From 22dec52d2ef2b7e339b65cbb0856f4c76fdf6c61 Mon Sep 17 00:00:00 2001 From: lion <120344285@qq.com> Date: Mon, 22 Jun 2026 16:26:54 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Crawl/Adapters/FacultyListHtmlAdapter.php | 225 +++++++++++++++++- tests/Unit/FacultyListHtmlAdapterTest.php | 66 +++++ 2 files changed, 288 insertions(+), 3 deletions(-) diff --git a/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php b/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php index 3a3a288..fe537a7 100644 --- a/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php +++ b/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php @@ -91,10 +91,12 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface foreach ($pending as $externalId => $item) { $response = $responses[$externalId] ?? null; if ($response && $response->successful()) { - $email = $this->extractEmailFromProfileHtml((string) $response->body()); + $body = (string) $response->body(); + $email = $this->extractEmailFromProfileHtml($body); if ($email) { $item = $this->applyEmailToItem($item, $email); } + $item = $this->applyProfileMetadataToItem($item, $body); } $enriched[] = $item; } @@ -299,7 +301,12 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface return $items; } - return $this->extractFromStructuredFacultyList($html, $keywords, $sourceUrl); + $items = $this->extractFromStructuredFacultyList($html, $keywords, $sourceUrl); + if ($items !== []) { + return $items; + } + + return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl); } /** @@ -362,11 +369,223 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface } /** - * 上海交通大学等 tsites.CollegeTeacherList:div.list > ul > li 卡片。 + * 上海交大材料学院等:panel-item + a.staff-item(/people/detail_new/{id})。 * * @param list $keywords * @return list */ + protected function extractFromStaffPanelList(string $html, array $keywords, string $sourceUrl): array + { + $items = []; + $seen = []; + $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html); + $defaultCollege = $this->inferCollegeFromPageTitle($html); + + $panelChunks = preg_split('##u', $html) ?: []; + if (count($panelChunks) > 1) { + array_shift($panelChunks); + foreach ($panelChunks as $chunk) { + if (! preg_match('#\s*([^<]+?)\s*#u', $chunk, $titleMatch)) { + continue; + } + $department = CrawlAuthorParser::cleanText($titleMatch[1]); + foreach ($this->extractStaffItemLinks($chunk) as $link) { + $item = $this->makeStaffPanelItem( + $link, + $department ?: $defaultCollege, + $pageUniversity, + $keywords, + $sourceUrl, + ); + if ($item === null || isset($seen[$item->externalId])) { + continue; + } + $seen[$item->externalId] = true; + $items[] = $item; + } + } + } + + if ($items !== []) { + return $items; + } + + foreach ($this->extractStaffItemLinks($html) as $link) { + $item = $this->makeStaffPanelItem( + $link, + $defaultCollege, + $pageUniversity, + $keywords, + $sourceUrl, + ); + if ($item === null || isset($seen[$item->externalId])) { + continue; + } + $seen[$item->externalId] = true; + $items[] = $item; + } + + return $items; + } + + /** + * @return list + */ + protected function extractStaffItemLinks(string $html): array + { + $links = []; + $seen = []; + + if (! preg_match_all('#]*?)>([^<]+)#su', $html, $matches, PREG_SET_ORDER)) { + return []; + } + + foreach ($matches as $match) { + $attrs = (string) $match[1]; + if (! str_contains($attrs, 'staff-item')) { + continue; + } + if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) { + continue; + } + + $name = CrawlAuthorParser::cleanText($match[2]) ?? ''; + if ($name === '' || ! $this->looksLikePersonName($name)) { + continue; + } + + $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'); + $key = $href.'|'.$name; + if (isset($seen[$key])) { + continue; + } + $seen[$key] = true; + $links[] = ['href' => $href, 'name' => $name]; + } + + return $links; + } + + /** + * @param array{href:string,name:string} $link + * @param list $keywords + */ + protected function makeStaffPanelItem( + array $link, + ?string $department, + ?string $pageUniversity, + array $keywords, + string $sourceUrl, + ): ?CrawlItemDto { + $name = $link['name']; + $profileUrl = $this->resolveUrl($link['href'], $sourceUrl); + $plain = trim($name.' '.($department ?? '')); + + if (! $this->matchesKeywords($plain, $keywords)) { + return null; + } + + $dedupeKey = $profileUrl ?: ('name:'.md5($name)); + $affiliation = $department; + $universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation); + + return $this->makeFacultyItem( + externalKey: 'faculty:'.md5($dedupeKey), + name: $name, + profileUrl: $profileUrl, + email: null, + affiliation: $affiliation, + universityName: $universityName, + summary: $department ? '单位:'.$department : null, + keywords: $keywords, + academicTitle: null, + platform: 'faculty_html_smse', + bio: null, + ); + } + + protected function inferCollegeFromPageTitle(string $html): ?string + { + if (! preg_match('/\s*[^<\-\–—]+[\-–—]\s*([^<]+?)\s*<\/title>/u', $html, $match)) { + return null; + } + + $title = CrawlAuthorParser::cleanText($match[1]); + if ($title === null || $title === '') { + return null; + } + + if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $title, $college)) { + return CrawlAuthorParser::cleanText($college[1]); + } + + return $title; + } + + protected function applyProfileMetadataToItem(CrawlItemDto $item, string $html): CrawlItemDto + { + $lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : []; + $changed = false; + + if (empty($lead['academic_title']) && preg_match('/<em>\s*([^<]+?)\s*<\/em>/u', $html, $titleMatch)) { + $title = CrawlAuthorParser::cleanText($titleMatch[1]); + if ($title !== null && $title !== '') { + $lead['academic_title'] = $title; + $changed = true; + } + } + + if (empty($lead['college']) && empty($lead['affiliation'])) { + $dept = $this->parseLabeledField($html, '所属二级机构'); + if ($dept !== null && $dept !== '') { + $lead['affiliation'] = $dept; + $lead['college'] = $dept; + $changed = true; + } + } + + if (! $changed) { + return $item; + } + + $extra = $item->extra; + $extra['lead_author'] = $lead; + if (! empty($lead['academic_title'])) { + $extra['academic_title'] = $lead['academic_title']; + } + if (! empty($lead['college'])) { + $extra['college_name'] = $lead['college']; + } + + $authorsParsed = $item->authorsParsed; + if ($authorsParsed !== []) { + if (! empty($lead['academic_title'])) { + $authorsParsed[0]['academic_title'] = $lead['academic_title']; + } + if (! empty($lead['college'])) { + $authorsParsed[0]['affiliation'] = $lead['college']; + } + } + + return new CrawlItemDto( + externalId: $item->externalId, + title: $item->title, + canonicalUrl: $item->canonicalUrl, + authors: $item->authors, + summary: $item->summary, + publishedAt: $item->publishedAt, + schoolName: $item->schoolName, + section: $item->section, + contentHtml: $item->contentHtml, + extra: $extra, + authorsParsed: $authorsParsed, + ); + } + + /** + * @param list<string> $keywords + * @return list<CrawlItemDto> + */ protected function extractFromStructuredFacultyList(string $html, array $keywords, string $sourceUrl): array { $items = []; diff --git a/tests/Unit/FacultyListHtmlAdapterTest.php b/tests/Unit/FacultyListHtmlAdapterTest.php index afb7a12..3a966ad 100644 --- a/tests/Unit/FacultyListHtmlAdapterTest.php +++ b/tests/Unit/FacultyListHtmlAdapterTest.php @@ -143,4 +143,70 @@ HTML; $this->assertStringContainsString('PAGENUM=3', $url); $this->assertStringContainsString('totalpage=20', $url); } + + public function test_extracts_smse_staff_panel_list(): void + { + $html = <<<'HTML' +<title>教师名录 - 上海交通大学材料科学与工程学院 +
+
塑性成形技术与装备研究院
+
+
+ +
+HTML; + + $adapter = new FacultyListHtmlAdapter; + $method = new \ReflectionMethod($adapter, 'extractFromHtml'); + $method->setAccessible(true); + + $items = $method->invoke( + $adapter, + $html, + [], + 'https://smse.sjtu.edu.cn/people/staff_new/department', + ); + + $this->assertCount(2, $items); + $this->assertSame('陈军', $items[0]->title); + $this->assertSame('https://smse.sjtu.edu.cn/people/detail_new/20092', $items[0]->canonicalUrl); + $this->assertSame('上海交通大学', $items[0]->schoolName); + $this->assertSame('faculty_html_smse', $items[0]->extra['platform']); + $this->assertSame('塑性成形技术与装备研究院', $items[0]->extra['college_name']); + } + + public function test_apply_profile_metadata_from_smse_detail_page(): void + { + $html = <<<'HTML' +

陈军

教授
+
所属二级机构:塑性成形技术与装备研究院
+HTML; + + $adapter = new FacultyListHtmlAdapter; + $method = new \ReflectionMethod($adapter, 'applyProfileMetadataToItem'); + $method->setAccessible(true); + + $item = $method->invoke( + $adapter, + new \App\Services\Crawl\CrawlItemDto( + externalId: 'faculty:test', + title: '陈军', + canonicalUrl: 'https://smse.sjtu.edu.cn/people/detail_new/20092', + extra: [ + 'lead_author' => [ + 'name' => '陈军', + 'email' => null, + 'university_name' => '上海交通大学', + ], + ], + ), + $html, + ); + + $this->assertSame('教授', $item->extra['lead_author']['academic_title']); + $this->assertSame('塑性成形技术与装备研究院', $item->extra['college_name']); + } }