From bd8527fc559ec28a79d9ee8e273197ccf2401a72 Mon Sep 17 00:00:00 2001 From: lion <120344285@qq.com> Date: Mon, 22 Jun 2026 17:17:27 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Crawl/Adapters/FacultyListHtmlAdapter.php | 679 +++++++++++++++++- tests/Unit/FacultyListHtmlAdapterTest.php | 136 ++++ 2 files changed, 807 insertions(+), 8 deletions(-) diff --git a/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php b/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php index 0482067..01498a7 100644 --- a/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php +++ b/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php @@ -31,6 +31,12 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface return $this->enrichEmailsFromProfilePages($items, $params); } + if ($this->isNjuTeacherHomePage($firstHtml)) { + $items = $this->fetchNjuTeacherHomeItems($requestUrl, $firstHtml, $keywords, $maxResults, $maxPages); + + return $this->enrichEmailsFromProfilePages($items, $params); + } + $totalPages = $this->detectTotalPages($firstHtml); $pagesToFetch = min($maxPages, $totalPages); @@ -485,6 +491,21 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface */ protected function extractFromHtml(string $html, array $keywords, string $sourceUrl): array { + $items = $this->extractFromSudyNewsFacultyList($html, $keywords, $sourceUrl); + if ($items !== []) { + return $items; + } + + $items = $this->extractFromRaTeacherList($html, $keywords, $sourceUrl); + if ($items !== []) { + return $items; + } + + $items = $this->extractFromVsbFacultyTable($html, $keywords, $sourceUrl); + if ($items !== []) { + return $items; + } + $items = $this->extractFromEmailBlocks($html, $keywords, $sourceUrl); if ($items !== []) { return $items; @@ -498,6 +519,363 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl); } + /** + * 南大 Sudy CMS:ul.news_list 内 news_title / news_title1 链接(frontier、ic 等)。 + * + * @param list $keywords + * @return list + */ + protected function extractFromSudyNewsFacultyList(string $html, array $keywords, string $sourceUrl): array + { + if (! preg_match('/class="news_list/u', $html)) { + return []; + } + + $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html); + $defaultCollege = $this->inferCollegeFromPageTitle($html); + if ($defaultCollege === null && preg_match('#
  • ([^<]+)

    #u', $html, $titleMatch)) { + $defaultCollege = CrawlAuthorParser::cleanText($titleMatch[1]); + } + + $items = []; + $seen = []; + $chunks = preg_split('#
  • ([^<]+)#u', $chunk, $deptMatch)) { + $department = CrawlAuthorParser::cleanText($deptMatch[1]); + } + + foreach ($this->extractSudyNewsLinksFromChunk( + $chunk, + $department, + $keywords, + $sourceUrl, + $pageUniversity, + $seen, + ) as $item) { + $items[] = $item; + } + } + + return $items; + } + + /** + * @param array $seen + * @param list $keywords + * @return list + */ + protected function extractSudyNewsLinksFromChunk( + string $chunk, + ?string $department, + array $keywords, + string $sourceUrl, + ?string $pageUniversity, + array &$seen, + ): array { + $items = []; + + if (! preg_match_all( + '#<(?:div|span)\s+class="news_title1?">\s*]*?)>([^<]+)#su', + $chunk, + $matches, + PREG_SET_ORDER, + )) { + return []; + } + + foreach ($matches as $match) { + $attrs = (string) $match[1]; + $name = CrawlAuthorParser::cleanText($match[2]) ?? ''; + if ($name === '' || ! $this->looksLikePersonName($name)) { + continue; + } + if (! preg_match('#\bhref=[\'"]([^\'"]+)[\'"]#u', $attrs, $hrefMatch)) { + continue; + } + + $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'); + if (! $this->looksLikeTeacherProfileUrl($href, null)) { + continue; + } + + $profileUrl = $this->resolveUrl($href, $sourceUrl); + $dedupeKey = $profileUrl ?: ('name:'.md5($name)); + if (isset($seen[$dedupeKey])) { + continue; + } + + $plain = trim($name.' '.($department ?? '')); + if (! $this->matchesKeywords($plain, $keywords)) { + continue; + } + + $seen[$dedupeKey] = true; + $items[] = $this->makeFacultyItem( + externalKey: 'faculty:'.md5($dedupeKey), + name: $name, + profileUrl: $profileUrl, + email: null, + affiliation: $department, + universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($department), + summary: $department ? '单位:'.$department : null, + keywords: $keywords, + academicTitle: null, + platform: 'faculty_html_sudy_news', + bio: null, + ); + } + + return $items; + } + + /** + * 南大机器人学院等博山 CMS:ul.teacher 卡片(div.xm 姓名)。 + * + * @param list $keywords + * @return list + */ + protected function extractFromRaTeacherList(string $html, array $keywords, string $sourceUrl): array + { + if (! preg_match('/