From a3a850b049e793ba9fe0dcd21f14bd45ec139a43 Mon Sep 17 00:00:00 2001 From: lion <120344285@qq.com> Date: Mon, 22 Jun 2026 17:07:51 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Crawl/Adapters/FacultyListHtmlAdapter.php | 214 +++++++++++++++--- tests/Unit/FacultyListHtmlAdapterTest.php | 60 +++++ 2 files changed, 243 insertions(+), 31 deletions(-) diff --git a/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php b/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php index be17572..0482067 100644 --- a/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php +++ b/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php @@ -26,7 +26,7 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface $firstHtml = $this->fetchHtml($baseUrl); if ($this->isAjaxTeacherListPage($firstHtml, $requestUrl)) { - $items = $this->fetchAjaxTeacherItems($requestUrl, $firstHtml, $keywords, $maxResults); + $items = $this->fetchAjaxTeacherItems($requestUrl, $firstHtml, $keywords, $maxResults, $maxPages); return $this->enrichEmailsFromProfilePages($items, $params); } @@ -516,22 +516,90 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface string $pageHtml, array $keywords, int $maxResults, + int $maxPages = 1, ): array { $config = $this->parseAjaxTeacherConfig($pageHtml, $requestUrl); $search = implode(' ', $keywords); - $type = $search !== '' ? '2' : '1'; $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20)); + $maxPages = max(1, min(50, $maxPages)); - $payload = [ - 'cat_id' => $config['cat_id'], - 'cat_code' => $config['cat_code'], - 'type' => $type, - 'zm' => $search === '' ? 'All' : '', - 'zc' => '', - 'search' => $search, - ]; - if ($config['uses_page']) { - $payload['page'] = '1'; + $items = []; + $seen = []; + $page = 1; + $totalCount = null; + + while ($page <= $maxPages && count($items) < $maxResults) { + $body = $this->requestAjaxTeacherPage($config, $page, $search, $timeout); + if ($totalCount === null && isset($body['count'])) { + $totalCount = max(0, (int) $body['count']); + } + + $content = (string) ($body['content'] ?? ''); + if ($content === '') { + break; + } + + $before = count($items); + foreach ($this->extractFromAjaxTeacherContent( + $pageHtml.$content, + $keywords, + $requestUrl, + $config['cat_code'], + ) as $item) { + if (isset($seen[$item->externalId])) { + continue; + } + $seen[$item->externalId] = true; + $items[] = $item; + if (count($items) >= $maxResults) { + break 2; + } + } + + if ($config['variant'] === 'standard') { + break; + } + + if (count($items) === $before) { + break; + } + + if ($totalCount !== null && count($items) >= min($totalCount, $maxResults)) { + break; + } + + $page++; + } + + return $items; + } + + /** + * @param array{variant:string,cat_id:?string,cat_code:string,api_url:string} $config + * @return array + */ + protected function requestAjaxTeacherPage(array $config, int $page, string $search, int $timeout): array + { + if ($config['variant'] === 'simple') { + $payload = [ + 'page' => (string) $page, + 'cat_code' => $config['cat_code'], + 'yjszxfl' => '全部', + 'name' => $search, + 'zm' => $search === '' ? 'All' : '', + ]; + } else { + $payload = [ + 'cat_id' => $config['cat_id'], + 'cat_code' => $config['cat_code'], + 'type' => $search !== '' ? '2' : '1', + 'zm' => $search === '' ? 'All' : '', + 'zc' => '', + 'search' => $search, + ]; + if ($config['uses_page']) { + $payload['page'] = (string) $page; + } } $response = Http::timeout($timeout) @@ -553,26 +621,11 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface throw new \RuntimeException('教师列表接口返回格式异常'); } - $content = (string) ($body['content'] ?? ''); - if ($content === '') { - return []; - } - - $items = $this->extractFromAjaxTeacherContent( - $pageHtml.$content, - $keywords, - $requestUrl, - $config['cat_code'], - ); - if (count($items) > $maxResults) { - $items = array_slice($items, 0, $maxResults); - } - - return $items; + return $body; } /** - * @return array{cat_id:string,cat_code:string,api_url:string,uses_page:bool} + * @return array{variant:string,cat_id:?string,cat_code:string,api_url:string,uses_page:bool} */ protected function parseAjaxTeacherConfig(string $html, string $sourceUrl): array { @@ -597,11 +650,17 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface $apiUrl = $origin.$apiUrl; } - if ($catId === null || $catCode === null || $apiUrl === '') { - throw new \RuntimeException('无法解析教师列表接口参数(cat_id / cat_code)'); + if ($catCode === null || $apiUrl === '') { + throw new \RuntimeException('无法解析教师列表接口参数(cat_code)'); + } + + $variant = $catId !== null ? 'standard' : 'simple'; + if ($variant === 'simple') { + $usesPage = true; } return [ + 'variant' => $variant, 'cat_id' => $catId, 'cat_code' => $catCode, 'api_url' => $apiUrl, @@ -624,6 +683,18 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html); $defaultCollege = $this->inferCollegeFromPageTitle($html); + $cardItems = $this->extractFromAjaxTeacherCards( + $html, + $keywords, + $sourceUrl, + $pageUniversity, + $defaultCollege, + $catCode, + ); + if ($cardItems !== []) { + return $cardItems; + } + $parts = preg_split('##u', $html) ?: []; if (count($parts) > 1) { array_shift($parts); @@ -667,6 +738,87 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface ); } + /** + * ICISEE 等站点 AJAX 返回的卡片式教师列表(姓名在 div.name 内,职称在 span 内)。 + * + * @param list $keywords + * @return list + */ + protected function extractFromAjaxTeacherCards( + string $html, + array $keywords, + string $sourceUrl, + ?string $pageUniversity, + ?string $affiliation, + ?string $catCode, + ): array { + if (! preg_match_all( + '#]*?)>\s*(?:.*?\s*)?(.*?)#su', + $html, + $matches, + PREG_SET_ORDER, + )) { + return []; + } + + $items = []; + $seen = []; + + foreach ($matches as $match) { + $attrs = (string) $match[1]; + $nameBlock = (string) $match[2]; + if (! preg_match('/^([^<]+)/u', $nameBlock, $nameMatch)) { + continue; + } + + $name = CrawlAuthorParser::cleanText(trim($nameMatch[1])) ?? ''; + if ($name === '' || ! $this->looksLikePersonName($name)) { + continue; + } + if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) { + continue; + } + + $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'); + if (! $this->looksLikeTeacherProfileUrl($href, $catCode)) { + continue; + } + + $profileUrl = $this->resolveUrl($href, $sourceUrl); + $dedupeKey = $profileUrl ?: ('name:'.md5($name)); + if (isset($seen[$dedupeKey])) { + continue; + } + + $plain = trim($name.' '.($affiliation ?? '')); + if (! $this->matchesKeywords($plain, $keywords)) { + continue; + } + + $academicTitle = null; + if (preg_match('#([^<]+)#u', $nameBlock, $titleMatch)) { + $academicTitle = CrawlAuthorParser::cleanText($titleMatch[1]); + } + + $seen[$dedupeKey] = true; + $items[] = $this->makeFacultyItem( + externalKey: 'faculty:'.md5($dedupeKey), + name: $name, + profileUrl: $profileUrl, + email: null, + affiliation: $affiliation, + universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation), + summary: $affiliation ? '单位:'.$affiliation : null, + keywords: $keywords, + academicTitle: $academicTitle, + platform: 'faculty_html_ajax', + bio: null, + ); + } + + return $items; + } + /** * @param list $keywords * @return list diff --git a/tests/Unit/FacultyListHtmlAdapterTest.php b/tests/Unit/FacultyListHtmlAdapterTest.php index 3c61b30..009ca6d 100644 --- a/tests/Unit/FacultyListHtmlAdapterTest.php +++ b/tests/Unit/FacultyListHtmlAdapterTest.php @@ -293,4 +293,64 @@ HTML; ))); $this->assertNull($method->invoke($adapter, null)); } + + public function test_parses_icisee_ajax_teacher_config_without_cat_id(): void + { + $html = <<<'HTML' + +HTML; + + $adapter = new FacultyListHtmlAdapter; + $method = new \ReflectionMethod($adapter, 'parseAjaxTeacherConfig'); + $method->setAccessible(true); + + $config = $method->invoke($adapter, $html, 'https://icisee.sjtu.edu.cn/jiaoshiml.html'); + + $this->assertSame('simple', $config['variant']); + $this->assertNull($config['cat_id']); + $this->assertSame('jiaoshiml', $config['cat_code']); + $this->assertSame('https://icisee.sjtu.edu.cn/active/ajax_teacher_list.html', $config['api_url']); + $this->assertTrue($config['uses_page']); + } + + public function test_extracts_icisee_card_style_teacher_list(): void + { + $html = <<<'HTML' +教师名录-上海交通大学集成电路学院(信息与电子工程学院) + +
+
蔡星汉教授

微纳全重党支部书记

+
+ +
+
张三副教授
+
+HTML; + + $adapter = new FacultyListHtmlAdapter; + $method = new \ReflectionMethod($adapter, 'extractFromAjaxTeacherContent'); + $method->setAccessible(true); + + $items = $method->invoke( + $adapter, + $html, + [], + 'https://icisee.sjtu.edu.cn/jiaoshiml.html', + 'jiaoshiml', + ); + + $this->assertCount(2, $items); + $this->assertSame('蔡星汉', $items[0]->title); + $this->assertSame('教授', $items[0]->extra['academic_title']); + $this->assertSame('https://icisee.sjtu.edu.cn/jiaoshiml/caixinghan.html', $items[0]->canonicalUrl); + $this->assertSame('张三', $items[1]->title); + $this->assertSame('副教授', $items[1]->extra['academic_title']); + $this->assertSame('faculty_html_ajax', $items[0]->extra['platform']); + } }