diff --git a/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php b/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php index d4643d7..be17572 100644 --- a/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php +++ b/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php @@ -25,8 +25,8 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface $baseUrl = $this->normalizeRequestUrl($requestUrl); $firstHtml = $this->fetchHtml($baseUrl); - if ($this->isSaisAjaxFacultyPage($firstHtml, $requestUrl)) { - $items = $this->fetchSaisFacultyItems($requestUrl, $firstHtml, $keywords, $maxResults); + if ($this->isAjaxTeacherListPage($firstHtml, $requestUrl)) { + $items = $this->fetchAjaxTeacherItems($requestUrl, $firstHtml, $keywords, $maxResults); return $this->enrichEmailsFromProfilePages($items, $params); } @@ -498,37 +498,42 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl); } - protected function isSaisAjaxFacultyPage(string $html, string $sourceUrl): bool + protected function isAjaxTeacherListPage(string $html, string $sourceUrl): bool { if (str_contains(strtolower($sourceUrl), 'ajax_teacher_list')) { return true; } - if (str_contains($html, 'ajax_teacher_list.html')) { - return true; - } - - $host = strtolower((string) parse_url($sourceUrl, PHP_URL_HOST)); - - return str_contains($host, 'sais.sjtu.edu.cn') - && str_contains(strtolower($sourceUrl), 'faculty'); + return str_contains($html, 'ajax_teacher_list.html'); } /** * @param list $keywords * @return list */ - protected function fetchSaisFacultyItems( + protected function fetchAjaxTeacherItems( string $requestUrl, string $pageHtml, array $keywords, int $maxResults, ): array { - $config = $this->parseSaisAjaxConfig($pageHtml, $requestUrl); + $config = $this->parseAjaxTeacherConfig($pageHtml, $requestUrl); $search = implode(' ', $keywords); $type = $search !== '' ? '2' : '1'; $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20)); + $payload = [ + 'cat_id' => $config['cat_id'], + 'cat_code' => $config['cat_code'], + 'type' => $type, + 'zm' => $search === '' ? 'All' : '', + 'zc' => '', + 'search' => $search, + ]; + if ($config['uses_page']) { + $payload['page'] = '1'; + } + $response = Http::timeout($timeout) ->connectTimeout(min(8, $timeout)) ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) @@ -537,29 +542,28 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface 'Accept' => 'application/json, text/html', ]) ->asForm() - ->post($config['api_url'], [ - 'cat_id' => $config['cat_id'], - 'cat_code' => $config['cat_code'], - 'type' => $type, - 'zm' => $search === '' ? 'All' : '', - 'search' => $search, - ]); + ->post($config['api_url'], $payload); if (! $response->successful()) { - throw new \RuntimeException('SAIS 教师列表接口请求失败(HTTP '.$response->status().')'); + throw new \RuntimeException('教师列表接口请求失败(HTTP '.$response->status().')'); } - $payload = $response->json(); - if (! is_array($payload)) { - throw new \RuntimeException('SAIS 教师列表接口返回格式异常'); + $body = $response->json(); + if (! is_array($body)) { + throw new \RuntimeException('教师列表接口返回格式异常'); } - $content = (string) ($payload['content'] ?? ''); + $content = (string) ($body['content'] ?? ''); if ($content === '') { return []; } - $items = $this->extractFromSaisJsList($pageHtml.$content, $keywords, $requestUrl); + $items = $this->extractFromAjaxTeacherContent( + $pageHtml.$content, + $keywords, + $requestUrl, + $config['cat_code'], + ); if (count($items) > $maxResults) { $items = array_slice($items, 0, $maxResults); } @@ -568,13 +572,15 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface } /** - * @return array{cat_id:string,cat_code:string,api_url:string} + * @return array{cat_id:string,cat_code:string,api_url:string,uses_page:bool} */ - protected function parseSaisAjaxConfig(string $html, string $sourceUrl): array + protected function parseAjaxTeacherConfig(string $html, string $sourceUrl): array { - $catId = '18'; - $catCode = 'faculty'; - $apiUrl = 'https://sais.sjtu.edu.cn/active/ajax_teacher_list.html'; + $catId = null; + $catCode = null; + $usesPage = str_contains($html, 'page:page'); + $origin = $this->requestOrigin($sourceUrl); + $apiUrl = $origin !== null ? $origin.'/active/ajax_teacher_list.html' : ''; if (preg_match("/cat_id\s*:\s*'(\d+)'/i", $html, $match)) { $catId = $match[1]; @@ -587,15 +593,19 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface ?? $apiUrl; } - $origin = $this->requestOrigin($sourceUrl); if ($origin !== null && str_starts_with($apiUrl, '/')) { $apiUrl = $origin.$apiUrl; } + if ($catId === null || $catCode === null || $apiUrl === '') { + throw new \RuntimeException('无法解析教师列表接口参数(cat_id / cat_code)'); + } + return [ 'cat_id' => $catId, 'cat_code' => $catCode, 'api_url' => $apiUrl, + 'uses_page' => $usesPage, ]; } @@ -603,33 +613,92 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface * @param list $keywords * @return list */ - protected function extractFromSaisJsList(string $html, array $keywords, string $sourceUrl): array - { + protected function extractFromAjaxTeacherContent( + string $html, + array $keywords, + string $sourceUrl, + ?string $catCode = null, + ): array { $items = []; $seen = []; $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html); $defaultCollege = $this->inferCollegeFromPageTitle($html); - if (! preg_match_all('#]*?)>([^<]+)#su', $html, $matches, PREG_SET_ORDER)) { + $parts = preg_split('##u', $html) ?: []; + if (count($parts) > 1) { + array_shift($parts); + foreach ($parts as $block) { + $department = $defaultCollege; + if (preg_match('#.*?([^<]+)#su', $block, $deptMatch)) { + $sectionTitle = CrawlAuthorParser::cleanText($deptMatch[1]); + if ($sectionTitle !== null && $sectionTitle !== '' && ! $this->looksLikePersonName($sectionTitle)) { + $department = $sectionTitle; + } + } + + foreach ($this->extractTeacherLinksFromHtmlBlock( + $block, + $keywords, + $sourceUrl, + $pageUniversity, + $department, + $catCode, + ) as $item) { + if (isset($seen[$item->externalId])) { + continue; + } + $seen[$item->externalId] = true; + $items[] = $item; + } + } + + if ($items !== []) { + return $items; + } + } + + return $this->extractTeacherLinksFromHtmlBlock( + $html, + $keywords, + $sourceUrl, + $pageUniversity, + $defaultCollege, + $catCode, + ); + } + + /** + * @param list $keywords + * @return list + */ + protected function extractTeacherLinksFromHtmlBlock( + string $html, + array $keywords, + string $sourceUrl, + ?string $pageUniversity, + ?string $affiliation, + ?string $catCode, + ): array { + $items = []; + $seen = []; + + if (! preg_match_all('#]*?)>(.*?)#su', $html, $matches, PREG_SET_ORDER)) { return []; } foreach ($matches as $match) { $attrs = (string) $match[1]; - if (! preg_match('/\bclass="[^"]*\bname\b[^"]*"/u', $attrs)) { + $rawName = preg_replace('/\s+/u', '', strip_tags($match[2])) ?? ''; + $name = CrawlAuthorParser::cleanText($rawName) ?? ''; + if ($name === '' || ! $this->looksLikePersonName($name)) { continue; } if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) { continue; } - $name = CrawlAuthorParser::cleanText($match[2]) ?? ''; - if ($name === '' || ! $this->looksLikePersonName($name)) { - continue; - } - $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'); - if (! str_contains(strtolower($href), '/faculty/')) { + if (! $this->looksLikeTeacherProfileUrl($href, $catCode)) { continue; } @@ -639,7 +708,7 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface continue; } - $plain = trim($name.' '.($defaultCollege ?? '')); + $plain = trim($name.' '.($affiliation ?? '')); if (! $this->matchesKeywords($plain, $keywords)) { continue; } @@ -650,12 +719,12 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface name: $name, profileUrl: $profileUrl, email: null, - affiliation: $defaultCollege, - universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege), - summary: $defaultCollege ? '单位:'.$defaultCollege : null, + affiliation: $affiliation, + universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation), + summary: $affiliation ? '单位:'.$affiliation : null, keywords: $keywords, academicTitle: null, - platform: 'faculty_html_sais', + platform: 'faculty_html_ajax', bio: null, ); } @@ -663,6 +732,26 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface return $items; } + protected function looksLikeTeacherProfileUrl(string $href, ?string $catCode): bool + { + $path = strtolower((string) parse_url($href, PHP_URL_PATH)); + if ($path === '') { + return false; + } + + if (preg_match('#/(faculty|jiaoshiml|people/detail_new)/[^/]+\.html$#', $path)) { + return true; + } + + if ($catCode !== null && $catCode !== '') { + $code = preg_quote(strtolower($catCode), '#'); + + return (bool) preg_match('#/'.$code.'/[^/]+\.html$#', $path); + } + + return false; + } + protected function requestOrigin(string $sourceUrl): ?string { $parts = parse_url($sourceUrl); diff --git a/tests/Unit/FacultyListHtmlAdapterTest.php b/tests/Unit/FacultyListHtmlAdapterTest.php index 7315bb6..3c61b30 100644 --- a/tests/Unit/FacultyListHtmlAdapterTest.php +++ b/tests/Unit/FacultyListHtmlAdapterTest.php @@ -221,7 +221,7 @@ HTML; HTML; $adapter = new FacultyListHtmlAdapter; - $method = new \ReflectionMethod($adapter, 'extractFromSaisJsList'); + $method = new \ReflectionMethod($adapter, 'extractFromAjaxTeacherContent'); $method->setAccessible(true); $items = $method->invoke( @@ -229,15 +229,47 @@ HTML; $html, [], 'https://sais.sjtu.edu.cn/faculty.html', + 'faculty', ); $this->assertCount(2, $items); $this->assertSame('白洋', $items[0]->title); $this->assertSame('https://sais.sjtu.edu.cn/faculty/baiyang.html', $items[0]->canonicalUrl); - $this->assertSame('faculty_html_sais', $items[0]->extra['platform']); + $this->assertSame('faculty_html_ajax', $items[0]->extra['platform']); $this->assertSame('上海交通大学', $items[0]->schoolName); } + public function test_extracts_cs_rc_item_teacher_list(): void + { + $html = <<<'HTML' +教师名录-上海交通大学计算机学院(网络空间安全学院、密码学院) +
+
并行与分布式系统研究所
+
+

所长:臧斌宇

+

陈海波

+
+
+HTML; + + $adapter = new FacultyListHtmlAdapter; + $method = new \ReflectionMethod($adapter, 'extractFromAjaxTeacherContent'); + $method->setAccessible(true); + + $items = $method->invoke( + $adapter, + $html, + [], + 'https://www.cs.sjtu.edu.cn/jiaoshiml.html', + 'jiaoshiml', + ); + + $this->assertCount(2, $items); + $this->assertSame('臧斌宇', $items[0]->title); + $this->assertSame('并行与分布式系统研究所', $items[0]->extra['college_name']); + $this->assertSame('https://www.cs.sjtu.edu.cn/jiaoshiml/chenhaibo.html', $items[1]->canonicalUrl); + } + public function test_resolve_profile_enrich_max_caps_large_batches(): void { $adapter = new FacultyListHtmlAdapter;