From d3418d2d3cd6db2f95fb4dc855b643370ea41e5f Mon Sep 17 00:00:00 2001 From: lion <120344285@qq.com> Date: Mon, 22 Jun 2026 16:53:37 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Crawl/Adapters/FacultyListHtmlAdapter.php | 188 ++++++++++++++++++ tests/Unit/FacultyListHtmlAdapterTest.php | 28 +++ 2 files changed, 216 insertions(+) diff --git a/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php b/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php index a5c4f20..d4643d7 100644 --- a/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php +++ b/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php @@ -24,6 +24,13 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface $baseUrl = $this->normalizeRequestUrl($requestUrl); $firstHtml = $this->fetchHtml($baseUrl); + + if ($this->isSaisAjaxFacultyPage($firstHtml, $requestUrl)) { + $items = $this->fetchSaisFacultyItems($requestUrl, $firstHtml, $keywords, $maxResults); + + return $this->enrichEmailsFromProfilePages($items, $params); + } + $totalPages = $this->detectTotalPages($firstHtml); $pagesToFetch = min($maxPages, $totalPages); @@ -342,6 +349,7 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface '/电子信箱[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', '/E-?mail[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/iu', '/邮箱[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', + '/电子邮件[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', ]; foreach ($labeledPatterns as $pattern) { @@ -490,6 +498,186 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl); } + protected function isSaisAjaxFacultyPage(string $html, string $sourceUrl): bool + { + if (str_contains(strtolower($sourceUrl), 'ajax_teacher_list')) { + return true; + } + + if (str_contains($html, 'ajax_teacher_list.html')) { + return true; + } + + $host = strtolower((string) parse_url($sourceUrl, PHP_URL_HOST)); + + return str_contains($host, 'sais.sjtu.edu.cn') + && str_contains(strtolower($sourceUrl), 'faculty'); + } + + /** + * @param list $keywords + * @return list + */ + protected function fetchSaisFacultyItems( + string $requestUrl, + string $pageHtml, + array $keywords, + int $maxResults, + ): array { + $config = $this->parseSaisAjaxConfig($pageHtml, $requestUrl); + $search = implode(' ', $keywords); + $type = $search !== '' ? '2' : '1'; + $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20)); + + $response = Http::timeout($timeout) + ->connectTimeout(min(8, $timeout)) + ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) + ->withHeaders([ + 'User-Agent' => 'SlakeSchool-Crawler/1.0', + 'Accept' => 'application/json, text/html', + ]) + ->asForm() + ->post($config['api_url'], [ + 'cat_id' => $config['cat_id'], + 'cat_code' => $config['cat_code'], + 'type' => $type, + 'zm' => $search === '' ? 'All' : '', + 'search' => $search, + ]); + + if (! $response->successful()) { + throw new \RuntimeException('SAIS 教师列表接口请求失败(HTTP '.$response->status().')'); + } + + $payload = $response->json(); + if (! is_array($payload)) { + throw new \RuntimeException('SAIS 教师列表接口返回格式异常'); + } + + $content = (string) ($payload['content'] ?? ''); + if ($content === '') { + return []; + } + + $items = $this->extractFromSaisJsList($pageHtml.$content, $keywords, $requestUrl); + if (count($items) > $maxResults) { + $items = array_slice($items, 0, $maxResults); + } + + return $items; + } + + /** + * @return array{cat_id:string,cat_code:string,api_url:string} + */ + protected function parseSaisAjaxConfig(string $html, string $sourceUrl): array + { + $catId = '18'; + $catCode = 'faculty'; + $apiUrl = 'https://sais.sjtu.edu.cn/active/ajax_teacher_list.html'; + + if (preg_match("/cat_id\s*:\s*'(\d+)'/i", $html, $match)) { + $catId = $match[1]; + } + if (preg_match("/cat_code\s*:\s*'([^']+)'/i", $html, $match)) { + $catCode = $match[1]; + } + if (preg_match("#url\s*:\s*'([^']*ajax_teacher_list[^']*)'#i", $html, $match)) { + $apiUrl = $this->resolveUrl(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'), $sourceUrl) + ?? $apiUrl; + } + + $origin = $this->requestOrigin($sourceUrl); + if ($origin !== null && str_starts_with($apiUrl, '/')) { + $apiUrl = $origin.$apiUrl; + } + + return [ + 'cat_id' => $catId, + 'cat_code' => $catCode, + 'api_url' => $apiUrl, + ]; + } + + /** + * @param list $keywords + * @return list + */ + protected function extractFromSaisJsList(string $html, array $keywords, string $sourceUrl): array + { + $items = []; + $seen = []; + $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html); + $defaultCollege = $this->inferCollegeFromPageTitle($html); + + if (! preg_match_all('#]*?)>([^<]+)#su', $html, $matches, PREG_SET_ORDER)) { + return []; + } + + foreach ($matches as $match) { + $attrs = (string) $match[1]; + if (! preg_match('/\bclass="[^"]*\bname\b[^"]*"/u', $attrs)) { + continue; + } + if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) { + continue; + } + + $name = CrawlAuthorParser::cleanText($match[2]) ?? ''; + if ($name === '' || ! $this->looksLikePersonName($name)) { + continue; + } + + $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'); + if (! str_contains(strtolower($href), '/faculty/')) { + continue; + } + + $profileUrl = $this->resolveUrl($href, $sourceUrl); + $dedupeKey = $profileUrl ?: ('name:'.md5($name)); + if (isset($seen[$dedupeKey])) { + continue; + } + + $plain = trim($name.' '.($defaultCollege ?? '')); + if (! $this->matchesKeywords($plain, $keywords)) { + continue; + } + + $seen[$dedupeKey] = true; + $items[] = $this->makeFacultyItem( + externalKey: 'faculty:'.md5($dedupeKey), + name: $name, + profileUrl: $profileUrl, + email: null, + affiliation: $defaultCollege, + universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege), + summary: $defaultCollege ? '单位:'.$defaultCollege : null, + keywords: $keywords, + academicTitle: null, + platform: 'faculty_html_sais', + bio: null, + ); + } + + return $items; + } + + protected function requestOrigin(string $sourceUrl): ?string + { + $parts = parse_url($sourceUrl); + if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) { + return null; + } + + $origin = $parts['scheme'].'://'.$parts['host']; + if (! empty($parts['port'])) { + $origin .= ':'.$parts['port']; + } + + return $origin; + } + /** * @param list $keywords * @return list diff --git a/tests/Unit/FacultyListHtmlAdapterTest.php b/tests/Unit/FacultyListHtmlAdapterTest.php index 09911c9..7315bb6 100644 --- a/tests/Unit/FacultyListHtmlAdapterTest.php +++ b/tests/Unit/FacultyListHtmlAdapterTest.php @@ -210,6 +210,34 @@ HTML; $this->assertSame('塑性成形技术与装备研究院', $item->extra['college_name']); } + public function test_extracts_sais_js_list_from_ajax_content(): void + { + $html = <<<'HTML' +教师名录-上海交通大学自动化与感知学院 +
+
  • 白洋
  • +
  • 陈新
  • +
    +HTML; + + $adapter = new FacultyListHtmlAdapter; + $method = new \ReflectionMethod($adapter, 'extractFromSaisJsList'); + $method->setAccessible(true); + + $items = $method->invoke( + $adapter, + $html, + [], + 'https://sais.sjtu.edu.cn/faculty.html', + ); + + $this->assertCount(2, $items); + $this->assertSame('白洋', $items[0]->title); + $this->assertSame('https://sais.sjtu.edu.cn/faculty/baiyang.html', $items[0]->canonicalUrl); + $this->assertSame('faculty_html_sais', $items[0]->extra['platform']); + $this->assertSame('上海交通大学', $items[0]->schoolName); + } + public function test_resolve_profile_enrich_max_caps_large_batches(): void { $adapter = new FacultyListHtmlAdapter;