修改

2 days ago · bd8527fc55
parent a3a850b049
commit bd8527fc55
2 changed files with 807 additions and 8 deletions
--- a/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php
+++ b/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php
@ -31,6 +31,12 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
            return $this->enrichEmailsFromProfilePages($items, $params);
        }

+        if ($this->isNjuTeacherHomePage($firstHtml)) {
+            $items = $this->fetchNjuTeacherHomeItems($requestUrl, $firstHtml, $keywords, $maxResults, $maxPages);
+
+            return $this->enrichEmailsFromProfilePages($items, $params);
+        }
+
        $totalPages = $this->detectTotalPages($firstHtml);
        $pagesToFetch = min($maxPages, $totalPages);

@ -485,6 +491,21 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
     */
    protected function extractFromHtml(string $html, array $keywords, string $sourceUrl): array
    {
+        $items = $this->extractFromSudyNewsFacultyList($html, $keywords, $sourceUrl);
+        if ($items !== []) {
+            return $items;
+        }
+
+        $items = $this->extractFromRaTeacherList($html, $keywords, $sourceUrl);
+        if ($items !== []) {
+            return $items;
+        }
+
+        $items = $this->extractFromVsbFacultyTable($html, $keywords, $sourceUrl);
+        if ($items !== []) {
+            return $items;
+        }
+
        $items = $this->extractFromEmailBlocks($html, $keywords, $sourceUrl);
        if ($items !== []) {
            return $items;
@ -498,6 +519,363 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
        return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl);
    }

+    /**
+     * 南大 Sudy CMS：ul.news_list 内 news_title / news_title1 链接（frontier、ic 等）。
+     *
+     * @param  list<string>  $keywords
+     * @return list<CrawlItemDto>
+     */
+    protected function extractFromSudyNewsFacultyList(string $html, array $keywords, string $sourceUrl): array
+    {
+        if (! preg_match('/class="news_list/u', $html)) {
+            return [];
+        }
+
+        $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
+        $defaultCollege = $this->inferCollegeFromPageTitle($html);
+        if ($defaultCollege === null && preg_match('#<li class="col_title"><h2>([^<]+)</h2>#u', $html, $titleMatch)) {
+            $defaultCollege = CrawlAuthorParser::cleanText($titleMatch[1]);
+        }
+
+        $items = [];
+        $seen = [];
+        $chunks = preg_split('#<li class="wp_sublist#u', $html) ?: [];
+
+        if (count($chunks) <= 1) {
+            return $this->extractSudyNewsLinksFromChunk(
+                $html,
+                $defaultCollege,
+                $keywords,
+                $sourceUrl,
+                $pageUniversity,
+                $seen,
+            );
+        }
+
+        array_shift($chunks);
+        foreach ($chunks as $chunk) {
+            $department = $defaultCollege;
+            if (preg_match('#subcolumn-name">([^<]+)</span>#u', $chunk, $deptMatch)) {
+                $department = CrawlAuthorParser::cleanText($deptMatch[1]);
+            }
+
+            foreach ($this->extractSudyNewsLinksFromChunk(
+                $chunk,
+                $department,
+                $keywords,
+                $sourceUrl,
+                $pageUniversity,
+                $seen,
+            ) as $item) {
+                $items[] = $item;
+            }
+        }
+
+        return $items;
+    }
+
+    /**
+     * @param  array<string, true>  $seen
+     * @param  list<string>  $keywords
+     * @return list<CrawlItemDto>
+     */
+    protected function extractSudyNewsLinksFromChunk(
+        string $chunk,
+        ?string $department,
+        array $keywords,
+        string $sourceUrl,
+        ?string $pageUniversity,
+        array &$seen,
+    ): array {
+        $items = [];
+
+        if (! preg_match_all(
+            '#<(?:div|span)\s+class="news_title1?">\s*<a\b([^>]*?)>([^<]+)</a>#su',
+            $chunk,
+            $matches,
+            PREG_SET_ORDER,
+        )) {
+            return [];
+        }
+
+        foreach ($matches as $match) {
+            $attrs = (string) $match[1];
+            $name = CrawlAuthorParser::cleanText($match[2]) ?? '';
+            if ($name === '' || ! $this->looksLikePersonName($name)) {
+                continue;
+            }
+            if (! preg_match('#\bhref=[\'"]([^\'"]+)[\'"]#u', $attrs, $hrefMatch)) {
+                continue;
+            }
+
+            $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
+            if (! $this->looksLikeTeacherProfileUrl($href, null)) {
+                continue;
+            }
+
+            $profileUrl = $this->resolveUrl($href, $sourceUrl);
+            $dedupeKey = $profileUrl ?: ('name:'.md5($name));
+            if (isset($seen[$dedupeKey])) {
+                continue;
+            }
+
+            $plain = trim($name.' '.($department ?? ''));
+            if (! $this->matchesKeywords($plain, $keywords)) {
+                continue;
+            }
+
+            $seen[$dedupeKey] = true;
+            $items[] = $this->makeFacultyItem(
+                externalKey: 'faculty:'.md5($dedupeKey),
+                name: $name,
+                profileUrl: $profileUrl,
+                email: null,
+                affiliation: $department,
+                universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($department),
+                summary: $department ? '单位：'.$department : null,
+                keywords: $keywords,
+                academicTitle: null,
+                platform: 'faculty_html_sudy_news',
+                bio: null,
+            );
+        }
+
+        return $items;
+    }
+
+    /**
+     * 南大机器人学院等博山 CMS：ul.teacher 卡片（div.xm 姓名）。
+     *
+     * @param  list<string>  $keywords
+     * @return list<CrawlItemDto>
+     */
+    protected function extractFromRaTeacherList(string $html, array $keywords, string $sourceUrl): array
+    {
+        if (! preg_match('/<ul class="teacher">/u', $html)) {
+            return [];
+        }
+
+        if (! preg_match_all(
+            '#<a\b([^>]*?)>.*?<div class="xm">([^<]+)</div>(.*?)</a>#su',
+            $html,
+            $matches,
+            PREG_SET_ORDER,
+        )) {
+            return [];
+        }
+
+        $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
+        $defaultCollege = $this->inferCollegeFromPageTitle($html);
+        $items = [];
+        $seen = [];
+
+        foreach ($matches as $match) {
+            $attrs = (string) $match[1];
+            $name = CrawlAuthorParser::cleanText($match[2]) ?? '';
+            $tail = (string) $match[3];
+            if ($name === '' || ! $this->looksLikePersonName($name)) {
+                continue;
+            }
+            if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
+                continue;
+            }
+
+            $profileUrl = $this->resolveUrl(html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'), $sourceUrl);
+            $dedupeKey = $profileUrl ?: ('name:'.md5($name));
+            if (isset($seen[$dedupeKey])) {
+                continue;
+            }
+
+            $academicTitle = null;
+            if (preg_match('#职称：\s*<span>([^<]+)</span>#u', $tail, $titleMatch)) {
+                $academicTitle = CrawlAuthorParser::cleanText($titleMatch[1]);
+            }
+
+            $researchField = null;
+            if (preg_match('#研究方向：\s*<span>([^<]+)</span>#u', $tail, $fieldMatch)) {
+                $researchField = CrawlAuthorParser::cleanText($fieldMatch[1]);
+            }
+
+            $plain = trim($name.' '.($researchField ?? '').' '.($academicTitle ?? '').' '.($defaultCollege ?? ''));
+            if (! $this->matchesKeywords($plain, $keywords)) {
+                continue;
+            }
+
+            $summaryParts = array_filter([
+                $defaultCollege ? '单位：'.$defaultCollege : null,
+                $academicTitle ? '职称：'.$academicTitle : null,
+                $researchField ? '研究方向：'.$researchField : null,
+            ]);
+
+            $seen[$dedupeKey] = true;
+            $items[] = $this->makeFacultyItem(
+                externalKey: 'faculty:'.md5($dedupeKey),
+                name: $name,
+                profileUrl: $profileUrl,
+                email: null,
+                affiliation: $defaultCollege,
+                universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
+                summary: $summaryParts !== [] ? implode('；', $summaryParts) : null,
+                keywords: $keywords,
+                academicTitle: $academicTitle,
+                platform: 'faculty_html_ra',
+                bio: $researchField,
+            );
+        }
+
+        return $items;
+    }
+
+    /**
+     * 南大/清华 WebPlus(VSB) 师资表格页（ise zjzjs 等）。
+     *
+     * @param  list<string>  $keywords
+     * @return list<CrawlItemDto>
+     */
+    protected function extractFromVsbFacultyTable(string $html, array $keywords, string $sourceUrl): array
+    {
+        $scope = null;
+        if (preg_match('#<div class="zjzjs">(.*?)</div>#su', $html, $match)) {
+            $scope = (string) $match[1];
+        } elseif (preg_match('#<div id="vsb_content[^"]*">(.*?)</div>\s*</div>\s*</div>#su', $html, $match)) {
+            $scope = (string) $match[1];
+        } elseif (preg_match('#<ul class="teach-list[^"]*">(.*?)</ul>#su', $html, $match) && trim(strip_tags($match[1])) !== '') {
+            $scope = (string) $match[1];
+        }
+
+        if ($scope === null || trim(strip_tags($scope)) === '') {
+            return [];
+        }
+
+        $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
+        $defaultCollege = $this->inferCollegeFromPageTitle($html);
+        $items = [];
+        $seen = [];
+
+        $sectionTitles = [];
+        if (preg_match_all('#<strong[^>]*>(.*?)</strong>#su', $scope, $sectionMatches, PREG_OFFSET_CAPTURE)) {
+            foreach ($sectionMatches[1] as $sectionMatch) {
+                $title = CrawlAuthorParser::cleanText(strip_tags($sectionMatch[0]));
+                if ($title !== null && $title !== '') {
+                    $sectionTitles[] = [
+                        'offset' => $sectionMatch[1],
+                        'title' => $title,
+                    ];
+                }
+            }
+        }
+
+        $resolveSectionTitle = function (int $offset) use ($sectionTitles): ?string {
+            $title = null;
+            foreach ($sectionTitles as $section) {
+                if ($section['offset'] <= $offset) {
+                    $title = $section['title'];
+                } else {
+                    break;
+                }
+            }
+
+            return $title;
+        };
+
+        $addItem = function (
+            string $name,
+            ?string $profileUrl,
+            ?string $sectionTitle,
+        ) use (
+            $keywords,
+            $defaultCollege,
+            $pageUniversity,
+            &$items,
+            &$seen,
+        ): void {
+            if ($name === '' || ! $this->looksLikePersonName($name)) {
+                return;
+            }
+
+            $dedupeKey = $profileUrl ?: ('name:'.md5($name));
+            if (isset($seen[$dedupeKey])) {
+                return;
+            }
+
+            $academicTitle = $this->inferAcademicTitleFromSection($sectionTitle);
+            $plain = trim($name.' '.($academicTitle ?? '').' '.($defaultCollege ?? ''));
+            if (! $this->matchesKeywords($plain, $keywords)) {
+                return;
+            }
+
+            $seen[$dedupeKey] = true;
+            $items[] = $this->makeFacultyItem(
+                externalKey: 'faculty:'.md5($dedupeKey),
+                name: $name,
+                profileUrl: $profileUrl,
+                email: null,
+                affiliation: $defaultCollege,
+                universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
+                summary: $defaultCollege ? '单位：'.$defaultCollege : null,
+                keywords: $keywords,
+                academicTitle: $academicTitle,
+                platform: 'faculty_html_vsb',
+                bio: null,
+            );
+        };
+
+        if (preg_match_all('#<a\b([^>]*?)>(.*?)</a>#su', $scope, $linkMatches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER)) {
+            foreach ($linkMatches as $linkMatch) {
+                $attrs = (string) $linkMatch[1][0];
+                $offset = (int) $linkMatch[0][1];
+                $name = CrawlAuthorParser::cleanText(strip_tags($linkMatch[2][0])) ?? '';
+                if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
+                    continue;
+                }
+                $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
+                if (! $this->looksLikeTeacherProfileUrl($href, null)) {
+                    continue;
+                }
+                $addItem($name, $this->resolveUrl($href, $sourceUrl), $resolveSectionTitle($offset));
+            }
+        }
+
+        if (preg_match_all('#<td[^>]*>(.*?)</td>#su', $scope, $cellMatches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER)) {
+            foreach ($cellMatches as $cellMatch) {
+                $cellHtml = (string) $cellMatch[1][0];
+                $offset = (int) $cellMatch[0][1];
+                if (str_contains($cellHtml, '<a ')) {
+                    continue;
+                }
+                $name = CrawlAuthorParser::cleanText(strip_tags($cellHtml)) ?? '';
+                $addItem($name, null, $resolveSectionTitle($offset));
+            }
+        }
+
+        return $items;
+    }
+
+    protected function inferAcademicTitleFromSection(?string $sectionTitle): ?string
+    {
+        if ($sectionTitle === null || $sectionTitle === '') {
+            return null;
+        }
+
+        if (str_contains($sectionTitle, '教授') && ! str_contains($sectionTitle, '副教授')) {
+            return '教授';
+        }
+        if (str_contains($sectionTitle, '副教授')) {
+            return '副教授';
+        }
+        if (str_contains($sectionTitle, '助理教授')) {
+            return '准聘助理教授';
+        }
+        if (str_contains($sectionTitle, '博士后')) {
+            return '博士后';
+        }
+        if (str_contains($sectionTitle, '专职科研')) {
+            return '专职科研';
+        }
+
+        return CrawlAuthorParser::cleanText($sectionTitle);
+    }
+
    protected function isAjaxTeacherListPage(string $html, string $sourceUrl): bool
    {
        if (str_contains(strtolower($sourceUrl), 'ajax_teacher_list')) {
@ -507,6 +885,250 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
        return str_contains($html, 'ajax_teacher_list.html');
    }

+    protected function isNjuTeacherHomePage(string $html): bool
+    {
+        return str_contains($html, 'faculty.js')
+            && (bool) preg_match('/<body[^>]*class="[^"]*\bfaculty\b/u', $html);
+    }
+
+    /**
+     * @param  list<string>  $keywords
+     * @return list<CrawlItemDto>
+     */
+    protected function fetchNjuTeacherHomeItems(
+        string $requestUrl,
+        string $pageHtml,
+        array $keywords,
+        int $maxResults,
+        int $maxPages = 1,
+    ): array {
+        $siteId = $this->parseNjuSiteId($pageHtml);
+        $filters = $this->parseNjuTeacherHomeFilters($pageHtml);
+        $conditions = $this->buildNjuTeacherHomeConditions($filters['career'], $filters['sub_career']);
+        $origin = $this->requestOrigin($requestUrl) ?? 'https://is.nju.edu.cn';
+        $apiUrl = $origin.'/_wp3services/generalQuery?queryObj=teacherHome';
+        $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
+        $maxPages = max(1, min(50, $maxPages));
+        $rows = 50;
+
+        $pageUniversity = $this->inferUniversityFromSource($requestUrl, $pageHtml);
+        $defaultCollege = $this->inferCollegeFromPageTitle($pageHtml);
+
+        $items = [];
+        $seen = [];
+        $pageIndex = 1;
+        $pageCount = null;
+
+        while ($pageIndex <= $maxPages && count($items) < $maxResults) {
+            $body = $this->requestNjuTeacherHomePage($apiUrl, $siteId, $pageIndex, $rows, $conditions, $timeout);
+            if ($pageCount === null) {
+                $pageCount = max(1, (int) ($body['pageCount'] ?? 1));
+            }
+
+            $data = $body['data'] ?? [];
+            if (! is_array($data) || $data === []) {
+                break;
+            }
+
+            foreach ($data as $art) {
+                if (! is_array($art)) {
+                    continue;
+                }
+
+                $name = CrawlAuthorParser::cleanText((string) ($art['title'] ?? '')) ?? '';
+                if ($name === '' || ! $this->looksLikePersonName($name)) {
+                    continue;
+                }
+
+                $profileUrl = $this->resolveUrl((string) ($art['cnUrl'] ?? ''), $requestUrl);
+                $dedupeKey = $profileUrl ?: ('name:'.md5($name));
+                if (isset($seen[$dedupeKey])) {
+                    continue;
+                }
+
+                $academicTitle = CrawlAuthorParser::cleanText((string) ($art['exField2'] ?? ''));
+                $researchField = CrawlAuthorParser::cleanText((string) ($art['exField1'] ?? ''));
+                $plain = trim($name.' '.($researchField ?? '').' '.($academicTitle ?? '').' '.($defaultCollege ?? ''));
+                if (! $this->matchesKeywords($plain, $keywords)) {
+                    continue;
+                }
+
+                $summaryParts = array_filter([
+                    $defaultCollege ? '单位：'.$defaultCollege : null,
+                    $academicTitle ? '职称：'.$academicTitle : null,
+                    $researchField ? '研究领域：'.$researchField : null,
+                ]);
+
+                $seen[$dedupeKey] = true;
+                $items[] = $this->makeFacultyItem(
+                    externalKey: 'faculty:'.md5($dedupeKey),
+                    name: $name,
+                    profileUrl: $profileUrl,
+                    email: null,
+                    affiliation: $defaultCollege,
+                    universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
+                    summary: $summaryParts !== [] ? implode('；', $summaryParts) : null,
+                    keywords: $keywords,
+                    academicTitle: $academicTitle,
+                    platform: 'faculty_html_nju_wp',
+                    bio: $researchField,
+                );
+
+                if (count($items) >= $maxResults) {
+                    break 2;
+                }
+            }
+
+            if ($pageIndex >= $pageCount) {
+                break;
+            }
+
+            $pageIndex++;
+        }
+
+        return $items;
+    }
+
+    protected function parseNjuSiteId(string $html): int
+    {
+        if (preg_match('/sudy-wp-siteId="(\d+)"/', $html, $match)) {
+            return (int) $match[1];
+        }
+
+        throw new \RuntimeException('无法解析教师列表站点 ID（siteId）');
+    }
+
+    /**
+     * @return array{career:?string,sub_career:?string}
+     */
+    protected function parseNjuTeacherHomeFilters(string $html): array
+    {
+        $career = null;
+        $subCareer = null;
+
+        if (preg_match('#class="col_item_link\s+selected"[^>]*title="([^"]+)"#u', $html, $match)) {
+            $career = CrawlAuthorParser::cleanText($match[1]);
+        } elseif (preg_match('#class="col_item_link\s+selected"[^>]*>.*?class="column-name">([^<]+)</span>#su', $html, $match)) {
+            $career = CrawlAuthorParser::cleanText($match[1]);
+        }
+
+        if (preg_match('#class="sub-item[^"]*\sselected"[^>]*>.*?class="column-name">([^<]+)</span>#su', $html, $match)) {
+            $subCareer = CrawlAuthorParser::cleanText($match[1]);
+        } elseif (preg_match('#class="sub-link[^"]*\sselected"[^>]*title="([^"]+)"#u', $html, $match)) {
+            $subCareer = CrawlAuthorParser::cleanText($match[1]);
+        }
+
+        return [
+            'career' => $career,
+            'sub_career' => $subCareer,
+        ];
+    }
+
+    /**
+     * @return list<array<string, mixed>>
+     */
+    protected function buildNjuTeacherHomeConditions(?string $career, ?string $subCareer): array
+    {
+        $conditions = [
+            ['field' => 'published', 'value' => '1', 'judge' => '='],
+        ];
+
+        if ($subCareer === '长聘副教授') {
+            $conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '长聘副教授', 'judge' => '=']]];
+        } elseif ($subCareer === '准聘副教授') {
+            $conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '准聘副教授', 'judge' => '=']]];
+        } elseif ($subCareer === '准聘助理教授') {
+            $conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '准聘助理教授', 'judge' => '=']]];
+        } elseif ($subCareer === '专职科研') {
+            $conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '专职科研', 'judge' => '=']]];
+        } elseif ($subCareer === '博士后') {
+            $conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '博士后', 'judge' => '=']]];
+        }
+
+        if ($career === null || $career === '') {
+            return $conditions;
+        }
+
+        if ($career === '教授') {
+            $conditions[] = ['field' => 'exField2', 'value' => '教授', 'judge' => '='];
+        } elseif ($career === '副教授') {
+            $conditions[] = ['field' => 'exField2', 'value' => '副教授', 'judge' => '='];
+        } elseif ($career === '兼职教授') {
+            $conditions[] = ['field' => 'exField2', 'value' => '兼职教授', 'judge' => '='];
+        } elseif ($career === '行政管理人员') {
+            $conditions[] = ['field' => 'exField2', 'value' => '行政管理人员', 'judge' => '='];
+        } elseif ($career === '准长聘' && ($subCareer === null || $subCareer === '')) {
+            $conditions[] = [
+                'orConditions' => [
+                    ['field' => 'exField2', 'value' => '长聘副教授', 'judge' => '='],
+                    ['field' => 'exField2', 'value' => '准聘副教授', 'judge' => '='],
+                    ['field' => 'exField2', 'value' => '准聘助理教授', 'judge' => '='],
+                ],
+            ];
+        } elseif ($career === '专职科研及博士后' && ($subCareer === null || $subCareer === '')) {
+            $conditions[] = [
+                'orConditions' => [
+                    ['field' => 'exField2', 'value' => '专职科研', 'judge' => '='],
+                    ['field' => 'exField2', 'value' => '博士后', 'judge' => '='],
+                ],
+            ];
+        }
+
+        return $conditions;
+    }
+
+    /**
+     * @param  list<array<string, mixed>>  $conditions
+     * @return array<string, mixed>
+     */
+    protected function requestNjuTeacherHomePage(
+        string $apiUrl,
+        int $siteId,
+        int $pageIndex,
+        int $rows,
+        array $conditions,
+        int $timeout,
+    ): array {
+        $returnInfos = [
+            ['field' => 'headerPic', 'name' => 'headerPic'],
+            ['field' => 'exField1', 'name' => 'exField1'],
+            ['field' => 'exField2', 'name' => 'exField2'],
+            ['field' => 'cnUrl', 'name' => 'cnUrl'],
+            ['field' => 'title', 'name' => 'title'],
+            ['field' => 'phone', 'name' => 'phone'],
+        ];
+
+        $response = Http::timeout($timeout)
+            ->connectTimeout(min(8, $timeout))
+            ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
+            ->withHeaders([
+                'User-Agent' => 'SlakeSchool-Crawler/1.0',
+                'Accept' => 'application/json',
+            ])
+            ->asForm()
+            ->post($apiUrl, [
+                'siteId' => $siteId,
+                'pageIndex' => $pageIndex,
+                'rows' => $rows,
+                'orders' => json_encode([['field' => 'siteSort', 'type' => 'asc']], JSON_UNESCAPED_UNICODE),
+                'returnInfos' => json_encode($returnInfos, JSON_UNESCAPED_UNICODE),
+                'conditions' => json_encode($conditions, JSON_UNESCAPED_UNICODE),
+                'articleType' => 1,
+                'level' => 1,
+            ]);
+
+        if (! $response->successful()) {
+            throw new \RuntimeException('教师列表接口请求失败（HTTP '.$response->status().'）');
+        }
+
+        $body = $response->json();
+        if (! is_array($body)) {
+            throw new \RuntimeException('教师列表接口返回格式异常');
+        }
+
+        return $body;
+    }
+
    /**
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
@ -895,6 +1517,22 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
            return true;
        }

+        if (preg_match('#/c\d+a\d+/page\.htm$#', $path)) {
+            return true;
+        }
+
+        if (preg_match('#/(?:szll|zjzjs)/[^/]+\.(?:htm|html)$#', $path)) {
+            return true;
+        }
+
+        if (preg_match('#^(?:szll|zjzjs)/[^/]+\.(?:htm|html)$#', $path)) {
+            return true;
+        }
+
+        if (preg_match('#/info/\d+/\d+\.htm$#', $path)) {
+            return true;
+        }
+
        if ($catCode !== null && $catCode !== '') {
            $code = preg_quote(strtolower($catCode), '#');

@ -1116,20 +1754,39 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface

    protected function inferCollegeFromPageTitle(string $html): ?string
    {
-        if (! preg_match('/<title>\s*[^<\-\–—]+[\-–—]\s*([^<]+?)\s*<\/title>/u', $html, $match)) {
-            return null;
+        if (preg_match('/<title>\s*[^<\-\–—]+[\-–—]\s*([^<]+?)\s*<\/title>/u', $html, $match)) {
+            $title = CrawlAuthorParser::cleanText($match[1]);
+            if ($title !== null && $title !== '') {
+                if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $title, $college)) {
+                    return CrawlAuthorParser::cleanText($college[1]);
+                }
+
+                return $title;
+            }
        }

-        $title = CrawlAuthorParser::cleanText($match[1]);
-        if ($title === null || $title === '') {
-            return null;
+        if (preg_match('/<meta\s+name="description"\s+content="([^"]+)"/u', $html, $match)) {
+            $desc = CrawlAuthorParser::cleanText(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'));
+            if ($desc !== null && $desc !== '') {
+                if (preg_match('/大学([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $desc, $college)) {
+                    return CrawlAuthorParser::cleanText($college[1]);
+                }
+                if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $desc, $college)) {
+                    return CrawlAuthorParser::cleanText($college[1]);
+                }
+            }
        }

-        if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $title, $college)) {
-            return CrawlAuthorParser::cleanText($college[1]);
+        if (preg_match('/<meta\s+name=[\'"]SiteName[\'"]\s+content=[\'"]([^\'"]+)[\'"]/u', $html, $match)) {
+            $siteName = CrawlAuthorParser::cleanText(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'));
+            if ($siteName !== null && $siteName !== '') {
+                if (preg_match('/大学([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $siteName, $college)) {
+                    return CrawlAuthorParser::cleanText($college[1]);
+                }
+            }
        }

-        return $title;
+        return null;
    }

    protected function applyProfileMetadataToItem(CrawlItemDto $item, string $html): CrawlItemDto
@ -1446,6 +2103,12 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
            if (str_contains($host, 'fudan.edu.cn')) {
                return '复旦大学';
            }
+            if (str_contains($host, 'nju.edu.cn')) {
+                return '南京大学';
+            }
+            if (str_contains($host, 'tsinghua.edu.cn')) {
+                return '清华大学';
+            }
        }

        if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,20}大学)/u', $this->htmlToPlain($html), $match)) {
--- a/tests/Unit/FacultyListHtmlAdapterTest.php
+++ b/tests/Unit/FacultyListHtmlAdapterTest.php
@ -353,4 +353,140 @@ HTML;
        $this->assertSame('副教授', $items[1]->extra['academic_title']);
        $this->assertSame('faculty_html_ajax', $items[0]->extra['platform']);
    }
+
+    public function test_detects_nju_teacher_home_page(): void
+    {
+        $html = '<body class="list faculty"><script src="/js/faculty.js"></script></body>';
+        $adapter = new FacultyListHtmlAdapter;
+        $method = new \ReflectionMethod($adapter, 'isNjuTeacherHomePage');
+        $method->setAccessible(true);
+
+        $this->assertTrue($method->invoke($adapter, $html));
+        $this->assertFalse($method->invoke($adapter, '<body class="list"><script src="/js/list.js"></script></body>'));
+    }
+
+    public function test_builds_nju_teacher_home_conditions_for_all_faculty(): void
+    {
+        $adapter = new FacultyListHtmlAdapter;
+        $method = new \ReflectionMethod($adapter, 'buildNjuTeacherHomeConditions');
+        $method->setAccessible(true);
+
+        $conditions = $method->invoke($adapter, null, null);
+
+        $this->assertCount(1, $conditions);
+        $this->assertSame('published', $conditions[0]['field']);
+    }
+
+    public function test_builds_nju_teacher_home_conditions_for_professor_category(): void
+    {
+        $adapter = new FacultyListHtmlAdapter;
+        $method = new \ReflectionMethod($adapter, 'buildNjuTeacherHomeConditions');
+        $method->setAccessible(true);
+
+        $conditions = $method->invoke($adapter, '教授', null);
+
+        $this->assertCount(2, $conditions);
+        $this->assertSame('exField2', $conditions[1]['field']);
+        $this->assertSame('教授', $conditions[1]['value']);
+    }
+
+    public function test_parses_nju_site_id_from_html(): void
+    {
+        $html = '<script src="/_js/jquery.min.js" sudy-wp-siteId="786"></script>';
+        $adapter = new FacultyListHtmlAdapter;
+        $method = new \ReflectionMethod($adapter, 'parseNjuSiteId');
+        $method->setAccessible(true);
+
+        $this->assertSame(786, $method->invoke($adapter, $html));
+    }
+
+    public function test_infers_college_from_meta_description(): void
+    {
+        $html = '<title>师资力量</title><meta name="description" content="南京大学智能科学与技术学院" >';
+        $adapter = new FacultyListHtmlAdapter;
+        $method = new \ReflectionMethod($adapter, 'inferCollegeFromPageTitle');
+        $method->setAccessible(true);
+
+        $this->assertSame('智能科学与技术学院', $method->invoke($adapter, $html));
+    }
+
+    public function test_extracts_sudy_news_faculty_list(): void
+    {
+        $html = <<<'HTML'
+<title>师资力量-南京大学前沿科学学院</title>
+<li class="wp_sublist sublist-1">
+    <h3 class="sublist_title"><span class="subcolumn-name">功能材料与智能制造研究院</span></h3>
+    <ul class="news_list list2">
+        <li class="news n1 clearfix">
+            <div class="news_title"><a href='/85/ef/c59286a689647/page.htm' title='王保明'>王保明</a></div>
+        </li>
+    </ul>
+</li>
+HTML;
+
+        $adapter = new FacultyListHtmlAdapter;
+        $method = new \ReflectionMethod($adapter, 'extractFromSudyNewsFacultyList');
+        $method->setAccessible(true);
+
+        $items = $method->invoke($adapter, $html, [], 'https://frontier.nju.edu.cn/zrjs/list.htm');
+
+        $this->assertCount(1, $items);
+        $this->assertSame('王保明', $items[0]->title);
+        $this->assertSame('功能材料与智能制造研究院', $items[0]->extra['college_name']);
+        $this->assertSame('faculty_html_sudy_news', $items[0]->extra['platform']);
+    }
+
+    public function test_extracts_ra_teacher_cards(): void
+    {
+        $html = <<<'HTML'
+<title>专职教师-南京大学机器人与自动化学院</title>
+<ul class="teacher">
+<li>
+<a href="http://ra.nju.edu.cn/szll/zzjs/20250901/i335910.html" title="周克敏">
+<div class="data"><div class="name"><div class="xm">周克敏</div></div>
+<div class="research pro">职称：<span>教授</span></div>
+<div class="research">研究方向：<span>鲁棒控制</span></div></div>
+</a>
+</li>
+</ul>
+HTML;
+
+        $adapter = new FacultyListHtmlAdapter;
+        $method = new \ReflectionMethod($adapter, 'extractFromRaTeacherList');
+        $method->setAccessible(true);
+
+        $items = $method->invoke($adapter, $html, [], 'https://ra.nju.edu.cn/szll/zzjs/index.html');
+
+        $this->assertCount(1, $items);
+        $this->assertSame('周克敏', $items[0]->title);
+        $this->assertSame('教授', $items[0]->extra['academic_title']);
+        $this->assertSame('faculty_html_ra', $items[0]->extra['platform']);
+    }
+
+    public function test_extracts_vsb_faculty_table(): void
+    {
+        $html = <<<'HTML'
+<title>专兼职教师-南京大学智能软件与工程学院</title>
+<div class="zjzjs"><p><strong><span>教授</span></strong></p>
+<table><tr><td><a href="zjzjs/yangkun.htm"><span>杨鲲</span></a></td><td><span>陶先平</span></td></tr></table>
+<p><strong><span>副教授</span></strong></p>
+<table><tr><td><a href="zjzjs/shaodong.htm"><span>邵栋</span></a></td></tr></table>
+</div>
+HTML;
+
+        $adapter = new FacultyListHtmlAdapter;
+        $method = new \ReflectionMethod($adapter, 'extractFromVsbFacultyTable');
+        $method->setAccessible(true);
+
+        $items = $method->invoke($adapter, $html, [], 'https://ise.nju.edu.cn/szll/zjzjs.htm');
+
+        $this->assertCount(3, $items);
+        $names = array_map(fn ($item) => $item->title, $items);
+        $this->assertContains('杨鲲', $names);
+        $this->assertContains('陶先平', $names);
+        $this->assertContains('邵栋', $names);
+        $titles = array_column(array_map(fn ($item) => $item->extra, $items), 'academic_title', null);
+        $this->assertSame('教授', $items[array_search('杨鲲', $names, true)]->extra['academic_title']);
+        $this->assertSame('副教授', $items[array_search('邵栋', $names, true)]->extra['academic_title']);
+    }
 }