slake-school-service/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php

<?php

namespace App\Services\Crawl\Adapters;

use App\Models\CrawlSource;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use Illuminate\Http\Client\Response;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Str;

/**
 * 通用院系/师资列表页：优先邮箱条目；无邮箱时解析 tsites 等列表卡片（姓名、单位、职称、主页）。
 */
class FacultyListHtmlAdapter implements CrawlerAdapterInterface
{
    public function fetch(string $requestUrl, CrawlSource $source, array $params): array
    {
        $keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
        $maxResults = min(500, max(1, (int) ($params['max_results'] ?? 30)));
        $maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1)));

        $baseUrl = $this->normalizeRequestUrl($requestUrl);
        $firstHtml = $this->fetchHtml($baseUrl);

        if ($this->isAjaxTeacherListPage($firstHtml, $requestUrl)) {
            $items = $this->fetchAjaxTeacherItems($requestUrl, $firstHtml, $keywords, $maxResults, $maxPages);

            return $this->enrichEmailsFromProfilePages($items, $params);
        }

        if ($this->isNjuTeacherHomePage($firstHtml)) {
            $items = $this->fetchNjuTeacherHomeItems($requestUrl, $firstHtml, $keywords, $maxResults, $maxPages);

            return $this->enrichEmailsFromProfilePages($items, $params);
        }

        $totalPages = $this->detectTotalPages($firstHtml);
        $pagesToFetch = min($maxPages, $totalPages);

        $merged = [];
        $seen = [];

        for ($page = 1; $page <= $pagesToFetch && count($merged) < $maxResults; $page++) {
            $html = $page === 1
                ? $firstHtml
                : null;

            if ($html === null) {
                break;
            }

            foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
                if (isset($seen[$item->externalId])) {
                    continue;
                }
                $seen[$item->externalId] = true;
                $merged[] = $item;
                if (count($merged) >= $maxResults) {
                    break 2;
                }
            }
        }

        if ($pagesToFetch > 1 && count($merged) < $maxResults) {
            $merged = $this->fetchRemainingListPages(
                $baseUrl,
                $firstHtml,
                $pagesToFetch,
                $keywords,
                $requestUrl,
                $merged,
                $seen,
                $maxResults,
            );
        }

        return $this->enrichEmailsFromProfilePages($merged, $params);
    }

    /**
     * @param  list<CrawlItemDto>  $merged
     * @param  array<string, true>  $seen
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function fetchRemainingListPages(
        string $baseUrl,
        string $firstHtml,
        int $pagesToFetch,
        array $keywords,
        string $requestUrl,
        array $merged,
        array $seen,
        int $maxResults,
    ): array {
        $poolSize = max(1, min(10, (int) config('crawl.faculty.list_fetch_pool_size', 5)));
        $pageUrls = [];
        for ($page = 2; $page <= $pagesToFetch; $page++) {
            $pageUrls[$page] = $this->buildPageUrl($baseUrl, $page, $firstHtml);
        }

        foreach (array_chunk($pageUrls, $poolSize, true) as $chunk) {
            $htmlByPage = $this->fetchHtmlPool($chunk);
            ksort($htmlByPage);

            foreach ($htmlByPage as $html) {
                foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
                    if (isset($seen[$item->externalId])) {
                        continue;
                    }
                    $seen[$item->externalId] = true;
                    $merged[] = $item;
                    if (count($merged) >= $maxResults) {
                        return $merged;
                    }
                }
            }
        }

        return $merged;
    }

    /**
     * @param  array<int, string>  $pageUrls
     * @return array<int, string>
     */
    protected function fetchHtmlPool(array $pageUrls): array
    {
        if ($pageUrls === []) {
            return [];
        }

        $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
        $headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
        $responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pageUrls, $headers, $timeout) {
            foreach ($pageUrls as $page => $url) {
                $pool->as((string) $page)
                    ->timeout($timeout)
                    ->connectTimeout(min(8, $timeout))
                    ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
                    ->withHeaders($headers)
                    ->get($url);
            }
        });

        $htmlByPage = [];
        foreach ($pageUrls as $page => $url) {
            $body = $this->responseBodyFromPoolResult($responses[(string) $page] ?? null);
            if ($body !== null && $body !== '') {
                $htmlByPage[$page] = $body;
            }
        }

        return $htmlByPage;
    }

    /**
     * @param  list<CrawlItemDto>  $items
     * @param  array<string, mixed>  $params
     * @return list<CrawlItemDto>
     */
    protected function enrichEmailsFromProfilePages(array $items, array $params = []): array
    {
        if (! config('crawl.faculty.profile_email_enrich_enabled', true)) {
            return $items;
        }

        $maxEnrich = $this->resolveProfileEnrichMax($params, count($items));
        if ($maxEnrich <= 0) {
            return $this->markProfileEnrichSkipped($items);
        }

        $poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 8)));
        $timeout = max(5, (int) config('crawl.faculty.profile_http_timeout_seconds', 10));
        $headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];

        $fetchMap = [];
        $enrichBudget = $maxEnrich;
        foreach ($items as $index => $item) {
            if ($enrichBudget <= 0) {
                break;
            }
            if ($this->itemHasEmail($item) || ! $item->canonicalUrl) {
                continue;
            }
            $fetchMap[$index] = $item;
            $enrichBudget--;
        }

        if ($fetchMap === []) {
            return $items;
        }

        $fetchedBodies = [];
        foreach (array_chunk($fetchMap, $poolSize, true) as $chunk) {
            $batchPending = [];
            foreach ($chunk as $index => $item) {
                $batchPending[$index] = $item;
            }

            $responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($batchPending, $headers, $timeout) {
                foreach ($batchPending as $index => $item) {
                    $pool->as((string) $index)
                        ->timeout($timeout)
                        ->connectTimeout(min(8, $timeout))
                        ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
                        ->withHeaders($headers)
                        ->get($item->canonicalUrl);
                }
            });

            foreach ($batchPending as $index => $item) {
                $body = $this->responseBodyFromPoolResult($responses[(string) $index] ?? null);
                if ($body !== null) {
                    $email = $this->extractEmailFromProfileHtml($body);
                    if ($email) {
                        $item = $this->applyEmailToItem($item, $email);
                    }
                    $item = $this->applyProfileMetadataToItem($item, $body);
                }
                $fetchedBodies[$index] = $item;
            }
        }

        $result = [];
        foreach ($items as $index => $item) {
            if (isset($fetchedBodies[$index])) {
                $result[] = $fetchedBodies[$index];
            } elseif (! $this->itemHasEmail($item) && $item->canonicalUrl) {
                $result[] = $this->markItemProfileEnrichSkipped($item);
            } else {
                $result[] = $item;
            }
        }

        return $result;
    }

    /**
     * @param  array<string, mixed>  $params
     */
    protected function resolveProfileEnrichMax(array $params, int $itemCount): int
    {
        if (($params['skip_profile_enrich'] ?? false) === true) {
            return 0;
        }

        $configured = (int) ($params['profile_enrich_max'] ?? config('crawl.faculty.profile_enrich_max', 32));

        return max(0, min($itemCount, min(200, $configured)));
    }

    /**
     * @param  list<CrawlItemDto>  $items
     * @return list<CrawlItemDto>
     */
    protected function markProfileEnrichSkipped(array $items): array
    {
        return array_map(fn (CrawlItemDto $item) => $this->markItemProfileEnrichSkipped($item), $items);
    }

    protected function markItemProfileEnrichSkipped(CrawlItemDto $item): CrawlItemDto
    {
        if ($this->itemHasEmail($item)) {
            return $item;
        }

        $extra = $item->extra;
        $extra['profile_enrich_skipped'] = true;

        return new CrawlItemDto(
            externalId: $item->externalId,
            title: $item->title,
            canonicalUrl: $item->canonicalUrl,
            authors: $item->authors,
            summary: $item->summary,
            publishedAt: $item->publishedAt,
            schoolName: $item->schoolName,
            section: $item->section,
            contentHtml: $item->contentHtml,
            extra: $extra,
            authorsParsed: $item->authorsParsed,
        );
    }

    protected function responseBodyFromPoolResult(mixed $result): ?string
    {
        if ($result instanceof Response && $result->successful()) {
            return (string) $result->body();
        }

        return null;
    }

    protected function itemHasEmail(CrawlItemDto $item): bool
    {
        $lead = $item->extra['lead_author'] ?? null;
        if (is_array($lead) && CrawlAuthorParser::normalizeEmail($lead['email'] ?? null)) {
            return true;
        }

        foreach ($item->authorsParsed as $author) {
            if (CrawlAuthorParser::normalizeEmail($author['email'] ?? null)) {
                return true;
            }
        }

        return false;
    }

    protected function applyEmailToItem(CrawlItemDto $item, string $email): CrawlItemDto
    {
        $email = CrawlAuthorParser::normalizeEmail($email) ?? $email;
        $lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : [];
        $lead['email'] = $email;

        $authorsParsed = $item->authorsParsed;
        if ($authorsParsed === []) {
            $authorsParsed = [[
                'name' => $item->title,
                'email' => $email,
                'affiliation' => $lead['affiliation'] ?? $lead['college'] ?? null,
                'university_name' => $lead['university_name'] ?? $item->schoolName,
            ]];
        } else {
            $authorsParsed[0]['email'] = $email;
        }

        $extra = $item->extra;
        $extra['lead_author'] = $lead;

        return new CrawlItemDto(
            externalId: $item->externalId,
            title: $item->title,
            canonicalUrl: $item->canonicalUrl,
            authors: $item->authors,
            summary: $item->summary,
            publishedAt: $item->publishedAt,
            schoolName: $item->schoolName,
            section: $item->section,
            contentHtml: $item->contentHtml,
            extra: $extra,
            authorsParsed: $authorsParsed,
        );
    }

    protected function extractEmailFromProfileHtml(string $html): ?string
    {
        $labeledPatterns = [
            '/电子邮箱[：:]\s*<\/strong>\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
            '/电子邮箱[：:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
            '/电子信箱[：:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
            '/E-?mail[：:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/iu',
            '/邮箱[：:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
            '/电子邮件[：:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
        ];

        foreach ($labeledPatterns as $pattern) {
            if (preg_match($pattern, $html, $match)) {
                $email = CrawlAuthorParser::normalizeEmail($match[1]);
                if ($email && ! $this->isNoiseEmail($email)) {
                    return $email;
                }
            }
        }

        $candidates = [];
        if (preg_match_all(
            '#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
            $html,
            $emailMatches,
        )) {
            foreach ($emailMatches[1] as $raw) {
                $email = CrawlAuthorParser::normalizeEmail($raw);
                if ($email && ! $this->isNoiseEmail($email)) {
                    $candidates[] = $email;
                }
            }
        }

        if ($candidates === []) {
            return null;
        }

        $candidates = array_values(array_unique($candidates));

        foreach ($candidates as $email) {
            if (str_ends_with($email, '.edu.cn') || str_ends_with($email, '.edu')) {
                return $email;
            }
        }

        return $candidates[0];
    }

    protected function isNoiseEmail(string $email): bool
    {
        return (bool) preg_match(
            '/^(noreply|no-reply|admin|webmaster|postmaster|root|support|service|info|contact)@/i',
            $email,
        );
    }

    protected function fetchHtml(string $url): string
    {
        $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));

        $response = Http::timeout($timeout)
            ->connectTimeout(min(8, $timeout))
            ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
            ->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'])
            ->get($url);

        if (! $response->successful()) {
            throw new \RuntimeException('页面请求失败（HTTP '.$response->status().'）：'.$url);
        }

        return (string) $response->body();
    }

    protected function detectTotalPages(string $html): int
    {
        if (preg_match('/totalpage=(\d+)/i', $html, $match)) {
            return max(1, (int) $match[1]);
        }

        if (preg_match('/共\s*(\d+)\s*条/u', $html, $countMatch)) {
            $perPage = 0;
            if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) {
                $perPage = preg_match_all('/<div\s+class="name">/u', $listMatch[1]) ?: 0;
            }
            if ($perPage > 0) {
                return max(1, (int) ceil(((int) $countMatch[1]) / $perPage));
            }
        }

        return 1;
    }

    protected function buildPageUrl(string $baseUrl, int $page, string $firstPageHtml): string
    {
        $parts = parse_url($baseUrl);
        if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
            return $baseUrl;
        }

        parse_str((string) ($parts['query'] ?? ''), $query);
        $query['PAGENUM'] = (string) $page;

        if (preg_match('/totalpage=(\d+)/i', $firstPageHtml, $totalMatch)) {
            $query['totalpage'] = $totalMatch[1];
        }

        $url = $parts['scheme'].'://'.$parts['host'];
        if (! empty($parts['port'])) {
            $url .= ':'.$parts['port'];
        }
        $url .= $parts['path'] ?? '/';
        if ($query !== []) {
            $url .= '?'.http_build_query($query);
        }

        return $url;
    }

    protected function normalizeRequestUrl(string $url): string
    {
        $parts = parse_url($url);
        if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
            return $url;
        }

        $normalized = $parts['scheme'].'://'.$parts['host'];
        if (! empty($parts['port'])) {
            $normalized .= ':'.$parts['port'];
        }
        $normalized .= $parts['path'] ?? '/';
        if (! empty($parts['query'])) {
            $normalized .= '?'.$parts['query'];
        }

        return $normalized;
    }

    /**
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function extractFromHtml(string $html, array $keywords, string $sourceUrl): array
    {
        $items = $this->extractFromSudyNewsFacultyList($html, $keywords, $sourceUrl);
        if ($items !== []) {
            return $items;
        }

        $items = $this->extractFromRaTeacherList($html, $keywords, $sourceUrl);
        if ($items !== []) {
            return $items;
        }

        $items = $this->extractFromVsbFacultyTable($html, $keywords, $sourceUrl);
        if ($items !== []) {
            return $items;
        }

        $items = $this->extractFromEmailBlocks($html, $keywords, $sourceUrl);
        if ($items !== []) {
            return $items;
        }

        $items = $this->extractFromStructuredFacultyList($html, $keywords, $sourceUrl);
        if ($items !== []) {
            return $items;
        }

        return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl);
    }

    /**
     * 南大 Sudy CMS：ul.news_list 内 news_title / news_title1 链接（frontier、ic 等）。
     *
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function extractFromSudyNewsFacultyList(string $html, array $keywords, string $sourceUrl): array
    {
        if (! preg_match('/class="news_list/u', $html)) {
            return [];
        }

        $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
        $defaultCollege = $this->inferCollegeFromPageTitle($html);
        if ($defaultCollege === null && preg_match('#<li class="col_title"><h2>([^<]+)</h2>#u', $html, $titleMatch)) {
            $defaultCollege = CrawlAuthorParser::cleanText($titleMatch[1]);
        }

        $items = [];
        $seen = [];
        $chunks = preg_split('#<li class="wp_sublist#u', $html) ?: [];

        if (count($chunks) <= 1) {
            return $this->extractSudyNewsLinksFromChunk(
                $html,
                $defaultCollege,
                $keywords,
                $sourceUrl,
                $pageUniversity,
                $seen,
            );
        }

        array_shift($chunks);
        foreach ($chunks as $chunk) {
            $department = $defaultCollege;
            if (preg_match('#subcolumn-name">([^<]+)</span>#u', $chunk, $deptMatch)) {
                $department = CrawlAuthorParser::cleanText($deptMatch[1]);
            }

            foreach ($this->extractSudyNewsLinksFromChunk(
                $chunk,
                $department,
                $keywords,
                $sourceUrl,
                $pageUniversity,
                $seen,
            ) as $item) {
                $items[] = $item;
            }
        }

        return $items;
    }

    /**
     * @param  array<string, true>  $seen
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function extractSudyNewsLinksFromChunk(
        string $chunk,
        ?string $department,
        array $keywords,
        string $sourceUrl,
        ?string $pageUniversity,
        array &$seen,
    ): array {
        $items = [];

        if (! preg_match_all(
            '#<(?:div|span)\s+class="news_title1?">\s*<a\b([^>]*?)>([^<]+)</a>#su',
            $chunk,
            $matches,
            PREG_SET_ORDER,
        )) {
            return [];
        }

        foreach ($matches as $match) {
            $attrs = (string) $match[1];
            $name = CrawlAuthorParser::cleanText($match[2]) ?? '';
            if ($name === '' || ! $this->looksLikePersonName($name)) {
                continue;
            }
            if (! preg_match('#\bhref=[\'"]([^\'"]+)[\'"]#u', $attrs, $hrefMatch)) {
                continue;
            }

            $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
            if (! $this->looksLikeTeacherProfileUrl($href, null)) {
                continue;
            }

            $profileUrl = $this->resolveUrl($href, $sourceUrl);
            $dedupeKey = $profileUrl ?: ('name:'.md5($name));
            if (isset($seen[$dedupeKey])) {
                continue;
            }

            $plain = trim($name.' '.($department ?? ''));
            if (! $this->matchesKeywords($plain, $keywords)) {
                continue;
            }

            $seen[$dedupeKey] = true;
            $items[] = $this->makeFacultyItem(
                externalKey: 'faculty:'.md5($dedupeKey),
                name: $name,
                profileUrl: $profileUrl,
                email: null,
                affiliation: $department,
                universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($department),
                summary: $department ? '单位：'.$department : null,
                keywords: $keywords,
                academicTitle: null,
                platform: 'faculty_html_sudy_news',
                bio: null,
            );
        }

        return $items;
    }

    /**
     * 南大机器人学院等博山 CMS：ul.teacher 卡片（div.xm 姓名）。
     *
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function extractFromRaTeacherList(string $html, array $keywords, string $sourceUrl): array
    {
        if (! preg_match('/<ul class="teacher">/u', $html)) {
            return [];
        }

        if (! preg_match_all(
            '#<a\b([^>]*?)>.*?<div class="xm">([^<]+)</div>(.*?)</a>#su',
            $html,
            $matches,
            PREG_SET_ORDER,
        )) {
            return [];
        }

        $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
        $defaultCollege = $this->inferCollegeFromPageTitle($html);
        $items = [];
        $seen = [];

        foreach ($matches as $match) {
            $attrs = (string) $match[1];
            $name = CrawlAuthorParser::cleanText($match[2]) ?? '';
            $tail = (string) $match[3];
            if ($name === '' || ! $this->looksLikePersonName($name)) {
                continue;
            }
            if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
                continue;
            }

            $profileUrl = $this->resolveUrl(html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'), $sourceUrl);
            $dedupeKey = $profileUrl ?: ('name:'.md5($name));
            if (isset($seen[$dedupeKey])) {
                continue;
            }

            $academicTitle = null;
            if (preg_match('#职称：\s*<span>([^<]+)</span>#u', $tail, $titleMatch)) {
                $academicTitle = CrawlAuthorParser::cleanText($titleMatch[1]);
            }

            $researchField = null;
            if (preg_match('#研究方向：\s*<span>([^<]+)</span>#u', $tail, $fieldMatch)) {
                $researchField = CrawlAuthorParser::cleanText($fieldMatch[1]);
            }

            $plain = trim($name.' '.($researchField ?? '').' '.($academicTitle ?? '').' '.($defaultCollege ?? ''));
            if (! $this->matchesKeywords($plain, $keywords)) {
                continue;
            }

            $summaryParts = array_filter([
                $defaultCollege ? '单位：'.$defaultCollege : null,
                $academicTitle ? '职称：'.$academicTitle : null,
                $researchField ? '研究方向：'.$researchField : null,
            ]);

            $seen[$dedupeKey] = true;
            $items[] = $this->makeFacultyItem(
                externalKey: 'faculty:'.md5($dedupeKey),
                name: $name,
                profileUrl: $profileUrl,
                email: null,
                affiliation: $defaultCollege,
                universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
                summary: $summaryParts !== [] ? implode('；', $summaryParts) : null,
                keywords: $keywords,
                academicTitle: $academicTitle,
                platform: 'faculty_html_ra',
                bio: $researchField,
            );
        }

        return $items;
    }

    /**
     * 南大/清华 WebPlus(VSB) 师资表格页（ise zjzjs 等）。
     *
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function extractFromVsbFacultyTable(string $html, array $keywords, string $sourceUrl): array
    {
        $scope = null;
        if (preg_match('#<div class="zjzjs">(.*?)</div>#su', $html, $match)) {
            $scope = (string) $match[1];
        } elseif (preg_match('#<div id="vsb_content[^"]*">(.*?)</div>\s*</div>\s*</div>#su', $html, $match)) {
            $scope = (string) $match[1];
        } elseif (preg_match('#<ul class="teach-list[^"]*">(.*?)</ul>#su', $html, $match) && trim(strip_tags($match[1])) !== '') {
            $scope = (string) $match[1];
        }

        if ($scope === null || trim(strip_tags($scope)) === '') {
            return [];
        }

        $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
        $defaultCollege = $this->inferCollegeFromPageTitle($html);
        $items = [];
        $seen = [];

        $sectionTitles = [];
        if (preg_match_all('#<strong[^>]*>(.*?)</strong>#su', $scope, $sectionMatches, PREG_OFFSET_CAPTURE)) {
            foreach ($sectionMatches[1] as $sectionMatch) {
                $title = CrawlAuthorParser::cleanText(strip_tags($sectionMatch[0]));
                if ($title !== null && $title !== '') {
                    $sectionTitles[] = [
                        'offset' => $sectionMatch[1],
                        'title' => $title,
                    ];
                }
            }
        }

        $resolveSectionTitle = function (int $offset) use ($sectionTitles): ?string {
            $title = null;
            foreach ($sectionTitles as $section) {
                if ($section['offset'] <= $offset) {
                    $title = $section['title'];
                } else {
                    break;
                }
            }

            return $title;
        };

        $addItem = function (
            string $name,
            ?string $profileUrl,
            ?string $sectionTitle,
        ) use (
            $keywords,
            $defaultCollege,
            $pageUniversity,
            &$items,
            &$seen,
        ): void {
            if ($name === '' || ! $this->looksLikePersonName($name)) {
                return;
            }

            $dedupeKey = $profileUrl ?: ('name:'.md5($name));
            if (isset($seen[$dedupeKey])) {
                return;
            }

            $academicTitle = $this->inferAcademicTitleFromSection($sectionTitle);
            $plain = trim($name.' '.($academicTitle ?? '').' '.($defaultCollege ?? ''));
            if (! $this->matchesKeywords($plain, $keywords)) {
                return;
            }

            $seen[$dedupeKey] = true;
            $items[] = $this->makeFacultyItem(
                externalKey: 'faculty:'.md5($dedupeKey),
                name: $name,
                profileUrl: $profileUrl,
                email: null,
                affiliation: $defaultCollege,
                universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
                summary: $defaultCollege ? '单位：'.$defaultCollege : null,
                keywords: $keywords,
                academicTitle: $academicTitle,
                platform: 'faculty_html_vsb',
                bio: null,
            );
        };

        if (preg_match_all('#<a\b([^>]*?)>(.*?)</a>#su', $scope, $linkMatches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER)) {
            foreach ($linkMatches as $linkMatch) {
                $attrs = (string) $linkMatch[1][0];
                $offset = (int) $linkMatch[0][1];
                $name = CrawlAuthorParser::cleanText(strip_tags($linkMatch[2][0])) ?? '';
                if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
                    continue;
                }
                $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
                if (! $this->looksLikeTeacherProfileUrl($href, null)) {
                    continue;
                }
                $addItem($name, $this->resolveUrl($href, $sourceUrl), $resolveSectionTitle($offset));
            }
        }

        if (preg_match_all('#<td[^>]*>(.*?)</td>#su', $scope, $cellMatches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER)) {
            foreach ($cellMatches as $cellMatch) {
                $cellHtml = (string) $cellMatch[1][0];
                $offset = (int) $cellMatch[0][1];
                if (str_contains($cellHtml, '<a ')) {
                    continue;
                }
                $name = CrawlAuthorParser::cleanText(strip_tags($cellHtml)) ?? '';
                $addItem($name, null, $resolveSectionTitle($offset));
            }
        }

        return $items;
    }

    protected function inferAcademicTitleFromSection(?string $sectionTitle): ?string
    {
        if ($sectionTitle === null || $sectionTitle === '') {
            return null;
        }

        if (str_contains($sectionTitle, '教授') && ! str_contains($sectionTitle, '副教授')) {
            return '教授';
        }
        if (str_contains($sectionTitle, '副教授')) {
            return '副教授';
        }
        if (str_contains($sectionTitle, '助理教授')) {
            return '准聘助理教授';
        }
        if (str_contains($sectionTitle, '博士后')) {
            return '博士后';
        }
        if (str_contains($sectionTitle, '专职科研')) {
            return '专职科研';
        }

        return CrawlAuthorParser::cleanText($sectionTitle);
    }

    protected function isAjaxTeacherListPage(string $html, string $sourceUrl): bool
    {
        if (str_contains(strtolower($sourceUrl), 'ajax_teacher_list')) {
            return true;
        }

        return str_contains($html, 'ajax_teacher_list.html');
    }

    protected function isNjuTeacherHomePage(string $html): bool
    {
        return str_contains($html, 'faculty.js')
            && (bool) preg_match('/<body[^>]*class="[^"]*\bfaculty\b/u', $html);
    }

    /**
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function fetchNjuTeacherHomeItems(
        string $requestUrl,
        string $pageHtml,
        array $keywords,
        int $maxResults,
        int $maxPages = 1,
    ): array {
        $siteId = $this->parseNjuSiteId($pageHtml);
        $filters = $this->parseNjuTeacherHomeFilters($pageHtml);
        $conditions = $this->buildNjuTeacherHomeConditions($filters['career'], $filters['sub_career']);
        $origin = $this->requestOrigin($requestUrl) ?? 'https://is.nju.edu.cn';
        $apiUrl = $origin.'/_wp3services/generalQuery?queryObj=teacherHome';
        $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
        $maxPages = max(1, min(50, $maxPages));
        $rows = 50;

        $pageUniversity = $this->inferUniversityFromSource($requestUrl, $pageHtml);
        $defaultCollege = $this->inferCollegeFromPageTitle($pageHtml);

        $items = [];
        $seen = [];
        $pageIndex = 1;
        $pageCount = null;

        while ($pageIndex <= $maxPages && count($items) < $maxResults) {
            $body = $this->requestNjuTeacherHomePage($apiUrl, $siteId, $pageIndex, $rows, $conditions, $timeout);
            if ($pageCount === null) {
                $pageCount = max(1, (int) ($body['pageCount'] ?? 1));
            }

            $data = $body['data'] ?? [];
            if (! is_array($data) || $data === []) {
                break;
            }

            foreach ($data as $art) {
                if (! is_array($art)) {
                    continue;
                }

                $name = CrawlAuthorParser::cleanText((string) ($art['title'] ?? '')) ?? '';
                if ($name === '' || ! $this->looksLikePersonName($name)) {
                    continue;
                }

                $profileUrl = $this->resolveUrl((string) ($art['cnUrl'] ?? ''), $requestUrl);
                $dedupeKey = $profileUrl ?: ('name:'.md5($name));
                if (isset($seen[$dedupeKey])) {
                    continue;
                }

                $academicTitle = CrawlAuthorParser::cleanText((string) ($art['exField2'] ?? ''));
                $researchField = CrawlAuthorParser::cleanText((string) ($art['exField1'] ?? ''));
                $plain = trim($name.' '.($researchField ?? '').' '.($academicTitle ?? '').' '.($defaultCollege ?? ''));
                if (! $this->matchesKeywords($plain, $keywords)) {
                    continue;
                }

                $summaryParts = array_filter([
                    $defaultCollege ? '单位：'.$defaultCollege : null,
                    $academicTitle ? '职称：'.$academicTitle : null,
                    $researchField ? '研究领域：'.$researchField : null,
                ]);

                $seen[$dedupeKey] = true;
                $items[] = $this->makeFacultyItem(
                    externalKey: 'faculty:'.md5($dedupeKey),
                    name: $name,
                    profileUrl: $profileUrl,
                    email: null,
                    affiliation: $defaultCollege,
                    universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
                    summary: $summaryParts !== [] ? implode('；', $summaryParts) : null,
                    keywords: $keywords,
                    academicTitle: $academicTitle,
                    platform: 'faculty_html_nju_wp',
                    bio: $researchField,
                );

                if (count($items) >= $maxResults) {
                    break 2;
                }
            }

            if ($pageIndex >= $pageCount) {
                break;
            }

            $pageIndex++;
        }

        return $items;
    }

    protected function parseNjuSiteId(string $html): int
    {
        if (preg_match('/sudy-wp-siteId="(\d+)"/', $html, $match)) {
            return (int) $match[1];
        }

        throw new \RuntimeException('无法解析教师列表站点 ID（siteId）');
    }

    /**
     * @return array{career:?string,sub_career:?string}
     */
    protected function parseNjuTeacherHomeFilters(string $html): array
    {
        $career = null;
        $subCareer = null;

        if (preg_match('#class="col_item_link\s+selected"[^>]*title="([^"]+)"#u', $html, $match)) {
            $career = CrawlAuthorParser::cleanText($match[1]);
        } elseif (preg_match('#class="col_item_link\s+selected"[^>]*>.*?class="column-name">([^<]+)</span>#su', $html, $match)) {
            $career = CrawlAuthorParser::cleanText($match[1]);
        }

        if (preg_match('#class="sub-item[^"]*\sselected"[^>]*>.*?class="column-name">([^<]+)</span>#su', $html, $match)) {
            $subCareer = CrawlAuthorParser::cleanText($match[1]);
        } elseif (preg_match('#class="sub-link[^"]*\sselected"[^>]*title="([^"]+)"#u', $html, $match)) {
            $subCareer = CrawlAuthorParser::cleanText($match[1]);
        }

        return [
            'career' => $career,
            'sub_career' => $subCareer,
        ];
    }

    /**
     * @return list<array<string, mixed>>
     */
    protected function buildNjuTeacherHomeConditions(?string $career, ?string $subCareer): array
    {
        $conditions = [
            ['field' => 'published', 'value' => '1', 'judge' => '='],
        ];

        if ($subCareer === '长聘副教授') {
            $conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '长聘副教授', 'judge' => '=']]];
        } elseif ($subCareer === '准聘副教授') {
            $conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '准聘副教授', 'judge' => '=']]];
        } elseif ($subCareer === '准聘助理教授') {
            $conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '准聘助理教授', 'judge' => '=']]];
        } elseif ($subCareer === '专职科研') {
            $conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '专职科研', 'judge' => '=']]];
        } elseif ($subCareer === '博士后') {
            $conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '博士后', 'judge' => '=']]];
        }

        if ($career === null || $career === '') {
            return $conditions;
        }

        if ($career === '教授') {
            $conditions[] = ['field' => 'exField2', 'value' => '教授', 'judge' => '='];
        } elseif ($career === '副教授') {
            $conditions[] = ['field' => 'exField2', 'value' => '副教授', 'judge' => '='];
        } elseif ($career === '兼职教授') {
            $conditions[] = ['field' => 'exField2', 'value' => '兼职教授', 'judge' => '='];
        } elseif ($career === '行政管理人员') {
            $conditions[] = ['field' => 'exField2', 'value' => '行政管理人员', 'judge' => '='];
        } elseif ($career === '准长聘' && ($subCareer === null || $subCareer === '')) {
            $conditions[] = [
                'orConditions' => [
                    ['field' => 'exField2', 'value' => '长聘副教授', 'judge' => '='],
                    ['field' => 'exField2', 'value' => '准聘副教授', 'judge' => '='],
                    ['field' => 'exField2', 'value' => '准聘助理教授', 'judge' => '='],
                ],
            ];
        } elseif ($career === '专职科研及博士后' && ($subCareer === null || $subCareer === '')) {
            $conditions[] = [
                'orConditions' => [
                    ['field' => 'exField2', 'value' => '专职科研', 'judge' => '='],
                    ['field' => 'exField2', 'value' => '博士后', 'judge' => '='],
                ],
            ];
        }

        return $conditions;
    }

    /**
     * @param  list<array<string, mixed>>  $conditions
     * @return array<string, mixed>
     */
    protected function requestNjuTeacherHomePage(
        string $apiUrl,
        int $siteId,
        int $pageIndex,
        int $rows,
        array $conditions,
        int $timeout,
    ): array {
        $returnInfos = [
            ['field' => 'headerPic', 'name' => 'headerPic'],
            ['field' => 'exField1', 'name' => 'exField1'],
            ['field' => 'exField2', 'name' => 'exField2'],
            ['field' => 'cnUrl', 'name' => 'cnUrl'],
            ['field' => 'title', 'name' => 'title'],
            ['field' => 'phone', 'name' => 'phone'],
        ];

        $response = Http::timeout($timeout)
            ->connectTimeout(min(8, $timeout))
            ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
            ->withHeaders([
                'User-Agent' => 'SlakeSchool-Crawler/1.0',
                'Accept' => 'application/json',
            ])
            ->asForm()
            ->post($apiUrl, [
                'siteId' => $siteId,
                'pageIndex' => $pageIndex,
                'rows' => $rows,
                'orders' => json_encode([['field' => 'siteSort', 'type' => 'asc']], JSON_UNESCAPED_UNICODE),
                'returnInfos' => json_encode($returnInfos, JSON_UNESCAPED_UNICODE),
                'conditions' => json_encode($conditions, JSON_UNESCAPED_UNICODE),
                'articleType' => 1,
                'level' => 1,
            ]);

        if (! $response->successful()) {
            throw new \RuntimeException('教师列表接口请求失败（HTTP '.$response->status().'）');
        }

        $body = $response->json();
        if (! is_array($body)) {
            throw new \RuntimeException('教师列表接口返回格式异常');
        }

        return $body;
    }

    /**
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function fetchAjaxTeacherItems(
        string $requestUrl,
        string $pageHtml,
        array $keywords,
        int $maxResults,
        int $maxPages = 1,
    ): array {
        $config = $this->parseAjaxTeacherConfig($pageHtml, $requestUrl);
        $search = implode(' ', $keywords);
        $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
        $maxPages = max(1, min(50, $maxPages));

        $items = [];
        $seen = [];
        $page = 1;
        $totalCount = null;

        while ($page <= $maxPages && count($items) < $maxResults) {
            $body = $this->requestAjaxTeacherPage($config, $page, $search, $timeout);
            if ($totalCount === null && isset($body['count'])) {
                $totalCount = max(0, (int) $body['count']);
            }

            $content = (string) ($body['content'] ?? '');
            if ($content === '') {
                break;
            }

            $before = count($items);
            foreach ($this->extractFromAjaxTeacherContent(
                $pageHtml.$content,
                $keywords,
                $requestUrl,
                $config['cat_code'],
            ) as $item) {
                if (isset($seen[$item->externalId])) {
                    continue;
                }
                $seen[$item->externalId] = true;
                $items[] = $item;
                if (count($items) >= $maxResults) {
                    break 2;
                }
            }

            if ($config['variant'] === 'standard') {
                break;
            }

            if (count($items) === $before) {
                break;
            }

            if ($totalCount !== null && count($items) >= min($totalCount, $maxResults)) {
                break;
            }

            $page++;
        }

        return $items;
    }

    /**
     * @param  array{variant:string,cat_id:?string,cat_code:string,api_url:string}  $config
     * @return array<string, mixed>
     */
    protected function requestAjaxTeacherPage(array $config, int $page, string $search, int $timeout): array
    {
        if ($config['variant'] === 'simple') {
            $payload = [
                'page' => (string) $page,
                'cat_code' => $config['cat_code'],
                'yjszxfl' => '全部',
                'name' => $search,
                'zm' => $search === '' ? 'All' : '',
            ];
        } else {
            $payload = [
                'cat_id' => $config['cat_id'],
                'cat_code' => $config['cat_code'],
                'type' => $search !== '' ? '2' : '1',
                'zm' => $search === '' ? 'All' : '',
                'zc' => '',
                'search' => $search,
            ];
            if ($config['uses_page']) {
                $payload['page'] = (string) $page;
            }
        }

        $response = Http::timeout($timeout)
            ->connectTimeout(min(8, $timeout))
            ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
            ->withHeaders([
                'User-Agent' => 'SlakeSchool-Crawler/1.0',
                'Accept' => 'application/json, text/html',
            ])
            ->asForm()
            ->post($config['api_url'], $payload);

        if (! $response->successful()) {
            throw new \RuntimeException('教师列表接口请求失败（HTTP '.$response->status().'）');
        }

        $body = $response->json();
        if (! is_array($body)) {
            throw new \RuntimeException('教师列表接口返回格式异常');
        }

        return $body;
    }

    /**
     * @return array{variant:string,cat_id:?string,cat_code:string,api_url:string,uses_page:bool}
     */
    protected function parseAjaxTeacherConfig(string $html, string $sourceUrl): array
    {
        $catId = null;
        $catCode = null;
        $usesPage = str_contains($html, 'page:page');
        $origin = $this->requestOrigin($sourceUrl);
        $apiUrl = $origin !== null ? $origin.'/active/ajax_teacher_list.html' : '';

        if (preg_match("/cat_id\s*:\s*'(\d+)'/i", $html, $match)) {
            $catId = $match[1];
        }
        if (preg_match("/cat_code\s*:\s*'([^']+)'/i", $html, $match)) {
            $catCode = $match[1];
        }
        if (preg_match("#url\s*:\s*'([^']*ajax_teacher_list[^']*)'#i", $html, $match)) {
            $apiUrl = $this->resolveUrl(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'), $sourceUrl)
                ?? $apiUrl;
        }

        if ($origin !== null && str_starts_with($apiUrl, '/')) {
            $apiUrl = $origin.$apiUrl;
        }

        if ($catCode === null || $apiUrl === '') {
            throw new \RuntimeException('无法解析教师列表接口参数（cat_code）');
        }

        $variant = $catId !== null ? 'standard' : 'simple';
        if ($variant === 'simple') {
            $usesPage = true;
        }

        return [
            'variant' => $variant,
            'cat_id' => $catId,
            'cat_code' => $catCode,
            'api_url' => $apiUrl,
            'uses_page' => $usesPage,
        ];
    }

    /**
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function extractFromAjaxTeacherContent(
        string $html,
        array $keywords,
        string $sourceUrl,
        ?string $catCode = null,
    ): array {
        $items = [];
        $seen = [];
        $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
        $defaultCollege = $this->inferCollegeFromPageTitle($html);

        $cardItems = $this->extractFromAjaxTeacherCards(
            $html,
            $keywords,
            $sourceUrl,
            $pageUniversity,
            $defaultCollege,
            $catCode,
        );
        if ($cardItems !== []) {
            return $cardItems;
        }

        $parts = preg_split('#<div\s+class="rc-item">#u', $html) ?: [];
        if (count($parts) > 1) {
            array_shift($parts);
            foreach ($parts as $block) {
                $department = $defaultCollege;
                if (preg_match('#<div\s+class="tit">.*?<div\s+class="name">([^<]+)</div>#su', $block, $deptMatch)) {
                    $sectionTitle = CrawlAuthorParser::cleanText($deptMatch[1]);
                    if ($sectionTitle !== null && $sectionTitle !== '' && ! $this->looksLikePersonName($sectionTitle)) {
                        $department = $sectionTitle;
                    }
                }

                foreach ($this->extractTeacherLinksFromHtmlBlock(
                    $block,
                    $keywords,
                    $sourceUrl,
                    $pageUniversity,
                    $department,
                    $catCode,
                ) as $item) {
                    if (isset($seen[$item->externalId])) {
                        continue;
                    }
                    $seen[$item->externalId] = true;
                    $items[] = $item;
                }
            }

            if ($items !== []) {
                return $items;
            }
        }

        return $this->extractTeacherLinksFromHtmlBlock(
            $html,
            $keywords,
            $sourceUrl,
            $pageUniversity,
            $defaultCollege,
            $catCode,
        );
    }

    /**
     * ICISEE 等站点 AJAX 返回的卡片式教师列表（姓名在 div.name 内，职称在 span 内）。
     *
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function extractFromAjaxTeacherCards(
        string $html,
        array $keywords,
        string $sourceUrl,
        ?string $pageUniversity,
        ?string $affiliation,
        ?string $catCode,
    ): array {
        if (! preg_match_all(
            '#<a\b([^>]*?)>\s*(?:<div\s+class="imgk">.*?</div>\s*)?<div\s+class="name">(.*?)</div>#su',
            $html,
            $matches,
            PREG_SET_ORDER,
        )) {
            return [];
        }

        $items = [];
        $seen = [];

        foreach ($matches as $match) {
            $attrs = (string) $match[1];
            $nameBlock = (string) $match[2];
            if (! preg_match('/^([^<]+)/u', $nameBlock, $nameMatch)) {
                continue;
            }

            $name = CrawlAuthorParser::cleanText(trim($nameMatch[1])) ?? '';
            if ($name === '' || ! $this->looksLikePersonName($name)) {
                continue;
            }
            if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
                continue;
            }

            $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
            if (! $this->looksLikeTeacherProfileUrl($href, $catCode)) {
                continue;
            }

            $profileUrl = $this->resolveUrl($href, $sourceUrl);
            $dedupeKey = $profileUrl ?: ('name:'.md5($name));
            if (isset($seen[$dedupeKey])) {
                continue;
            }

            $plain = trim($name.' '.($affiliation ?? ''));
            if (! $this->matchesKeywords($plain, $keywords)) {
                continue;
            }

            $academicTitle = null;
            if (preg_match('#<span>([^<]+)</span>#u', $nameBlock, $titleMatch)) {
                $academicTitle = CrawlAuthorParser::cleanText($titleMatch[1]);
            }

            $seen[$dedupeKey] = true;
            $items[] = $this->makeFacultyItem(
                externalKey: 'faculty:'.md5($dedupeKey),
                name: $name,
                profileUrl: $profileUrl,
                email: null,
                affiliation: $affiliation,
                universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation),
                summary: $affiliation ? '单位：'.$affiliation : null,
                keywords: $keywords,
                academicTitle: $academicTitle,
                platform: 'faculty_html_ajax',
                bio: null,
            );
        }

        return $items;
    }

    /**
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function extractTeacherLinksFromHtmlBlock(
        string $html,
        array $keywords,
        string $sourceUrl,
        ?string $pageUniversity,
        ?string $affiliation,
        ?string $catCode,
    ): array {
        $items = [];
        $seen = [];

        if (! preg_match_all('#<a\b([^>]*?)>(.*?)</a>#su', $html, $matches, PREG_SET_ORDER)) {
            return [];
        }

        foreach ($matches as $match) {
            $attrs = (string) $match[1];
            $rawName = preg_replace('/\s+/u', '', strip_tags($match[2])) ?? '';
            $name = CrawlAuthorParser::cleanText($rawName) ?? '';
            if ($name === '' || ! $this->looksLikePersonName($name)) {
                continue;
            }
            if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
                continue;
            }

            $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
            if (! $this->looksLikeTeacherProfileUrl($href, $catCode)) {
                continue;
            }

            $profileUrl = $this->resolveUrl($href, $sourceUrl);
            $dedupeKey = $profileUrl ?: ('name:'.md5($name));
            if (isset($seen[$dedupeKey])) {
                continue;
            }

            $plain = trim($name.' '.($affiliation ?? ''));
            if (! $this->matchesKeywords($plain, $keywords)) {
                continue;
            }

            $seen[$dedupeKey] = true;
            $items[] = $this->makeFacultyItem(
                externalKey: 'faculty:'.md5($dedupeKey),
                name: $name,
                profileUrl: $profileUrl,
                email: null,
                affiliation: $affiliation,
                universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation),
                summary: $affiliation ? '单位：'.$affiliation : null,
                keywords: $keywords,
                academicTitle: null,
                platform: 'faculty_html_ajax',
                bio: null,
            );
        }

        return $items;
    }

    protected function looksLikeTeacherProfileUrl(string $href, ?string $catCode): bool
    {
        $path = strtolower((string) parse_url($href, PHP_URL_PATH));
        if ($path === '') {
            return false;
        }

        if (preg_match('#/(faculty|jiaoshiml|people/detail_new)/[^/]+\.html$#', $path)) {
            return true;
        }

        if (preg_match('#/c\d+a\d+/page\.htm$#', $path)) {
            return true;
        }

        if (preg_match('#/(?:szll|zjzjs)/[^/]+\.(?:htm|html)$#', $path)) {
            return true;
        }

        if (preg_match('#^(?:szll|zjzjs)/[^/]+\.(?:htm|html)$#', $path)) {
            return true;
        }

        if (preg_match('#/info/\d+/\d+\.htm$#', $path)) {
            return true;
        }

        if ($catCode !== null && $catCode !== '') {
            $code = preg_quote(strtolower($catCode), '#');

            return (bool) preg_match('#/'.$code.'/[^/]+\.html$#', $path);
        }

        return false;
    }

    protected function requestOrigin(string $sourceUrl): ?string
    {
        $parts = parse_url($sourceUrl);
        if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
            return null;
        }

        $origin = $parts['scheme'].'://'.$parts['host'];
        if (! empty($parts['port'])) {
            $origin .= ':'.$parts['port'];
        }

        return $origin;
    }

    /**
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function extractFromEmailBlocks(string $html, array $keywords, string $sourceUrl): array
    {
        $items = [];
        $seen = [];

        if (! preg_match_all(
            '#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
            $html,
            $emailMatches,
            PREG_OFFSET_CAPTURE
        )) {
            return [];
        }

        foreach ($emailMatches[1] as $match) {
            $email = CrawlAuthorParser::normalizeEmail($match[0]);
            if (! $email || isset($seen[$email])) {
                continue;
            }

            $pos = (int) $match[1];
            $window = substr($html, max(0, $pos - 400), 800);
            $plain = $this->htmlToPlain($window);

            if (! $this->matchesKeywords($plain, $keywords)) {
                continue;
            }

            $name = $this->guessName($plain, $email);
            if ($name === '') {
                continue;
            }

            $affiliation = $this->guessAffiliation($plain);
            $seen[$email] = true;

            $items[] = $this->makeFacultyItem(
                externalKey: 'faculty:'.md5($email),
                name: $name,
                profileUrl: $sourceUrl,
                email: $email,
                affiliation: $affiliation,
                universityName: CrawlAuthorParser::universityFromAffiliation($affiliation)
                    ?? $this->inferUniversityFromSource($sourceUrl, $html),
                summary: Str::limit($plain, 300),
                keywords: $keywords,
                academicTitle: null,
                platform: 'faculty_html',
                bio: null,
            );
        }

        return $items;
    }

    /**
     * 上海交大材料学院等：panel-item + a.staff-item（/people/detail_new/{id}）。
     *
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function extractFromStaffPanelList(string $html, array $keywords, string $sourceUrl): array
    {
        $items = [];
        $seen = [];
        $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
        $defaultCollege = $this->inferCollegeFromPageTitle($html);

        $panelChunks = preg_split('#<div\s+class="panel-head">#u', $html) ?: [];
        if (count($panelChunks) > 1) {
            array_shift($panelChunks);
            foreach ($panelChunks as $chunk) {
                if (! preg_match('#<div\s+class="title">\s*([^<]+?)\s*</div>#u', $chunk, $titleMatch)) {
                    continue;
                }
                $department = CrawlAuthorParser::cleanText($titleMatch[1]);
                foreach ($this->extractStaffItemLinks($chunk) as $link) {
                    $item = $this->makeStaffPanelItem(
                        $link,
                        $department ?: $defaultCollege,
                        $pageUniversity,
                        $keywords,
                        $sourceUrl,
                    );
                    if ($item === null || isset($seen[$item->externalId])) {
                        continue;
                    }
                    $seen[$item->externalId] = true;
                    $items[] = $item;
                }
            }
        }

        if ($items !== []) {
            return $items;
        }

        foreach ($this->extractStaffItemLinks($html) as $link) {
            $item = $this->makeStaffPanelItem(
                $link,
                $defaultCollege,
                $pageUniversity,
                $keywords,
                $sourceUrl,
            );
            if ($item === null || isset($seen[$item->externalId])) {
                continue;
            }
            $seen[$item->externalId] = true;
            $items[] = $item;
        }

        return $items;
    }

    /**
     * @return list<array{href:string,name:string}>
     */
    protected function extractStaffItemLinks(string $html): array
    {
        $links = [];
        $seen = [];

        if (! preg_match_all('#<a\b([^>]*?)>([^<]+)</a>#su', $html, $matches, PREG_SET_ORDER)) {
            return [];
        }

        foreach ($matches as $match) {
            $attrs = (string) $match[1];
            if (! str_contains($attrs, 'staff-item')) {
                continue;
            }
            if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
                continue;
            }

            $name = CrawlAuthorParser::cleanText($match[2]) ?? '';
            if ($name === '' || ! $this->looksLikePersonName($name)) {
                continue;
            }

            $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
            $key = $href.'|'.$name;
            if (isset($seen[$key])) {
                continue;
            }
            $seen[$key] = true;
            $links[] = ['href' => $href, 'name' => $name];
        }

        return $links;
    }

    /**
     * @param  array{href:string,name:string}  $link
     * @param  list<string>  $keywords
     */
    protected function makeStaffPanelItem(
        array $link,
        ?string $department,
        ?string $pageUniversity,
        array $keywords,
        string $sourceUrl,
    ): ?CrawlItemDto {
        $name = $link['name'];
        $profileUrl = $this->resolveUrl($link['href'], $sourceUrl);
        $plain = trim($name.' '.($department ?? ''));

        if (! $this->matchesKeywords($plain, $keywords)) {
            return null;
        }

        $dedupeKey = $profileUrl ?: ('name:'.md5($name));
        $affiliation = $department;
        $universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation);

        return $this->makeFacultyItem(
            externalKey: 'faculty:'.md5($dedupeKey),
            name: $name,
            profileUrl: $profileUrl,
            email: null,
            affiliation: $affiliation,
            universityName: $universityName,
            summary: $department ? '单位：'.$department : null,
            keywords: $keywords,
            academicTitle: null,
            platform: 'faculty_html_smse',
            bio: null,
        );
    }

    protected function inferCollegeFromPageTitle(string $html): ?string
    {
        if (preg_match('/<title>\s*[^<\-\–—]+[\-–—]\s*([^<]+?)\s*<\/title>/u', $html, $match)) {
            $title = CrawlAuthorParser::cleanText($match[1]);
            if ($title !== null && $title !== '') {
                if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $title, $college)) {
                    return CrawlAuthorParser::cleanText($college[1]);
                }

                return $title;
            }
        }

        if (preg_match('/<meta\s+name="description"\s+content="([^"]+)"/u', $html, $match)) {
            $desc = CrawlAuthorParser::cleanText(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'));
            if ($desc !== null && $desc !== '') {
                if (preg_match('/大学([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $desc, $college)) {
                    return CrawlAuthorParser::cleanText($college[1]);
                }
                if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $desc, $college)) {
                    return CrawlAuthorParser::cleanText($college[1]);
                }
            }
        }

        if (preg_match('/<meta\s+name=[\'"]SiteName[\'"]\s+content=[\'"]([^\'"]+)[\'"]/u', $html, $match)) {
            $siteName = CrawlAuthorParser::cleanText(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'));
            if ($siteName !== null && $siteName !== '') {
                if (preg_match('/大学([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $siteName, $college)) {
                    return CrawlAuthorParser::cleanText($college[1]);
                }
            }
        }

        return null;
    }

    protected function applyProfileMetadataToItem(CrawlItemDto $item, string $html): CrawlItemDto
    {
        $lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : [];
        $changed = false;

        if (empty($lead['academic_title']) && preg_match('/<em>\s*([^<]+?)\s*<\/em>/u', $html, $titleMatch)) {
            $title = CrawlAuthorParser::cleanText($titleMatch[1]);
            if ($title !== null && $title !== '') {
                $lead['academic_title'] = $title;
                $changed = true;
            }
        }

        if (empty($lead['college']) && empty($lead['affiliation'])) {
            $dept = $this->parseLabeledField($html, '所属二级机构');
            if ($dept !== null && $dept !== '') {
                $lead['affiliation'] = $dept;
                $lead['college'] = $dept;
                $changed = true;
            }
        }

        if (! $changed) {
            return $item;
        }

        $extra = $item->extra;
        $extra['lead_author'] = $lead;
        if (! empty($lead['academic_title'])) {
            $extra['academic_title'] = $lead['academic_title'];
        }
        if (! empty($lead['college'])) {
            $extra['college_name'] = $lead['college'];
        }

        $authorsParsed = $item->authorsParsed;
        if ($authorsParsed !== []) {
            if (! empty($lead['academic_title'])) {
                $authorsParsed[0]['academic_title'] = $lead['academic_title'];
            }
            if (! empty($lead['college'])) {
                $authorsParsed[0]['affiliation'] = $lead['college'];
            }
        }

        return new CrawlItemDto(
            externalId: $item->externalId,
            title: $item->title,
            canonicalUrl: $item->canonicalUrl,
            authors: $item->authors,
            summary: $item->summary,
            publishedAt: $item->publishedAt,
            schoolName: $item->schoolName,
            section: $item->section,
            contentHtml: $item->contentHtml,
            extra: $extra,
            authorsParsed: $authorsParsed,
        );
    }

    /**
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function extractFromStructuredFacultyList(string $html, array $keywords, string $sourceUrl): array
    {
        $items = [];
        $seen = [];

        $collegeName = null;
        if (preg_match('/<div\s+class="title">\s*([^<]+?)\s*<\/div>/u', $html, $collegeMatch)) {
            $collegeName = CrawlAuthorParser::cleanText($collegeMatch[1]);
        }

        $listHtml = $html;
        if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) {
            $listHtml = $listMatch[1];
        }

        if (! preg_match_all('#<li>(.*?)</li>#su', $listHtml, $liBlocks)) {
            return [];
        }

        $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);

        foreach ($liBlocks[1] as $inner) {
            $inner = (string) $inner;
            if (! preg_match('/<div\s+class="name">\s*([^<]+?)\s*<\/div>/u', $inner, $nameMatch)) {
                continue;
            }

            $name = CrawlAuthorParser::cleanText($nameMatch[1]);
            if ($name === '' || ! $this->looksLikePersonName($name)) {
                continue;
            }

            $href = '';
            if (preg_match('/<a\s+[^>]*href="([^"]*)"/u', $inner, $hrefMatch)) {
                $href = (string) $hrefMatch[1];
            }

            $profileUrl = $this->resolveUrl($href, $sourceUrl)
                ?? $this->inferProfileUrlFromPageScripts($html, $name, $sourceUrl);
            $plain = $this->htmlToPlain($inner);

            if (! $this->matchesKeywords($plain.' '.$name, $keywords)) {
                continue;
            }

            $dedupeKey = $profileUrl ?: ('name:'.md5($name));
            if (isset($seen[$dedupeKey])) {
                continue;
            }
            $seen[$dedupeKey] = true;

            $affiliation = $this->parseLabeledField($inner, '所在单位')
                ?? $collegeName;
            $academicTitle = $this->parseLabeledField($inner, '职称');
            // 列表页「所在单位」多为学院，高校名称从站点/页头推断
            $universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation);

            $bio = $this->parseLabeledField($inner, '简介');
            $summaryParts = array_filter([
                $academicTitle ? '职称：'.$academicTitle : null,
                $affiliation ? '单位：'.$affiliation : null,
                $bio,
            ]);

            $items[] = $this->makeFacultyItem(
                externalKey: 'faculty:'.md5($dedupeKey),
                name: $name,
                profileUrl: $profileUrl,
                email: null,
                affiliation: $affiliation,
                universityName: $universityName,
                summary: Str::limit(implode('；', $summaryParts), 300),
                keywords: $keywords,
                academicTitle: $academicTitle,
                platform: 'faculty_html_tsites',
                bio: $bio,
            );
        }

        return $items;
    }

    /**
     * @param  list<string>  $keywords
     */
    protected function makeFacultyItem(
        string $externalKey,
        string $name,
        ?string $profileUrl,
        ?string $email,
        ?string $affiliation,
        ?string $universityName,
        ?string $summary,
        array $keywords,
        ?string $academicTitle,
        string $platform,
        ?string $bio = null,
    ): CrawlItemDto {
        $college = $affiliation;
        $lead = [
            'name' => $name,
            'email' => $email,
            'affiliation' => $college,
            'college' => $college,
            'university_name' => $universityName,
            'academic_title' => $academicTitle,
            'bio' => $bio,
            'profile_url' => $profileUrl,
        ];

        return new CrawlItemDto(
            externalId: $externalKey,
            title: $name,
            canonicalUrl: $profileUrl,
            authors: $name,
            summary: $summary,
            schoolName: $universityName,
            extra: [
                'platform' => $platform,
                'academic_title' => $academicTitle,
                'college_name' => $college,
                'bio' => $bio,
                'profile_url' => $profileUrl,
                'lead_author' => $lead,
                'keyword' => implode(' ', $keywords),
            ],
            authorsParsed: [[
                'name' => $name,
                'email' => $email,
                'affiliation' => $college,
                'university_name' => $universityName,
                'academic_title' => $academicTitle,
            ]],
        );
    }

    /**
     * @param  list<string>  $keywords
     */
    protected function matchesKeywords(string $plain, array $keywords): bool
    {
        if ($keywords === []) {
            return true;
        }

        foreach ($keywords as $kw) {
            if ($kw !== '' && stripos($plain, $kw) !== false) {
                return true;
            }
        }

        return false;
    }

    protected function htmlToPlain(string $html): string
    {
        $plain = html_entity_decode(strip_tags($html), ENT_QUOTES | ENT_HTML5, 'UTF-8');

        return preg_replace('/\s+/u', ' ', $plain) ?? '';
    }

    protected function parseLabeledField(string $html, string $label): ?string
    {
        $pattern = '/'.preg_quote($label, '/').'[：:]\s*([^<]+)/u';
        if (! preg_match($pattern, $html, $match)) {
            return null;
        }

        return CrawlAuthorParser::cleanText($match[1]);
    }

    protected function looksLikePersonName(string $name): bool
    {
        if (preg_match('/^(首页|登录|联系我们|下页|尾页|转到)/u', $name)) {
            return false;
        }

        return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name)
            || (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,30}$/', $name);
    }

    protected function resolveUrl(string $href, string $baseUrl): ?string
    {
        $href = trim(html_entity_decode($href, ENT_QUOTES | ENT_HTML5, 'UTF-8'));
        if ($href === '' || str_starts_with($href, 'javascript:')) {
            return null;
        }

        if (preg_match('#^https?://#i', $href)) {
            return $href;
        }

        $base = parse_url($baseUrl);
        if (! is_array($base) || empty($base['scheme']) || empty($base['host'])) {
            return $href;
        }

        $origin = $base['scheme'].'://'.$base['host'];
        if (! empty($base['port'])) {
            $origin .= ':'.$base['port'];
        }

        if (str_starts_with($href, '//')) {
            return $base['scheme'].':'.$href;
        }

        if (str_starts_with($href, '/')) {
            return $origin.$href;
        }

        $path = $base['path'] ?? '/';
        $dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/';

        return $origin.$dir.$href;
    }

    protected function inferProfileUrlFromPageScripts(string $html, string $name, string $sourceUrl): ?string
    {
        $escaped = preg_quote($name, '/');
        if (! preg_match(
            '/addimg\(\s*(?:"[^"]*"|\'[^\']*\')\s*,\s*"(\\/[^"]+index\.htm)"\s*,\s*"'.$escaped.'"/u',
            $html,
            $match,
        )) {
            return null;
        }

        return $this->resolveUrl($match[1], $sourceUrl);
    }

    protected function inferUniversityFromSource(string $sourceUrl, string $html): ?string
    {
        $host = parse_url($sourceUrl, PHP_URL_HOST);
        if (is_string($host)) {
            $host = strtolower($host);
            if (str_contains($host, 'sjtu.edu.cn')) {
                return '上海交通大学';
            }
            if (str_contains($host, 'tsinghua.edu.cn')) {
                return '清华大学';
            }
            if (str_contains($host, 'pku.edu.cn')) {
                return '北京大学';
            }
            if (str_contains($host, 'zju.edu.cn')) {
                return '浙江大学';
            }
            if (str_contains($host, 'fudan.edu.cn')) {
                return '复旦大学';
            }
            if (str_contains($host, 'nju.edu.cn')) {
                return '南京大学';
            }
            if (str_contains($host, 'tsinghua.edu.cn')) {
                return '清华大学';
            }
        }

        if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,20}大学)/u', $this->htmlToPlain($html), $match)) {
            return CrawlAuthorParser::cleanText($match[1]);
        }

        return null;
    }

    protected function guessName(string $plain, string $email): string
    {
        if (preg_match('/([\x{4e00}-\x{9fff}]{2,4})\s*(?:教授|副教授|讲师|研究员|博士|老师)?/u', $plain, $m)) {
            return trim($m[1]);
        }

        $local = strstr($email, '@', true) ?: '';
        $local = str_replace(['.', '_', '-'], ' ', $local);

        return Str::title(trim($local));
    }

    protected function guessAffiliation(string $plain): ?string
    {
        if (preg_match('/((?:[\x{4e00}-\x{9fff}A-Za-z\s]{2,40}(?:大学|学院|研究院|研究所|University|College)))/u', $plain, $m)) {
            return CrawlAuthorParser::cleanText($m[1]);
        }

        return null;
    }
}