You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2142 lines
73 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
namespace App\Services\Crawl\Adapters;
use App\Models\CrawlSource;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use Illuminate\Http\Client\Response;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Str;
/**
* 通用院系/师资列表页:优先邮箱条目;无邮箱时解析 tsites 等列表卡片(姓名、单位、职称、主页)。
*/
class FacultyListHtmlAdapter implements CrawlerAdapterInterface
{
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
{
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
$maxResults = min(500, max(1, (int) ($params['max_results'] ?? 30)));
$maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1)));
$baseUrl = $this->normalizeRequestUrl($requestUrl);
$firstHtml = $this->fetchHtml($baseUrl);
if ($this->isAjaxTeacherListPage($firstHtml, $requestUrl)) {
$items = $this->fetchAjaxTeacherItems($requestUrl, $firstHtml, $keywords, $maxResults, $maxPages);
return $this->enrichEmailsFromProfilePages($items, $params);
}
if ($this->isNjuTeacherHomePage($firstHtml)) {
$items = $this->fetchNjuTeacherHomeItems($requestUrl, $firstHtml, $keywords, $maxResults, $maxPages);
return $this->enrichEmailsFromProfilePages($items, $params);
}
$totalPages = $this->detectTotalPages($firstHtml);
$pagesToFetch = min($maxPages, $totalPages);
$merged = [];
$seen = [];
for ($page = 1; $page <= $pagesToFetch && count($merged) < $maxResults; $page++) {
$html = $page === 1
? $firstHtml
: null;
if ($html === null) {
break;
}
foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
if (isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$merged[] = $item;
if (count($merged) >= $maxResults) {
break 2;
}
}
}
if ($pagesToFetch > 1 && count($merged) < $maxResults) {
$merged = $this->fetchRemainingListPages(
$baseUrl,
$firstHtml,
$pagesToFetch,
$keywords,
$requestUrl,
$merged,
$seen,
$maxResults,
);
}
return $this->enrichEmailsFromProfilePages($merged, $params);
}
/**
* @param list<CrawlItemDto> $merged
* @param array<string, true> $seen
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function fetchRemainingListPages(
string $baseUrl,
string $firstHtml,
int $pagesToFetch,
array $keywords,
string $requestUrl,
array $merged,
array $seen,
int $maxResults,
): array {
$poolSize = max(1, min(10, (int) config('crawl.faculty.list_fetch_pool_size', 5)));
$pageUrls = [];
for ($page = 2; $page <= $pagesToFetch; $page++) {
$pageUrls[$page] = $this->buildPageUrl($baseUrl, $page, $firstHtml);
}
foreach (array_chunk($pageUrls, $poolSize, true) as $chunk) {
$htmlByPage = $this->fetchHtmlPool($chunk);
ksort($htmlByPage);
foreach ($htmlByPage as $html) {
foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
if (isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$merged[] = $item;
if (count($merged) >= $maxResults) {
return $merged;
}
}
}
}
return $merged;
}
/**
* @param array<int, string> $pageUrls
* @return array<int, string>
*/
protected function fetchHtmlPool(array $pageUrls): array
{
if ($pageUrls === []) {
return [];
}
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
$headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pageUrls, $headers, $timeout) {
foreach ($pageUrls as $page => $url) {
$pool->as((string) $page)
->timeout($timeout)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders($headers)
->get($url);
}
});
$htmlByPage = [];
foreach ($pageUrls as $page => $url) {
$body = $this->responseBodyFromPoolResult($responses[(string) $page] ?? null);
if ($body !== null && $body !== '') {
$htmlByPage[$page] = $body;
}
}
return $htmlByPage;
}
/**
* @param list<CrawlItemDto> $items
* @param array<string, mixed> $params
* @return list<CrawlItemDto>
*/
protected function enrichEmailsFromProfilePages(array $items, array $params = []): array
{
if (! config('crawl.faculty.profile_email_enrich_enabled', true)) {
return $items;
}
$maxEnrich = $this->resolveProfileEnrichMax($params, count($items));
if ($maxEnrich <= 0) {
return $this->markProfileEnrichSkipped($items);
}
$poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 8)));
$timeout = max(5, (int) config('crawl.faculty.profile_http_timeout_seconds', 10));
$headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
$fetchMap = [];
$enrichBudget = $maxEnrich;
foreach ($items as $index => $item) {
if ($enrichBudget <= 0) {
break;
}
if ($this->itemHasEmail($item) || ! $item->canonicalUrl) {
continue;
}
$fetchMap[$index] = $item;
$enrichBudget--;
}
if ($fetchMap === []) {
return $items;
}
$fetchedBodies = [];
foreach (array_chunk($fetchMap, $poolSize, true) as $chunk) {
$batchPending = [];
foreach ($chunk as $index => $item) {
$batchPending[$index] = $item;
}
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($batchPending, $headers, $timeout) {
foreach ($batchPending as $index => $item) {
$pool->as((string) $index)
->timeout($timeout)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders($headers)
->get($item->canonicalUrl);
}
});
foreach ($batchPending as $index => $item) {
$body = $this->responseBodyFromPoolResult($responses[(string) $index] ?? null);
if ($body !== null) {
$email = $this->extractEmailFromProfileHtml($body);
if ($email) {
$item = $this->applyEmailToItem($item, $email);
}
$item = $this->applyProfileMetadataToItem($item, $body);
}
$fetchedBodies[$index] = $item;
}
}
$result = [];
foreach ($items as $index => $item) {
if (isset($fetchedBodies[$index])) {
$result[] = $fetchedBodies[$index];
} elseif (! $this->itemHasEmail($item) && $item->canonicalUrl) {
$result[] = $this->markItemProfileEnrichSkipped($item);
} else {
$result[] = $item;
}
}
return $result;
}
/**
* @param array<string, mixed> $params
*/
protected function resolveProfileEnrichMax(array $params, int $itemCount): int
{
if (($params['skip_profile_enrich'] ?? false) === true) {
return 0;
}
$configured = (int) ($params['profile_enrich_max'] ?? config('crawl.faculty.profile_enrich_max', 32));
return max(0, min($itemCount, min(200, $configured)));
}
/**
* @param list<CrawlItemDto> $items
* @return list<CrawlItemDto>
*/
protected function markProfileEnrichSkipped(array $items): array
{
return array_map(fn (CrawlItemDto $item) => $this->markItemProfileEnrichSkipped($item), $items);
}
protected function markItemProfileEnrichSkipped(CrawlItemDto $item): CrawlItemDto
{
if ($this->itemHasEmail($item)) {
return $item;
}
$extra = $item->extra;
$extra['profile_enrich_skipped'] = true;
return new CrawlItemDto(
externalId: $item->externalId,
title: $item->title,
canonicalUrl: $item->canonicalUrl,
authors: $item->authors,
summary: $item->summary,
publishedAt: $item->publishedAt,
schoolName: $item->schoolName,
section: $item->section,
contentHtml: $item->contentHtml,
extra: $extra,
authorsParsed: $item->authorsParsed,
);
}
protected function responseBodyFromPoolResult(mixed $result): ?string
{
if ($result instanceof Response && $result->successful()) {
return (string) $result->body();
}
return null;
}
protected function itemHasEmail(CrawlItemDto $item): bool
{
$lead = $item->extra['lead_author'] ?? null;
if (is_array($lead) && CrawlAuthorParser::normalizeEmail($lead['email'] ?? null)) {
return true;
}
foreach ($item->authorsParsed as $author) {
if (CrawlAuthorParser::normalizeEmail($author['email'] ?? null)) {
return true;
}
}
return false;
}
protected function applyEmailToItem(CrawlItemDto $item, string $email): CrawlItemDto
{
$email = CrawlAuthorParser::normalizeEmail($email) ?? $email;
$lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : [];
$lead['email'] = $email;
$authorsParsed = $item->authorsParsed;
if ($authorsParsed === []) {
$authorsParsed = [[
'name' => $item->title,
'email' => $email,
'affiliation' => $lead['affiliation'] ?? $lead['college'] ?? null,
'university_name' => $lead['university_name'] ?? $item->schoolName,
]];
} else {
$authorsParsed[0]['email'] = $email;
}
$extra = $item->extra;
$extra['lead_author'] = $lead;
return new CrawlItemDto(
externalId: $item->externalId,
title: $item->title,
canonicalUrl: $item->canonicalUrl,
authors: $item->authors,
summary: $item->summary,
publishedAt: $item->publishedAt,
schoolName: $item->schoolName,
section: $item->section,
contentHtml: $item->contentHtml,
extra: $extra,
authorsParsed: $authorsParsed,
);
}
protected function extractEmailFromProfileHtml(string $html): ?string
{
$labeledPatterns = [
'/电子邮箱[:]\s*<\/strong>\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
'/电子邮箱[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
'/电子信箱[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
'/E-?mail[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/iu',
'/邮箱[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
'/电子邮件[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
];
foreach ($labeledPatterns as $pattern) {
if (preg_match($pattern, $html, $match)) {
$email = CrawlAuthorParser::normalizeEmail($match[1]);
if ($email && ! $this->isNoiseEmail($email)) {
return $email;
}
}
}
$candidates = [];
if (preg_match_all(
'#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
$html,
$emailMatches,
)) {
foreach ($emailMatches[1] as $raw) {
$email = CrawlAuthorParser::normalizeEmail($raw);
if ($email && ! $this->isNoiseEmail($email)) {
$candidates[] = $email;
}
}
}
if ($candidates === []) {
return null;
}
$candidates = array_values(array_unique($candidates));
foreach ($candidates as $email) {
if (str_ends_with($email, '.edu.cn') || str_ends_with($email, '.edu')) {
return $email;
}
}
return $candidates[0];
}
protected function isNoiseEmail(string $email): bool
{
return (bool) preg_match(
'/^(noreply|no-reply|admin|webmaster|postmaster|root|support|service|info|contact)@/i',
$email,
);
}
protected function fetchHtml(string $url): string
{
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
$response = Http::timeout($timeout)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'])
->get($url);
if (! $response->successful()) {
throw new \RuntimeException('页面请求失败HTTP '.$response->status().''.$url);
}
return (string) $response->body();
}
protected function detectTotalPages(string $html): int
{
if (preg_match('/totalpage=(\d+)/i', $html, $match)) {
return max(1, (int) $match[1]);
}
if (preg_match('/共\s*(\d+)\s*条/u', $html, $countMatch)) {
$perPage = 0;
if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) {
$perPage = preg_match_all('/<div\s+class="name">/u', $listMatch[1]) ?: 0;
}
if ($perPage > 0) {
return max(1, (int) ceil(((int) $countMatch[1]) / $perPage));
}
}
return 1;
}
protected function buildPageUrl(string $baseUrl, int $page, string $firstPageHtml): string
{
$parts = parse_url($baseUrl);
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
return $baseUrl;
}
parse_str((string) ($parts['query'] ?? ''), $query);
$query['PAGENUM'] = (string) $page;
if (preg_match('/totalpage=(\d+)/i', $firstPageHtml, $totalMatch)) {
$query['totalpage'] = $totalMatch[1];
}
$url = $parts['scheme'].'://'.$parts['host'];
if (! empty($parts['port'])) {
$url .= ':'.$parts['port'];
}
$url .= $parts['path'] ?? '/';
if ($query !== []) {
$url .= '?'.http_build_query($query);
}
return $url;
}
protected function normalizeRequestUrl(string $url): string
{
$parts = parse_url($url);
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
return $url;
}
$normalized = $parts['scheme'].'://'.$parts['host'];
if (! empty($parts['port'])) {
$normalized .= ':'.$parts['port'];
}
$normalized .= $parts['path'] ?? '/';
if (! empty($parts['query'])) {
$normalized .= '?'.$parts['query'];
}
return $normalized;
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromHtml(string $html, array $keywords, string $sourceUrl): array
{
$items = $this->extractFromSudyNewsFacultyList($html, $keywords, $sourceUrl);
if ($items !== []) {
return $items;
}
$items = $this->extractFromRaTeacherList($html, $keywords, $sourceUrl);
if ($items !== []) {
return $items;
}
$items = $this->extractFromVsbFacultyTable($html, $keywords, $sourceUrl);
if ($items !== []) {
return $items;
}
$items = $this->extractFromEmailBlocks($html, $keywords, $sourceUrl);
if ($items !== []) {
return $items;
}
$items = $this->extractFromStructuredFacultyList($html, $keywords, $sourceUrl);
if ($items !== []) {
return $items;
}
return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl);
}
/**
* 南大 Sudy CMSul.news_list 内 news_title / news_title1 链接frontier、ic 等)。
*
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromSudyNewsFacultyList(string $html, array $keywords, string $sourceUrl): array
{
if (! preg_match('/class="news_list/u', $html)) {
return [];
}
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
$defaultCollege = $this->inferCollegeFromPageTitle($html);
if ($defaultCollege === null && preg_match('#<li class="col_title"><h2>([^<]+)</h2>#u', $html, $titleMatch)) {
$defaultCollege = CrawlAuthorParser::cleanText($titleMatch[1]);
}
$items = [];
$seen = [];
$chunks = preg_split('#<li class="wp_sublist#u', $html) ?: [];
if (count($chunks) <= 1) {
return $this->extractSudyNewsLinksFromChunk(
$html,
$defaultCollege,
$keywords,
$sourceUrl,
$pageUniversity,
$seen,
);
}
array_shift($chunks);
foreach ($chunks as $chunk) {
$department = $defaultCollege;
if (preg_match('#subcolumn-name">([^<]+)</span>#u', $chunk, $deptMatch)) {
$department = CrawlAuthorParser::cleanText($deptMatch[1]);
}
foreach ($this->extractSudyNewsLinksFromChunk(
$chunk,
$department,
$keywords,
$sourceUrl,
$pageUniversity,
$seen,
) as $item) {
$items[] = $item;
}
}
return $items;
}
/**
* @param array<string, true> $seen
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractSudyNewsLinksFromChunk(
string $chunk,
?string $department,
array $keywords,
string $sourceUrl,
?string $pageUniversity,
array &$seen,
): array {
$items = [];
if (! preg_match_all(
'#<(?:div|span)\s+class="news_title1?">\s*<a\b([^>]*?)>([^<]+)</a>#su',
$chunk,
$matches,
PREG_SET_ORDER,
)) {
return [];
}
foreach ($matches as $match) {
$attrs = (string) $match[1];
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
if (! preg_match('#\bhref=[\'"]([^\'"]+)[\'"]#u', $attrs, $hrefMatch)) {
continue;
}
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
if (! $this->looksLikeTeacherProfileUrl($href, null)) {
continue;
}
$profileUrl = $this->resolveUrl($href, $sourceUrl);
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
continue;
}
$plain = trim($name.' '.($department ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
continue;
}
$seen[$dedupeKey] = true;
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $department,
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($department),
summary: $department ? '单位:'.$department : null,
keywords: $keywords,
academicTitle: null,
platform: 'faculty_html_sudy_news',
bio: null,
);
}
return $items;
}
/**
* 南大机器人学院等博山 CMSul.teacher 卡片div.xm 姓名)。
*
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromRaTeacherList(string $html, array $keywords, string $sourceUrl): array
{
if (! preg_match('/<ul class="teacher">/u', $html)) {
return [];
}
if (! preg_match_all(
'#<a\b([^>]*?)>.*?<div class="xm">([^<]+)</div>(.*?)</a>#su',
$html,
$matches,
PREG_SET_ORDER,
)) {
return [];
}
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
$defaultCollege = $this->inferCollegeFromPageTitle($html);
$items = [];
$seen = [];
foreach ($matches as $match) {
$attrs = (string) $match[1];
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
$tail = (string) $match[3];
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
continue;
}
$profileUrl = $this->resolveUrl(html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'), $sourceUrl);
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
continue;
}
$academicTitle = null;
if (preg_match('#职称:\s*<span>([^<]+)</span>#u', $tail, $titleMatch)) {
$academicTitle = CrawlAuthorParser::cleanText($titleMatch[1]);
}
$researchField = null;
if (preg_match('#研究方向:\s*<span>([^<]+)</span>#u', $tail, $fieldMatch)) {
$researchField = CrawlAuthorParser::cleanText($fieldMatch[1]);
}
$plain = trim($name.' '.($researchField ?? '').' '.($academicTitle ?? '').' '.($defaultCollege ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
continue;
}
$summaryParts = array_filter([
$defaultCollege ? '单位:'.$defaultCollege : null,
$academicTitle ? '职称:'.$academicTitle : null,
$researchField ? '研究方向:'.$researchField : null,
]);
$seen[$dedupeKey] = true;
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $defaultCollege,
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
summary: $summaryParts !== [] ? implode('', $summaryParts) : null,
keywords: $keywords,
academicTitle: $academicTitle,
platform: 'faculty_html_ra',
bio: $researchField,
);
}
return $items;
}
/**
* 南大/清华 WebPlus(VSB) 师资表格页ise zjzjs 等)。
*
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromVsbFacultyTable(string $html, array $keywords, string $sourceUrl): array
{
$scope = null;
if (preg_match('#<div class="zjzjs">(.*?)</div>#su', $html, $match)) {
$scope = (string) $match[1];
} elseif (preg_match('#<div id="vsb_content[^"]*">(.*?)</div>\s*</div>\s*</div>#su', $html, $match)) {
$scope = (string) $match[1];
} elseif (preg_match('#<ul class="teach-list[^"]*">(.*?)</ul>#su', $html, $match) && trim(strip_tags($match[1])) !== '') {
$scope = (string) $match[1];
}
if ($scope === null || trim(strip_tags($scope)) === '') {
return [];
}
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
$defaultCollege = $this->inferCollegeFromPageTitle($html);
$items = [];
$seen = [];
$sectionTitles = [];
if (preg_match_all('#<strong[^>]*>(.*?)</strong>#su', $scope, $sectionMatches, PREG_OFFSET_CAPTURE)) {
foreach ($sectionMatches[1] as $sectionMatch) {
$title = CrawlAuthorParser::cleanText(strip_tags($sectionMatch[0]));
if ($title !== null && $title !== '') {
$sectionTitles[] = [
'offset' => $sectionMatch[1],
'title' => $title,
];
}
}
}
$resolveSectionTitle = function (int $offset) use ($sectionTitles): ?string {
$title = null;
foreach ($sectionTitles as $section) {
if ($section['offset'] <= $offset) {
$title = $section['title'];
} else {
break;
}
}
return $title;
};
$addItem = function (
string $name,
?string $profileUrl,
?string $sectionTitle,
) use (
$keywords,
$defaultCollege,
$pageUniversity,
&$items,
&$seen,
): void {
if ($name === '' || ! $this->looksLikePersonName($name)) {
return;
}
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
return;
}
$academicTitle = $this->inferAcademicTitleFromSection($sectionTitle);
$plain = trim($name.' '.($academicTitle ?? '').' '.($defaultCollege ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
return;
}
$seen[$dedupeKey] = true;
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $defaultCollege,
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
summary: $defaultCollege ? '单位:'.$defaultCollege : null,
keywords: $keywords,
academicTitle: $academicTitle,
platform: 'faculty_html_vsb',
bio: null,
);
};
if (preg_match_all('#<a\b([^>]*?)>(.*?)</a>#su', $scope, $linkMatches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER)) {
foreach ($linkMatches as $linkMatch) {
$attrs = (string) $linkMatch[1][0];
$offset = (int) $linkMatch[0][1];
$name = CrawlAuthorParser::cleanText(strip_tags($linkMatch[2][0])) ?? '';
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
continue;
}
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
if (! $this->looksLikeTeacherProfileUrl($href, null)) {
continue;
}
$addItem($name, $this->resolveUrl($href, $sourceUrl), $resolveSectionTitle($offset));
}
}
if (preg_match_all('#<td[^>]*>(.*?)</td>#su', $scope, $cellMatches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER)) {
foreach ($cellMatches as $cellMatch) {
$cellHtml = (string) $cellMatch[1][0];
$offset = (int) $cellMatch[0][1];
if (str_contains($cellHtml, '<a ')) {
continue;
}
$name = CrawlAuthorParser::cleanText(strip_tags($cellHtml)) ?? '';
$addItem($name, null, $resolveSectionTitle($offset));
}
}
return $items;
}
protected function inferAcademicTitleFromSection(?string $sectionTitle): ?string
{
if ($sectionTitle === null || $sectionTitle === '') {
return null;
}
if (str_contains($sectionTitle, '教授') && ! str_contains($sectionTitle, '副教授')) {
return '教授';
}
if (str_contains($sectionTitle, '副教授')) {
return '副教授';
}
if (str_contains($sectionTitle, '助理教授')) {
return '准聘助理教授';
}
if (str_contains($sectionTitle, '博士后')) {
return '博士后';
}
if (str_contains($sectionTitle, '专职科研')) {
return '专职科研';
}
return CrawlAuthorParser::cleanText($sectionTitle);
}
protected function isAjaxTeacherListPage(string $html, string $sourceUrl): bool
{
if (str_contains(strtolower($sourceUrl), 'ajax_teacher_list')) {
return true;
}
return str_contains($html, 'ajax_teacher_list.html');
}
protected function isNjuTeacherHomePage(string $html): bool
{
return str_contains($html, 'faculty.js')
&& (bool) preg_match('/<body[^>]*class="[^"]*\bfaculty\b/u', $html);
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function fetchNjuTeacherHomeItems(
string $requestUrl,
string $pageHtml,
array $keywords,
int $maxResults,
int $maxPages = 1,
): array {
$siteId = $this->parseNjuSiteId($pageHtml);
$filters = $this->parseNjuTeacherHomeFilters($pageHtml);
$conditions = $this->buildNjuTeacherHomeConditions($filters['career'], $filters['sub_career']);
$origin = $this->requestOrigin($requestUrl) ?? 'https://is.nju.edu.cn';
$apiUrl = $origin.'/_wp3services/generalQuery?queryObj=teacherHome';
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
$maxPages = max(1, min(50, $maxPages));
$rows = 50;
$pageUniversity = $this->inferUniversityFromSource($requestUrl, $pageHtml);
$defaultCollege = $this->inferCollegeFromPageTitle($pageHtml);
$items = [];
$seen = [];
$pageIndex = 1;
$pageCount = null;
while ($pageIndex <= $maxPages && count($items) < $maxResults) {
$body = $this->requestNjuTeacherHomePage($apiUrl, $siteId, $pageIndex, $rows, $conditions, $timeout);
if ($pageCount === null) {
$pageCount = max(1, (int) ($body['pageCount'] ?? 1));
}
$data = $body['data'] ?? [];
if (! is_array($data) || $data === []) {
break;
}
foreach ($data as $art) {
if (! is_array($art)) {
continue;
}
$name = CrawlAuthorParser::cleanText((string) ($art['title'] ?? '')) ?? '';
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
$profileUrl = $this->resolveUrl((string) ($art['cnUrl'] ?? ''), $requestUrl);
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
continue;
}
$academicTitle = CrawlAuthorParser::cleanText((string) ($art['exField2'] ?? ''));
$researchField = CrawlAuthorParser::cleanText((string) ($art['exField1'] ?? ''));
$plain = trim($name.' '.($researchField ?? '').' '.($academicTitle ?? '').' '.($defaultCollege ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
continue;
}
$summaryParts = array_filter([
$defaultCollege ? '单位:'.$defaultCollege : null,
$academicTitle ? '职称:'.$academicTitle : null,
$researchField ? '研究领域:'.$researchField : null,
]);
$seen[$dedupeKey] = true;
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $defaultCollege,
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
summary: $summaryParts !== [] ? implode('', $summaryParts) : null,
keywords: $keywords,
academicTitle: $academicTitle,
platform: 'faculty_html_nju_wp',
bio: $researchField,
);
if (count($items) >= $maxResults) {
break 2;
}
}
if ($pageIndex >= $pageCount) {
break;
}
$pageIndex++;
}
return $items;
}
protected function parseNjuSiteId(string $html): int
{
if (preg_match('/sudy-wp-siteId="(\d+)"/', $html, $match)) {
return (int) $match[1];
}
throw new \RuntimeException('无法解析教师列表站点 IDsiteId');
}
/**
* @return array{career:?string,sub_career:?string}
*/
protected function parseNjuTeacherHomeFilters(string $html): array
{
$career = null;
$subCareer = null;
if (preg_match('#class="col_item_link\s+selected"[^>]*title="([^"]+)"#u', $html, $match)) {
$career = CrawlAuthorParser::cleanText($match[1]);
} elseif (preg_match('#class="col_item_link\s+selected"[^>]*>.*?class="column-name">([^<]+)</span>#su', $html, $match)) {
$career = CrawlAuthorParser::cleanText($match[1]);
}
if (preg_match('#class="sub-item[^"]*\sselected"[^>]*>.*?class="column-name">([^<]+)</span>#su', $html, $match)) {
$subCareer = CrawlAuthorParser::cleanText($match[1]);
} elseif (preg_match('#class="sub-link[^"]*\sselected"[^>]*title="([^"]+)"#u', $html, $match)) {
$subCareer = CrawlAuthorParser::cleanText($match[1]);
}
return [
'career' => $career,
'sub_career' => $subCareer,
];
}
/**
* @return list<array<string, mixed>>
*/
protected function buildNjuTeacherHomeConditions(?string $career, ?string $subCareer): array
{
$conditions = [
['field' => 'published', 'value' => '1', 'judge' => '='],
];
if ($subCareer === '长聘副教授') {
$conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '长聘副教授', 'judge' => '=']]];
} elseif ($subCareer === '准聘副教授') {
$conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '准聘副教授', 'judge' => '=']]];
} elseif ($subCareer === '准聘助理教授') {
$conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '准聘助理教授', 'judge' => '=']]];
} elseif ($subCareer === '专职科研') {
$conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '专职科研', 'judge' => '=']]];
} elseif ($subCareer === '博士后') {
$conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '博士后', 'judge' => '=']]];
}
if ($career === null || $career === '') {
return $conditions;
}
if ($career === '教授') {
$conditions[] = ['field' => 'exField2', 'value' => '教授', 'judge' => '='];
} elseif ($career === '副教授') {
$conditions[] = ['field' => 'exField2', 'value' => '副教授', 'judge' => '='];
} elseif ($career === '兼职教授') {
$conditions[] = ['field' => 'exField2', 'value' => '兼职教授', 'judge' => '='];
} elseif ($career === '行政管理人员') {
$conditions[] = ['field' => 'exField2', 'value' => '行政管理人员', 'judge' => '='];
} elseif ($career === '准长聘' && ($subCareer === null || $subCareer === '')) {
$conditions[] = [
'orConditions' => [
['field' => 'exField2', 'value' => '长聘副教授', 'judge' => '='],
['field' => 'exField2', 'value' => '准聘副教授', 'judge' => '='],
['field' => 'exField2', 'value' => '准聘助理教授', 'judge' => '='],
],
];
} elseif ($career === '专职科研及博士后' && ($subCareer === null || $subCareer === '')) {
$conditions[] = [
'orConditions' => [
['field' => 'exField2', 'value' => '专职科研', 'judge' => '='],
['field' => 'exField2', 'value' => '博士后', 'judge' => '='],
],
];
}
return $conditions;
}
/**
* @param list<array<string, mixed>> $conditions
* @return array<string, mixed>
*/
protected function requestNjuTeacherHomePage(
string $apiUrl,
int $siteId,
int $pageIndex,
int $rows,
array $conditions,
int $timeout,
): array {
$returnInfos = [
['field' => 'headerPic', 'name' => 'headerPic'],
['field' => 'exField1', 'name' => 'exField1'],
['field' => 'exField2', 'name' => 'exField2'],
['field' => 'cnUrl', 'name' => 'cnUrl'],
['field' => 'title', 'name' => 'title'],
['field' => 'phone', 'name' => 'phone'],
];
$response = Http::timeout($timeout)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders([
'User-Agent' => 'SlakeSchool-Crawler/1.0',
'Accept' => 'application/json',
])
->asForm()
->post($apiUrl, [
'siteId' => $siteId,
'pageIndex' => $pageIndex,
'rows' => $rows,
'orders' => json_encode([['field' => 'siteSort', 'type' => 'asc']], JSON_UNESCAPED_UNICODE),
'returnInfos' => json_encode($returnInfos, JSON_UNESCAPED_UNICODE),
'conditions' => json_encode($conditions, JSON_UNESCAPED_UNICODE),
'articleType' => 1,
'level' => 1,
]);
if (! $response->successful()) {
throw new \RuntimeException('教师列表接口请求失败HTTP '.$response->status().'');
}
$body = $response->json();
if (! is_array($body)) {
throw new \RuntimeException('教师列表接口返回格式异常');
}
return $body;
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function fetchAjaxTeacherItems(
string $requestUrl,
string $pageHtml,
array $keywords,
int $maxResults,
int $maxPages = 1,
): array {
$config = $this->parseAjaxTeacherConfig($pageHtml, $requestUrl);
$search = implode(' ', $keywords);
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
$maxPages = max(1, min(50, $maxPages));
$items = [];
$seen = [];
$page = 1;
$totalCount = null;
while ($page <= $maxPages && count($items) < $maxResults) {
$body = $this->requestAjaxTeacherPage($config, $page, $search, $timeout);
if ($totalCount === null && isset($body['count'])) {
$totalCount = max(0, (int) $body['count']);
}
$content = (string) ($body['content'] ?? '');
if ($content === '') {
break;
}
$before = count($items);
foreach ($this->extractFromAjaxTeacherContent(
$pageHtml.$content,
$keywords,
$requestUrl,
$config['cat_code'],
) as $item) {
if (isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$items[] = $item;
if (count($items) >= $maxResults) {
break 2;
}
}
if ($config['variant'] === 'standard') {
break;
}
if (count($items) === $before) {
break;
}
if ($totalCount !== null && count($items) >= min($totalCount, $maxResults)) {
break;
}
$page++;
}
return $items;
}
/**
* @param array{variant:string,cat_id:?string,cat_code:string,api_url:string} $config
* @return array<string, mixed>
*/
protected function requestAjaxTeacherPage(array $config, int $page, string $search, int $timeout): array
{
if ($config['variant'] === 'simple') {
$payload = [
'page' => (string) $page,
'cat_code' => $config['cat_code'],
'yjszxfl' => '全部',
'name' => $search,
'zm' => $search === '' ? 'All' : '',
];
} else {
$payload = [
'cat_id' => $config['cat_id'],
'cat_code' => $config['cat_code'],
'type' => $search !== '' ? '2' : '1',
'zm' => $search === '' ? 'All' : '',
'zc' => '',
'search' => $search,
];
if ($config['uses_page']) {
$payload['page'] = (string) $page;
}
}
$response = Http::timeout($timeout)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders([
'User-Agent' => 'SlakeSchool-Crawler/1.0',
'Accept' => 'application/json, text/html',
])
->asForm()
->post($config['api_url'], $payload);
if (! $response->successful()) {
throw new \RuntimeException('教师列表接口请求失败HTTP '.$response->status().'');
}
$body = $response->json();
if (! is_array($body)) {
throw new \RuntimeException('教师列表接口返回格式异常');
}
return $body;
}
/**
* @return array{variant:string,cat_id:?string,cat_code:string,api_url:string,uses_page:bool}
*/
protected function parseAjaxTeacherConfig(string $html, string $sourceUrl): array
{
$catId = null;
$catCode = null;
$usesPage = str_contains($html, 'page:page');
$origin = $this->requestOrigin($sourceUrl);
$apiUrl = $origin !== null ? $origin.'/active/ajax_teacher_list.html' : '';
if (preg_match("/cat_id\s*:\s*'(\d+)'/i", $html, $match)) {
$catId = $match[1];
}
if (preg_match("/cat_code\s*:\s*'([^']+)'/i", $html, $match)) {
$catCode = $match[1];
}
if (preg_match("#url\s*:\s*'([^']*ajax_teacher_list[^']*)'#i", $html, $match)) {
$apiUrl = $this->resolveUrl(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'), $sourceUrl)
?? $apiUrl;
}
if ($origin !== null && str_starts_with($apiUrl, '/')) {
$apiUrl = $origin.$apiUrl;
}
if ($catCode === null || $apiUrl === '') {
throw new \RuntimeException('无法解析教师列表接口参数cat_code');
}
$variant = $catId !== null ? 'standard' : 'simple';
if ($variant === 'simple') {
$usesPage = true;
}
return [
'variant' => $variant,
'cat_id' => $catId,
'cat_code' => $catCode,
'api_url' => $apiUrl,
'uses_page' => $usesPage,
];
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromAjaxTeacherContent(
string $html,
array $keywords,
string $sourceUrl,
?string $catCode = null,
): array {
$items = [];
$seen = [];
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
$defaultCollege = $this->inferCollegeFromPageTitle($html);
$cardItems = $this->extractFromAjaxTeacherCards(
$html,
$keywords,
$sourceUrl,
$pageUniversity,
$defaultCollege,
$catCode,
);
if ($cardItems !== []) {
return $cardItems;
}
$parts = preg_split('#<div\s+class="rc-item">#u', $html) ?: [];
if (count($parts) > 1) {
array_shift($parts);
foreach ($parts as $block) {
$department = $defaultCollege;
if (preg_match('#<div\s+class="tit">.*?<div\s+class="name">([^<]+)</div>#su', $block, $deptMatch)) {
$sectionTitle = CrawlAuthorParser::cleanText($deptMatch[1]);
if ($sectionTitle !== null && $sectionTitle !== '' && ! $this->looksLikePersonName($sectionTitle)) {
$department = $sectionTitle;
}
}
foreach ($this->extractTeacherLinksFromHtmlBlock(
$block,
$keywords,
$sourceUrl,
$pageUniversity,
$department,
$catCode,
) as $item) {
if (isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$items[] = $item;
}
}
if ($items !== []) {
return $items;
}
}
return $this->extractTeacherLinksFromHtmlBlock(
$html,
$keywords,
$sourceUrl,
$pageUniversity,
$defaultCollege,
$catCode,
);
}
/**
* ICISEE 等站点 AJAX 返回的卡片式教师列表(姓名在 div.name 内,职称在 span 内)。
*
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromAjaxTeacherCards(
string $html,
array $keywords,
string $sourceUrl,
?string $pageUniversity,
?string $affiliation,
?string $catCode,
): array {
if (! preg_match_all(
'#<a\b([^>]*?)>\s*(?:<div\s+class="imgk">.*?</div>\s*)?<div\s+class="name">(.*?)</div>#su',
$html,
$matches,
PREG_SET_ORDER,
)) {
return [];
}
$items = [];
$seen = [];
foreach ($matches as $match) {
$attrs = (string) $match[1];
$nameBlock = (string) $match[2];
if (! preg_match('/^([^<]+)/u', $nameBlock, $nameMatch)) {
continue;
}
$name = CrawlAuthorParser::cleanText(trim($nameMatch[1])) ?? '';
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
continue;
}
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
if (! $this->looksLikeTeacherProfileUrl($href, $catCode)) {
continue;
}
$profileUrl = $this->resolveUrl($href, $sourceUrl);
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
continue;
}
$plain = trim($name.' '.($affiliation ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
continue;
}
$academicTitle = null;
if (preg_match('#<span>([^<]+)</span>#u', $nameBlock, $titleMatch)) {
$academicTitle = CrawlAuthorParser::cleanText($titleMatch[1]);
}
$seen[$dedupeKey] = true;
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $affiliation,
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation),
summary: $affiliation ? '单位:'.$affiliation : null,
keywords: $keywords,
academicTitle: $academicTitle,
platform: 'faculty_html_ajax',
bio: null,
);
}
return $items;
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractTeacherLinksFromHtmlBlock(
string $html,
array $keywords,
string $sourceUrl,
?string $pageUniversity,
?string $affiliation,
?string $catCode,
): array {
$items = [];
$seen = [];
if (! preg_match_all('#<a\b([^>]*?)>(.*?)</a>#su', $html, $matches, PREG_SET_ORDER)) {
return [];
}
foreach ($matches as $match) {
$attrs = (string) $match[1];
$rawName = preg_replace('/\s+/u', '', strip_tags($match[2])) ?? '';
$name = CrawlAuthorParser::cleanText($rawName) ?? '';
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
continue;
}
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
if (! $this->looksLikeTeacherProfileUrl($href, $catCode)) {
continue;
}
$profileUrl = $this->resolveUrl($href, $sourceUrl);
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
continue;
}
$plain = trim($name.' '.($affiliation ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
continue;
}
$seen[$dedupeKey] = true;
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $affiliation,
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation),
summary: $affiliation ? '单位:'.$affiliation : null,
keywords: $keywords,
academicTitle: null,
platform: 'faculty_html_ajax',
bio: null,
);
}
return $items;
}
protected function looksLikeTeacherProfileUrl(string $href, ?string $catCode): bool
{
$path = strtolower((string) parse_url($href, PHP_URL_PATH));
if ($path === '') {
return false;
}
if (preg_match('#/(faculty|jiaoshiml|people/detail_new)/[^/]+\.html$#', $path)) {
return true;
}
if (preg_match('#/c\d+a\d+/page\.htm$#', $path)) {
return true;
}
if (preg_match('#/(?:szll|zjzjs)/[^/]+\.(?:htm|html)$#', $path)) {
return true;
}
if (preg_match('#^(?:szll|zjzjs)/[^/]+\.(?:htm|html)$#', $path)) {
return true;
}
if (preg_match('#/info/\d+/\d+\.htm$#', $path)) {
return true;
}
if ($catCode !== null && $catCode !== '') {
$code = preg_quote(strtolower($catCode), '#');
return (bool) preg_match('#/'.$code.'/[^/]+\.html$#', $path);
}
return false;
}
protected function requestOrigin(string $sourceUrl): ?string
{
$parts = parse_url($sourceUrl);
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
return null;
}
$origin = $parts['scheme'].'://'.$parts['host'];
if (! empty($parts['port'])) {
$origin .= ':'.$parts['port'];
}
return $origin;
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromEmailBlocks(string $html, array $keywords, string $sourceUrl): array
{
$items = [];
$seen = [];
if (! preg_match_all(
'#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
$html,
$emailMatches,
PREG_OFFSET_CAPTURE
)) {
return [];
}
foreach ($emailMatches[1] as $match) {
$email = CrawlAuthorParser::normalizeEmail($match[0]);
if (! $email || isset($seen[$email])) {
continue;
}
$pos = (int) $match[1];
$window = substr($html, max(0, $pos - 400), 800);
$plain = $this->htmlToPlain($window);
if (! $this->matchesKeywords($plain, $keywords)) {
continue;
}
$name = $this->guessName($plain, $email);
if ($name === '') {
continue;
}
$affiliation = $this->guessAffiliation($plain);
$seen[$email] = true;
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($email),
name: $name,
profileUrl: $sourceUrl,
email: $email,
affiliation: $affiliation,
universityName: CrawlAuthorParser::universityFromAffiliation($affiliation)
?? $this->inferUniversityFromSource($sourceUrl, $html),
summary: Str::limit($plain, 300),
keywords: $keywords,
academicTitle: null,
platform: 'faculty_html',
bio: null,
);
}
return $items;
}
/**
* 上海交大材料学院等panel-item + a.staff-item/people/detail_new/{id})。
*
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromStaffPanelList(string $html, array $keywords, string $sourceUrl): array
{
$items = [];
$seen = [];
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
$defaultCollege = $this->inferCollegeFromPageTitle($html);
$panelChunks = preg_split('#<div\s+class="panel-head">#u', $html) ?: [];
if (count($panelChunks) > 1) {
array_shift($panelChunks);
foreach ($panelChunks as $chunk) {
if (! preg_match('#<div\s+class="title">\s*([^<]+?)\s*</div>#u', $chunk, $titleMatch)) {
continue;
}
$department = CrawlAuthorParser::cleanText($titleMatch[1]);
foreach ($this->extractStaffItemLinks($chunk) as $link) {
$item = $this->makeStaffPanelItem(
$link,
$department ?: $defaultCollege,
$pageUniversity,
$keywords,
$sourceUrl,
);
if ($item === null || isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$items[] = $item;
}
}
}
if ($items !== []) {
return $items;
}
foreach ($this->extractStaffItemLinks($html) as $link) {
$item = $this->makeStaffPanelItem(
$link,
$defaultCollege,
$pageUniversity,
$keywords,
$sourceUrl,
);
if ($item === null || isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$items[] = $item;
}
return $items;
}
/**
* @return list<array{href:string,name:string}>
*/
protected function extractStaffItemLinks(string $html): array
{
$links = [];
$seen = [];
if (! preg_match_all('#<a\b([^>]*?)>([^<]+)</a>#su', $html, $matches, PREG_SET_ORDER)) {
return [];
}
foreach ($matches as $match) {
$attrs = (string) $match[1];
if (! str_contains($attrs, 'staff-item')) {
continue;
}
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
continue;
}
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
$key = $href.'|'.$name;
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$links[] = ['href' => $href, 'name' => $name];
}
return $links;
}
/**
* @param array{href:string,name:string} $link
* @param list<string> $keywords
*/
protected function makeStaffPanelItem(
array $link,
?string $department,
?string $pageUniversity,
array $keywords,
string $sourceUrl,
): ?CrawlItemDto {
$name = $link['name'];
$profileUrl = $this->resolveUrl($link['href'], $sourceUrl);
$plain = trim($name.' '.($department ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
return null;
}
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
$affiliation = $department;
$universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation);
return $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $affiliation,
universityName: $universityName,
summary: $department ? '单位:'.$department : null,
keywords: $keywords,
academicTitle: null,
platform: 'faculty_html_smse',
bio: null,
);
}
protected function inferCollegeFromPageTitle(string $html): ?string
{
if (preg_match('/<title>\s*[^<\-\–—]+[\-–—]\s*([^<]+?)\s*<\/title>/u', $html, $match)) {
$title = CrawlAuthorParser::cleanText($match[1]);
if ($title !== null && $title !== '') {
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $title, $college)) {
return CrawlAuthorParser::cleanText($college[1]);
}
return $title;
}
}
if (preg_match('/<meta\s+name="description"\s+content="([^"]+)"/u', $html, $match)) {
$desc = CrawlAuthorParser::cleanText(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'));
if ($desc !== null && $desc !== '') {
if (preg_match('/大学([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $desc, $college)) {
return CrawlAuthorParser::cleanText($college[1]);
}
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $desc, $college)) {
return CrawlAuthorParser::cleanText($college[1]);
}
}
}
if (preg_match('/<meta\s+name=[\'"]SiteName[\'"]\s+content=[\'"]([^\'"]+)[\'"]/u', $html, $match)) {
$siteName = CrawlAuthorParser::cleanText(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'));
if ($siteName !== null && $siteName !== '') {
if (preg_match('/大学([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $siteName, $college)) {
return CrawlAuthorParser::cleanText($college[1]);
}
}
}
return null;
}
protected function applyProfileMetadataToItem(CrawlItemDto $item, string $html): CrawlItemDto
{
$lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : [];
$changed = false;
if (empty($lead['academic_title']) && preg_match('/<em>\s*([^<]+?)\s*<\/em>/u', $html, $titleMatch)) {
$title = CrawlAuthorParser::cleanText($titleMatch[1]);
if ($title !== null && $title !== '') {
$lead['academic_title'] = $title;
$changed = true;
}
}
if (empty($lead['college']) && empty($lead['affiliation'])) {
$dept = $this->parseLabeledField($html, '所属二级机构');
if ($dept !== null && $dept !== '') {
$lead['affiliation'] = $dept;
$lead['college'] = $dept;
$changed = true;
}
}
if (! $changed) {
return $item;
}
$extra = $item->extra;
$extra['lead_author'] = $lead;
if (! empty($lead['academic_title'])) {
$extra['academic_title'] = $lead['academic_title'];
}
if (! empty($lead['college'])) {
$extra['college_name'] = $lead['college'];
}
$authorsParsed = $item->authorsParsed;
if ($authorsParsed !== []) {
if (! empty($lead['academic_title'])) {
$authorsParsed[0]['academic_title'] = $lead['academic_title'];
}
if (! empty($lead['college'])) {
$authorsParsed[0]['affiliation'] = $lead['college'];
}
}
return new CrawlItemDto(
externalId: $item->externalId,
title: $item->title,
canonicalUrl: $item->canonicalUrl,
authors: $item->authors,
summary: $item->summary,
publishedAt: $item->publishedAt,
schoolName: $item->schoolName,
section: $item->section,
contentHtml: $item->contentHtml,
extra: $extra,
authorsParsed: $authorsParsed,
);
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromStructuredFacultyList(string $html, array $keywords, string $sourceUrl): array
{
$items = [];
$seen = [];
$collegeName = null;
if (preg_match('/<div\s+class="title">\s*([^<]+?)\s*<\/div>/u', $html, $collegeMatch)) {
$collegeName = CrawlAuthorParser::cleanText($collegeMatch[1]);
}
$listHtml = $html;
if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) {
$listHtml = $listMatch[1];
}
if (! preg_match_all('#<li>(.*?)</li>#su', $listHtml, $liBlocks)) {
return [];
}
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
foreach ($liBlocks[1] as $inner) {
$inner = (string) $inner;
if (! preg_match('/<div\s+class="name">\s*([^<]+?)\s*<\/div>/u', $inner, $nameMatch)) {
continue;
}
$name = CrawlAuthorParser::cleanText($nameMatch[1]);
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
$href = '';
if (preg_match('/<a\s+[^>]*href="([^"]*)"/u', $inner, $hrefMatch)) {
$href = (string) $hrefMatch[1];
}
$profileUrl = $this->resolveUrl($href, $sourceUrl)
?? $this->inferProfileUrlFromPageScripts($html, $name, $sourceUrl);
$plain = $this->htmlToPlain($inner);
if (! $this->matchesKeywords($plain.' '.$name, $keywords)) {
continue;
}
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
continue;
}
$seen[$dedupeKey] = true;
$affiliation = $this->parseLabeledField($inner, '所在单位')
?? $collegeName;
$academicTitle = $this->parseLabeledField($inner, '职称');
// 列表页「所在单位」多为学院,高校名称从站点/页头推断
$universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation);
$bio = $this->parseLabeledField($inner, '简介');
$summaryParts = array_filter([
$academicTitle ? '职称:'.$academicTitle : null,
$affiliation ? '单位:'.$affiliation : null,
$bio,
]);
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $affiliation,
universityName: $universityName,
summary: Str::limit(implode('', $summaryParts), 300),
keywords: $keywords,
academicTitle: $academicTitle,
platform: 'faculty_html_tsites',
bio: $bio,
);
}
return $items;
}
/**
* @param list<string> $keywords
*/
protected function makeFacultyItem(
string $externalKey,
string $name,
?string $profileUrl,
?string $email,
?string $affiliation,
?string $universityName,
?string $summary,
array $keywords,
?string $academicTitle,
string $platform,
?string $bio = null,
): CrawlItemDto {
$college = $affiliation;
$lead = [
'name' => $name,
'email' => $email,
'affiliation' => $college,
'college' => $college,
'university_name' => $universityName,
'academic_title' => $academicTitle,
'bio' => $bio,
'profile_url' => $profileUrl,
];
return new CrawlItemDto(
externalId: $externalKey,
title: $name,
canonicalUrl: $profileUrl,
authors: $name,
summary: $summary,
schoolName: $universityName,
extra: [
'platform' => $platform,
'academic_title' => $academicTitle,
'college_name' => $college,
'bio' => $bio,
'profile_url' => $profileUrl,
'lead_author' => $lead,
'keyword' => implode(' ', $keywords),
],
authorsParsed: [[
'name' => $name,
'email' => $email,
'affiliation' => $college,
'university_name' => $universityName,
'academic_title' => $academicTitle,
]],
);
}
/**
* @param list<string> $keywords
*/
protected function matchesKeywords(string $plain, array $keywords): bool
{
if ($keywords === []) {
return true;
}
foreach ($keywords as $kw) {
if ($kw !== '' && stripos($plain, $kw) !== false) {
return true;
}
}
return false;
}
protected function htmlToPlain(string $html): string
{
$plain = html_entity_decode(strip_tags($html), ENT_QUOTES | ENT_HTML5, 'UTF-8');
return preg_replace('/\s+/u', ' ', $plain) ?? '';
}
protected function parseLabeledField(string $html, string $label): ?string
{
$pattern = '/'.preg_quote($label, '/').'[:]\s*([^<]+)/u';
if (! preg_match($pattern, $html, $match)) {
return null;
}
return CrawlAuthorParser::cleanText($match[1]);
}
protected function looksLikePersonName(string $name): bool
{
if (preg_match('/^(首页|登录|联系我们|下页|尾页|转到)/u', $name)) {
return false;
}
return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name)
|| (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,30}$/', $name);
}
protected function resolveUrl(string $href, string $baseUrl): ?string
{
$href = trim(html_entity_decode($href, ENT_QUOTES | ENT_HTML5, 'UTF-8'));
if ($href === '' || str_starts_with($href, 'javascript:')) {
return null;
}
if (preg_match('#^https?://#i', $href)) {
return $href;
}
$base = parse_url($baseUrl);
if (! is_array($base) || empty($base['scheme']) || empty($base['host'])) {
return $href;
}
$origin = $base['scheme'].'://'.$base['host'];
if (! empty($base['port'])) {
$origin .= ':'.$base['port'];
}
if (str_starts_with($href, '//')) {
return $base['scheme'].':'.$href;
}
if (str_starts_with($href, '/')) {
return $origin.$href;
}
$path = $base['path'] ?? '/';
$dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/';
return $origin.$dir.$href;
}
protected function inferProfileUrlFromPageScripts(string $html, string $name, string $sourceUrl): ?string
{
$escaped = preg_quote($name, '/');
if (! preg_match(
'/addimg\(\s*(?:"[^"]*"|\'[^\']*\')\s*,\s*"(\\/[^"]+index\.htm)"\s*,\s*"'.$escaped.'"/u',
$html,
$match,
)) {
return null;
}
return $this->resolveUrl($match[1], $sourceUrl);
}
protected function inferUniversityFromSource(string $sourceUrl, string $html): ?string
{
$host = parse_url($sourceUrl, PHP_URL_HOST);
if (is_string($host)) {
$host = strtolower($host);
if (str_contains($host, 'sjtu.edu.cn')) {
return '上海交通大学';
}
if (str_contains($host, 'tsinghua.edu.cn')) {
return '清华大学';
}
if (str_contains($host, 'pku.edu.cn')) {
return '北京大学';
}
if (str_contains($host, 'zju.edu.cn')) {
return '浙江大学';
}
if (str_contains($host, 'fudan.edu.cn')) {
return '复旦大学';
}
if (str_contains($host, 'nju.edu.cn')) {
return '南京大学';
}
if (str_contains($host, 'tsinghua.edu.cn')) {
return '清华大学';
}
}
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,20}大学)/u', $this->htmlToPlain($html), $match)) {
return CrawlAuthorParser::cleanText($match[1]);
}
return null;
}
protected function guessName(string $plain, string $email): string
{
if (preg_match('/([\x{4e00}-\x{9fff}]{2,4})\s*(?:教授|副教授|讲师|研究员|博士|老师)?/u', $plain, $m)) {
return trim($m[1]);
}
$local = strstr($email, '@', true) ?: '';
$local = str_replace(['.', '_', '-'], ' ', $local);
return Str::title(trim($local));
}
protected function guessAffiliation(string $plain): ?string
{
if (preg_match('/((?:[\x{4e00}-\x{9fff}A-Za-z\s]{2,40}(?:大学|学院|研究院|研究所|University|College)))/u', $plain, $m)) {
return CrawlAuthorParser::cleanText($m[1]);
}
return null;
}
}