master
lion 2 days ago
parent a3a850b049
commit bd8527fc55

@ -31,6 +31,12 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
return $this->enrichEmailsFromProfilePages($items, $params);
}
if ($this->isNjuTeacherHomePage($firstHtml)) {
$items = $this->fetchNjuTeacherHomeItems($requestUrl, $firstHtml, $keywords, $maxResults, $maxPages);
return $this->enrichEmailsFromProfilePages($items, $params);
}
$totalPages = $this->detectTotalPages($firstHtml);
$pagesToFetch = min($maxPages, $totalPages);
@ -485,6 +491,21 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
*/
protected function extractFromHtml(string $html, array $keywords, string $sourceUrl): array
{
$items = $this->extractFromSudyNewsFacultyList($html, $keywords, $sourceUrl);
if ($items !== []) {
return $items;
}
$items = $this->extractFromRaTeacherList($html, $keywords, $sourceUrl);
if ($items !== []) {
return $items;
}
$items = $this->extractFromVsbFacultyTable($html, $keywords, $sourceUrl);
if ($items !== []) {
return $items;
}
$items = $this->extractFromEmailBlocks($html, $keywords, $sourceUrl);
if ($items !== []) {
return $items;
@ -498,6 +519,363 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl);
}
/**
* 南大 Sudy CMSul.news_list 内 news_title / news_title1 链接frontier、ic 等)。
*
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromSudyNewsFacultyList(string $html, array $keywords, string $sourceUrl): array
{
if (! preg_match('/class="news_list/u', $html)) {
return [];
}
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
$defaultCollege = $this->inferCollegeFromPageTitle($html);
if ($defaultCollege === null && preg_match('#<li class="col_title"><h2>([^<]+)</h2>#u', $html, $titleMatch)) {
$defaultCollege = CrawlAuthorParser::cleanText($titleMatch[1]);
}
$items = [];
$seen = [];
$chunks = preg_split('#<li class="wp_sublist#u', $html) ?: [];
if (count($chunks) <= 1) {
return $this->extractSudyNewsLinksFromChunk(
$html,
$defaultCollege,
$keywords,
$sourceUrl,
$pageUniversity,
$seen,
);
}
array_shift($chunks);
foreach ($chunks as $chunk) {
$department = $defaultCollege;
if (preg_match('#subcolumn-name">([^<]+)</span>#u', $chunk, $deptMatch)) {
$department = CrawlAuthorParser::cleanText($deptMatch[1]);
}
foreach ($this->extractSudyNewsLinksFromChunk(
$chunk,
$department,
$keywords,
$sourceUrl,
$pageUniversity,
$seen,
) as $item) {
$items[] = $item;
}
}
return $items;
}
/**
* @param array<string, true> $seen
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractSudyNewsLinksFromChunk(
string $chunk,
?string $department,
array $keywords,
string $sourceUrl,
?string $pageUniversity,
array &$seen,
): array {
$items = [];
if (! preg_match_all(
'#<(?:div|span)\s+class="news_title1?">\s*<a\b([^>]*?)>([^<]+)</a>#su',
$chunk,
$matches,
PREG_SET_ORDER,
)) {
return [];
}
foreach ($matches as $match) {
$attrs = (string) $match[1];
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
if (! preg_match('#\bhref=[\'"]([^\'"]+)[\'"]#u', $attrs, $hrefMatch)) {
continue;
}
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
if (! $this->looksLikeTeacherProfileUrl($href, null)) {
continue;
}
$profileUrl = $this->resolveUrl($href, $sourceUrl);
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
continue;
}
$plain = trim($name.' '.($department ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
continue;
}
$seen[$dedupeKey] = true;
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $department,
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($department),
summary: $department ? '单位:'.$department : null,
keywords: $keywords,
academicTitle: null,
platform: 'faculty_html_sudy_news',
bio: null,
);
}
return $items;
}
/**
* 南大机器人学院等博山 CMSul.teacher 卡片div.xm 姓名)。
*
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromRaTeacherList(string $html, array $keywords, string $sourceUrl): array
{
if (! preg_match('/<ul class="teacher">/u', $html)) {
return [];
}
if (! preg_match_all(
'#<a\b([^>]*?)>.*?<div class="xm">([^<]+)</div>(.*?)</a>#su',
$html,
$matches,
PREG_SET_ORDER,
)) {
return [];
}
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
$defaultCollege = $this->inferCollegeFromPageTitle($html);
$items = [];
$seen = [];
foreach ($matches as $match) {
$attrs = (string) $match[1];
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
$tail = (string) $match[3];
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
continue;
}
$profileUrl = $this->resolveUrl(html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'), $sourceUrl);
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
continue;
}
$academicTitle = null;
if (preg_match('#职称:\s*<span>([^<]+)</span>#u', $tail, $titleMatch)) {
$academicTitle = CrawlAuthorParser::cleanText($titleMatch[1]);
}
$researchField = null;
if (preg_match('#研究方向:\s*<span>([^<]+)</span>#u', $tail, $fieldMatch)) {
$researchField = CrawlAuthorParser::cleanText($fieldMatch[1]);
}
$plain = trim($name.' '.($researchField ?? '').' '.($academicTitle ?? '').' '.($defaultCollege ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
continue;
}
$summaryParts = array_filter([
$defaultCollege ? '单位:'.$defaultCollege : null,
$academicTitle ? '职称:'.$academicTitle : null,
$researchField ? '研究方向:'.$researchField : null,
]);
$seen[$dedupeKey] = true;
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $defaultCollege,
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
summary: $summaryParts !== [] ? implode('', $summaryParts) : null,
keywords: $keywords,
academicTitle: $academicTitle,
platform: 'faculty_html_ra',
bio: $researchField,
);
}
return $items;
}
/**
* 南大/清华 WebPlus(VSB) 师资表格页ise zjzjs 等)。
*
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromVsbFacultyTable(string $html, array $keywords, string $sourceUrl): array
{
$scope = null;
if (preg_match('#<div class="zjzjs">(.*?)</div>#su', $html, $match)) {
$scope = (string) $match[1];
} elseif (preg_match('#<div id="vsb_content[^"]*">(.*?)</div>\s*</div>\s*</div>#su', $html, $match)) {
$scope = (string) $match[1];
} elseif (preg_match('#<ul class="teach-list[^"]*">(.*?)</ul>#su', $html, $match) && trim(strip_tags($match[1])) !== '') {
$scope = (string) $match[1];
}
if ($scope === null || trim(strip_tags($scope)) === '') {
return [];
}
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
$defaultCollege = $this->inferCollegeFromPageTitle($html);
$items = [];
$seen = [];
$sectionTitles = [];
if (preg_match_all('#<strong[^>]*>(.*?)</strong>#su', $scope, $sectionMatches, PREG_OFFSET_CAPTURE)) {
foreach ($sectionMatches[1] as $sectionMatch) {
$title = CrawlAuthorParser::cleanText(strip_tags($sectionMatch[0]));
if ($title !== null && $title !== '') {
$sectionTitles[] = [
'offset' => $sectionMatch[1],
'title' => $title,
];
}
}
}
$resolveSectionTitle = function (int $offset) use ($sectionTitles): ?string {
$title = null;
foreach ($sectionTitles as $section) {
if ($section['offset'] <= $offset) {
$title = $section['title'];
} else {
break;
}
}
return $title;
};
$addItem = function (
string $name,
?string $profileUrl,
?string $sectionTitle,
) use (
$keywords,
$defaultCollege,
$pageUniversity,
&$items,
&$seen,
): void {
if ($name === '' || ! $this->looksLikePersonName($name)) {
return;
}
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
return;
}
$academicTitle = $this->inferAcademicTitleFromSection($sectionTitle);
$plain = trim($name.' '.($academicTitle ?? '').' '.($defaultCollege ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
return;
}
$seen[$dedupeKey] = true;
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $defaultCollege,
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
summary: $defaultCollege ? '单位:'.$defaultCollege : null,
keywords: $keywords,
academicTitle: $academicTitle,
platform: 'faculty_html_vsb',
bio: null,
);
};
if (preg_match_all('#<a\b([^>]*?)>(.*?)</a>#su', $scope, $linkMatches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER)) {
foreach ($linkMatches as $linkMatch) {
$attrs = (string) $linkMatch[1][0];
$offset = (int) $linkMatch[0][1];
$name = CrawlAuthorParser::cleanText(strip_tags($linkMatch[2][0])) ?? '';
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
continue;
}
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
if (! $this->looksLikeTeacherProfileUrl($href, null)) {
continue;
}
$addItem($name, $this->resolveUrl($href, $sourceUrl), $resolveSectionTitle($offset));
}
}
if (preg_match_all('#<td[^>]*>(.*?)</td>#su', $scope, $cellMatches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER)) {
foreach ($cellMatches as $cellMatch) {
$cellHtml = (string) $cellMatch[1][0];
$offset = (int) $cellMatch[0][1];
if (str_contains($cellHtml, '<a ')) {
continue;
}
$name = CrawlAuthorParser::cleanText(strip_tags($cellHtml)) ?? '';
$addItem($name, null, $resolveSectionTitle($offset));
}
}
return $items;
}
protected function inferAcademicTitleFromSection(?string $sectionTitle): ?string
{
if ($sectionTitle === null || $sectionTitle === '') {
return null;
}
if (str_contains($sectionTitle, '教授') && ! str_contains($sectionTitle, '副教授')) {
return '教授';
}
if (str_contains($sectionTitle, '副教授')) {
return '副教授';
}
if (str_contains($sectionTitle, '助理教授')) {
return '准聘助理教授';
}
if (str_contains($sectionTitle, '博士后')) {
return '博士后';
}
if (str_contains($sectionTitle, '专职科研')) {
return '专职科研';
}
return CrawlAuthorParser::cleanText($sectionTitle);
}
protected function isAjaxTeacherListPage(string $html, string $sourceUrl): bool
{
if (str_contains(strtolower($sourceUrl), 'ajax_teacher_list')) {
@ -507,6 +885,250 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
return str_contains($html, 'ajax_teacher_list.html');
}
protected function isNjuTeacherHomePage(string $html): bool
{
return str_contains($html, 'faculty.js')
&& (bool) preg_match('/<body[^>]*class="[^"]*\bfaculty\b/u', $html);
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function fetchNjuTeacherHomeItems(
string $requestUrl,
string $pageHtml,
array $keywords,
int $maxResults,
int $maxPages = 1,
): array {
$siteId = $this->parseNjuSiteId($pageHtml);
$filters = $this->parseNjuTeacherHomeFilters($pageHtml);
$conditions = $this->buildNjuTeacherHomeConditions($filters['career'], $filters['sub_career']);
$origin = $this->requestOrigin($requestUrl) ?? 'https://is.nju.edu.cn';
$apiUrl = $origin.'/_wp3services/generalQuery?queryObj=teacherHome';
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
$maxPages = max(1, min(50, $maxPages));
$rows = 50;
$pageUniversity = $this->inferUniversityFromSource($requestUrl, $pageHtml);
$defaultCollege = $this->inferCollegeFromPageTitle($pageHtml);
$items = [];
$seen = [];
$pageIndex = 1;
$pageCount = null;
while ($pageIndex <= $maxPages && count($items) < $maxResults) {
$body = $this->requestNjuTeacherHomePage($apiUrl, $siteId, $pageIndex, $rows, $conditions, $timeout);
if ($pageCount === null) {
$pageCount = max(1, (int) ($body['pageCount'] ?? 1));
}
$data = $body['data'] ?? [];
if (! is_array($data) || $data === []) {
break;
}
foreach ($data as $art) {
if (! is_array($art)) {
continue;
}
$name = CrawlAuthorParser::cleanText((string) ($art['title'] ?? '')) ?? '';
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
$profileUrl = $this->resolveUrl((string) ($art['cnUrl'] ?? ''), $requestUrl);
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
continue;
}
$academicTitle = CrawlAuthorParser::cleanText((string) ($art['exField2'] ?? ''));
$researchField = CrawlAuthorParser::cleanText((string) ($art['exField1'] ?? ''));
$plain = trim($name.' '.($researchField ?? '').' '.($academicTitle ?? '').' '.($defaultCollege ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
continue;
}
$summaryParts = array_filter([
$defaultCollege ? '单位:'.$defaultCollege : null,
$academicTitle ? '职称:'.$academicTitle : null,
$researchField ? '研究领域:'.$researchField : null,
]);
$seen[$dedupeKey] = true;
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $defaultCollege,
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
summary: $summaryParts !== [] ? implode('', $summaryParts) : null,
keywords: $keywords,
academicTitle: $academicTitle,
platform: 'faculty_html_nju_wp',
bio: $researchField,
);
if (count($items) >= $maxResults) {
break 2;
}
}
if ($pageIndex >= $pageCount) {
break;
}
$pageIndex++;
}
return $items;
}
protected function parseNjuSiteId(string $html): int
{
if (preg_match('/sudy-wp-siteId="(\d+)"/', $html, $match)) {
return (int) $match[1];
}
throw new \RuntimeException('无法解析教师列表站点 IDsiteId');
}
/**
* @return array{career:?string,sub_career:?string}
*/
protected function parseNjuTeacherHomeFilters(string $html): array
{
$career = null;
$subCareer = null;
if (preg_match('#class="col_item_link\s+selected"[^>]*title="([^"]+)"#u', $html, $match)) {
$career = CrawlAuthorParser::cleanText($match[1]);
} elseif (preg_match('#class="col_item_link\s+selected"[^>]*>.*?class="column-name">([^<]+)</span>#su', $html, $match)) {
$career = CrawlAuthorParser::cleanText($match[1]);
}
if (preg_match('#class="sub-item[^"]*\sselected"[^>]*>.*?class="column-name">([^<]+)</span>#su', $html, $match)) {
$subCareer = CrawlAuthorParser::cleanText($match[1]);
} elseif (preg_match('#class="sub-link[^"]*\sselected"[^>]*title="([^"]+)"#u', $html, $match)) {
$subCareer = CrawlAuthorParser::cleanText($match[1]);
}
return [
'career' => $career,
'sub_career' => $subCareer,
];
}
/**
* @return list<array<string, mixed>>
*/
protected function buildNjuTeacherHomeConditions(?string $career, ?string $subCareer): array
{
$conditions = [
['field' => 'published', 'value' => '1', 'judge' => '='],
];
if ($subCareer === '长聘副教授') {
$conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '长聘副教授', 'judge' => '=']]];
} elseif ($subCareer === '准聘副教授') {
$conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '准聘副教授', 'judge' => '=']]];
} elseif ($subCareer === '准聘助理教授') {
$conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '准聘助理教授', 'judge' => '=']]];
} elseif ($subCareer === '专职科研') {
$conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '专职科研', 'judge' => '=']]];
} elseif ($subCareer === '博士后') {
$conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '博士后', 'judge' => '=']]];
}
if ($career === null || $career === '') {
return $conditions;
}
if ($career === '教授') {
$conditions[] = ['field' => 'exField2', 'value' => '教授', 'judge' => '='];
} elseif ($career === '副教授') {
$conditions[] = ['field' => 'exField2', 'value' => '副教授', 'judge' => '='];
} elseif ($career === '兼职教授') {
$conditions[] = ['field' => 'exField2', 'value' => '兼职教授', 'judge' => '='];
} elseif ($career === '行政管理人员') {
$conditions[] = ['field' => 'exField2', 'value' => '行政管理人员', 'judge' => '='];
} elseif ($career === '准长聘' && ($subCareer === null || $subCareer === '')) {
$conditions[] = [
'orConditions' => [
['field' => 'exField2', 'value' => '长聘副教授', 'judge' => '='],
['field' => 'exField2', 'value' => '准聘副教授', 'judge' => '='],
['field' => 'exField2', 'value' => '准聘助理教授', 'judge' => '='],
],
];
} elseif ($career === '专职科研及博士后' && ($subCareer === null || $subCareer === '')) {
$conditions[] = [
'orConditions' => [
['field' => 'exField2', 'value' => '专职科研', 'judge' => '='],
['field' => 'exField2', 'value' => '博士后', 'judge' => '='],
],
];
}
return $conditions;
}
/**
* @param list<array<string, mixed>> $conditions
* @return array<string, mixed>
*/
protected function requestNjuTeacherHomePage(
string $apiUrl,
int $siteId,
int $pageIndex,
int $rows,
array $conditions,
int $timeout,
): array {
$returnInfos = [
['field' => 'headerPic', 'name' => 'headerPic'],
['field' => 'exField1', 'name' => 'exField1'],
['field' => 'exField2', 'name' => 'exField2'],
['field' => 'cnUrl', 'name' => 'cnUrl'],
['field' => 'title', 'name' => 'title'],
['field' => 'phone', 'name' => 'phone'],
];
$response = Http::timeout($timeout)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders([
'User-Agent' => 'SlakeSchool-Crawler/1.0',
'Accept' => 'application/json',
])
->asForm()
->post($apiUrl, [
'siteId' => $siteId,
'pageIndex' => $pageIndex,
'rows' => $rows,
'orders' => json_encode([['field' => 'siteSort', 'type' => 'asc']], JSON_UNESCAPED_UNICODE),
'returnInfos' => json_encode($returnInfos, JSON_UNESCAPED_UNICODE),
'conditions' => json_encode($conditions, JSON_UNESCAPED_UNICODE),
'articleType' => 1,
'level' => 1,
]);
if (! $response->successful()) {
throw new \RuntimeException('教师列表接口请求失败HTTP '.$response->status().'');
}
$body = $response->json();
if (! is_array($body)) {
throw new \RuntimeException('教师列表接口返回格式异常');
}
return $body;
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
@ -895,6 +1517,22 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
return true;
}
if (preg_match('#/c\d+a\d+/page\.htm$#', $path)) {
return true;
}
if (preg_match('#/(?:szll|zjzjs)/[^/]+\.(?:htm|html)$#', $path)) {
return true;
}
if (preg_match('#^(?:szll|zjzjs)/[^/]+\.(?:htm|html)$#', $path)) {
return true;
}
if (preg_match('#/info/\d+/\d+\.htm$#', $path)) {
return true;
}
if ($catCode !== null && $catCode !== '') {
$code = preg_quote(strtolower($catCode), '#');
@ -1116,20 +1754,39 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
protected function inferCollegeFromPageTitle(string $html): ?string
{
if (! preg_match('/<title>\s*[^<\-\–—]+[\-–—]\s*([^<]+?)\s*<\/title>/u', $html, $match)) {
return null;
if (preg_match('/<title>\s*[^<\-\–—]+[\-–—]\s*([^<]+?)\s*<\/title>/u', $html, $match)) {
$title = CrawlAuthorParser::cleanText($match[1]);
if ($title !== null && $title !== '') {
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $title, $college)) {
return CrawlAuthorParser::cleanText($college[1]);
}
return $title;
}
}
$title = CrawlAuthorParser::cleanText($match[1]);
if ($title === null || $title === '') {
return null;
if (preg_match('/<meta\s+name="description"\s+content="([^"]+)"/u', $html, $match)) {
$desc = CrawlAuthorParser::cleanText(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'));
if ($desc !== null && $desc !== '') {
if (preg_match('/大学([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $desc, $college)) {
return CrawlAuthorParser::cleanText($college[1]);
}
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $desc, $college)) {
return CrawlAuthorParser::cleanText($college[1]);
}
}
}
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $title, $college)) {
return CrawlAuthorParser::cleanText($college[1]);
if (preg_match('/<meta\s+name=[\'"]SiteName[\'"]\s+content=[\'"]([^\'"]+)[\'"]/u', $html, $match)) {
$siteName = CrawlAuthorParser::cleanText(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'));
if ($siteName !== null && $siteName !== '') {
if (preg_match('/大学([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $siteName, $college)) {
return CrawlAuthorParser::cleanText($college[1]);
}
}
}
return $title;
return null;
}
protected function applyProfileMetadataToItem(CrawlItemDto $item, string $html): CrawlItemDto
@ -1446,6 +2103,12 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
if (str_contains($host, 'fudan.edu.cn')) {
return '复旦大学';
}
if (str_contains($host, 'nju.edu.cn')) {
return '南京大学';
}
if (str_contains($host, 'tsinghua.edu.cn')) {
return '清华大学';
}
}
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,20}大学)/u', $this->htmlToPlain($html), $match)) {

@ -353,4 +353,140 @@ HTML;
$this->assertSame('副教授', $items[1]->extra['academic_title']);
$this->assertSame('faculty_html_ajax', $items[0]->extra['platform']);
}
public function test_detects_nju_teacher_home_page(): void
{
$html = '<body class="list faculty"><script src="/js/faculty.js"></script></body>';
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'isNjuTeacherHomePage');
$method->setAccessible(true);
$this->assertTrue($method->invoke($adapter, $html));
$this->assertFalse($method->invoke($adapter, '<body class="list"><script src="/js/list.js"></script></body>'));
}
public function test_builds_nju_teacher_home_conditions_for_all_faculty(): void
{
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'buildNjuTeacherHomeConditions');
$method->setAccessible(true);
$conditions = $method->invoke($adapter, null, null);
$this->assertCount(1, $conditions);
$this->assertSame('published', $conditions[0]['field']);
}
public function test_builds_nju_teacher_home_conditions_for_professor_category(): void
{
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'buildNjuTeacherHomeConditions');
$method->setAccessible(true);
$conditions = $method->invoke($adapter, '教授', null);
$this->assertCount(2, $conditions);
$this->assertSame('exField2', $conditions[1]['field']);
$this->assertSame('教授', $conditions[1]['value']);
}
public function test_parses_nju_site_id_from_html(): void
{
$html = '<script src="/_js/jquery.min.js" sudy-wp-siteId="786"></script>';
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'parseNjuSiteId');
$method->setAccessible(true);
$this->assertSame(786, $method->invoke($adapter, $html));
}
public function test_infers_college_from_meta_description(): void
{
$html = '<title>师资力量</title><meta name="description" content="南京大学智能科学与技术学院" >';
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'inferCollegeFromPageTitle');
$method->setAccessible(true);
$this->assertSame('智能科学与技术学院', $method->invoke($adapter, $html));
}
public function test_extracts_sudy_news_faculty_list(): void
{
$html = <<<'HTML'
<title>师资力量-南京大学前沿科学学院</title>
<li class="wp_sublist sublist-1">
<h3 class="sublist_title"><span class="subcolumn-name">功能材料与智能制造研究院</span></h3>
<ul class="news_list list2">
<li class="news n1 clearfix">
<div class="news_title"><a href='/85/ef/c59286a689647/page.htm' title='王保明'>王保明</a></div>
</li>
</ul>
</li>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'extractFromSudyNewsFacultyList');
$method->setAccessible(true);
$items = $method->invoke($adapter, $html, [], 'https://frontier.nju.edu.cn/zrjs/list.htm');
$this->assertCount(1, $items);
$this->assertSame('王保明', $items[0]->title);
$this->assertSame('功能材料与智能制造研究院', $items[0]->extra['college_name']);
$this->assertSame('faculty_html_sudy_news', $items[0]->extra['platform']);
}
public function test_extracts_ra_teacher_cards(): void
{
$html = <<<'HTML'
<title>专职教师-南京大学机器人与自动化学院</title>
<ul class="teacher">
<li>
<a href="http://ra.nju.edu.cn/szll/zzjs/20250901/i335910.html" title="周克敏">
<div class="data"><div class="name"><div class="xm">周克敏</div></div>
<div class="research pro">职称:<span>教授</span></div>
<div class="research">研究方向:<span>鲁棒控制</span></div></div>
</a>
</li>
</ul>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'extractFromRaTeacherList');
$method->setAccessible(true);
$items = $method->invoke($adapter, $html, [], 'https://ra.nju.edu.cn/szll/zzjs/index.html');
$this->assertCount(1, $items);
$this->assertSame('周克敏', $items[0]->title);
$this->assertSame('教授', $items[0]->extra['academic_title']);
$this->assertSame('faculty_html_ra', $items[0]->extra['platform']);
}
public function test_extracts_vsb_faculty_table(): void
{
$html = <<<'HTML'
<title>专兼职教师-南京大学智能软件与工程学院</title>
<div class="zjzjs"><p><strong><span>教授</span></strong></p>
<table><tr><td><a href="zjzjs/yangkun.htm"><span>杨鲲</span></a></td><td><span>陶先平</span></td></tr></table>
<p><strong><span>副教授</span></strong></p>
<table><tr><td><a href="zjzjs/shaodong.htm"><span>邵栋</span></a></td></tr></table>
</div>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'extractFromVsbFacultyTable');
$method->setAccessible(true);
$items = $method->invoke($adapter, $html, [], 'https://ise.nju.edu.cn/szll/zjzjs.htm');
$this->assertCount(3, $items);
$names = array_map(fn ($item) => $item->title, $items);
$this->assertContains('杨鲲', $names);
$this->assertContains('陶先平', $names);
$this->assertContains('邵栋', $names);
$titles = array_column(array_map(fn ($item) => $item->extra, $items), 'academic_title', null);
$this->assertSame('教授', $items[array_search('杨鲲', $names, true)]->extra['academic_title']);
$this->assertSame('副教授', $items[array_search('邵栋', $names, true)]->extra['academic_title']);
}
}

Loading…
Cancel
Save