master
lion 1 day ago
parent 68f30c05d6
commit a3a850b049

@ -26,7 +26,7 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
$firstHtml = $this->fetchHtml($baseUrl);
if ($this->isAjaxTeacherListPage($firstHtml, $requestUrl)) {
$items = $this->fetchAjaxTeacherItems($requestUrl, $firstHtml, $keywords, $maxResults);
$items = $this->fetchAjaxTeacherItems($requestUrl, $firstHtml, $keywords, $maxResults, $maxPages);
return $this->enrichEmailsFromProfilePages($items, $params);
}
@ -516,22 +516,90 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
string $pageHtml,
array $keywords,
int $maxResults,
int $maxPages = 1,
): array {
$config = $this->parseAjaxTeacherConfig($pageHtml, $requestUrl);
$search = implode(' ', $keywords);
$type = $search !== '' ? '2' : '1';
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
$maxPages = max(1, min(50, $maxPages));
$payload = [
'cat_id' => $config['cat_id'],
'cat_code' => $config['cat_code'],
'type' => $type,
'zm' => $search === '' ? 'All' : '',
'zc' => '',
'search' => $search,
];
if ($config['uses_page']) {
$payload['page'] = '1';
$items = [];
$seen = [];
$page = 1;
$totalCount = null;
while ($page <= $maxPages && count($items) < $maxResults) {
$body = $this->requestAjaxTeacherPage($config, $page, $search, $timeout);
if ($totalCount === null && isset($body['count'])) {
$totalCount = max(0, (int) $body['count']);
}
$content = (string) ($body['content'] ?? '');
if ($content === '') {
break;
}
$before = count($items);
foreach ($this->extractFromAjaxTeacherContent(
$pageHtml.$content,
$keywords,
$requestUrl,
$config['cat_code'],
) as $item) {
if (isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$items[] = $item;
if (count($items) >= $maxResults) {
break 2;
}
}
if ($config['variant'] === 'standard') {
break;
}
if (count($items) === $before) {
break;
}
if ($totalCount !== null && count($items) >= min($totalCount, $maxResults)) {
break;
}
$page++;
}
return $items;
}
/**
* @param array{variant:string,cat_id:?string,cat_code:string,api_url:string} $config
* @return array<string, mixed>
*/
protected function requestAjaxTeacherPage(array $config, int $page, string $search, int $timeout): array
{
if ($config['variant'] === 'simple') {
$payload = [
'page' => (string) $page,
'cat_code' => $config['cat_code'],
'yjszxfl' => '全部',
'name' => $search,
'zm' => $search === '' ? 'All' : '',
];
} else {
$payload = [
'cat_id' => $config['cat_id'],
'cat_code' => $config['cat_code'],
'type' => $search !== '' ? '2' : '1',
'zm' => $search === '' ? 'All' : '',
'zc' => '',
'search' => $search,
];
if ($config['uses_page']) {
$payload['page'] = (string) $page;
}
}
$response = Http::timeout($timeout)
@ -553,26 +621,11 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
throw new \RuntimeException('教师列表接口返回格式异常');
}
$content = (string) ($body['content'] ?? '');
if ($content === '') {
return [];
}
$items = $this->extractFromAjaxTeacherContent(
$pageHtml.$content,
$keywords,
$requestUrl,
$config['cat_code'],
);
if (count($items) > $maxResults) {
$items = array_slice($items, 0, $maxResults);
}
return $items;
return $body;
}
/**
* @return array{cat_id:string,cat_code:string,api_url:string,uses_page:bool}
* @return array{variant:string,cat_id:?string,cat_code:string,api_url:string,uses_page:bool}
*/
protected function parseAjaxTeacherConfig(string $html, string $sourceUrl): array
{
@ -597,11 +650,17 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
$apiUrl = $origin.$apiUrl;
}
if ($catId === null || $catCode === null || $apiUrl === '') {
throw new \RuntimeException('无法解析教师列表接口参数cat_id / cat_code');
if ($catCode === null || $apiUrl === '') {
throw new \RuntimeException('无法解析教师列表接口参数cat_code');
}
$variant = $catId !== null ? 'standard' : 'simple';
if ($variant === 'simple') {
$usesPage = true;
}
return [
'variant' => $variant,
'cat_id' => $catId,
'cat_code' => $catCode,
'api_url' => $apiUrl,
@ -624,6 +683,18 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
$defaultCollege = $this->inferCollegeFromPageTitle($html);
$cardItems = $this->extractFromAjaxTeacherCards(
$html,
$keywords,
$sourceUrl,
$pageUniversity,
$defaultCollege,
$catCode,
);
if ($cardItems !== []) {
return $cardItems;
}
$parts = preg_split('#<div\s+class="rc-item">#u', $html) ?: [];
if (count($parts) > 1) {
array_shift($parts);
@ -667,6 +738,87 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
);
}
/**
* ICISEE 等站点 AJAX 返回的卡片式教师列表(姓名在 div.name 内,职称在 span 内)。
*
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromAjaxTeacherCards(
string $html,
array $keywords,
string $sourceUrl,
?string $pageUniversity,
?string $affiliation,
?string $catCode,
): array {
if (! preg_match_all(
'#<a\b([^>]*?)>\s*(?:<div\s+class="imgk">.*?</div>\s*)?<div\s+class="name">(.*?)</div>#su',
$html,
$matches,
PREG_SET_ORDER,
)) {
return [];
}
$items = [];
$seen = [];
foreach ($matches as $match) {
$attrs = (string) $match[1];
$nameBlock = (string) $match[2];
if (! preg_match('/^([^<]+)/u', $nameBlock, $nameMatch)) {
continue;
}
$name = CrawlAuthorParser::cleanText(trim($nameMatch[1])) ?? '';
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
continue;
}
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
if (! $this->looksLikeTeacherProfileUrl($href, $catCode)) {
continue;
}
$profileUrl = $this->resolveUrl($href, $sourceUrl);
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
continue;
}
$plain = trim($name.' '.($affiliation ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
continue;
}
$academicTitle = null;
if (preg_match('#<span>([^<]+)</span>#u', $nameBlock, $titleMatch)) {
$academicTitle = CrawlAuthorParser::cleanText($titleMatch[1]);
}
$seen[$dedupeKey] = true;
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $affiliation,
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation),
summary: $affiliation ? '单位:'.$affiliation : null,
keywords: $keywords,
academicTitle: $academicTitle,
platform: 'faculty_html_ajax',
bio: null,
);
}
return $items;
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>

@ -293,4 +293,64 @@ HTML;
)));
$this->assertNull($method->invoke($adapter, null));
}
public function test_parses_icisee_ajax_teacher_config_without_cat_id(): void
{
$html = <<<'HTML'
<script>
$.ajax({
url: '/active/ajax_teacher_list.html',
type: 'post',
data: {page:page, cat_code:'jiaoshiml', yjszxfl:global_yjszxfl, name:global_name, zm:global_zm},
});
</script>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'parseAjaxTeacherConfig');
$method->setAccessible(true);
$config = $method->invoke($adapter, $html, 'https://icisee.sjtu.edu.cn/jiaoshiml.html');
$this->assertSame('simple', $config['variant']);
$this->assertNull($config['cat_id']);
$this->assertSame('jiaoshiml', $config['cat_code']);
$this->assertSame('https://icisee.sjtu.edu.cn/active/ajax_teacher_list.html', $config['api_url']);
$this->assertTrue($config['uses_page']);
}
public function test_extracts_icisee_card_style_teacher_list(): void
{
$html = <<<'HTML'
<title>教师名录-上海交通大学集成电路学院(信息与电子工程学院)</title>
<a href="/jiaoshiml/caixinghan.html" target="_blank">
<div class="imgk"><img src="/upload/x.png" alt=""></div>
<div class="name">蔡星汉<span>教授</span><p class="line-2">微纳全重党支部书记</p></div>
</a>
<a href="/jiaoshiml/zhangsan.html" target="_blank">
<div class="imgk"><img src="/upload/y.png" alt=""></div>
<div class="name">张三<span>副教授</span></div>
</a>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'extractFromAjaxTeacherContent');
$method->setAccessible(true);
$items = $method->invoke(
$adapter,
$html,
[],
'https://icisee.sjtu.edu.cn/jiaoshiml.html',
'jiaoshiml',
);
$this->assertCount(2, $items);
$this->assertSame('蔡星汉', $items[0]->title);
$this->assertSame('教授', $items[0]->extra['academic_title']);
$this->assertSame('https://icisee.sjtu.edu.cn/jiaoshiml/caixinghan.html', $items[0]->canonicalUrl);
$this->assertSame('张三', $items[1]->title);
$this->assertSame('副教授', $items[1]->extra['academic_title']);
$this->assertSame('faculty_html_ajax', $items[0]->extra['platform']);
}
}

Loading…
Cancel
Save