|
|
|
|
@ -26,7 +26,7 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
$firstHtml = $this->fetchHtml($baseUrl);
|
|
|
|
|
|
|
|
|
|
if ($this->isAjaxTeacherListPage($firstHtml, $requestUrl)) {
|
|
|
|
|
$items = $this->fetchAjaxTeacherItems($requestUrl, $firstHtml, $keywords, $maxResults);
|
|
|
|
|
$items = $this->fetchAjaxTeacherItems($requestUrl, $firstHtml, $keywords, $maxResults, $maxPages);
|
|
|
|
|
|
|
|
|
|
return $this->enrichEmailsFromProfilePages($items, $params);
|
|
|
|
|
}
|
|
|
|
|
@ -516,22 +516,90 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
string $pageHtml,
|
|
|
|
|
array $keywords,
|
|
|
|
|
int $maxResults,
|
|
|
|
|
int $maxPages = 1,
|
|
|
|
|
): array {
|
|
|
|
|
$config = $this->parseAjaxTeacherConfig($pageHtml, $requestUrl);
|
|
|
|
|
$search = implode(' ', $keywords);
|
|
|
|
|
$type = $search !== '' ? '2' : '1';
|
|
|
|
|
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
|
|
|
|
|
$maxPages = max(1, min(50, $maxPages));
|
|
|
|
|
|
|
|
|
|
$payload = [
|
|
|
|
|
'cat_id' => $config['cat_id'],
|
|
|
|
|
'cat_code' => $config['cat_code'],
|
|
|
|
|
'type' => $type,
|
|
|
|
|
'zm' => $search === '' ? 'All' : '',
|
|
|
|
|
'zc' => '',
|
|
|
|
|
'search' => $search,
|
|
|
|
|
];
|
|
|
|
|
if ($config['uses_page']) {
|
|
|
|
|
$payload['page'] = '1';
|
|
|
|
|
$items = [];
|
|
|
|
|
$seen = [];
|
|
|
|
|
$page = 1;
|
|
|
|
|
$totalCount = null;
|
|
|
|
|
|
|
|
|
|
while ($page <= $maxPages && count($items) < $maxResults) {
|
|
|
|
|
$body = $this->requestAjaxTeacherPage($config, $page, $search, $timeout);
|
|
|
|
|
if ($totalCount === null && isset($body['count'])) {
|
|
|
|
|
$totalCount = max(0, (int) $body['count']);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$content = (string) ($body['content'] ?? '');
|
|
|
|
|
if ($content === '') {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$before = count($items);
|
|
|
|
|
foreach ($this->extractFromAjaxTeacherContent(
|
|
|
|
|
$pageHtml.$content,
|
|
|
|
|
$keywords,
|
|
|
|
|
$requestUrl,
|
|
|
|
|
$config['cat_code'],
|
|
|
|
|
) as $item) {
|
|
|
|
|
if (isset($seen[$item->externalId])) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$seen[$item->externalId] = true;
|
|
|
|
|
$items[] = $item;
|
|
|
|
|
if (count($items) >= $maxResults) {
|
|
|
|
|
break 2;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($config['variant'] === 'standard') {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (count($items) === $before) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($totalCount !== null && count($items) >= min($totalCount, $maxResults)) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$page++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param array{variant:string,cat_id:?string,cat_code:string,api_url:string} $config
|
|
|
|
|
* @return array<string, mixed>
|
|
|
|
|
*/
|
|
|
|
|
protected function requestAjaxTeacherPage(array $config, int $page, string $search, int $timeout): array
|
|
|
|
|
{
|
|
|
|
|
if ($config['variant'] === 'simple') {
|
|
|
|
|
$payload = [
|
|
|
|
|
'page' => (string) $page,
|
|
|
|
|
'cat_code' => $config['cat_code'],
|
|
|
|
|
'yjszxfl' => '全部',
|
|
|
|
|
'name' => $search,
|
|
|
|
|
'zm' => $search === '' ? 'All' : '',
|
|
|
|
|
];
|
|
|
|
|
} else {
|
|
|
|
|
$payload = [
|
|
|
|
|
'cat_id' => $config['cat_id'],
|
|
|
|
|
'cat_code' => $config['cat_code'],
|
|
|
|
|
'type' => $search !== '' ? '2' : '1',
|
|
|
|
|
'zm' => $search === '' ? 'All' : '',
|
|
|
|
|
'zc' => '',
|
|
|
|
|
'search' => $search,
|
|
|
|
|
];
|
|
|
|
|
if ($config['uses_page']) {
|
|
|
|
|
$payload['page'] = (string) $page;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$response = Http::timeout($timeout)
|
|
|
|
|
@ -553,26 +621,11 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
throw new \RuntimeException('教师列表接口返回格式异常');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$content = (string) ($body['content'] ?? '');
|
|
|
|
|
if ($content === '') {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$items = $this->extractFromAjaxTeacherContent(
|
|
|
|
|
$pageHtml.$content,
|
|
|
|
|
$keywords,
|
|
|
|
|
$requestUrl,
|
|
|
|
|
$config['cat_code'],
|
|
|
|
|
);
|
|
|
|
|
if (count($items) > $maxResults) {
|
|
|
|
|
$items = array_slice($items, 0, $maxResults);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $items;
|
|
|
|
|
return $body;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return array{cat_id:string,cat_code:string,api_url:string,uses_page:bool}
|
|
|
|
|
* @return array{variant:string,cat_id:?string,cat_code:string,api_url:string,uses_page:bool}
|
|
|
|
|
*/
|
|
|
|
|
protected function parseAjaxTeacherConfig(string $html, string $sourceUrl): array
|
|
|
|
|
{
|
|
|
|
|
@ -597,11 +650,17 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
$apiUrl = $origin.$apiUrl;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($catId === null || $catCode === null || $apiUrl === '') {
|
|
|
|
|
throw new \RuntimeException('无法解析教师列表接口参数(cat_id / cat_code)');
|
|
|
|
|
if ($catCode === null || $apiUrl === '') {
|
|
|
|
|
throw new \RuntimeException('无法解析教师列表接口参数(cat_code)');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$variant = $catId !== null ? 'standard' : 'simple';
|
|
|
|
|
if ($variant === 'simple') {
|
|
|
|
|
$usesPage = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return [
|
|
|
|
|
'variant' => $variant,
|
|
|
|
|
'cat_id' => $catId,
|
|
|
|
|
'cat_code' => $catCode,
|
|
|
|
|
'api_url' => $apiUrl,
|
|
|
|
|
@ -624,6 +683,18 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
|
|
|
|
|
$defaultCollege = $this->inferCollegeFromPageTitle($html);
|
|
|
|
|
|
|
|
|
|
$cardItems = $this->extractFromAjaxTeacherCards(
|
|
|
|
|
$html,
|
|
|
|
|
$keywords,
|
|
|
|
|
$sourceUrl,
|
|
|
|
|
$pageUniversity,
|
|
|
|
|
$defaultCollege,
|
|
|
|
|
$catCode,
|
|
|
|
|
);
|
|
|
|
|
if ($cardItems !== []) {
|
|
|
|
|
return $cardItems;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$parts = preg_split('#<div\s+class="rc-item">#u', $html) ?: [];
|
|
|
|
|
if (count($parts) > 1) {
|
|
|
|
|
array_shift($parts);
|
|
|
|
|
@ -667,6 +738,87 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* ICISEE 等站点 AJAX 返回的卡片式教师列表(姓名在 div.name 内,职称在 span 内)。
|
|
|
|
|
*
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
*/
|
|
|
|
|
protected function extractFromAjaxTeacherCards(
|
|
|
|
|
string $html,
|
|
|
|
|
array $keywords,
|
|
|
|
|
string $sourceUrl,
|
|
|
|
|
?string $pageUniversity,
|
|
|
|
|
?string $affiliation,
|
|
|
|
|
?string $catCode,
|
|
|
|
|
): array {
|
|
|
|
|
if (! preg_match_all(
|
|
|
|
|
'#<a\b([^>]*?)>\s*(?:<div\s+class="imgk">.*?</div>\s*)?<div\s+class="name">(.*?)</div>#su',
|
|
|
|
|
$html,
|
|
|
|
|
$matches,
|
|
|
|
|
PREG_SET_ORDER,
|
|
|
|
|
)) {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$items = [];
|
|
|
|
|
$seen = [];
|
|
|
|
|
|
|
|
|
|
foreach ($matches as $match) {
|
|
|
|
|
$attrs = (string) $match[1];
|
|
|
|
|
$nameBlock = (string) $match[2];
|
|
|
|
|
if (! preg_match('/^([^<]+)/u', $nameBlock, $nameMatch)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$name = CrawlAuthorParser::cleanText(trim($nameMatch[1])) ?? '';
|
|
|
|
|
if ($name === '' || ! $this->looksLikePersonName($name)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
|
|
|
if (! $this->looksLikeTeacherProfileUrl($href, $catCode)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$profileUrl = $this->resolveUrl($href, $sourceUrl);
|
|
|
|
|
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
|
|
|
|
|
if (isset($seen[$dedupeKey])) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$plain = trim($name.' '.($affiliation ?? ''));
|
|
|
|
|
if (! $this->matchesKeywords($plain, $keywords)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$academicTitle = null;
|
|
|
|
|
if (preg_match('#<span>([^<]+)</span>#u', $nameBlock, $titleMatch)) {
|
|
|
|
|
$academicTitle = CrawlAuthorParser::cleanText($titleMatch[1]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$seen[$dedupeKey] = true;
|
|
|
|
|
$items[] = $this->makeFacultyItem(
|
|
|
|
|
externalKey: 'faculty:'.md5($dedupeKey),
|
|
|
|
|
name: $name,
|
|
|
|
|
profileUrl: $profileUrl,
|
|
|
|
|
email: null,
|
|
|
|
|
affiliation: $affiliation,
|
|
|
|
|
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation),
|
|
|
|
|
summary: $affiliation ? '单位:'.$affiliation : null,
|
|
|
|
|
keywords: $keywords,
|
|
|
|
|
academicTitle: $academicTitle,
|
|
|
|
|
platform: 'faculty_html_ajax',
|
|
|
|
|
bio: null,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
|