|
|
|
|
@ -25,8 +25,8 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
$baseUrl = $this->normalizeRequestUrl($requestUrl);
|
|
|
|
|
$firstHtml = $this->fetchHtml($baseUrl);
|
|
|
|
|
|
|
|
|
|
if ($this->isSaisAjaxFacultyPage($firstHtml, $requestUrl)) {
|
|
|
|
|
$items = $this->fetchSaisFacultyItems($requestUrl, $firstHtml, $keywords, $maxResults);
|
|
|
|
|
if ($this->isAjaxTeacherListPage($firstHtml, $requestUrl)) {
|
|
|
|
|
$items = $this->fetchAjaxTeacherItems($requestUrl, $firstHtml, $keywords, $maxResults);
|
|
|
|
|
|
|
|
|
|
return $this->enrichEmailsFromProfilePages($items, $params);
|
|
|
|
|
}
|
|
|
|
|
@ -498,37 +498,42 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function isSaisAjaxFacultyPage(string $html, string $sourceUrl): bool
|
|
|
|
|
protected function isAjaxTeacherListPage(string $html, string $sourceUrl): bool
|
|
|
|
|
{
|
|
|
|
|
if (str_contains(strtolower($sourceUrl), 'ajax_teacher_list')) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (str_contains($html, 'ajax_teacher_list.html')) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$host = strtolower((string) parse_url($sourceUrl, PHP_URL_HOST));
|
|
|
|
|
|
|
|
|
|
return str_contains($host, 'sais.sjtu.edu.cn')
|
|
|
|
|
&& str_contains(strtolower($sourceUrl), 'faculty');
|
|
|
|
|
return str_contains($html, 'ajax_teacher_list.html');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
*/
|
|
|
|
|
protected function fetchSaisFacultyItems(
|
|
|
|
|
protected function fetchAjaxTeacherItems(
|
|
|
|
|
string $requestUrl,
|
|
|
|
|
string $pageHtml,
|
|
|
|
|
array $keywords,
|
|
|
|
|
int $maxResults,
|
|
|
|
|
): array {
|
|
|
|
|
$config = $this->parseSaisAjaxConfig($pageHtml, $requestUrl);
|
|
|
|
|
$config = $this->parseAjaxTeacherConfig($pageHtml, $requestUrl);
|
|
|
|
|
$search = implode(' ', $keywords);
|
|
|
|
|
$type = $search !== '' ? '2' : '1';
|
|
|
|
|
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
|
|
|
|
|
|
|
|
|
|
$payload = [
|
|
|
|
|
'cat_id' => $config['cat_id'],
|
|
|
|
|
'cat_code' => $config['cat_code'],
|
|
|
|
|
'type' => $type,
|
|
|
|
|
'zm' => $search === '' ? 'All' : '',
|
|
|
|
|
'zc' => '',
|
|
|
|
|
'search' => $search,
|
|
|
|
|
];
|
|
|
|
|
if ($config['uses_page']) {
|
|
|
|
|
$payload['page'] = '1';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$response = Http::timeout($timeout)
|
|
|
|
|
->connectTimeout(min(8, $timeout))
|
|
|
|
|
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
|
|
|
|
|
@ -537,29 +542,28 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
'Accept' => 'application/json, text/html',
|
|
|
|
|
])
|
|
|
|
|
->asForm()
|
|
|
|
|
->post($config['api_url'], [
|
|
|
|
|
'cat_id' => $config['cat_id'],
|
|
|
|
|
'cat_code' => $config['cat_code'],
|
|
|
|
|
'type' => $type,
|
|
|
|
|
'zm' => $search === '' ? 'All' : '',
|
|
|
|
|
'search' => $search,
|
|
|
|
|
]);
|
|
|
|
|
->post($config['api_url'], $payload);
|
|
|
|
|
|
|
|
|
|
if (! $response->successful()) {
|
|
|
|
|
throw new \RuntimeException('SAIS 教师列表接口请求失败(HTTP '.$response->status().')');
|
|
|
|
|
throw new \RuntimeException('教师列表接口请求失败(HTTP '.$response->status().')');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$payload = $response->json();
|
|
|
|
|
if (! is_array($payload)) {
|
|
|
|
|
throw new \RuntimeException('SAIS 教师列表接口返回格式异常');
|
|
|
|
|
$body = $response->json();
|
|
|
|
|
if (! is_array($body)) {
|
|
|
|
|
throw new \RuntimeException('教师列表接口返回格式异常');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$content = (string) ($payload['content'] ?? '');
|
|
|
|
|
$content = (string) ($body['content'] ?? '');
|
|
|
|
|
if ($content === '') {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$items = $this->extractFromSaisJsList($pageHtml.$content, $keywords, $requestUrl);
|
|
|
|
|
$items = $this->extractFromAjaxTeacherContent(
|
|
|
|
|
$pageHtml.$content,
|
|
|
|
|
$keywords,
|
|
|
|
|
$requestUrl,
|
|
|
|
|
$config['cat_code'],
|
|
|
|
|
);
|
|
|
|
|
if (count($items) > $maxResults) {
|
|
|
|
|
$items = array_slice($items, 0, $maxResults);
|
|
|
|
|
}
|
|
|
|
|
@ -568,13 +572,15 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return array{cat_id:string,cat_code:string,api_url:string}
|
|
|
|
|
* @return array{cat_id:string,cat_code:string,api_url:string,uses_page:bool}
|
|
|
|
|
*/
|
|
|
|
|
protected function parseSaisAjaxConfig(string $html, string $sourceUrl): array
|
|
|
|
|
protected function parseAjaxTeacherConfig(string $html, string $sourceUrl): array
|
|
|
|
|
{
|
|
|
|
|
$catId = '18';
|
|
|
|
|
$catCode = 'faculty';
|
|
|
|
|
$apiUrl = 'https://sais.sjtu.edu.cn/active/ajax_teacher_list.html';
|
|
|
|
|
$catId = null;
|
|
|
|
|
$catCode = null;
|
|
|
|
|
$usesPage = str_contains($html, 'page:page');
|
|
|
|
|
$origin = $this->requestOrigin($sourceUrl);
|
|
|
|
|
$apiUrl = $origin !== null ? $origin.'/active/ajax_teacher_list.html' : '';
|
|
|
|
|
|
|
|
|
|
if (preg_match("/cat_id\s*:\s*'(\d+)'/i", $html, $match)) {
|
|
|
|
|
$catId = $match[1];
|
|
|
|
|
@ -587,15 +593,19 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
?? $apiUrl;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$origin = $this->requestOrigin($sourceUrl);
|
|
|
|
|
if ($origin !== null && str_starts_with($apiUrl, '/')) {
|
|
|
|
|
$apiUrl = $origin.$apiUrl;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($catId === null || $catCode === null || $apiUrl === '') {
|
|
|
|
|
throw new \RuntimeException('无法解析教师列表接口参数(cat_id / cat_code)');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return [
|
|
|
|
|
'cat_id' => $catId,
|
|
|
|
|
'cat_code' => $catCode,
|
|
|
|
|
'api_url' => $apiUrl,
|
|
|
|
|
'uses_page' => $usesPage,
|
|
|
|
|
];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@ -603,33 +613,92 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
*/
|
|
|
|
|
protected function extractFromSaisJsList(string $html, array $keywords, string $sourceUrl): array
|
|
|
|
|
{
|
|
|
|
|
protected function extractFromAjaxTeacherContent(
|
|
|
|
|
string $html,
|
|
|
|
|
array $keywords,
|
|
|
|
|
string $sourceUrl,
|
|
|
|
|
?string $catCode = null,
|
|
|
|
|
): array {
|
|
|
|
|
$items = [];
|
|
|
|
|
$seen = [];
|
|
|
|
|
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
|
|
|
|
|
$defaultCollege = $this->inferCollegeFromPageTitle($html);
|
|
|
|
|
|
|
|
|
|
if (! preg_match_all('#<a\b([^>]*?)>([^<]+)</a>#su', $html, $matches, PREG_SET_ORDER)) {
|
|
|
|
|
$parts = preg_split('#<div\s+class="rc-item">#u', $html) ?: [];
|
|
|
|
|
if (count($parts) > 1) {
|
|
|
|
|
array_shift($parts);
|
|
|
|
|
foreach ($parts as $block) {
|
|
|
|
|
$department = $defaultCollege;
|
|
|
|
|
if (preg_match('#<div\s+class="tit">.*?<div\s+class="name">([^<]+)</div>#su', $block, $deptMatch)) {
|
|
|
|
|
$sectionTitle = CrawlAuthorParser::cleanText($deptMatch[1]);
|
|
|
|
|
if ($sectionTitle !== null && $sectionTitle !== '' && ! $this->looksLikePersonName($sectionTitle)) {
|
|
|
|
|
$department = $sectionTitle;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach ($this->extractTeacherLinksFromHtmlBlock(
|
|
|
|
|
$block,
|
|
|
|
|
$keywords,
|
|
|
|
|
$sourceUrl,
|
|
|
|
|
$pageUniversity,
|
|
|
|
|
$department,
|
|
|
|
|
$catCode,
|
|
|
|
|
) as $item) {
|
|
|
|
|
if (isset($seen[$item->externalId])) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$seen[$item->externalId] = true;
|
|
|
|
|
$items[] = $item;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($items !== []) {
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $this->extractTeacherLinksFromHtmlBlock(
|
|
|
|
|
$html,
|
|
|
|
|
$keywords,
|
|
|
|
|
$sourceUrl,
|
|
|
|
|
$pageUniversity,
|
|
|
|
|
$defaultCollege,
|
|
|
|
|
$catCode,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
*/
|
|
|
|
|
protected function extractTeacherLinksFromHtmlBlock(
|
|
|
|
|
string $html,
|
|
|
|
|
array $keywords,
|
|
|
|
|
string $sourceUrl,
|
|
|
|
|
?string $pageUniversity,
|
|
|
|
|
?string $affiliation,
|
|
|
|
|
?string $catCode,
|
|
|
|
|
): array {
|
|
|
|
|
$items = [];
|
|
|
|
|
$seen = [];
|
|
|
|
|
|
|
|
|
|
if (! preg_match_all('#<a\b([^>]*?)>(.*?)</a>#su', $html, $matches, PREG_SET_ORDER)) {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach ($matches as $match) {
|
|
|
|
|
$attrs = (string) $match[1];
|
|
|
|
|
if (! preg_match('/\bclass="[^"]*\bname\b[^"]*"/u', $attrs)) {
|
|
|
|
|
$rawName = preg_replace('/\s+/u', '', strip_tags($match[2])) ?? '';
|
|
|
|
|
$name = CrawlAuthorParser::cleanText($rawName) ?? '';
|
|
|
|
|
if ($name === '' || ! $this->looksLikePersonName($name)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
|
|
|
|
|
if ($name === '' || ! $this->looksLikePersonName($name)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
|
|
|
if (! str_contains(strtolower($href), '/faculty/')) {
|
|
|
|
|
if (! $this->looksLikeTeacherProfileUrl($href, $catCode)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@ -639,7 +708,7 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$plain = trim($name.' '.($defaultCollege ?? ''));
|
|
|
|
|
$plain = trim($name.' '.($affiliation ?? ''));
|
|
|
|
|
if (! $this->matchesKeywords($plain, $keywords)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
@ -650,12 +719,12 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
name: $name,
|
|
|
|
|
profileUrl: $profileUrl,
|
|
|
|
|
email: null,
|
|
|
|
|
affiliation: $defaultCollege,
|
|
|
|
|
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
|
|
|
|
|
summary: $defaultCollege ? '单位:'.$defaultCollege : null,
|
|
|
|
|
affiliation: $affiliation,
|
|
|
|
|
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation),
|
|
|
|
|
summary: $affiliation ? '单位:'.$affiliation : null,
|
|
|
|
|
keywords: $keywords,
|
|
|
|
|
academicTitle: null,
|
|
|
|
|
platform: 'faculty_html_sais',
|
|
|
|
|
platform: 'faculty_html_ajax',
|
|
|
|
|
bio: null,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
@ -663,6 +732,26 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function looksLikeTeacherProfileUrl(string $href, ?string $catCode): bool
|
|
|
|
|
{
|
|
|
|
|
$path = strtolower((string) parse_url($href, PHP_URL_PATH));
|
|
|
|
|
if ($path === '') {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('#/(faculty|jiaoshiml|people/detail_new)/[^/]+\.html$#', $path)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($catCode !== null && $catCode !== '') {
|
|
|
|
|
$code = preg_quote(strtolower($catCode), '#');
|
|
|
|
|
|
|
|
|
|
return (bool) preg_match('#/'.$code.'/[^/]+\.html$#', $path);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function requestOrigin(string $sourceUrl): ?string
|
|
|
|
|
{
|
|
|
|
|
$parts = parse_url($sourceUrl);
|
|
|
|
|
|