|
|
|
|
@ -24,6 +24,13 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
|
|
|
|
|
$baseUrl = $this->normalizeRequestUrl($requestUrl);
|
|
|
|
|
$firstHtml = $this->fetchHtml($baseUrl);
|
|
|
|
|
|
|
|
|
|
if ($this->isSaisAjaxFacultyPage($firstHtml, $requestUrl)) {
|
|
|
|
|
$items = $this->fetchSaisFacultyItems($requestUrl, $firstHtml, $keywords, $maxResults);
|
|
|
|
|
|
|
|
|
|
return $this->enrichEmailsFromProfilePages($items, $params);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$totalPages = $this->detectTotalPages($firstHtml);
|
|
|
|
|
$pagesToFetch = min($maxPages, $totalPages);
|
|
|
|
|
|
|
|
|
|
@ -342,6 +349,7 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
'/电子信箱[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
|
|
|
|
|
'/E-?mail[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/iu',
|
|
|
|
|
'/邮箱[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
|
|
|
|
|
'/电子邮件[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
foreach ($labeledPatterns as $pattern) {
|
|
|
|
|
@ -490,6 +498,186 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function isSaisAjaxFacultyPage(string $html, string $sourceUrl): bool
|
|
|
|
|
{
|
|
|
|
|
if (str_contains(strtolower($sourceUrl), 'ajax_teacher_list')) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (str_contains($html, 'ajax_teacher_list.html')) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$host = strtolower((string) parse_url($sourceUrl, PHP_URL_HOST));
|
|
|
|
|
|
|
|
|
|
return str_contains($host, 'sais.sjtu.edu.cn')
|
|
|
|
|
&& str_contains(strtolower($sourceUrl), 'faculty');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
*/
|
|
|
|
|
protected function fetchSaisFacultyItems(
|
|
|
|
|
string $requestUrl,
|
|
|
|
|
string $pageHtml,
|
|
|
|
|
array $keywords,
|
|
|
|
|
int $maxResults,
|
|
|
|
|
): array {
|
|
|
|
|
$config = $this->parseSaisAjaxConfig($pageHtml, $requestUrl);
|
|
|
|
|
$search = implode(' ', $keywords);
|
|
|
|
|
$type = $search !== '' ? '2' : '1';
|
|
|
|
|
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
|
|
|
|
|
|
|
|
|
|
$response = Http::timeout($timeout)
|
|
|
|
|
->connectTimeout(min(8, $timeout))
|
|
|
|
|
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
|
|
|
|
|
->withHeaders([
|
|
|
|
|
'User-Agent' => 'SlakeSchool-Crawler/1.0',
|
|
|
|
|
'Accept' => 'application/json, text/html',
|
|
|
|
|
])
|
|
|
|
|
->asForm()
|
|
|
|
|
->post($config['api_url'], [
|
|
|
|
|
'cat_id' => $config['cat_id'],
|
|
|
|
|
'cat_code' => $config['cat_code'],
|
|
|
|
|
'type' => $type,
|
|
|
|
|
'zm' => $search === '' ? 'All' : '',
|
|
|
|
|
'search' => $search,
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
if (! $response->successful()) {
|
|
|
|
|
throw new \RuntimeException('SAIS 教师列表接口请求失败(HTTP '.$response->status().')');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$payload = $response->json();
|
|
|
|
|
if (! is_array($payload)) {
|
|
|
|
|
throw new \RuntimeException('SAIS 教师列表接口返回格式异常');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$content = (string) ($payload['content'] ?? '');
|
|
|
|
|
if ($content === '') {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$items = $this->extractFromSaisJsList($pageHtml.$content, $keywords, $requestUrl);
|
|
|
|
|
if (count($items) > $maxResults) {
|
|
|
|
|
$items = array_slice($items, 0, $maxResults);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return array{cat_id:string,cat_code:string,api_url:string}
|
|
|
|
|
*/
|
|
|
|
|
protected function parseSaisAjaxConfig(string $html, string $sourceUrl): array
|
|
|
|
|
{
|
|
|
|
|
$catId = '18';
|
|
|
|
|
$catCode = 'faculty';
|
|
|
|
|
$apiUrl = 'https://sais.sjtu.edu.cn/active/ajax_teacher_list.html';
|
|
|
|
|
|
|
|
|
|
if (preg_match("/cat_id\s*:\s*'(\d+)'/i", $html, $match)) {
|
|
|
|
|
$catId = $match[1];
|
|
|
|
|
}
|
|
|
|
|
if (preg_match("/cat_code\s*:\s*'([^']+)'/i", $html, $match)) {
|
|
|
|
|
$catCode = $match[1];
|
|
|
|
|
}
|
|
|
|
|
if (preg_match("#url\s*:\s*'([^']*ajax_teacher_list[^']*)'#i", $html, $match)) {
|
|
|
|
|
$apiUrl = $this->resolveUrl(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'), $sourceUrl)
|
|
|
|
|
?? $apiUrl;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$origin = $this->requestOrigin($sourceUrl);
|
|
|
|
|
if ($origin !== null && str_starts_with($apiUrl, '/')) {
|
|
|
|
|
$apiUrl = $origin.$apiUrl;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return [
|
|
|
|
|
'cat_id' => $catId,
|
|
|
|
|
'cat_code' => $catCode,
|
|
|
|
|
'api_url' => $apiUrl,
|
|
|
|
|
];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
*/
|
|
|
|
|
protected function extractFromSaisJsList(string $html, array $keywords, string $sourceUrl): array
|
|
|
|
|
{
|
|
|
|
|
$items = [];
|
|
|
|
|
$seen = [];
|
|
|
|
|
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
|
|
|
|
|
$defaultCollege = $this->inferCollegeFromPageTitle($html);
|
|
|
|
|
|
|
|
|
|
if (! preg_match_all('#<a\b([^>]*?)>([^<]+)</a>#su', $html, $matches, PREG_SET_ORDER)) {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach ($matches as $match) {
|
|
|
|
|
$attrs = (string) $match[1];
|
|
|
|
|
if (! preg_match('/\bclass="[^"]*\bname\b[^"]*"/u', $attrs)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
|
|
|
|
|
if ($name === '' || ! $this->looksLikePersonName($name)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
|
|
|
if (! str_contains(strtolower($href), '/faculty/')) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$profileUrl = $this->resolveUrl($href, $sourceUrl);
|
|
|
|
|
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
|
|
|
|
|
if (isset($seen[$dedupeKey])) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$plain = trim($name.' '.($defaultCollege ?? ''));
|
|
|
|
|
if (! $this->matchesKeywords($plain, $keywords)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$seen[$dedupeKey] = true;
|
|
|
|
|
$items[] = $this->makeFacultyItem(
|
|
|
|
|
externalKey: 'faculty:'.md5($dedupeKey),
|
|
|
|
|
name: $name,
|
|
|
|
|
profileUrl: $profileUrl,
|
|
|
|
|
email: null,
|
|
|
|
|
affiliation: $defaultCollege,
|
|
|
|
|
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
|
|
|
|
|
summary: $defaultCollege ? '单位:'.$defaultCollege : null,
|
|
|
|
|
keywords: $keywords,
|
|
|
|
|
academicTitle: null,
|
|
|
|
|
platform: 'faculty_html_sais',
|
|
|
|
|
bio: null,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function requestOrigin(string $sourceUrl): ?string
|
|
|
|
|
{
|
|
|
|
|
$parts = parse_url($sourceUrl);
|
|
|
|
|
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$origin = $parts['scheme'].'://'.$parts['host'];
|
|
|
|
|
if (! empty($parts['port'])) {
|
|
|
|
|
$origin .= ':'.$parts['port'];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $origin;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
|