|
|
<?php
|
|
|
|
|
|
namespace App\Services\Crawl\Adapters;
|
|
|
|
|
|
use App\Models\CrawlSource;
|
|
|
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
|
|
|
use App\Services\Crawl\CrawlAuthorParser;
|
|
|
use App\Services\Crawl\CrawlItemDto;
|
|
|
use App\Services\Crawl\CrawlKeywordParser;
|
|
|
use Illuminate\Http\Client\Response;
|
|
|
use Illuminate\Support\Facades\Http;
|
|
|
use Illuminate\Support\Str;
|
|
|
|
|
|
/**
|
|
|
* 通用院系/师资列表页:优先邮箱条目;无邮箱时解析 tsites 等列表卡片(姓名、单位、职称、主页)。
|
|
|
*/
|
|
|
class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
{
|
|
|
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
|
|
|
{
|
|
|
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
|
|
|
$maxResults = min(500, max(1, (int) ($params['max_results'] ?? 30)));
|
|
|
$maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1)));
|
|
|
|
|
|
$baseUrl = $this->normalizeRequestUrl($requestUrl);
|
|
|
$firstHtml = $this->fetchHtml($baseUrl);
|
|
|
|
|
|
if ($this->isSaisAjaxFacultyPage($firstHtml, $requestUrl)) {
|
|
|
$items = $this->fetchSaisFacultyItems($requestUrl, $firstHtml, $keywords, $maxResults);
|
|
|
|
|
|
return $this->enrichEmailsFromProfilePages($items, $params);
|
|
|
}
|
|
|
|
|
|
$totalPages = $this->detectTotalPages($firstHtml);
|
|
|
$pagesToFetch = min($maxPages, $totalPages);
|
|
|
|
|
|
$merged = [];
|
|
|
$seen = [];
|
|
|
|
|
|
for ($page = 1; $page <= $pagesToFetch && count($merged) < $maxResults; $page++) {
|
|
|
$html = $page === 1
|
|
|
? $firstHtml
|
|
|
: null;
|
|
|
|
|
|
if ($html === null) {
|
|
|
break;
|
|
|
}
|
|
|
|
|
|
foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
|
|
|
if (isset($seen[$item->externalId])) {
|
|
|
continue;
|
|
|
}
|
|
|
$seen[$item->externalId] = true;
|
|
|
$merged[] = $item;
|
|
|
if (count($merged) >= $maxResults) {
|
|
|
break 2;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if ($pagesToFetch > 1 && count($merged) < $maxResults) {
|
|
|
$merged = $this->fetchRemainingListPages(
|
|
|
$baseUrl,
|
|
|
$firstHtml,
|
|
|
$pagesToFetch,
|
|
|
$keywords,
|
|
|
$requestUrl,
|
|
|
$merged,
|
|
|
$seen,
|
|
|
$maxResults,
|
|
|
);
|
|
|
}
|
|
|
|
|
|
return $this->enrichEmailsFromProfilePages($merged, $params);
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param list<CrawlItemDto> $merged
|
|
|
* @param array<string, true> $seen
|
|
|
* @param list<string> $keywords
|
|
|
* @return list<CrawlItemDto>
|
|
|
*/
|
|
|
protected function fetchRemainingListPages(
|
|
|
string $baseUrl,
|
|
|
string $firstHtml,
|
|
|
int $pagesToFetch,
|
|
|
array $keywords,
|
|
|
string $requestUrl,
|
|
|
array $merged,
|
|
|
array $seen,
|
|
|
int $maxResults,
|
|
|
): array {
|
|
|
$poolSize = max(1, min(10, (int) config('crawl.faculty.list_fetch_pool_size', 5)));
|
|
|
$pageUrls = [];
|
|
|
for ($page = 2; $page <= $pagesToFetch; $page++) {
|
|
|
$pageUrls[$page] = $this->buildPageUrl($baseUrl, $page, $firstHtml);
|
|
|
}
|
|
|
|
|
|
foreach (array_chunk($pageUrls, $poolSize, true) as $chunk) {
|
|
|
$htmlByPage = $this->fetchHtmlPool($chunk);
|
|
|
ksort($htmlByPage);
|
|
|
|
|
|
foreach ($htmlByPage as $html) {
|
|
|
foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
|
|
|
if (isset($seen[$item->externalId])) {
|
|
|
continue;
|
|
|
}
|
|
|
$seen[$item->externalId] = true;
|
|
|
$merged[] = $item;
|
|
|
if (count($merged) >= $maxResults) {
|
|
|
return $merged;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return $merged;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param array<int, string> $pageUrls
|
|
|
* @return array<int, string>
|
|
|
*/
|
|
|
protected function fetchHtmlPool(array $pageUrls): array
|
|
|
{
|
|
|
if ($pageUrls === []) {
|
|
|
return [];
|
|
|
}
|
|
|
|
|
|
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
|
|
|
$headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
|
|
|
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pageUrls, $headers, $timeout) {
|
|
|
foreach ($pageUrls as $page => $url) {
|
|
|
$pool->as((string) $page)
|
|
|
->timeout($timeout)
|
|
|
->connectTimeout(min(8, $timeout))
|
|
|
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
|
|
|
->withHeaders($headers)
|
|
|
->get($url);
|
|
|
}
|
|
|
});
|
|
|
|
|
|
$htmlByPage = [];
|
|
|
foreach ($pageUrls as $page => $url) {
|
|
|
$body = $this->responseBodyFromPoolResult($responses[(string) $page] ?? null);
|
|
|
if ($body !== null && $body !== '') {
|
|
|
$htmlByPage[$page] = $body;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return $htmlByPage;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param list<CrawlItemDto> $items
|
|
|
* @param array<string, mixed> $params
|
|
|
* @return list<CrawlItemDto>
|
|
|
*/
|
|
|
protected function enrichEmailsFromProfilePages(array $items, array $params = []): array
|
|
|
{
|
|
|
if (! config('crawl.faculty.profile_email_enrich_enabled', true)) {
|
|
|
return $items;
|
|
|
}
|
|
|
|
|
|
$maxEnrich = $this->resolveProfileEnrichMax($params, count($items));
|
|
|
if ($maxEnrich <= 0) {
|
|
|
return $this->markProfileEnrichSkipped($items);
|
|
|
}
|
|
|
|
|
|
$poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 8)));
|
|
|
$timeout = max(5, (int) config('crawl.faculty.profile_http_timeout_seconds', 10));
|
|
|
$headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
|
|
|
|
|
|
$fetchMap = [];
|
|
|
$enrichBudget = $maxEnrich;
|
|
|
foreach ($items as $index => $item) {
|
|
|
if ($enrichBudget <= 0) {
|
|
|
break;
|
|
|
}
|
|
|
if ($this->itemHasEmail($item) || ! $item->canonicalUrl) {
|
|
|
continue;
|
|
|
}
|
|
|
$fetchMap[$index] = $item;
|
|
|
$enrichBudget--;
|
|
|
}
|
|
|
|
|
|
if ($fetchMap === []) {
|
|
|
return $items;
|
|
|
}
|
|
|
|
|
|
$fetchedBodies = [];
|
|
|
foreach (array_chunk($fetchMap, $poolSize, true) as $chunk) {
|
|
|
$batchPending = [];
|
|
|
foreach ($chunk as $index => $item) {
|
|
|
$batchPending[$index] = $item;
|
|
|
}
|
|
|
|
|
|
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($batchPending, $headers, $timeout) {
|
|
|
foreach ($batchPending as $index => $item) {
|
|
|
$pool->as((string) $index)
|
|
|
->timeout($timeout)
|
|
|
->connectTimeout(min(8, $timeout))
|
|
|
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
|
|
|
->withHeaders($headers)
|
|
|
->get($item->canonicalUrl);
|
|
|
}
|
|
|
});
|
|
|
|
|
|
foreach ($batchPending as $index => $item) {
|
|
|
$body = $this->responseBodyFromPoolResult($responses[(string) $index] ?? null);
|
|
|
if ($body !== null) {
|
|
|
$email = $this->extractEmailFromProfileHtml($body);
|
|
|
if ($email) {
|
|
|
$item = $this->applyEmailToItem($item, $email);
|
|
|
}
|
|
|
$item = $this->applyProfileMetadataToItem($item, $body);
|
|
|
}
|
|
|
$fetchedBodies[$index] = $item;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
$result = [];
|
|
|
foreach ($items as $index => $item) {
|
|
|
if (isset($fetchedBodies[$index])) {
|
|
|
$result[] = $fetchedBodies[$index];
|
|
|
} elseif (! $this->itemHasEmail($item) && $item->canonicalUrl) {
|
|
|
$result[] = $this->markItemProfileEnrichSkipped($item);
|
|
|
} else {
|
|
|
$result[] = $item;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return $result;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param array<string, mixed> $params
|
|
|
*/
|
|
|
protected function resolveProfileEnrichMax(array $params, int $itemCount): int
|
|
|
{
|
|
|
if (($params['skip_profile_enrich'] ?? false) === true) {
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
$configured = (int) ($params['profile_enrich_max'] ?? config('crawl.faculty.profile_enrich_max', 32));
|
|
|
|
|
|
return max(0, min($itemCount, min(200, $configured)));
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param list<CrawlItemDto> $items
|
|
|
* @return list<CrawlItemDto>
|
|
|
*/
|
|
|
protected function markProfileEnrichSkipped(array $items): array
|
|
|
{
|
|
|
return array_map(fn (CrawlItemDto $item) => $this->markItemProfileEnrichSkipped($item), $items);
|
|
|
}
|
|
|
|
|
|
protected function markItemProfileEnrichSkipped(CrawlItemDto $item): CrawlItemDto
|
|
|
{
|
|
|
if ($this->itemHasEmail($item)) {
|
|
|
return $item;
|
|
|
}
|
|
|
|
|
|
$extra = $item->extra;
|
|
|
$extra['profile_enrich_skipped'] = true;
|
|
|
|
|
|
return new CrawlItemDto(
|
|
|
externalId: $item->externalId,
|
|
|
title: $item->title,
|
|
|
canonicalUrl: $item->canonicalUrl,
|
|
|
authors: $item->authors,
|
|
|
summary: $item->summary,
|
|
|
publishedAt: $item->publishedAt,
|
|
|
schoolName: $item->schoolName,
|
|
|
section: $item->section,
|
|
|
contentHtml: $item->contentHtml,
|
|
|
extra: $extra,
|
|
|
authorsParsed: $item->authorsParsed,
|
|
|
);
|
|
|
}
|
|
|
|
|
|
protected function responseBodyFromPoolResult(mixed $result): ?string
|
|
|
{
|
|
|
if ($result instanceof Response && $result->successful()) {
|
|
|
return (string) $result->body();
|
|
|
}
|
|
|
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
protected function itemHasEmail(CrawlItemDto $item): bool
|
|
|
{
|
|
|
$lead = $item->extra['lead_author'] ?? null;
|
|
|
if (is_array($lead) && CrawlAuthorParser::normalizeEmail($lead['email'] ?? null)) {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
foreach ($item->authorsParsed as $author) {
|
|
|
if (CrawlAuthorParser::normalizeEmail($author['email'] ?? null)) {
|
|
|
return true;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
protected function applyEmailToItem(CrawlItemDto $item, string $email): CrawlItemDto
|
|
|
{
|
|
|
$email = CrawlAuthorParser::normalizeEmail($email) ?? $email;
|
|
|
$lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : [];
|
|
|
$lead['email'] = $email;
|
|
|
|
|
|
$authorsParsed = $item->authorsParsed;
|
|
|
if ($authorsParsed === []) {
|
|
|
$authorsParsed = [[
|
|
|
'name' => $item->title,
|
|
|
'email' => $email,
|
|
|
'affiliation' => $lead['affiliation'] ?? $lead['college'] ?? null,
|
|
|
'university_name' => $lead['university_name'] ?? $item->schoolName,
|
|
|
]];
|
|
|
} else {
|
|
|
$authorsParsed[0]['email'] = $email;
|
|
|
}
|
|
|
|
|
|
$extra = $item->extra;
|
|
|
$extra['lead_author'] = $lead;
|
|
|
|
|
|
return new CrawlItemDto(
|
|
|
externalId: $item->externalId,
|
|
|
title: $item->title,
|
|
|
canonicalUrl: $item->canonicalUrl,
|
|
|
authors: $item->authors,
|
|
|
summary: $item->summary,
|
|
|
publishedAt: $item->publishedAt,
|
|
|
schoolName: $item->schoolName,
|
|
|
section: $item->section,
|
|
|
contentHtml: $item->contentHtml,
|
|
|
extra: $extra,
|
|
|
authorsParsed: $authorsParsed,
|
|
|
);
|
|
|
}
|
|
|
|
|
|
protected function extractEmailFromProfileHtml(string $html): ?string
|
|
|
{
|
|
|
$labeledPatterns = [
|
|
|
'/电子邮箱[::]\s*<\/strong>\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
|
|
|
'/电子邮箱[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
|
|
|
'/电子信箱[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
|
|
|
'/E-?mail[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/iu',
|
|
|
'/邮箱[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
|
|
|
'/电子邮件[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
|
|
|
];
|
|
|
|
|
|
foreach ($labeledPatterns as $pattern) {
|
|
|
if (preg_match($pattern, $html, $match)) {
|
|
|
$email = CrawlAuthorParser::normalizeEmail($match[1]);
|
|
|
if ($email && ! $this->isNoiseEmail($email)) {
|
|
|
return $email;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
$candidates = [];
|
|
|
if (preg_match_all(
|
|
|
'#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
|
|
|
$html,
|
|
|
$emailMatches,
|
|
|
)) {
|
|
|
foreach ($emailMatches[1] as $raw) {
|
|
|
$email = CrawlAuthorParser::normalizeEmail($raw);
|
|
|
if ($email && ! $this->isNoiseEmail($email)) {
|
|
|
$candidates[] = $email;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if ($candidates === []) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$candidates = array_values(array_unique($candidates));
|
|
|
|
|
|
foreach ($candidates as $email) {
|
|
|
if (str_ends_with($email, '.edu.cn') || str_ends_with($email, '.edu')) {
|
|
|
return $email;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return $candidates[0];
|
|
|
}
|
|
|
|
|
|
protected function isNoiseEmail(string $email): bool
|
|
|
{
|
|
|
return (bool) preg_match(
|
|
|
'/^(noreply|no-reply|admin|webmaster|postmaster|root|support|service|info|contact)@/i',
|
|
|
$email,
|
|
|
);
|
|
|
}
|
|
|
|
|
|
protected function fetchHtml(string $url): string
|
|
|
{
|
|
|
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
|
|
|
|
|
|
$response = Http::timeout($timeout)
|
|
|
->connectTimeout(min(8, $timeout))
|
|
|
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
|
|
|
->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'])
|
|
|
->get($url);
|
|
|
|
|
|
if (! $response->successful()) {
|
|
|
throw new \RuntimeException('页面请求失败(HTTP '.$response->status().'):'.$url);
|
|
|
}
|
|
|
|
|
|
return (string) $response->body();
|
|
|
}
|
|
|
|
|
|
protected function detectTotalPages(string $html): int
|
|
|
{
|
|
|
if (preg_match('/totalpage=(\d+)/i', $html, $match)) {
|
|
|
return max(1, (int) $match[1]);
|
|
|
}
|
|
|
|
|
|
if (preg_match('/共\s*(\d+)\s*条/u', $html, $countMatch)) {
|
|
|
$perPage = 0;
|
|
|
if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) {
|
|
|
$perPage = preg_match_all('/<div\s+class="name">/u', $listMatch[1]) ?: 0;
|
|
|
}
|
|
|
if ($perPage > 0) {
|
|
|
return max(1, (int) ceil(((int) $countMatch[1]) / $perPage));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return 1;
|
|
|
}
|
|
|
|
|
|
protected function buildPageUrl(string $baseUrl, int $page, string $firstPageHtml): string
|
|
|
{
|
|
|
$parts = parse_url($baseUrl);
|
|
|
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
|
|
|
return $baseUrl;
|
|
|
}
|
|
|
|
|
|
parse_str((string) ($parts['query'] ?? ''), $query);
|
|
|
$query['PAGENUM'] = (string) $page;
|
|
|
|
|
|
if (preg_match('/totalpage=(\d+)/i', $firstPageHtml, $totalMatch)) {
|
|
|
$query['totalpage'] = $totalMatch[1];
|
|
|
}
|
|
|
|
|
|
$url = $parts['scheme'].'://'.$parts['host'];
|
|
|
if (! empty($parts['port'])) {
|
|
|
$url .= ':'.$parts['port'];
|
|
|
}
|
|
|
$url .= $parts['path'] ?? '/';
|
|
|
if ($query !== []) {
|
|
|
$url .= '?'.http_build_query($query);
|
|
|
}
|
|
|
|
|
|
return $url;
|
|
|
}
|
|
|
|
|
|
protected function normalizeRequestUrl(string $url): string
|
|
|
{
|
|
|
$parts = parse_url($url);
|
|
|
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
|
|
|
return $url;
|
|
|
}
|
|
|
|
|
|
$normalized = $parts['scheme'].'://'.$parts['host'];
|
|
|
if (! empty($parts['port'])) {
|
|
|
$normalized .= ':'.$parts['port'];
|
|
|
}
|
|
|
$normalized .= $parts['path'] ?? '/';
|
|
|
if (! empty($parts['query'])) {
|
|
|
$normalized .= '?'.$parts['query'];
|
|
|
}
|
|
|
|
|
|
return $normalized;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param list<string> $keywords
|
|
|
* @return list<CrawlItemDto>
|
|
|
*/
|
|
|
protected function extractFromHtml(string $html, array $keywords, string $sourceUrl): array
|
|
|
{
|
|
|
$items = $this->extractFromEmailBlocks($html, $keywords, $sourceUrl);
|
|
|
if ($items !== []) {
|
|
|
return $items;
|
|
|
}
|
|
|
|
|
|
$items = $this->extractFromStructuredFacultyList($html, $keywords, $sourceUrl);
|
|
|
if ($items !== []) {
|
|
|
return $items;
|
|
|
}
|
|
|
|
|
|
return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl);
|
|
|
}
|
|
|
|
|
|
protected function isSaisAjaxFacultyPage(string $html, string $sourceUrl): bool
|
|
|
{
|
|
|
if (str_contains(strtolower($sourceUrl), 'ajax_teacher_list')) {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
if (str_contains($html, 'ajax_teacher_list.html')) {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
$host = strtolower((string) parse_url($sourceUrl, PHP_URL_HOST));
|
|
|
|
|
|
return str_contains($host, 'sais.sjtu.edu.cn')
|
|
|
&& str_contains(strtolower($sourceUrl), 'faculty');
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param list<string> $keywords
|
|
|
* @return list<CrawlItemDto>
|
|
|
*/
|
|
|
protected function fetchSaisFacultyItems(
|
|
|
string $requestUrl,
|
|
|
string $pageHtml,
|
|
|
array $keywords,
|
|
|
int $maxResults,
|
|
|
): array {
|
|
|
$config = $this->parseSaisAjaxConfig($pageHtml, $requestUrl);
|
|
|
$search = implode(' ', $keywords);
|
|
|
$type = $search !== '' ? '2' : '1';
|
|
|
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
|
|
|
|
|
|
$response = Http::timeout($timeout)
|
|
|
->connectTimeout(min(8, $timeout))
|
|
|
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
|
|
|
->withHeaders([
|
|
|
'User-Agent' => 'SlakeSchool-Crawler/1.0',
|
|
|
'Accept' => 'application/json, text/html',
|
|
|
])
|
|
|
->asForm()
|
|
|
->post($config['api_url'], [
|
|
|
'cat_id' => $config['cat_id'],
|
|
|
'cat_code' => $config['cat_code'],
|
|
|
'type' => $type,
|
|
|
'zm' => $search === '' ? 'All' : '',
|
|
|
'search' => $search,
|
|
|
]);
|
|
|
|
|
|
if (! $response->successful()) {
|
|
|
throw new \RuntimeException('SAIS 教师列表接口请求失败(HTTP '.$response->status().')');
|
|
|
}
|
|
|
|
|
|
$payload = $response->json();
|
|
|
if (! is_array($payload)) {
|
|
|
throw new \RuntimeException('SAIS 教师列表接口返回格式异常');
|
|
|
}
|
|
|
|
|
|
$content = (string) ($payload['content'] ?? '');
|
|
|
if ($content === '') {
|
|
|
return [];
|
|
|
}
|
|
|
|
|
|
$items = $this->extractFromSaisJsList($pageHtml.$content, $keywords, $requestUrl);
|
|
|
if (count($items) > $maxResults) {
|
|
|
$items = array_slice($items, 0, $maxResults);
|
|
|
}
|
|
|
|
|
|
return $items;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @return array{cat_id:string,cat_code:string,api_url:string}
|
|
|
*/
|
|
|
protected function parseSaisAjaxConfig(string $html, string $sourceUrl): array
|
|
|
{
|
|
|
$catId = '18';
|
|
|
$catCode = 'faculty';
|
|
|
$apiUrl = 'https://sais.sjtu.edu.cn/active/ajax_teacher_list.html';
|
|
|
|
|
|
if (preg_match("/cat_id\s*:\s*'(\d+)'/i", $html, $match)) {
|
|
|
$catId = $match[1];
|
|
|
}
|
|
|
if (preg_match("/cat_code\s*:\s*'([^']+)'/i", $html, $match)) {
|
|
|
$catCode = $match[1];
|
|
|
}
|
|
|
if (preg_match("#url\s*:\s*'([^']*ajax_teacher_list[^']*)'#i", $html, $match)) {
|
|
|
$apiUrl = $this->resolveUrl(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'), $sourceUrl)
|
|
|
?? $apiUrl;
|
|
|
}
|
|
|
|
|
|
$origin = $this->requestOrigin($sourceUrl);
|
|
|
if ($origin !== null && str_starts_with($apiUrl, '/')) {
|
|
|
$apiUrl = $origin.$apiUrl;
|
|
|
}
|
|
|
|
|
|
return [
|
|
|
'cat_id' => $catId,
|
|
|
'cat_code' => $catCode,
|
|
|
'api_url' => $apiUrl,
|
|
|
];
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param list<string> $keywords
|
|
|
* @return list<CrawlItemDto>
|
|
|
*/
|
|
|
protected function extractFromSaisJsList(string $html, array $keywords, string $sourceUrl): array
|
|
|
{
|
|
|
$items = [];
|
|
|
$seen = [];
|
|
|
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
|
|
|
$defaultCollege = $this->inferCollegeFromPageTitle($html);
|
|
|
|
|
|
if (! preg_match_all('#<a\b([^>]*?)>([^<]+)</a>#su', $html, $matches, PREG_SET_ORDER)) {
|
|
|
return [];
|
|
|
}
|
|
|
|
|
|
foreach ($matches as $match) {
|
|
|
$attrs = (string) $match[1];
|
|
|
if (! preg_match('/\bclass="[^"]*\bname\b[^"]*"/u', $attrs)) {
|
|
|
continue;
|
|
|
}
|
|
|
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
|
|
|
if ($name === '' || ! $this->looksLikePersonName($name)) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
|
if (! str_contains(strtolower($href), '/faculty/')) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$profileUrl = $this->resolveUrl($href, $sourceUrl);
|
|
|
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
|
|
|
if (isset($seen[$dedupeKey])) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$plain = trim($name.' '.($defaultCollege ?? ''));
|
|
|
if (! $this->matchesKeywords($plain, $keywords)) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$seen[$dedupeKey] = true;
|
|
|
$items[] = $this->makeFacultyItem(
|
|
|
externalKey: 'faculty:'.md5($dedupeKey),
|
|
|
name: $name,
|
|
|
profileUrl: $profileUrl,
|
|
|
email: null,
|
|
|
affiliation: $defaultCollege,
|
|
|
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
|
|
|
summary: $defaultCollege ? '单位:'.$defaultCollege : null,
|
|
|
keywords: $keywords,
|
|
|
academicTitle: null,
|
|
|
platform: 'faculty_html_sais',
|
|
|
bio: null,
|
|
|
);
|
|
|
}
|
|
|
|
|
|
return $items;
|
|
|
}
|
|
|
|
|
|
protected function requestOrigin(string $sourceUrl): ?string
|
|
|
{
|
|
|
$parts = parse_url($sourceUrl);
|
|
|
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$origin = $parts['scheme'].'://'.$parts['host'];
|
|
|
if (! empty($parts['port'])) {
|
|
|
$origin .= ':'.$parts['port'];
|
|
|
}
|
|
|
|
|
|
return $origin;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param list<string> $keywords
|
|
|
* @return list<CrawlItemDto>
|
|
|
*/
|
|
|
protected function extractFromEmailBlocks(string $html, array $keywords, string $sourceUrl): array
|
|
|
{
|
|
|
$items = [];
|
|
|
$seen = [];
|
|
|
|
|
|
if (! preg_match_all(
|
|
|
'#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
|
|
|
$html,
|
|
|
$emailMatches,
|
|
|
PREG_OFFSET_CAPTURE
|
|
|
)) {
|
|
|
return [];
|
|
|
}
|
|
|
|
|
|
foreach ($emailMatches[1] as $match) {
|
|
|
$email = CrawlAuthorParser::normalizeEmail($match[0]);
|
|
|
if (! $email || isset($seen[$email])) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$pos = (int) $match[1];
|
|
|
$window = substr($html, max(0, $pos - 400), 800);
|
|
|
$plain = $this->htmlToPlain($window);
|
|
|
|
|
|
if (! $this->matchesKeywords($plain, $keywords)) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$name = $this->guessName($plain, $email);
|
|
|
if ($name === '') {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$affiliation = $this->guessAffiliation($plain);
|
|
|
$seen[$email] = true;
|
|
|
|
|
|
$items[] = $this->makeFacultyItem(
|
|
|
externalKey: 'faculty:'.md5($email),
|
|
|
name: $name,
|
|
|
profileUrl: $sourceUrl,
|
|
|
email: $email,
|
|
|
affiliation: $affiliation,
|
|
|
universityName: CrawlAuthorParser::universityFromAffiliation($affiliation)
|
|
|
?? $this->inferUniversityFromSource($sourceUrl, $html),
|
|
|
summary: Str::limit($plain, 300),
|
|
|
keywords: $keywords,
|
|
|
academicTitle: null,
|
|
|
platform: 'faculty_html',
|
|
|
bio: null,
|
|
|
);
|
|
|
}
|
|
|
|
|
|
return $items;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* 上海交大材料学院等:panel-item + a.staff-item(/people/detail_new/{id})。
|
|
|
*
|
|
|
* @param list<string> $keywords
|
|
|
* @return list<CrawlItemDto>
|
|
|
*/
|
|
|
protected function extractFromStaffPanelList(string $html, array $keywords, string $sourceUrl): array
|
|
|
{
|
|
|
$items = [];
|
|
|
$seen = [];
|
|
|
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
|
|
|
$defaultCollege = $this->inferCollegeFromPageTitle($html);
|
|
|
|
|
|
$panelChunks = preg_split('#<div\s+class="panel-head">#u', $html) ?: [];
|
|
|
if (count($panelChunks) > 1) {
|
|
|
array_shift($panelChunks);
|
|
|
foreach ($panelChunks as $chunk) {
|
|
|
if (! preg_match('#<div\s+class="title">\s*([^<]+?)\s*</div>#u', $chunk, $titleMatch)) {
|
|
|
continue;
|
|
|
}
|
|
|
$department = CrawlAuthorParser::cleanText($titleMatch[1]);
|
|
|
foreach ($this->extractStaffItemLinks($chunk) as $link) {
|
|
|
$item = $this->makeStaffPanelItem(
|
|
|
$link,
|
|
|
$department ?: $defaultCollege,
|
|
|
$pageUniversity,
|
|
|
$keywords,
|
|
|
$sourceUrl,
|
|
|
);
|
|
|
if ($item === null || isset($seen[$item->externalId])) {
|
|
|
continue;
|
|
|
}
|
|
|
$seen[$item->externalId] = true;
|
|
|
$items[] = $item;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if ($items !== []) {
|
|
|
return $items;
|
|
|
}
|
|
|
|
|
|
foreach ($this->extractStaffItemLinks($html) as $link) {
|
|
|
$item = $this->makeStaffPanelItem(
|
|
|
$link,
|
|
|
$defaultCollege,
|
|
|
$pageUniversity,
|
|
|
$keywords,
|
|
|
$sourceUrl,
|
|
|
);
|
|
|
if ($item === null || isset($seen[$item->externalId])) {
|
|
|
continue;
|
|
|
}
|
|
|
$seen[$item->externalId] = true;
|
|
|
$items[] = $item;
|
|
|
}
|
|
|
|
|
|
return $items;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @return list<array{href:string,name:string}>
|
|
|
*/
|
|
|
protected function extractStaffItemLinks(string $html): array
|
|
|
{
|
|
|
$links = [];
|
|
|
$seen = [];
|
|
|
|
|
|
if (! preg_match_all('#<a\b([^>]*?)>([^<]+)</a>#su', $html, $matches, PREG_SET_ORDER)) {
|
|
|
return [];
|
|
|
}
|
|
|
|
|
|
foreach ($matches as $match) {
|
|
|
$attrs = (string) $match[1];
|
|
|
if (! str_contains($attrs, 'staff-item')) {
|
|
|
continue;
|
|
|
}
|
|
|
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
|
|
|
if ($name === '' || ! $this->looksLikePersonName($name)) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
|
$key = $href.'|'.$name;
|
|
|
if (isset($seen[$key])) {
|
|
|
continue;
|
|
|
}
|
|
|
$seen[$key] = true;
|
|
|
$links[] = ['href' => $href, 'name' => $name];
|
|
|
}
|
|
|
|
|
|
return $links;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param array{href:string,name:string} $link
|
|
|
* @param list<string> $keywords
|
|
|
*/
|
|
|
protected function makeStaffPanelItem(
|
|
|
array $link,
|
|
|
?string $department,
|
|
|
?string $pageUniversity,
|
|
|
array $keywords,
|
|
|
string $sourceUrl,
|
|
|
): ?CrawlItemDto {
|
|
|
$name = $link['name'];
|
|
|
$profileUrl = $this->resolveUrl($link['href'], $sourceUrl);
|
|
|
$plain = trim($name.' '.($department ?? ''));
|
|
|
|
|
|
if (! $this->matchesKeywords($plain, $keywords)) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
|
|
|
$affiliation = $department;
|
|
|
$universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation);
|
|
|
|
|
|
return $this->makeFacultyItem(
|
|
|
externalKey: 'faculty:'.md5($dedupeKey),
|
|
|
name: $name,
|
|
|
profileUrl: $profileUrl,
|
|
|
email: null,
|
|
|
affiliation: $affiliation,
|
|
|
universityName: $universityName,
|
|
|
summary: $department ? '单位:'.$department : null,
|
|
|
keywords: $keywords,
|
|
|
academicTitle: null,
|
|
|
platform: 'faculty_html_smse',
|
|
|
bio: null,
|
|
|
);
|
|
|
}
|
|
|
|
|
|
protected function inferCollegeFromPageTitle(string $html): ?string
|
|
|
{
|
|
|
if (! preg_match('/<title>\s*[^<\-\–—]+[\-–—]\s*([^<]+?)\s*<\/title>/u', $html, $match)) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$title = CrawlAuthorParser::cleanText($match[1]);
|
|
|
if ($title === null || $title === '') {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $title, $college)) {
|
|
|
return CrawlAuthorParser::cleanText($college[1]);
|
|
|
}
|
|
|
|
|
|
return $title;
|
|
|
}
|
|
|
|
|
|
protected function applyProfileMetadataToItem(CrawlItemDto $item, string $html): CrawlItemDto
|
|
|
{
|
|
|
$lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : [];
|
|
|
$changed = false;
|
|
|
|
|
|
if (empty($lead['academic_title']) && preg_match('/<em>\s*([^<]+?)\s*<\/em>/u', $html, $titleMatch)) {
|
|
|
$title = CrawlAuthorParser::cleanText($titleMatch[1]);
|
|
|
if ($title !== null && $title !== '') {
|
|
|
$lead['academic_title'] = $title;
|
|
|
$changed = true;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if (empty($lead['college']) && empty($lead['affiliation'])) {
|
|
|
$dept = $this->parseLabeledField($html, '所属二级机构');
|
|
|
if ($dept !== null && $dept !== '') {
|
|
|
$lead['affiliation'] = $dept;
|
|
|
$lead['college'] = $dept;
|
|
|
$changed = true;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if (! $changed) {
|
|
|
return $item;
|
|
|
}
|
|
|
|
|
|
$extra = $item->extra;
|
|
|
$extra['lead_author'] = $lead;
|
|
|
if (! empty($lead['academic_title'])) {
|
|
|
$extra['academic_title'] = $lead['academic_title'];
|
|
|
}
|
|
|
if (! empty($lead['college'])) {
|
|
|
$extra['college_name'] = $lead['college'];
|
|
|
}
|
|
|
|
|
|
$authorsParsed = $item->authorsParsed;
|
|
|
if ($authorsParsed !== []) {
|
|
|
if (! empty($lead['academic_title'])) {
|
|
|
$authorsParsed[0]['academic_title'] = $lead['academic_title'];
|
|
|
}
|
|
|
if (! empty($lead['college'])) {
|
|
|
$authorsParsed[0]['affiliation'] = $lead['college'];
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return new CrawlItemDto(
|
|
|
externalId: $item->externalId,
|
|
|
title: $item->title,
|
|
|
canonicalUrl: $item->canonicalUrl,
|
|
|
authors: $item->authors,
|
|
|
summary: $item->summary,
|
|
|
publishedAt: $item->publishedAt,
|
|
|
schoolName: $item->schoolName,
|
|
|
section: $item->section,
|
|
|
contentHtml: $item->contentHtml,
|
|
|
extra: $extra,
|
|
|
authorsParsed: $authorsParsed,
|
|
|
);
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param list<string> $keywords
|
|
|
* @return list<CrawlItemDto>
|
|
|
*/
|
|
|
protected function extractFromStructuredFacultyList(string $html, array $keywords, string $sourceUrl): array
|
|
|
{
|
|
|
$items = [];
|
|
|
$seen = [];
|
|
|
|
|
|
$collegeName = null;
|
|
|
if (preg_match('/<div\s+class="title">\s*([^<]+?)\s*<\/div>/u', $html, $collegeMatch)) {
|
|
|
$collegeName = CrawlAuthorParser::cleanText($collegeMatch[1]);
|
|
|
}
|
|
|
|
|
|
$listHtml = $html;
|
|
|
if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) {
|
|
|
$listHtml = $listMatch[1];
|
|
|
}
|
|
|
|
|
|
if (! preg_match_all('#<li>(.*?)</li>#su', $listHtml, $liBlocks)) {
|
|
|
return [];
|
|
|
}
|
|
|
|
|
|
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
|
|
|
|
|
|
foreach ($liBlocks[1] as $inner) {
|
|
|
$inner = (string) $inner;
|
|
|
if (! preg_match('/<div\s+class="name">\s*([^<]+?)\s*<\/div>/u', $inner, $nameMatch)) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$name = CrawlAuthorParser::cleanText($nameMatch[1]);
|
|
|
if ($name === '' || ! $this->looksLikePersonName($name)) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$href = '';
|
|
|
if (preg_match('/<a\s+[^>]*href="([^"]*)"/u', $inner, $hrefMatch)) {
|
|
|
$href = (string) $hrefMatch[1];
|
|
|
}
|
|
|
|
|
|
$profileUrl = $this->resolveUrl($href, $sourceUrl)
|
|
|
?? $this->inferProfileUrlFromPageScripts($html, $name, $sourceUrl);
|
|
|
$plain = $this->htmlToPlain($inner);
|
|
|
|
|
|
if (! $this->matchesKeywords($plain.' '.$name, $keywords)) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
|
|
|
if (isset($seen[$dedupeKey])) {
|
|
|
continue;
|
|
|
}
|
|
|
$seen[$dedupeKey] = true;
|
|
|
|
|
|
$affiliation = $this->parseLabeledField($inner, '所在单位')
|
|
|
?? $collegeName;
|
|
|
$academicTitle = $this->parseLabeledField($inner, '职称');
|
|
|
// 列表页「所在单位」多为学院,高校名称从站点/页头推断
|
|
|
$universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation);
|
|
|
|
|
|
$bio = $this->parseLabeledField($inner, '简介');
|
|
|
$summaryParts = array_filter([
|
|
|
$academicTitle ? '职称:'.$academicTitle : null,
|
|
|
$affiliation ? '单位:'.$affiliation : null,
|
|
|
$bio,
|
|
|
]);
|
|
|
|
|
|
$items[] = $this->makeFacultyItem(
|
|
|
externalKey: 'faculty:'.md5($dedupeKey),
|
|
|
name: $name,
|
|
|
profileUrl: $profileUrl,
|
|
|
email: null,
|
|
|
affiliation: $affiliation,
|
|
|
universityName: $universityName,
|
|
|
summary: Str::limit(implode(';', $summaryParts), 300),
|
|
|
keywords: $keywords,
|
|
|
academicTitle: $academicTitle,
|
|
|
platform: 'faculty_html_tsites',
|
|
|
bio: $bio,
|
|
|
);
|
|
|
}
|
|
|
|
|
|
return $items;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param list<string> $keywords
|
|
|
*/
|
|
|
protected function makeFacultyItem(
|
|
|
string $externalKey,
|
|
|
string $name,
|
|
|
?string $profileUrl,
|
|
|
?string $email,
|
|
|
?string $affiliation,
|
|
|
?string $universityName,
|
|
|
?string $summary,
|
|
|
array $keywords,
|
|
|
?string $academicTitle,
|
|
|
string $platform,
|
|
|
?string $bio = null,
|
|
|
): CrawlItemDto {
|
|
|
$college = $affiliation;
|
|
|
$lead = [
|
|
|
'name' => $name,
|
|
|
'email' => $email,
|
|
|
'affiliation' => $college,
|
|
|
'college' => $college,
|
|
|
'university_name' => $universityName,
|
|
|
'academic_title' => $academicTitle,
|
|
|
'bio' => $bio,
|
|
|
'profile_url' => $profileUrl,
|
|
|
];
|
|
|
|
|
|
return new CrawlItemDto(
|
|
|
externalId: $externalKey,
|
|
|
title: $name,
|
|
|
canonicalUrl: $profileUrl,
|
|
|
authors: $name,
|
|
|
summary: $summary,
|
|
|
schoolName: $universityName,
|
|
|
extra: [
|
|
|
'platform' => $platform,
|
|
|
'academic_title' => $academicTitle,
|
|
|
'college_name' => $college,
|
|
|
'bio' => $bio,
|
|
|
'profile_url' => $profileUrl,
|
|
|
'lead_author' => $lead,
|
|
|
'keyword' => implode(' ', $keywords),
|
|
|
],
|
|
|
authorsParsed: [[
|
|
|
'name' => $name,
|
|
|
'email' => $email,
|
|
|
'affiliation' => $college,
|
|
|
'university_name' => $universityName,
|
|
|
'academic_title' => $academicTitle,
|
|
|
]],
|
|
|
);
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param list<string> $keywords
|
|
|
*/
|
|
|
protected function matchesKeywords(string $plain, array $keywords): bool
|
|
|
{
|
|
|
if ($keywords === []) {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
foreach ($keywords as $kw) {
|
|
|
if ($kw !== '' && stripos($plain, $kw) !== false) {
|
|
|
return true;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
protected function htmlToPlain(string $html): string
|
|
|
{
|
|
|
$plain = html_entity_decode(strip_tags($html), ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
|
|
|
|
return preg_replace('/\s+/u', ' ', $plain) ?? '';
|
|
|
}
|
|
|
|
|
|
protected function parseLabeledField(string $html, string $label): ?string
|
|
|
{
|
|
|
$pattern = '/'.preg_quote($label, '/').'[::]\s*([^<]+)/u';
|
|
|
if (! preg_match($pattern, $html, $match)) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
return CrawlAuthorParser::cleanText($match[1]);
|
|
|
}
|
|
|
|
|
|
protected function looksLikePersonName(string $name): bool
|
|
|
{
|
|
|
if (preg_match('/^(首页|登录|联系我们|下页|尾页|转到)/u', $name)) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name)
|
|
|
|| (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,30}$/', $name);
|
|
|
}
|
|
|
|
|
|
protected function resolveUrl(string $href, string $baseUrl): ?string
|
|
|
{
|
|
|
$href = trim(html_entity_decode($href, ENT_QUOTES | ENT_HTML5, 'UTF-8'));
|
|
|
if ($href === '' || str_starts_with($href, 'javascript:')) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
if (preg_match('#^https?://#i', $href)) {
|
|
|
return $href;
|
|
|
}
|
|
|
|
|
|
$base = parse_url($baseUrl);
|
|
|
if (! is_array($base) || empty($base['scheme']) || empty($base['host'])) {
|
|
|
return $href;
|
|
|
}
|
|
|
|
|
|
$origin = $base['scheme'].'://'.$base['host'];
|
|
|
if (! empty($base['port'])) {
|
|
|
$origin .= ':'.$base['port'];
|
|
|
}
|
|
|
|
|
|
if (str_starts_with($href, '//')) {
|
|
|
return $base['scheme'].':'.$href;
|
|
|
}
|
|
|
|
|
|
if (str_starts_with($href, '/')) {
|
|
|
return $origin.$href;
|
|
|
}
|
|
|
|
|
|
$path = $base['path'] ?? '/';
|
|
|
$dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/';
|
|
|
|
|
|
return $origin.$dir.$href;
|
|
|
}
|
|
|
|
|
|
protected function inferProfileUrlFromPageScripts(string $html, string $name, string $sourceUrl): ?string
|
|
|
{
|
|
|
$escaped = preg_quote($name, '/');
|
|
|
if (! preg_match(
|
|
|
'/addimg\(\s*(?:"[^"]*"|\'[^\']*\')\s*,\s*"(\\/[^"]+index\.htm)"\s*,\s*"'.$escaped.'"/u',
|
|
|
$html,
|
|
|
$match,
|
|
|
)) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
return $this->resolveUrl($match[1], $sourceUrl);
|
|
|
}
|
|
|
|
|
|
protected function inferUniversityFromSource(string $sourceUrl, string $html): ?string
|
|
|
{
|
|
|
$host = parse_url($sourceUrl, PHP_URL_HOST);
|
|
|
if (is_string($host)) {
|
|
|
$host = strtolower($host);
|
|
|
if (str_contains($host, 'sjtu.edu.cn')) {
|
|
|
return '上海交通大学';
|
|
|
}
|
|
|
if (str_contains($host, 'tsinghua.edu.cn')) {
|
|
|
return '清华大学';
|
|
|
}
|
|
|
if (str_contains($host, 'pku.edu.cn')) {
|
|
|
return '北京大学';
|
|
|
}
|
|
|
if (str_contains($host, 'zju.edu.cn')) {
|
|
|
return '浙江大学';
|
|
|
}
|
|
|
if (str_contains($host, 'fudan.edu.cn')) {
|
|
|
return '复旦大学';
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,20}大学)/u', $this->htmlToPlain($html), $match)) {
|
|
|
return CrawlAuthorParser::cleanText($match[1]);
|
|
|
}
|
|
|
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
protected function guessName(string $plain, string $email): string
|
|
|
{
|
|
|
if (preg_match('/([\x{4e00}-\x{9fff}]{2,4})\s*(?:教授|副教授|讲师|研究员|博士|老师)?/u', $plain, $m)) {
|
|
|
return trim($m[1]);
|
|
|
}
|
|
|
|
|
|
$local = strstr($email, '@', true) ?: '';
|
|
|
$local = str_replace(['.', '_', '-'], ' ', $local);
|
|
|
|
|
|
return Str::title(trim($local));
|
|
|
}
|
|
|
|
|
|
protected function guessAffiliation(string $plain): ?string
|
|
|
{
|
|
|
if (preg_match('/((?:[\x{4e00}-\x{9fff}A-Za-z\s]{2,40}(?:大学|学院|研究院|研究所|University|College)))/u', $plain, $m)) {
|
|
|
return CrawlAuthorParser::cleanText($m[1]);
|
|
|
}
|
|
|
|
|
|
return null;
|
|
|
}
|
|
|
}
|