You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

220 lines
7.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
namespace App\Services\Crawl\Adapters;
use App\Models\CrawlSource;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use Illuminate\Support\Facades\Http;
/**
* 上海交大人工智能研究院研究中心页Vue SPA
* GET /api/researchCenter 列表各中心「研究团队」tab 对应 teams 字段。
*/
class AiSjtuResearchCenterAdapter implements CrawlerAdapterInterface
{
protected const API_BASE = 'https://ai.sjtu.edu.cn/api';
protected const LIST_PATH = '/researchCenter';
protected const UNIVERSITY_NAME = '上海交通大学';
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
{
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
$maxResults = min(500, max(1, (int) ($params['max_results'] ?? 200)));
$response = Http::timeout(30)
->connectTimeout(10)
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders([
'User-Agent' => 'SlakeSchool-Crawler/1.0',
'Accept' => 'application/json',
])
->get(self::API_BASE.self::LIST_PATH, [
'page' => 1,
'limit' => 99999,
]);
if (! $response->successful()) {
throw new \RuntimeException('无法访问人工智能研究院研究中心 APIHTTP '.$response->status());
}
$json = $response->json();
if (! is_array($json)) {
throw new \RuntimeException('研究中心 API 返回格式异常');
}
$centers = $json['researchCenters'] ?? [];
if (! is_array($centers)) {
throw new \RuntimeException('研究中心 API 缺少 researchCenters 字段');
}
$items = [];
$seen = [];
foreach ($centers as $center) {
if (! is_array($center)) {
continue;
}
$centerId = (int) ($center['id'] ?? 0);
$centerName = trim((string) ($center['name'] ?? ''));
if ($centerId <= 0 || $centerName === '') {
continue;
}
$teams = $center['teams'] ?? [];
if (! is_array($teams)) {
continue;
}
foreach ($teams as $member) {
if (! is_array($member)) {
continue;
}
$item = $this->memberToItem($member, $centerId, $centerName, $keywords, $requestUrl);
if ($item === null || isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$items[] = $item;
if (count($items) >= $maxResults) {
return $items;
}
}
}
return $items;
}
/**
* @param list<string> $keywords
*/
protected function memberToItem(
array $member,
int $centerId,
string $centerName,
array $keywords,
string $requestUrl,
): ?CrawlItemDto {
$name = trim((string) ($member['name'] ?? ''));
if ($name === '' || ! $this->looksLikePersonName($name)) {
return null;
}
$email = CrawlAuthorParser::normalizeEmail(trim((string) ($member['email'] ?? '')));
$phone = $this->normalizePhone((string) ($member['phone'] ?? ''));
$title = trim((string) ($member['title'] ?? ''));
$direction = trim((string) ($member['direction'] ?? ''));
$memberKey = (string) ($member['id'] ?? md5($name.$email));
$plain = implode(' ', array_filter([$name, $centerName, $title, $direction, $email, $phone]));
if (! $this->matchesKeywords($plain, $keywords)) {
return null;
}
$profileUrl = 'https://ai.sjtu.edu.cn/center?centerId='.$centerId;
$externalId = 'ai_sjtu_center_'.$centerId.'_team_'.$memberKey;
$researchDirectionNames = $this->parseResearchDirectionNames($direction);
$summaryParts = array_filter([
$title !== '' ? '职称:'.$title : null,
$phone !== '' ? '电话:'.$phone : null,
$direction !== '' ? '研究方向:'.$direction : null,
'所属中心:'.$centerName,
]);
$lead = [
'name' => $name,
'email' => $email,
'phone' => $phone !== '' ? $phone : null,
'affiliation' => $centerName,
'college' => $centerName,
'university_name' => self::UNIVERSITY_NAME,
'academic_title' => $title !== '' ? $title : null,
'research_direction_names' => $researchDirectionNames,
];
return new CrawlItemDto(
externalId: $externalId,
title: $name,
canonicalUrl: $profileUrl,
authors: $name,
summary: implode('', $summaryParts),
schoolName: self::UNIVERSITY_NAME,
section: $centerName,
extra: [
'platform' => 'ai_sjtu_research_center',
'academic_title' => $title !== '' ? $title : null,
'college_name' => $centerName,
'profile_url' => $profileUrl,
'phone' => $phone !== '' ? $phone : null,
'research_direction_names' => $researchDirectionNames,
'lead_author' => $lead,
],
authorsParsed: [[
'name' => $name,
'email' => $email,
'affiliation' => $centerName,
'university_name' => self::UNIVERSITY_NAME,
'academic_title' => $title !== '' ? $title : null,
]],
);
}
/**
* @param list<string> $keywords
*/
protected function matchesKeywords(string $plain, array $keywords): bool
{
if ($keywords === []) {
return true;
}
foreach ($keywords as $keyword) {
if ($keyword !== '' && mb_stripos($plain, $keyword) !== false) {
return true;
}
}
return false;
}
/**
* @return list<string>
*/
protected function parseResearchDirectionNames(string $direction): array
{
$direction = trim($direction);
if ($direction === '') {
return [];
}
$parts = preg_split('/[、,,;\/]+/u', $direction) ?: [];
return array_values(array_unique(array_filter(array_map(
fn (string $part) => trim($part),
$parts,
))));
}
protected function normalizePhone(string $phone): string
{
$phone = trim(preg_replace('/\s+/u', ' ', $phone) ?? '');
return $phone;
}
protected function looksLikePersonName(string $name): bool
{
return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name)
|| (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,40}$/', $name);
}
}