|
|
<?php
|
|
|
|
|
|
namespace App\Services\Crawl\Adapters;
|
|
|
|
|
|
use App\Models\CrawlSource;
|
|
|
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
|
|
|
use App\Services\Crawl\CrawlAuthorParser;
|
|
|
use App\Services\Crawl\CrawlItemDto;
|
|
|
use App\Services\Crawl\CrawlKeywordParser;
|
|
|
use Illuminate\Support\Facades\Http;
|
|
|
|
|
|
/**
|
|
|
* 上海交大人工智能研究院研究中心页(Vue SPA):
|
|
|
* GET /api/researchCenter 列表,各中心「研究团队」tab 对应 teams 字段。
|
|
|
*/
|
|
|
class AiSjtuResearchCenterAdapter implements CrawlerAdapterInterface
|
|
|
{
|
|
|
protected const API_BASE = 'https://ai.sjtu.edu.cn/api';
|
|
|
|
|
|
protected const LIST_PATH = '/researchCenter';
|
|
|
|
|
|
protected const UNIVERSITY_NAME = '上海交通大学';
|
|
|
|
|
|
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
|
|
|
{
|
|
|
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
|
|
|
$maxResults = min(500, max(1, (int) ($params['max_results'] ?? 200)));
|
|
|
|
|
|
$response = Http::timeout(30)
|
|
|
->connectTimeout(10)
|
|
|
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
|
|
|
->withHeaders([
|
|
|
'User-Agent' => 'SlakeSchool-Crawler/1.0',
|
|
|
'Accept' => 'application/json',
|
|
|
])
|
|
|
->get(self::API_BASE.self::LIST_PATH, [
|
|
|
'page' => 1,
|
|
|
'limit' => 99999,
|
|
|
]);
|
|
|
|
|
|
if (! $response->successful()) {
|
|
|
throw new \RuntimeException('无法访问人工智能研究院研究中心 API:HTTP '.$response->status());
|
|
|
}
|
|
|
|
|
|
$json = $response->json();
|
|
|
if (! is_array($json)) {
|
|
|
throw new \RuntimeException('研究中心 API 返回格式异常');
|
|
|
}
|
|
|
|
|
|
$centers = $json['researchCenters'] ?? [];
|
|
|
if (! is_array($centers)) {
|
|
|
throw new \RuntimeException('研究中心 API 缺少 researchCenters 字段');
|
|
|
}
|
|
|
|
|
|
$items = [];
|
|
|
$seen = [];
|
|
|
|
|
|
foreach ($centers as $center) {
|
|
|
if (! is_array($center)) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$centerId = (int) ($center['id'] ?? 0);
|
|
|
$centerName = trim((string) ($center['name'] ?? ''));
|
|
|
if ($centerId <= 0 || $centerName === '') {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$teams = $center['teams'] ?? [];
|
|
|
if (! is_array($teams)) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
foreach ($teams as $member) {
|
|
|
if (! is_array($member)) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$item = $this->memberToItem($member, $centerId, $centerName, $keywords, $requestUrl);
|
|
|
if ($item === null || isset($seen[$item->externalId])) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$seen[$item->externalId] = true;
|
|
|
$items[] = $item;
|
|
|
|
|
|
if (count($items) >= $maxResults) {
|
|
|
return $items;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return $items;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param list<string> $keywords
|
|
|
*/
|
|
|
protected function memberToItem(
|
|
|
array $member,
|
|
|
int $centerId,
|
|
|
string $centerName,
|
|
|
array $keywords,
|
|
|
string $requestUrl,
|
|
|
): ?CrawlItemDto {
|
|
|
$name = trim((string) ($member['name'] ?? ''));
|
|
|
if ($name === '' || ! $this->looksLikePersonName($name)) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$email = CrawlAuthorParser::normalizeEmail(trim((string) ($member['email'] ?? '')));
|
|
|
$phone = $this->normalizePhone((string) ($member['phone'] ?? ''));
|
|
|
$title = trim((string) ($member['title'] ?? ''));
|
|
|
$direction = trim((string) ($member['direction'] ?? ''));
|
|
|
$memberKey = (string) ($member['id'] ?? md5($name.$email));
|
|
|
|
|
|
$plain = implode(' ', array_filter([$name, $centerName, $title, $direction, $email, $phone]));
|
|
|
if (! $this->matchesKeywords($plain, $keywords)) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$profileUrl = 'https://ai.sjtu.edu.cn/center?centerId='.$centerId;
|
|
|
$externalId = 'ai_sjtu_center_'.$centerId.'_team_'.$memberKey;
|
|
|
$researchDirectionNames = $this->parseResearchDirectionNames($direction);
|
|
|
|
|
|
$summaryParts = array_filter([
|
|
|
$title !== '' ? '职称:'.$title : null,
|
|
|
$phone !== '' ? '电话:'.$phone : null,
|
|
|
$direction !== '' ? '研究方向:'.$direction : null,
|
|
|
'所属中心:'.$centerName,
|
|
|
]);
|
|
|
|
|
|
$lead = [
|
|
|
'name' => $name,
|
|
|
'email' => $email,
|
|
|
'phone' => $phone !== '' ? $phone : null,
|
|
|
'affiliation' => $centerName,
|
|
|
'college' => $centerName,
|
|
|
'university_name' => self::UNIVERSITY_NAME,
|
|
|
'academic_title' => $title !== '' ? $title : null,
|
|
|
'research_direction_names' => $researchDirectionNames,
|
|
|
];
|
|
|
|
|
|
return new CrawlItemDto(
|
|
|
externalId: $externalId,
|
|
|
title: $name,
|
|
|
canonicalUrl: $profileUrl,
|
|
|
authors: $name,
|
|
|
summary: implode(';', $summaryParts),
|
|
|
schoolName: self::UNIVERSITY_NAME,
|
|
|
section: $centerName,
|
|
|
extra: [
|
|
|
'platform' => 'ai_sjtu_research_center',
|
|
|
'academic_title' => $title !== '' ? $title : null,
|
|
|
'college_name' => $centerName,
|
|
|
'profile_url' => $profileUrl,
|
|
|
'phone' => $phone !== '' ? $phone : null,
|
|
|
'research_direction_names' => $researchDirectionNames,
|
|
|
'lead_author' => $lead,
|
|
|
],
|
|
|
authorsParsed: [[
|
|
|
'name' => $name,
|
|
|
'email' => $email,
|
|
|
'affiliation' => $centerName,
|
|
|
'university_name' => self::UNIVERSITY_NAME,
|
|
|
'academic_title' => $title !== '' ? $title : null,
|
|
|
]],
|
|
|
);
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param list<string> $keywords
|
|
|
*/
|
|
|
protected function matchesKeywords(string $plain, array $keywords): bool
|
|
|
{
|
|
|
if ($keywords === []) {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
foreach ($keywords as $keyword) {
|
|
|
if ($keyword !== '' && mb_stripos($plain, $keyword) !== false) {
|
|
|
return true;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @return list<string>
|
|
|
*/
|
|
|
protected function parseResearchDirectionNames(string $direction): array
|
|
|
{
|
|
|
$direction = trim($direction);
|
|
|
if ($direction === '') {
|
|
|
return [];
|
|
|
}
|
|
|
|
|
|
$parts = preg_split('/[、,,;;\/]+/u', $direction) ?: [];
|
|
|
|
|
|
return array_values(array_unique(array_filter(array_map(
|
|
|
fn (string $part) => trim($part),
|
|
|
$parts,
|
|
|
))));
|
|
|
}
|
|
|
|
|
|
protected function normalizePhone(string $phone): string
|
|
|
{
|
|
|
$phone = trim(preg_replace('/\s+/u', ' ', $phone) ?? '');
|
|
|
|
|
|
return $phone;
|
|
|
}
|
|
|
|
|
|
protected function looksLikePersonName(string $name): bool
|
|
|
{
|
|
|
return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name)
|
|
|
|| (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,40}$/', $name);
|
|
|
}
|
|
|
}
|