交大智能研究院

master
lion 2 days ago
parent f472eec19d
commit fc3e050c76

@ -0,0 +1,219 @@
<?php
namespace App\Services\Crawl\Adapters;
use App\Models\CrawlSource;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use Illuminate\Support\Facades\Http;
/**
* 上海交大人工智能研究院研究中心页Vue SPA
* GET /api/researchCenter 列表各中心「研究团队」tab 对应 teams 字段。
*/
class AiSjtuResearchCenterAdapter implements CrawlerAdapterInterface
{
protected const API_BASE = 'https://ai.sjtu.edu.cn/api';
protected const LIST_PATH = '/researchCenter';
protected const UNIVERSITY_NAME = '上海交通大学';
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
{
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
$maxResults = min(500, max(1, (int) ($params['max_results'] ?? 200)));
$response = Http::timeout(30)
->connectTimeout(10)
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders([
'User-Agent' => 'SlakeSchool-Crawler/1.0',
'Accept' => 'application/json',
])
->get(self::API_BASE.self::LIST_PATH, [
'page' => 1,
'limit' => 99999,
]);
if (! $response->successful()) {
throw new \RuntimeException('无法访问人工智能研究院研究中心 APIHTTP '.$response->status());
}
$json = $response->json();
if (! is_array($json)) {
throw new \RuntimeException('研究中心 API 返回格式异常');
}
$centers = $json['researchCenters'] ?? [];
if (! is_array($centers)) {
throw new \RuntimeException('研究中心 API 缺少 researchCenters 字段');
}
$items = [];
$seen = [];
foreach ($centers as $center) {
if (! is_array($center)) {
continue;
}
$centerId = (int) ($center['id'] ?? 0);
$centerName = trim((string) ($center['name'] ?? ''));
if ($centerId <= 0 || $centerName === '') {
continue;
}
$teams = $center['teams'] ?? [];
if (! is_array($teams)) {
continue;
}
foreach ($teams as $member) {
if (! is_array($member)) {
continue;
}
$item = $this->memberToItem($member, $centerId, $centerName, $keywords, $requestUrl);
if ($item === null || isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$items[] = $item;
if (count($items) >= $maxResults) {
return $items;
}
}
}
return $items;
}
/**
* @param list<string> $keywords
*/
protected function memberToItem(
array $member,
int $centerId,
string $centerName,
array $keywords,
string $requestUrl,
): ?CrawlItemDto {
$name = trim((string) ($member['name'] ?? ''));
if ($name === '' || ! $this->looksLikePersonName($name)) {
return null;
}
$email = CrawlAuthorParser::normalizeEmail(trim((string) ($member['email'] ?? '')));
$phone = $this->normalizePhone((string) ($member['phone'] ?? ''));
$title = trim((string) ($member['title'] ?? ''));
$direction = trim((string) ($member['direction'] ?? ''));
$memberKey = (string) ($member['id'] ?? md5($name.$email));
$plain = implode(' ', array_filter([$name, $centerName, $title, $direction, $email, $phone]));
if (! $this->matchesKeywords($plain, $keywords)) {
return null;
}
$profileUrl = 'https://ai.sjtu.edu.cn/center?centerId='.$centerId;
$externalId = 'ai_sjtu_center_'.$centerId.'_team_'.$memberKey;
$researchDirectionNames = $this->parseResearchDirectionNames($direction);
$summaryParts = array_filter([
$title !== '' ? '职称:'.$title : null,
$phone !== '' ? '电话:'.$phone : null,
$direction !== '' ? '研究方向:'.$direction : null,
'所属中心:'.$centerName,
]);
$lead = [
'name' => $name,
'email' => $email,
'phone' => $phone !== '' ? $phone : null,
'affiliation' => $centerName,
'college' => $centerName,
'university_name' => self::UNIVERSITY_NAME,
'academic_title' => $title !== '' ? $title : null,
'research_direction_names' => $researchDirectionNames,
];
return new CrawlItemDto(
externalId: $externalId,
title: $name,
canonicalUrl: $profileUrl,
authors: $name,
summary: implode('', $summaryParts),
schoolName: self::UNIVERSITY_NAME,
section: $centerName,
extra: [
'platform' => 'ai_sjtu_research_center',
'academic_title' => $title !== '' ? $title : null,
'college_name' => $centerName,
'profile_url' => $profileUrl,
'phone' => $phone !== '' ? $phone : null,
'research_direction_names' => $researchDirectionNames,
'lead_author' => $lead,
],
authorsParsed: [[
'name' => $name,
'email' => $email,
'affiliation' => $centerName,
'university_name' => self::UNIVERSITY_NAME,
'academic_title' => $title !== '' ? $title : null,
]],
);
}
/**
* @param list<string> $keywords
*/
protected function matchesKeywords(string $plain, array $keywords): bool
{
if ($keywords === []) {
return true;
}
foreach ($keywords as $keyword) {
if ($keyword !== '' && mb_stripos($plain, $keyword) !== false) {
return true;
}
}
return false;
}
/**
* @return list<string>
*/
protected function parseResearchDirectionNames(string $direction): array
{
$direction = trim($direction);
if ($direction === '') {
return [];
}
$parts = preg_split('/[、,,;\/]+/u', $direction) ?: [];
return array_values(array_unique(array_filter(array_map(
fn (string $part) => trim($part),
$parts,
))));
}
protected function normalizePhone(string $phone): string
{
$phone = trim(preg_replace('/\s+/u', ' ', $phone) ?? '');
return $phone;
}
protected function looksLikePersonName(string $name): bool
{
return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name)
|| (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,40}$/', $name);
}
}

@ -10,6 +10,7 @@ use App\Models\News;
use App\Models\Paper;
use App\Models\Teacher;
use App\Models\University;
use App\Services\ResearchDirectionResolver;
use Carbon\Carbon;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Log;
@ -192,6 +193,8 @@ class CrawlImportService
return null;
}
$phone = trim((string) ($lead['phone'] ?? $payload['phone'] ?? ''));
$academicTitle = trim((string) ($lead['academic_title'] ?? $payload['academic_title'] ?? ''));
$collegeName = trim((string) ($lead['college'] ?? $lead['affiliation'] ?? $payload['college_name'] ?? ''));
$defaultDepartment = trim((string) ($defaults['department'] ?? ''));
@ -257,13 +260,26 @@ class CrawlImportService
'city' => $city ?: '待补充',
'title' => $academicTitle !== '' ? $academicTitle : '待补充',
'email' => $email,
'phone' => $phone !== '' ? $phone : null,
'source_dict_item_id' => $sourceId,
'status_dict_item_id' => $statusId,
'remark' => implode('', $remarkParts),
]);
$directionIds = [];
if (! empty($defaults['research_direction_ids']) && is_array($defaults['research_direction_ids'])) {
$teacher->researchDirections()->sync($defaults['research_direction_ids']);
$directionIds = array_map('intval', $defaults['research_direction_ids']);
} else {
$directionNames = $payload['research_direction_names'] ?? $lead['research_direction_names'] ?? [];
if (is_string($directionNames)) {
$directionNames = preg_split('/[、,,;\/]+/u', $directionNames) ?: [];
}
if (is_array($directionNames) && $directionNames !== []) {
$directionIds = app(ResearchDirectionResolver::class)->resolveIds([], $directionNames);
}
}
if ($directionIds !== []) {
$teacher->researchDirections()->sync($directionIds);
}
$paperExternalId = $payload['paper_external_id'] ?? null;

@ -3,6 +3,7 @@
namespace App\Services\Crawl;
use App\Models\CrawlSource;
use App\Services\Crawl\Adapters\AiSjtuResearchCenterAdapter;
use App\Services\Crawl\Adapters\ArxivApiAdapter;
use App\Services\Crawl\Adapters\FacultyListHtmlAdapter;
use App\Services\Crawl\Adapters\GenericNewsHtmlAdapter;
@ -20,6 +21,7 @@ class CrawlJobDispatcher
protected HuxiuHtmlAdapter $huxiu,
protected GenericPaperHtmlAdapter $genericPaper,
protected FacultyListHtmlAdapter $facultyList,
protected AiSjtuResearchCenterAdapter $aiSjtuResearchCenter,
) {}
/**
@ -40,6 +42,7 @@ class CrawlJobDispatcher
'generic_news_html' => $this->genericNews,
'generic_paper_html' => $this->genericPaper,
'faculty_list_html' => $this->facultyList,
'ai_sjtu_research_center_api' => $this->aiSjtuResearchCenter,
default => throw new \InvalidArgumentException("未支持的适配器:{$code}"),
};
}

@ -208,6 +208,9 @@ class CrawlJobRunnerService
'academic_title' => $dto->extra['academic_title'] ?? (is_array($lead) ? ($lead['academic_title'] ?? null) : null),
'college_name' => $dto->extra['college_name'] ?? (is_array($lead) ? ($lead['college'] ?? $lead['affiliation'] ?? null) : null),
'profile_url' => $dto->extra['profile_url'] ?? $dto->canonicalUrl,
'phone' => $dto->extra['phone'] ?? (is_array($lead) ? ($lead['phone'] ?? null) : null),
'research_direction_names' => $dto->extra['research_direction_names']
?? (is_array($lead) ? ($lead['research_direction_names'] ?? null) : null),
],
'status' => $status,
'target_type' => 'teacher',

@ -97,6 +97,10 @@ class CrawlSourceResolver
return $sources->firstWhere('adapter_code', 'arxiv_api');
}
if ($targetType === 'teacher' && str_contains($lower, 'ai.sjtu.edu.cn')) {
return $sources->firstWhere('adapter_code', 'ai_sjtu_research_center_api');
}
return null;
}
}

@ -180,6 +180,19 @@ class CrawlSourcesSeeder extends Seeder
]
);
CrawlSource::query()->updateOrCreate(
['adapter_code' => 'ai_sjtu_research_center_api', 'target_type' => 'teacher'],
[
'name' => '交大人工智能研究院研究中心',
'entry_url' => 'https://ai.sjtu.edu.cn/center',
'match_domains' => ['ai.sjtu.edu.cn'],
'config' => ['api_base' => 'https://ai.sjtu.edu.cn/api'],
'param_schema' => $teacherSchema,
'status' => 1,
'sort' => 25,
]
);
$this->command?->info('采集源 arXiv / 虎嗅 / 投资界 / 通用 HTML / 师资列表 已写入。');
}
}

@ -0,0 +1,64 @@
<?php
namespace Tests\Unit;
use App\Models\CrawlSource;
use App\Services\Crawl\Adapters\AiSjtuResearchCenterAdapter;
use Illuminate\Support\Facades\Http;
use Tests\TestCase;
class AiSjtuResearchCenterAdapterTest extends TestCase
{
public function test_fetches_research_team_members_from_api(): void
{
Http::fake([
'ai.sjtu.edu.cn/api/researchCenter*' => Http::response([
'researchCenters' => [
[
'id' => 3,
'name' => '数学基础研究中心',
'teams' => [
[
'id' => 0,
'name' => '范金燕',
'email' => 'jyfan@sjtu.edu.cn',
'phone' => '54740206',
'title' => '教授',
'direction' => '最优化理论与方法、多项式优化',
],
],
],
],
], 200),
]);
$adapter = new AiSjtuResearchCenterAdapter;
$source = new CrawlSource([
'adapter_code' => 'ai_sjtu_research_center_api',
'target_type' => 'teacher',
]);
$items = $adapter->fetch('https://ai.sjtu.edu.cn/center', $source, [
'max_results' => 10,
]);
$this->assertCount(1, $items);
$this->assertSame('范金燕', $items[0]->title);
$this->assertSame('jyfan@sjtu.edu.cn', $items[0]->extra['lead_author']['email']);
$this->assertSame('54740206', $items[0]->extra['phone']);
$this->assertSame(['最优化理论与方法', '多项式优化'], $items[0]->extra['research_direction_names']);
$this->assertSame('数学基础研究中心', $items[0]->extra['college_name']);
}
public function test_parse_research_direction_names(): void
{
$adapter = new AiSjtuResearchCenterAdapter;
$method = new \ReflectionMethod($adapter, 'parseResearchDirectionNames');
$method->setAccessible(true);
$this->assertSame(
['图像与信号处理', '数据科学', '最优化方法'],
$method->invoke($adapter, '图像与信号处理,数据科学,最优化方法'),
);
}
}
Loading…
Cancel
Save