From fc3e050c762f4c6221e81d267fc4c86b4822f1ef Mon Sep 17 00:00:00 2001 From: lion <120344285@qq.com> Date: Tue, 23 Jun 2026 16:17:54 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BA=A4=E5=A4=A7=E6=99=BA=E8=83=BD=E7=A0=94?= =?UTF-8?q?=E7=A9=B6=E9=99=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Adapters/AiSjtuResearchCenterAdapter.php | 219 ++++++++++++++++++ app/Services/Crawl/CrawlImportService.php | 18 +- app/Services/Crawl/CrawlJobDispatcher.php | 3 + app/Services/Crawl/CrawlJobRunnerService.php | 3 + app/Services/Crawl/CrawlSourceResolver.php | 4 + database/seeders/CrawlSourcesSeeder.php | 13 ++ .../Unit/AiSjtuResearchCenterAdapterTest.php | 64 +++++ 7 files changed, 323 insertions(+), 1 deletion(-) create mode 100644 app/Services/Crawl/Adapters/AiSjtuResearchCenterAdapter.php create mode 100644 tests/Unit/AiSjtuResearchCenterAdapterTest.php diff --git a/app/Services/Crawl/Adapters/AiSjtuResearchCenterAdapter.php b/app/Services/Crawl/Adapters/AiSjtuResearchCenterAdapter.php new file mode 100644 index 0000000..39727f9 --- /dev/null +++ b/app/Services/Crawl/Adapters/AiSjtuResearchCenterAdapter.php @@ -0,0 +1,219 @@ +connectTimeout(10) + ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) + ->withHeaders([ + 'User-Agent' => 'SlakeSchool-Crawler/1.0', + 'Accept' => 'application/json', + ]) + ->get(self::API_BASE.self::LIST_PATH, [ + 'page' => 1, + 'limit' => 99999, + ]); + + if (! $response->successful()) { + throw new \RuntimeException('无法访问人工智能研究院研究中心 API:HTTP '.$response->status()); + } + + $json = $response->json(); + if (! is_array($json)) { + throw new \RuntimeException('研究中心 API 返回格式异常'); + } + + $centers = $json['researchCenters'] ?? []; + if (! is_array($centers)) { + throw new \RuntimeException('研究中心 API 缺少 researchCenters 字段'); + } + + $items = []; + $seen = []; + + foreach ($centers as $center) { + if (! is_array($center)) { + continue; + } + + $centerId = (int) ($center['id'] ?? 0); + $centerName = trim((string) ($center['name'] ?? '')); + if ($centerId <= 0 || $centerName === '') { + continue; + } + + $teams = $center['teams'] ?? []; + if (! is_array($teams)) { + continue; + } + + foreach ($teams as $member) { + if (! is_array($member)) { + continue; + } + + $item = $this->memberToItem($member, $centerId, $centerName, $keywords, $requestUrl); + if ($item === null || isset($seen[$item->externalId])) { + continue; + } + + $seen[$item->externalId] = true; + $items[] = $item; + + if (count($items) >= $maxResults) { + return $items; + } + } + } + + return $items; + } + + /** + * @param list $keywords + */ + protected function memberToItem( + array $member, + int $centerId, + string $centerName, + array $keywords, + string $requestUrl, + ): ?CrawlItemDto { + $name = trim((string) ($member['name'] ?? '')); + if ($name === '' || ! $this->looksLikePersonName($name)) { + return null; + } + + $email = CrawlAuthorParser::normalizeEmail(trim((string) ($member['email'] ?? ''))); + $phone = $this->normalizePhone((string) ($member['phone'] ?? '')); + $title = trim((string) ($member['title'] ?? '')); + $direction = trim((string) ($member['direction'] ?? '')); + $memberKey = (string) ($member['id'] ?? md5($name.$email)); + + $plain = implode(' ', array_filter([$name, $centerName, $title, $direction, $email, $phone])); + if (! $this->matchesKeywords($plain, $keywords)) { + return null; + } + + $profileUrl = 'https://ai.sjtu.edu.cn/center?centerId='.$centerId; + $externalId = 'ai_sjtu_center_'.$centerId.'_team_'.$memberKey; + $researchDirectionNames = $this->parseResearchDirectionNames($direction); + + $summaryParts = array_filter([ + $title !== '' ? '职称:'.$title : null, + $phone !== '' ? '电话:'.$phone : null, + $direction !== '' ? '研究方向:'.$direction : null, + '所属中心:'.$centerName, + ]); + + $lead = [ + 'name' => $name, + 'email' => $email, + 'phone' => $phone !== '' ? $phone : null, + 'affiliation' => $centerName, + 'college' => $centerName, + 'university_name' => self::UNIVERSITY_NAME, + 'academic_title' => $title !== '' ? $title : null, + 'research_direction_names' => $researchDirectionNames, + ]; + + return new CrawlItemDto( + externalId: $externalId, + title: $name, + canonicalUrl: $profileUrl, + authors: $name, + summary: implode(';', $summaryParts), + schoolName: self::UNIVERSITY_NAME, + section: $centerName, + extra: [ + 'platform' => 'ai_sjtu_research_center', + 'academic_title' => $title !== '' ? $title : null, + 'college_name' => $centerName, + 'profile_url' => $profileUrl, + 'phone' => $phone !== '' ? $phone : null, + 'research_direction_names' => $researchDirectionNames, + 'lead_author' => $lead, + ], + authorsParsed: [[ + 'name' => $name, + 'email' => $email, + 'affiliation' => $centerName, + 'university_name' => self::UNIVERSITY_NAME, + 'academic_title' => $title !== '' ? $title : null, + ]], + ); + } + + /** + * @param list $keywords + */ + protected function matchesKeywords(string $plain, array $keywords): bool + { + if ($keywords === []) { + return true; + } + + foreach ($keywords as $keyword) { + if ($keyword !== '' && mb_stripos($plain, $keyword) !== false) { + return true; + } + } + + return false; + } + + /** + * @return list + */ + protected function parseResearchDirectionNames(string $direction): array + { + $direction = trim($direction); + if ($direction === '') { + return []; + } + + $parts = preg_split('/[、,,;;\/]+/u', $direction) ?: []; + + return array_values(array_unique(array_filter(array_map( + fn (string $part) => trim($part), + $parts, + )))); + } + + protected function normalizePhone(string $phone): string + { + $phone = trim(preg_replace('/\s+/u', ' ', $phone) ?? ''); + + return $phone; + } + + protected function looksLikePersonName(string $name): bool + { + return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name) + || (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,40}$/', $name); + } +} diff --git a/app/Services/Crawl/CrawlImportService.php b/app/Services/Crawl/CrawlImportService.php index 090a504..0a71204 100644 --- a/app/Services/Crawl/CrawlImportService.php +++ b/app/Services/Crawl/CrawlImportService.php @@ -10,6 +10,7 @@ use App\Models\News; use App\Models\Paper; use App\Models\Teacher; use App\Models\University; +use App\Services\ResearchDirectionResolver; use Carbon\Carbon; use Illuminate\Support\Facades\DB; use Illuminate\Support\Facades\Log; @@ -192,6 +193,8 @@ class CrawlImportService return null; } + $phone = trim((string) ($lead['phone'] ?? $payload['phone'] ?? '')); + $academicTitle = trim((string) ($lead['academic_title'] ?? $payload['academic_title'] ?? '')); $collegeName = trim((string) ($lead['college'] ?? $lead['affiliation'] ?? $payload['college_name'] ?? '')); $defaultDepartment = trim((string) ($defaults['department'] ?? '')); @@ -257,13 +260,26 @@ class CrawlImportService 'city' => $city ?: '待补充', 'title' => $academicTitle !== '' ? $academicTitle : '待补充', 'email' => $email, + 'phone' => $phone !== '' ? $phone : null, 'source_dict_item_id' => $sourceId, 'status_dict_item_id' => $statusId, 'remark' => implode(';', $remarkParts), ]); + $directionIds = []; if (! empty($defaults['research_direction_ids']) && is_array($defaults['research_direction_ids'])) { - $teacher->researchDirections()->sync($defaults['research_direction_ids']); + $directionIds = array_map('intval', $defaults['research_direction_ids']); + } else { + $directionNames = $payload['research_direction_names'] ?? $lead['research_direction_names'] ?? []; + if (is_string($directionNames)) { + $directionNames = preg_split('/[、,,;;\/]+/u', $directionNames) ?: []; + } + if (is_array($directionNames) && $directionNames !== []) { + $directionIds = app(ResearchDirectionResolver::class)->resolveIds([], $directionNames); + } + } + if ($directionIds !== []) { + $teacher->researchDirections()->sync($directionIds); } $paperExternalId = $payload['paper_external_id'] ?? null; diff --git a/app/Services/Crawl/CrawlJobDispatcher.php b/app/Services/Crawl/CrawlJobDispatcher.php index 545b0a3..467ce5c 100644 --- a/app/Services/Crawl/CrawlJobDispatcher.php +++ b/app/Services/Crawl/CrawlJobDispatcher.php @@ -3,6 +3,7 @@ namespace App\Services\Crawl; use App\Models\CrawlSource; +use App\Services\Crawl\Adapters\AiSjtuResearchCenterAdapter; use App\Services\Crawl\Adapters\ArxivApiAdapter; use App\Services\Crawl\Adapters\FacultyListHtmlAdapter; use App\Services\Crawl\Adapters\GenericNewsHtmlAdapter; @@ -20,6 +21,7 @@ class CrawlJobDispatcher protected HuxiuHtmlAdapter $huxiu, protected GenericPaperHtmlAdapter $genericPaper, protected FacultyListHtmlAdapter $facultyList, + protected AiSjtuResearchCenterAdapter $aiSjtuResearchCenter, ) {} /** @@ -40,6 +42,7 @@ class CrawlJobDispatcher 'generic_news_html' => $this->genericNews, 'generic_paper_html' => $this->genericPaper, 'faculty_list_html' => $this->facultyList, + 'ai_sjtu_research_center_api' => $this->aiSjtuResearchCenter, default => throw new \InvalidArgumentException("未支持的适配器:{$code}"), }; } diff --git a/app/Services/Crawl/CrawlJobRunnerService.php b/app/Services/Crawl/CrawlJobRunnerService.php index 858ca15..b8b0ec4 100644 --- a/app/Services/Crawl/CrawlJobRunnerService.php +++ b/app/Services/Crawl/CrawlJobRunnerService.php @@ -208,6 +208,9 @@ class CrawlJobRunnerService 'academic_title' => $dto->extra['academic_title'] ?? (is_array($lead) ? ($lead['academic_title'] ?? null) : null), 'college_name' => $dto->extra['college_name'] ?? (is_array($lead) ? ($lead['college'] ?? $lead['affiliation'] ?? null) : null), 'profile_url' => $dto->extra['profile_url'] ?? $dto->canonicalUrl, + 'phone' => $dto->extra['phone'] ?? (is_array($lead) ? ($lead['phone'] ?? null) : null), + 'research_direction_names' => $dto->extra['research_direction_names'] + ?? (is_array($lead) ? ($lead['research_direction_names'] ?? null) : null), ], 'status' => $status, 'target_type' => 'teacher', diff --git a/app/Services/Crawl/CrawlSourceResolver.php b/app/Services/Crawl/CrawlSourceResolver.php index cf5e564..d1fcce6 100644 --- a/app/Services/Crawl/CrawlSourceResolver.php +++ b/app/Services/Crawl/CrawlSourceResolver.php @@ -97,6 +97,10 @@ class CrawlSourceResolver return $sources->firstWhere('adapter_code', 'arxiv_api'); } + if ($targetType === 'teacher' && str_contains($lower, 'ai.sjtu.edu.cn')) { + return $sources->firstWhere('adapter_code', 'ai_sjtu_research_center_api'); + } + return null; } } diff --git a/database/seeders/CrawlSourcesSeeder.php b/database/seeders/CrawlSourcesSeeder.php index 766492f..b8d7094 100644 --- a/database/seeders/CrawlSourcesSeeder.php +++ b/database/seeders/CrawlSourcesSeeder.php @@ -180,6 +180,19 @@ class CrawlSourcesSeeder extends Seeder ] ); + CrawlSource::query()->updateOrCreate( + ['adapter_code' => 'ai_sjtu_research_center_api', 'target_type' => 'teacher'], + [ + 'name' => '交大人工智能研究院研究中心', + 'entry_url' => 'https://ai.sjtu.edu.cn/center', + 'match_domains' => ['ai.sjtu.edu.cn'], + 'config' => ['api_base' => 'https://ai.sjtu.edu.cn/api'], + 'param_schema' => $teacherSchema, + 'status' => 1, + 'sort' => 25, + ] + ); + $this->command?->info('采集源 arXiv / 虎嗅 / 投资界 / 通用 HTML / 师资列表 已写入。'); } } diff --git a/tests/Unit/AiSjtuResearchCenterAdapterTest.php b/tests/Unit/AiSjtuResearchCenterAdapterTest.php new file mode 100644 index 0000000..b7d0bf9 --- /dev/null +++ b/tests/Unit/AiSjtuResearchCenterAdapterTest.php @@ -0,0 +1,64 @@ + Http::response([ + 'researchCenters' => [ + [ + 'id' => 3, + 'name' => '数学基础研究中心', + 'teams' => [ + [ + 'id' => 0, + 'name' => '范金燕', + 'email' => 'jyfan@sjtu.edu.cn', + 'phone' => '54740206', + 'title' => '教授', + 'direction' => '最优化理论与方法、多项式优化', + ], + ], + ], + ], + ], 200), + ]); + + $adapter = new AiSjtuResearchCenterAdapter; + $source = new CrawlSource([ + 'adapter_code' => 'ai_sjtu_research_center_api', + 'target_type' => 'teacher', + ]); + + $items = $adapter->fetch('https://ai.sjtu.edu.cn/center', $source, [ + 'max_results' => 10, + ]); + + $this->assertCount(1, $items); + $this->assertSame('范金燕', $items[0]->title); + $this->assertSame('jyfan@sjtu.edu.cn', $items[0]->extra['lead_author']['email']); + $this->assertSame('54740206', $items[0]->extra['phone']); + $this->assertSame(['最优化理论与方法', '多项式优化'], $items[0]->extra['research_direction_names']); + $this->assertSame('数学基础研究中心', $items[0]->extra['college_name']); + } + + public function test_parse_research_direction_names(): void + { + $adapter = new AiSjtuResearchCenterAdapter; + $method = new \ReflectionMethod($adapter, 'parseResearchDirectionNames'); + $method->setAccessible(true); + + $this->assertSame( + ['图像与信号处理', '数据科学', '最优化方法'], + $method->invoke($adapter, '图像与信号处理,数据科学,最优化方法'), + ); + } +}