connectTimeout(10) ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->withHeaders([ 'User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'application/json', ]) ->get(self::API_BASE.self::LIST_PATH, [ 'page' => 1, 'limit' => 99999, ]); if (! $response->successful()) { throw new \RuntimeException('无法访问人工智能研究院研究中心 API:HTTP '.$response->status()); } $json = $response->json(); if (! is_array($json)) { throw new \RuntimeException('研究中心 API 返回格式异常'); } $centers = $json['researchCenters'] ?? []; if (! is_array($centers)) { throw new \RuntimeException('研究中心 API 缺少 researchCenters 字段'); } $items = []; $seen = []; foreach ($centers as $center) { if (! is_array($center)) { continue; } $centerId = (int) ($center['id'] ?? 0); $centerName = trim((string) ($center['name'] ?? '')); if ($centerId <= 0 || $centerName === '') { continue; } $teams = $center['teams'] ?? []; if (! is_array($teams)) { continue; } foreach ($teams as $member) { if (! is_array($member)) { continue; } $item = $this->memberToItem($member, $centerId, $centerName, $keywords, $requestUrl); if ($item === null || isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $items[] = $item; if (count($items) >= $maxResults) { return $items; } } } return $items; } /** * @param list $keywords */ protected function memberToItem( array $member, int $centerId, string $centerName, array $keywords, string $requestUrl, ): ?CrawlItemDto { $name = trim((string) ($member['name'] ?? '')); if ($name === '' || ! $this->looksLikePersonName($name)) { return null; } $email = CrawlAuthorParser::normalizeEmail(trim((string) ($member['email'] ?? ''))); $phone = $this->normalizePhone((string) ($member['phone'] ?? '')); $title = trim((string) ($member['title'] ?? '')); $direction = trim((string) ($member['direction'] ?? '')); $memberKey = (string) ($member['id'] ?? md5($name.$email)); $plain = implode(' ', array_filter([$name, $centerName, $title, $direction, $email, $phone])); if (! $this->matchesKeywords($plain, $keywords)) { return null; } $profileUrl = 'https://ai.sjtu.edu.cn/center?centerId='.$centerId; $externalId = 'ai_sjtu_center_'.$centerId.'_team_'.$memberKey; $researchDirectionNames = $this->parseResearchDirectionNames($direction); $summaryParts = array_filter([ $title !== '' ? '职称:'.$title : null, $phone !== '' ? '电话:'.$phone : null, $direction !== '' ? '研究方向:'.$direction : null, '所属中心:'.$centerName, ]); $lead = [ 'name' => $name, 'email' => $email, 'phone' => $phone !== '' ? $phone : null, 'affiliation' => $centerName, 'college' => $centerName, 'university_name' => self::UNIVERSITY_NAME, 'academic_title' => $title !== '' ? $title : null, 'research_direction_names' => $researchDirectionNames, ]; return new CrawlItemDto( externalId: $externalId, title: $name, canonicalUrl: $profileUrl, authors: $name, summary: implode(';', $summaryParts), schoolName: self::UNIVERSITY_NAME, section: $centerName, extra: [ 'platform' => 'ai_sjtu_research_center', 'academic_title' => $title !== '' ? $title : null, 'college_name' => $centerName, 'profile_url' => $profileUrl, 'phone' => $phone !== '' ? $phone : null, 'research_direction_names' => $researchDirectionNames, 'lead_author' => $lead, ], authorsParsed: [[ 'name' => $name, 'email' => $email, 'affiliation' => $centerName, 'university_name' => self::UNIVERSITY_NAME, 'academic_title' => $title !== '' ? $title : null, ]], ); } /** * @param list $keywords */ protected function matchesKeywords(string $plain, array $keywords): bool { if ($keywords === []) { return true; } foreach ($keywords as $keyword) { if ($keyword !== '' && mb_stripos($plain, $keyword) !== false) { return true; } } return false; } /** * @return list */ protected function parseResearchDirectionNames(string $direction): array { $direction = trim($direction); if ($direction === '') { return []; } $parts = preg_split('/[、,,;;\/]+/u', $direction) ?: []; return array_values(array_unique(array_filter(array_map( fn (string $part) => trim($part), $parts, )))); } protected function normalizePhone(string $phone): string { $phone = trim(preg_replace('/\s+/u', ' ', $phone) ?? ''); return $phone; } protected function looksLikePersonName(string $name): bool { return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name) || (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,40}$/', $name); } }