slake-school-service/app/Services/Crawl/Adapters/AiSjtuResearchCenterAdapter...

<?php

namespace App\Services\Crawl\Adapters;

use App\Models\CrawlSource;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use Illuminate\Support\Facades\Http;

/**
 * 上海交大人工智能研究院研究中心页（Vue SPA）：
 * GET /api/researchCenter 列表，各中心「研究团队」tab 对应 teams 字段。
 */
class AiSjtuResearchCenterAdapter implements CrawlerAdapterInterface
{
    protected const API_BASE = 'https://ai.sjtu.edu.cn/api';

    protected const LIST_PATH = '/researchCenter';

    protected const UNIVERSITY_NAME = '上海交通大学';

    public function fetch(string $requestUrl, CrawlSource $source, array $params): array
    {
        $keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
        $maxResults = min(500, max(1, (int) ($params['max_results'] ?? 200)));

        $response = Http::timeout(30)
            ->connectTimeout(10)
            ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
            ->withHeaders([
                'User-Agent' => 'SlakeSchool-Crawler/1.0',
                'Accept' => 'application/json',
            ])
            ->get(self::API_BASE.self::LIST_PATH, [
                'page' => 1,
                'limit' => 99999,
            ]);

        if (! $response->successful()) {
            throw new \RuntimeException('无法访问人工智能研究院研究中心 API：HTTP '.$response->status());
        }

        $json = $response->json();
        if (! is_array($json)) {
            throw new \RuntimeException('研究中心 API 返回格式异常');
        }

        $centers = $json['researchCenters'] ?? [];
        if (! is_array($centers)) {
            throw new \RuntimeException('研究中心 API 缺少 researchCenters 字段');
        }

        $items = [];
        $seen = [];

        foreach ($centers as $center) {
            if (! is_array($center)) {
                continue;
            }

            $centerId = (int) ($center['id'] ?? 0);
            $centerName = trim((string) ($center['name'] ?? ''));
            if ($centerId <= 0 || $centerName === '') {
                continue;
            }

            $teams = $center['teams'] ?? [];
            if (! is_array($teams)) {
                continue;
            }

            foreach ($teams as $member) {
                if (! is_array($member)) {
                    continue;
                }

                $item = $this->memberToItem($member, $centerId, $centerName, $keywords, $requestUrl);
                if ($item === null || isset($seen[$item->externalId])) {
                    continue;
                }

                $seen[$item->externalId] = true;
                $items[] = $item;

                if (count($items) >= $maxResults) {
                    return $items;
                }
            }
        }

        return $items;
    }

    /**
     * @param  list<string>  $keywords
     */
    protected function memberToItem(
        array $member,
        int $centerId,
        string $centerName,
        array $keywords,
        string $requestUrl,
    ): ?CrawlItemDto {
        $name = trim((string) ($member['name'] ?? ''));
        if ($name === '' || ! $this->looksLikePersonName($name)) {
            return null;
        }

        $email = CrawlAuthorParser::normalizeEmail(trim((string) ($member['email'] ?? '')));
        $phone = $this->normalizePhone((string) ($member['phone'] ?? ''));
        $title = trim((string) ($member['title'] ?? ''));
        $direction = trim((string) ($member['direction'] ?? ''));
        $memberKey = (string) ($member['id'] ?? md5($name.$email));

        $plain = implode(' ', array_filter([$name, $centerName, $title, $direction, $email, $phone]));
        if (! $this->matchesKeywords($plain, $keywords)) {
            return null;
        }

        $profileUrl = 'https://ai.sjtu.edu.cn/center?centerId='.$centerId;
        $externalId = 'ai_sjtu_center_'.$centerId.'_team_'.$memberKey;
        $researchDirectionNames = $this->parseResearchDirectionNames($direction);

        $summaryParts = array_filter([
            $title !== '' ? '职称：'.$title : null,
            $phone !== '' ? '电话：'.$phone : null,
            $direction !== '' ? '研究方向：'.$direction : null,
            '所属中心：'.$centerName,
        ]);

        $lead = [
            'name' => $name,
            'email' => $email,
            'phone' => $phone !== '' ? $phone : null,
            'affiliation' => $centerName,
            'college' => $centerName,
            'university_name' => self::UNIVERSITY_NAME,
            'academic_title' => $title !== '' ? $title : null,
            'research_direction_names' => $researchDirectionNames,
        ];

        return new CrawlItemDto(
            externalId: $externalId,
            title: $name,
            canonicalUrl: $profileUrl,
            authors: $name,
            summary: implode('；', $summaryParts),
            schoolName: self::UNIVERSITY_NAME,
            section: $centerName,
            extra: [
                'platform' => 'ai_sjtu_research_center',
                'academic_title' => $title !== '' ? $title : null,
                'college_name' => $centerName,
                'profile_url' => $profileUrl,
                'phone' => $phone !== '' ? $phone : null,
                'research_direction_names' => $researchDirectionNames,
                'lead_author' => $lead,
            ],
            authorsParsed: [[
                'name' => $name,
                'email' => $email,
                'affiliation' => $centerName,
                'university_name' => self::UNIVERSITY_NAME,
                'academic_title' => $title !== '' ? $title : null,
            ]],
        );
    }

    /**
     * @param  list<string>  $keywords
     */
    protected function matchesKeywords(string $plain, array $keywords): bool
    {
        if ($keywords === []) {
            return true;
        }

        foreach ($keywords as $keyword) {
            if ($keyword !== '' && mb_stripos($plain, $keyword) !== false) {
                return true;
            }
        }

        return false;
    }

    /**
     * @return list<string>
     */
    protected function parseResearchDirectionNames(string $direction): array
    {
        $direction = trim($direction);
        if ($direction === '') {
            return [];
        }

        $parts = preg_split('/[、，,;；\/]+/u', $direction) ?: [];

        return array_values(array_unique(array_filter(array_map(
            fn (string $part) => trim($part),
            $parts,
        ))));
    }

    protected function normalizePhone(string $phone): string
    {
        $phone = trim(preg_replace('/\s+/u', ' ', $phone) ?? '');

        return $phone;
    }

    protected function looksLikePersonName(string $name): bool
    {
        return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name)
            || (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,40}$/', $name);
    }
}