loadImportedExternalIds() : []; $maxScanPages = $this->resolveMaxScanPages($maxPages, $maxResults, $skipImported); if ((bool) config('crawl.arxiv.prefer_html_search', false)) { return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults)); } $items = []; $seen = []; for ($page = 0; $page < $maxScanPages && count($items) < $maxResults; $page++) { $start = $page * $pageSize; $batch = $this->fetchApiPage($keywordRaw, $start, $pageSize); if ($batch === []) { break; } foreach ($batch as $item) { if (isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; if ($skipImported && isset($importedIds[$item->externalId])) { continue; } $items[] = $item; if (count($items) >= $maxResults) { break 2; } } if (count($batch) < $pageSize) { break; } } if ($items !== []) { return $this->finalizeItems($items); } if ($keywordRaw === '') { throw new \RuntimeException('arXiv API 未返回结果,请稍后重试'); } return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, min($maxResults, $pageSize))); } /** * @return list */ protected function fetchApiPage(string $keywordRaw, int $start, int $maxResults): array { $maxResults = min(50, max(1, $maxResults)); try { $response = $this->requestApiOnce([ 'search_query' => CrawlKeywordParser::buildArxivSearchQuery($keywordRaw), 'start' => $start, 'max_results' => $maxResults, 'sortBy' => 'submittedDate', 'sortOrder' => 'descending', ]); } catch (ConnectionException|RequestException) { return []; } if (! $response->successful()) { return []; } $body = $response->body(); if (str_contains($body, 'arxiv.org/api/errors') || str_contains($body, 'Error')) { return []; } return $this->parseAtomFeed($body, $keywordRaw); } /** * @param array $queryParams */ protected function requestApiOnce(array $queryParams): Response { try { return $this->gate->run(fn () => $this->sendRequest(self::API_URL, $queryParams)); } catch (ConnectionException $e) { sleep(3); return $this->gate->run(fn () => $this->sendRequest(self::API_URL, $queryParams)); } } /** * @return list */ protected function requireHtmlSearchItems(string $keyword, int $maxResults, ?\Throwable $previous = null): array { $items = $this->fetchViaHtmlSearch($keyword, $maxResults); if ($items !== []) { return $items; } $hint = $previous instanceof RequestException && $previous->response?->status() === 429 ? 'arXiv 访问过于频繁(HTTP 429),请等待 1~2 分钟后再试' : 'arXiv 搜索页抓取失败,请检查网络或稍后重试'; throw new \RuntimeException($hint, 0, $previous); } /** * @return CrawlItemDto[] */ protected function parseAtomFeed(string $body, string $keyword): array { $xml = new SimpleXMLElement($body); $xml->registerXPathNamespace('atom', 'http://www.w3.org/2005/Atom'); $entries = $xml->xpath('//atom:entry') ?: []; $items = []; foreach ($entries as $entry) { $entry->registerXPathNamespace('atom', 'http://www.w3.org/2005/Atom'); $idUrl = (string) ($entry->id ?? ''); $arxivId = $this->extractArxivId($idUrl); if (! $arxivId) { continue; } $authorsParsed = []; foreach ($entry->author as $author) { $author->registerXPathNamespace('arxiv', 'http://arxiv.org/schemas/atom'); $name = trim((string) ($author->name ?? '')); $affNodes = $author->xpath('arxiv:affiliation') ?: []; $affiliation = trim((string) ($affNodes[0] ?? '')); if ($name !== '') { $authorsParsed[] = [ 'name' => $name, 'email' => null, 'affiliation' => $affiliation !== '' ? $affiliation : null, 'university_name' => CrawlAuthorParser::universityFromAffiliation($affiliation), ]; } } $authorNames = array_column($authorsParsed, 'name'); $published = (string) ($entry->published ?? ''); $publishedAt = $published ? substr($published, 0, 10) : null; $lead = CrawlAuthorParser::leadAuthor(implode('; ', $authorNames), $authorsParsed); $items[] = new CrawlItemDto( externalId: 'arxiv:'.$arxivId, title: ArxivTextNormalizer::normalize(trim((string) ($entry->title ?? ''))) ?? '', canonicalUrl: 'https://arxiv.org/abs/'.$arxivId, authors: implode('; ', $authorNames), summary: ArxivTextNormalizer::normalize(trim((string) ($entry->summary ?? ''))), publishedAt: $publishedAt, schoolName: $lead['university_name'] ?? null, extra: [ 'platform' => 'arxiv', 'arxiv_id' => $arxivId, 'keyword' => $keyword, 'source' => 'api', 'authors_parsed' => $authorsParsed, 'lead_author' => $lead, ], authorsParsed: $authorsParsed, ); } return $items; } /** * 搜索页降级(export API 被 429 时)。勿传 size 参数,否则会 400。 * * @return CrawlItemDto[] */ protected function fetchViaHtmlSearch(string $keyword, int $maxResults): array { $response = $this->gate->run(fn () => $this->sendRequest(self::SEARCH_URL, [ 'query' => $keyword, 'searchtype' => 'all', ])); if (! $response->successful()) { return []; } return $this->parseSearchHtml($response->body(), $keyword, $maxResults); } /** * @return CrawlItemDto[] */ protected function parseSearchHtml(string $html, string $keyword, int $maxResults): array { if (! preg_match_all('#
  • (.*?)
  • #s', $html, $blocks)) { return []; } $items = []; foreach ($blocks[1] as $block) { if (count($items) >= $maxResults) { break; } if (! preg_match('#arxiv\.org/abs/([^"?\s]+)#', $block, $idMatch)) { continue; } $arxivId = $idMatch[1]; $title = ''; if (preg_match('#

    \s*(.*?)\s*

    #s', $block, $titleMatch)) { $title = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($titleMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8'))) ?? ''; } $authors = ''; if (preg_match('#

    (.*?)

    #s', $block, $authorMatch)) { if (preg_match_all('#]*>([^<]+)#', $authorMatch[1], $authorNames)) { $authors = implode('; ', array_map('trim', $authorNames[1])); } } $summary = ''; if (preg_match('#]*>(.*?)#s', $block, $abstractMatch)) { $summary = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($abstractMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8'))); } elseif (preg_match('#]*>(.*?)#s', $block, $abstractShort)) { $summary = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($abstractShort[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8'))); } $publishedAt = ArxivMetadataParser::parsePublishedDate($block); $authorsParsed = []; if ($authors !== '') { foreach (preg_split('/\s*;\s*/', $authors) ?: [] as $name) { $name = trim($name); if ($name !== '') { $authorsParsed[] = [ 'name' => $name, 'email' => null, 'affiliation' => null, 'university_name' => null, ]; } } } $lead = CrawlAuthorParser::leadAuthor($authors, $authorsParsed); $items[] = new CrawlItemDto( externalId: 'arxiv:'.$arxivId, title: $title, canonicalUrl: 'https://arxiv.org/abs/'.$arxivId, authors: $authors, summary: $summary, publishedAt: $publishedAt, schoolName: $lead['university_name'] ?? null, extra: [ 'platform' => 'arxiv', 'arxiv_id' => $arxivId, 'keyword' => $keyword, 'source' => 'html_search', 'pdf_url' => ArxivMetadataParser::extractPdfUrl($block, $arxivId), 'html_url' => ArxivMetadataParser::extractHtmlUrl($block, $arxivId), 'authors_parsed' => $authorsParsed, 'lead_author' => $lead, ], authorsParsed: $authorsParsed, ); } return $items; } /** * @param array $queryParams */ protected function sendRequest(string $url, array $queryParams): Response { $email = (string) config('crawl.arxiv.contact_email', 'support@example.com'); $timeout = (int) config('crawl.arxiv.http_timeout_seconds', 60); $connectTimeout = (int) config('crawl.arxiv.connect_timeout_seconds', 30); return Http::timeout($timeout) ->connectTimeout($connectTimeout) ->retry(2, 2000, fn ($exception) => $exception instanceof ConnectionException, throw: false) ->withHeaders([ 'User-Agent' => 'SlakeSchool-Crawler/1.0 (https://arxiv.org; mailto:'.$email.')', 'Accept' => 'application/atom+xml, text/html;q=0.9', ]) ->get($url, $queryParams); } protected function extractArxivId(string $idUrl): ?string { if (preg_match('~arxiv\.org/abs/([^\s/?]+)~i', $idUrl, $m)) { return $m[1]; } if (preg_match('~/abs/([^\s/?]+)~', $idUrl, $m)) { return $m[1]; } return null; } /** * @param list $items * @return list */ protected function finalizeItems(array $items, bool $enrichAbs = true): array { if ($items === []) { return $items; } return $enrichAbs ? $this->absEnricher->enrichMany($items) : $items; } /** * @return array */ protected function loadImportedExternalIds(): array { $ids = Paper::query() ->where('source', 'crawl') ->whereNotNull('external_id') ->pluck('external_id') ->all(); return array_fill_keys($ids, true); } protected function resolveMaxScanPages(int $maxPages, int $maxResults, bool $skipImported): int { $maxPages = min(20, max(1, $maxPages)); if (! $skipImported) { return $maxPages; } $minForTarget = (int) ceil($maxResults / 50); return min(200, max($maxPages, $minForTarget * 10)); } }