$items * @return list */ public function enrichMany(array $items): array { if (! config('crawl.arxiv.abs_enrich_enabled', true)) { return array_map(fn (CrawlItemDto $d) => $this->ensureLeadAuthor($d), $items); } $max = max(0, (int) config('crawl.arxiv.abs_enrich_max', 32)); $sorted = $items; usort($sorted, fn (CrawlItemDto $a, CrawlItemDto $b) => $this->enrichPriority($a) <=> $this->enrichPriority($b)); $enriched = 0; $enrichedMap = []; foreach ($sorted as $dto) { if ($enriched >= $max || ! $this->shouldEnrich($dto)) { $enrichedMap[$dto->externalId] = $this->ensureLeadAuthor($dto); continue; } $enrichedMap[$dto->externalId] = $this->enrichOne($dto); $enriched++; } return array_map( fn (CrawlItemDto $dto) => $enrichedMap[$dto->externalId] ?? $this->ensureLeadAuthor($dto), $items, ); } protected function enrichPriority(CrawlItemDto $dto): int { if (($dto->publishedAt ?? '') === '') { return 0; } $lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed); $hasSchool = ($dto->schoolName ?? null) !== null || ($lead['university_name'] ?? null) !== null || ($lead['affiliation'] ?? null) !== null; return $hasSchool ? 2 : 1; } public function enrichOne(CrawlItemDto $dto): CrawlItemDto { $arxivId = $dto->extra['arxiv_id'] ?? null; if (! $arxivId || ! $dto->canonicalUrl) { return $this->ensureLeadAuthor($dto); } if (! $this->shouldEnrich($dto)) { return $this->ensureLeadAuthor($dto); } $publishedAt = $dto->publishedAt; $authorsParsed = $dto->authorsParsed; $enrichedFrom = null; $pageHtml = ''; $absHtml = ''; $preferHtml = $this->shouldPreferHtmlEnrich($dto); if ($preferHtml && (bool) config('crawl.arxiv.try_html_version', true)) { $pageHtml = $this->fetchHtmlVersion((string) $arxivId); if ($pageHtml !== '') { $enrichedFrom = 'arxiv_html'; $publishedAt = ArxivMetadataParser::parsePublishedDate($pageHtml) ?? $publishedAt; } } if (($publishedAt ?? '') === '' || $pageHtml === '') { $absHtml = $this->fetchAbsHtml((string) $arxivId); if ($absHtml !== '') { if ($enrichedFrom === null) { $enrichedFrom = 'abs_html'; } if (($publishedAt ?? '') === '') { $publishedAt = ArxivMetadataParser::parsePublishedDate($absHtml) ?? $publishedAt; } } } if ($pageHtml === '' && $absHtml !== '') { $pageHtml = $absHtml; $preferHtml = false; } if ($pageHtml === '') { return $this->ensureLeadAuthor(new CrawlItemDto( externalId: $dto->externalId, title: $dto->title, canonicalUrl: $dto->canonicalUrl, authors: $dto->authors, summary: $dto->summary, publishedAt: $publishedAt ?: ArxivMetadataParser::parsePublishedDateFromArxivId((string) $arxivId), schoolName: $dto->schoolName, section: $dto->section, contentHtml: $dto->contentHtml, extra: $dto->extra, authorsParsed: $dto->authorsParsed, )); } if (($publishedAt ?? '') === '') { $publishedAt = ArxivMetadataParser::parsePublishedDate($pageHtml) ?? $publishedAt; } if (($publishedAt ?? '') === '' && $absHtml !== '') { $publishedAt = ArxivMetadataParser::parsePublishedDate($absHtml) ?? $publishedAt; } if (($publishedAt ?? '') === '') { $publishedAt = ArxivMetadataParser::parsePublishedDateFromArxivId((string) $arxivId); } $parsed = $preferHtml ? ArxivMetadataParser::parseAuthorsFromHtmlVersion($pageHtml) : ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml); if ($parsed === [] && $preferHtml && $absHtml !== '') { $parsed = ArxivMetadataParser::parseAuthorsFromAbsHtml($absHtml); } if ($parsed !== []) { $authorsParsed = $parsed; } $lead = CrawlAuthorParser::leadAuthor($dto->authors, $authorsParsed); $schoolName = $lead['university_name'] ?? $dto->schoolName; $extra = array_merge($dto->extra, [ 'authors_parsed' => $authorsParsed, 'lead_author' => $lead, ]); if ($enrichedFrom !== null) { $extra['enriched_from'] = $enrichedFrom; } if (! isset($extra['pdf_url'])) { $extra['pdf_url'] = ArxivMetadataParser::extractPdfUrl($pageHtml ?: $absHtml, (string) $arxivId); } if (! isset($extra['html_url'])) { $extra['html_url'] = ArxivMetadataParser::extractHtmlUrl($pageHtml ?: $absHtml, (string) $arxivId); } return new CrawlItemDto( externalId: $dto->externalId, title: $dto->title, canonicalUrl: $dto->canonicalUrl, authors: $dto->authors, summary: $dto->summary, publishedAt: $publishedAt, schoolName: $schoolName, section: $dto->section, contentHtml: $dto->contentHtml, extra: $extra, authorsParsed: $authorsParsed, ); } protected function shouldEnrich(CrawlItemDto $dto): bool { $mode = (string) config('crawl.arxiv.abs_enrich_mode', 'auto'); if ($mode === 'never') { return false; } if ($mode === 'always') { return true; } $lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed); $hasDate = ($dto->publishedAt ?? '') !== ''; $hasSchool = ($dto->schoolName ?? null) !== null || ($lead['university_name'] ?? null) !== null || ($lead['affiliation'] ?? null) !== null; if ($hasDate && $hasSchool) { return false; } return true; } protected function shouldPreferHtmlEnrich(CrawlItemDto $dto): bool { if ((bool) config('crawl.arxiv.enrich_prefer_html', true)) { return true; } return ($dto->extra['source'] ?? '') === 'html_search' || ! empty($dto->extra['html_url']); } protected function ensureLeadAuthor(CrawlItemDto $dto): CrawlItemDto { $publishedAt = $dto->publishedAt; if (($publishedAt ?? '') === '') { $arxivId = $dto->extra['arxiv_id'] ?? null; if ($arxivId) { $publishedAt = ArxivMetadataParser::parsePublishedDateFromArxivId((string) $arxivId); } } if (! empty($dto->extra['lead_author'])) { if (($publishedAt ?? '') === ($dto->publishedAt ?? '')) { return $dto; } return new CrawlItemDto( externalId: $dto->externalId, title: $dto->title, canonicalUrl: $dto->canonicalUrl, authors: $dto->authors, summary: $dto->summary, publishedAt: $publishedAt, schoolName: $dto->schoolName, section: $dto->section, contentHtml: $dto->contentHtml, extra: $dto->extra, authorsParsed: $dto->authorsParsed, ); } $lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed); $extra = array_merge($dto->extra, [ 'lead_author' => $lead, 'authors_parsed' => $dto->authorsParsed !== [] ? $dto->authorsParsed : ($dto->extra['authors_parsed'] ?? []), ]); return new CrawlItemDto( externalId: $dto->externalId, title: $dto->title, canonicalUrl: $dto->canonicalUrl, authors: $dto->authors, summary: $dto->summary, publishedAt: $publishedAt, schoolName: $dto->schoolName ?? $lead['university_name'] ?? null, section: $dto->section, contentHtml: $dto->contentHtml, extra: $extra, authorsParsed: $dto->authorsParsed, ); } protected function fetchAbsHtml(string $arxivId): string { return $this->fetchCachedPage('abs', $arxivId, function () use ($arxivId) { foreach ($this->versionIdCandidates($arxivId) as $id) { $html = $this->fetchPage('https://arxiv.org/abs/'.$id); if ($html !== '' && str_contains($html, 'abs-outer')) { return $html; } } return ''; }); } protected function fetchHtmlVersion(string $arxivId): string { return $this->fetchCachedPage('html', $arxivId, function () use ($arxivId) { foreach ($this->versionIdCandidates($arxivId) as $id) { $html = $this->fetchPage('https://arxiv.org/html/'.$id); if ($html !== '' && (str_contains($html, 'ltx_document') || str_contains($html, 'ltx_authors'))) { return $html; } } return ''; }); } /** * @return list */ protected function versionIdCandidates(string $arxivId): array { if (preg_match('/v\d+$/i', $arxivId)) { return [$arxivId]; } return [$arxivId.'v1']; } protected function fetchCachedPage(string $kind, string $arxivId, callable $fetcher): string { $ttl = max(60, (int) config('crawl.arxiv.page_cache_seconds', 86400)); $key = 'arxiv_'.$kind.':'.preg_replace('/[^a-zA-Z0-9._-]/', '_', $arxivId); return (string) Cache::remember($key, $ttl, fn () => (string) $fetcher()); } protected function fetchPage(string $url): string { try { $timeout = (int) config('crawl.arxiv.enrich_http_timeout_seconds', 25); $connectTimeout = (int) config('crawl.arxiv.connect_timeout_seconds', 15); $response = $this->gate->run(fn () => Http::timeout($timeout) ->connectTimeout($connectTimeout) ->withHeaders([ 'User-Agent' => 'SlakeSchool-Crawler/1.0 (https://arxiv.org)', 'Accept' => 'text/html', ]) ->get($url)); if ($response->status() === 429) { return ''; } return $response->successful() ? (string) $response->body() : ''; } catch (\Throwable) { return ''; } } }