|
|
|
|
@ -24,21 +24,41 @@ class ArxivAbsEnricher
|
|
|
|
|
return array_map(fn (CrawlItemDto $d) => $this->ensureLeadAuthor($d), $items);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$max = max(0, (int) config('crawl.arxiv.abs_enrich_max', 8));
|
|
|
|
|
$max = max(0, (int) config('crawl.arxiv.abs_enrich_max', 32));
|
|
|
|
|
$sorted = $items;
|
|
|
|
|
usort($sorted, fn (CrawlItemDto $a, CrawlItemDto $b) => $this->enrichPriority($a) <=> $this->enrichPriority($b));
|
|
|
|
|
|
|
|
|
|
$enriched = 0;
|
|
|
|
|
$out = [];
|
|
|
|
|
$enrichedMap = [];
|
|
|
|
|
|
|
|
|
|
foreach ($items as $dto) {
|
|
|
|
|
foreach ($sorted as $dto) {
|
|
|
|
|
if ($enriched >= $max || ! $this->shouldEnrich($dto)) {
|
|
|
|
|
$out[] = $this->ensureLeadAuthor($dto);
|
|
|
|
|
$enrichedMap[$dto->externalId] = $this->ensureLeadAuthor($dto);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$out[] = $this->enrichOne($dto);
|
|
|
|
|
$enrichedMap[$dto->externalId] = $this->enrichOne($dto);
|
|
|
|
|
$enriched++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $out;
|
|
|
|
|
return array_map(
|
|
|
|
|
fn (CrawlItemDto $dto) => $enrichedMap[$dto->externalId] ?? $this->ensureLeadAuthor($dto),
|
|
|
|
|
$items,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function enrichPriority(CrawlItemDto $dto): int
|
|
|
|
|
{
|
|
|
|
|
if (($dto->publishedAt ?? '') === '') {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed);
|
|
|
|
|
$hasSchool = ($dto->schoolName ?? null) !== null
|
|
|
|
|
|| ($lead['university_name'] ?? null) !== null
|
|
|
|
|
|| ($lead['affiliation'] ?? null) !== null;
|
|
|
|
|
|
|
|
|
|
return $hasSchool ? 2 : 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function enrichOne(CrawlItemDto $dto): CrawlItemDto
|
|
|
|
|
@ -56,6 +76,7 @@ class ArxivAbsEnricher
|
|
|
|
|
$authorsParsed = $dto->authorsParsed;
|
|
|
|
|
$enrichedFrom = null;
|
|
|
|
|
$pageHtml = '';
|
|
|
|
|
$absHtml = '';
|
|
|
|
|
|
|
|
|
|
$preferHtml = $this->shouldPreferHtmlEnrich($dto);
|
|
|
|
|
|
|
|
|
|
@ -63,31 +84,63 @@ class ArxivAbsEnricher
|
|
|
|
|
$pageHtml = $this->fetchHtmlVersion((string) $arxivId);
|
|
|
|
|
if ($pageHtml !== '') {
|
|
|
|
|
$enrichedFrom = 'arxiv_html';
|
|
|
|
|
$publishedAt = ArxivMetadataParser::parsePublishedDate($pageHtml) ?? $publishedAt;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($pageHtml === '') {
|
|
|
|
|
$pageHtml = $this->fetchAbsHtml((string) $arxivId);
|
|
|
|
|
if ($pageHtml !== '') {
|
|
|
|
|
$enrichedFrom = 'abs_html';
|
|
|
|
|
$preferHtml = false;
|
|
|
|
|
if (($publishedAt ?? '') === '' || $pageHtml === '') {
|
|
|
|
|
$absHtml = $this->fetchAbsHtml((string) $arxivId);
|
|
|
|
|
if ($absHtml !== '') {
|
|
|
|
|
if ($enrichedFrom === null) {
|
|
|
|
|
$enrichedFrom = 'abs_html';
|
|
|
|
|
}
|
|
|
|
|
if (($publishedAt ?? '') === '') {
|
|
|
|
|
$publishedAt = ArxivMetadataParser::parsePublishedDate($absHtml) ?? $publishedAt;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($pageHtml !== '') {
|
|
|
|
|
if ($pageHtml === '' && $absHtml !== '') {
|
|
|
|
|
$pageHtml = $absHtml;
|
|
|
|
|
$preferHtml = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($pageHtml === '') {
|
|
|
|
|
return $this->ensureLeadAuthor(new CrawlItemDto(
|
|
|
|
|
externalId: $dto->externalId,
|
|
|
|
|
title: $dto->title,
|
|
|
|
|
canonicalUrl: $dto->canonicalUrl,
|
|
|
|
|
authors: $dto->authors,
|
|
|
|
|
summary: $dto->summary,
|
|
|
|
|
publishedAt: $publishedAt ?: ArxivMetadataParser::parsePublishedDateFromArxivId((string) $arxivId),
|
|
|
|
|
schoolName: $dto->schoolName,
|
|
|
|
|
section: $dto->section,
|
|
|
|
|
contentHtml: $dto->contentHtml,
|
|
|
|
|
extra: $dto->extra,
|
|
|
|
|
authorsParsed: $dto->authorsParsed,
|
|
|
|
|
));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (($publishedAt ?? '') === '') {
|
|
|
|
|
$publishedAt = ArxivMetadataParser::parsePublishedDate($pageHtml) ?? $publishedAt;
|
|
|
|
|
}
|
|
|
|
|
if (($publishedAt ?? '') === '' && $absHtml !== '') {
|
|
|
|
|
$publishedAt = ArxivMetadataParser::parsePublishedDate($absHtml) ?? $publishedAt;
|
|
|
|
|
}
|
|
|
|
|
if (($publishedAt ?? '') === '') {
|
|
|
|
|
$publishedAt = ArxivMetadataParser::parsePublishedDateFromArxivId((string) $arxivId);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$parsed = $preferHtml
|
|
|
|
|
? ArxivMetadataParser::parseAuthorsFromHtmlVersion($pageHtml)
|
|
|
|
|
: ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml);
|
|
|
|
|
$parsed = $preferHtml
|
|
|
|
|
? ArxivMetadataParser::parseAuthorsFromHtmlVersion($pageHtml)
|
|
|
|
|
: ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml);
|
|
|
|
|
|
|
|
|
|
if ($parsed === [] && $preferHtml) {
|
|
|
|
|
$parsed = ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml);
|
|
|
|
|
}
|
|
|
|
|
if ($parsed === [] && $preferHtml && $absHtml !== '') {
|
|
|
|
|
$parsed = ArxivMetadataParser::parseAuthorsFromAbsHtml($absHtml);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($parsed !== []) {
|
|
|
|
|
$authorsParsed = $parsed;
|
|
|
|
|
}
|
|
|
|
|
if ($parsed !== []) {
|
|
|
|
|
$authorsParsed = $parsed;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$lead = CrawlAuthorParser::leadAuthor($dto->authors, $authorsParsed);
|
|
|
|
|
@ -101,10 +154,10 @@ class ArxivAbsEnricher
|
|
|
|
|
$extra['enriched_from'] = $enrichedFrom;
|
|
|
|
|
}
|
|
|
|
|
if (! isset($extra['pdf_url'])) {
|
|
|
|
|
$extra['pdf_url'] = ArxivMetadataParser::extractPdfUrl($pageHtml, (string) $arxivId);
|
|
|
|
|
$extra['pdf_url'] = ArxivMetadataParser::extractPdfUrl($pageHtml ?: $absHtml, (string) $arxivId);
|
|
|
|
|
}
|
|
|
|
|
if (! isset($extra['html_url'])) {
|
|
|
|
|
$extra['html_url'] = ArxivMetadataParser::extractHtmlUrl($pageHtml, (string) $arxivId);
|
|
|
|
|
$extra['html_url'] = ArxivMetadataParser::extractHtmlUrl($pageHtml ?: $absHtml, (string) $arxivId);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return new CrawlItemDto(
|
|
|
|
|
@ -157,8 +210,32 @@ class ArxivAbsEnricher
|
|
|
|
|
|
|
|
|
|
protected function ensureLeadAuthor(CrawlItemDto $dto): CrawlItemDto
|
|
|
|
|
{
|
|
|
|
|
$publishedAt = $dto->publishedAt;
|
|
|
|
|
if (($publishedAt ?? '') === '') {
|
|
|
|
|
$arxivId = $dto->extra['arxiv_id'] ?? null;
|
|
|
|
|
if ($arxivId) {
|
|
|
|
|
$publishedAt = ArxivMetadataParser::parsePublishedDateFromArxivId((string) $arxivId);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (! empty($dto->extra['lead_author'])) {
|
|
|
|
|
return $dto;
|
|
|
|
|
if (($publishedAt ?? '') === ($dto->publishedAt ?? '')) {
|
|
|
|
|
return $dto;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return new CrawlItemDto(
|
|
|
|
|
externalId: $dto->externalId,
|
|
|
|
|
title: $dto->title,
|
|
|
|
|
canonicalUrl: $dto->canonicalUrl,
|
|
|
|
|
authors: $dto->authors,
|
|
|
|
|
summary: $dto->summary,
|
|
|
|
|
publishedAt: $publishedAt,
|
|
|
|
|
schoolName: $dto->schoolName,
|
|
|
|
|
section: $dto->section,
|
|
|
|
|
contentHtml: $dto->contentHtml,
|
|
|
|
|
extra: $dto->extra,
|
|
|
|
|
authorsParsed: $dto->authorsParsed,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed);
|
|
|
|
|
@ -173,7 +250,7 @@ class ArxivAbsEnricher
|
|
|
|
|
canonicalUrl: $dto->canonicalUrl,
|
|
|
|
|
authors: $dto->authors,
|
|
|
|
|
summary: $dto->summary,
|
|
|
|
|
publishedAt: $dto->publishedAt,
|
|
|
|
|
publishedAt: $publishedAt,
|
|
|
|
|
schoolName: $dto->schoolName ?? $lead['university_name'] ?? null,
|
|
|
|
|
section: $dto->section,
|
|
|
|
|
contentHtml: $dto->contentHtml,
|
|
|
|
|
|