From 4de9b4675f526f64cb78ca4332a6d45c92f8107b Mon Sep 17 00:00:00 2001 From: lion <120344285@qq.com> Date: Wed, 24 Jun 2026 10:56:28 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BA=A4=E5=A4=A7=E6=99=BA=E8=83=BD=E7=A0=94?= =?UTF-8?q?=E7=A9=B6=E9=99=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Crawl/Adapters/ArxivApiAdapter.php | 5 +- app/Services/Crawl/ArxivAbsEnricher.php | 127 ++++++++++++++---- app/Services/Crawl/ArxivMetadataParser.php | 102 +++++++++++++- config/crawl.php | 2 +- tests/Unit/ArxivMetadataParserTest.php | 29 ++++ 5 files changed, 235 insertions(+), 30 deletions(-) create mode 100644 tests/Unit/ArxivMetadataParserTest.php diff --git a/app/Services/Crawl/Adapters/ArxivApiAdapter.php b/app/Services/Crawl/Adapters/ArxivApiAdapter.php index 87fe2f1..a6f2c42 100644 --- a/app/Services/Crawl/Adapters/ArxivApiAdapter.php +++ b/app/Services/Crawl/Adapters/ArxivApiAdapter.php @@ -481,13 +481,16 @@ class ArxivApiAdapter implements CrawlerAdapterInterface } $lead = CrawlAuthorParser::leadAuthor($authors, $authorsParsed); + $publishedAt = ArxivMetadataParser::parsePublishedDate($body) + ?? ArxivMetadataParser::parsePublishedDateFromArxivId($arxivId); + $items[] = new CrawlItemDto( externalId: 'arxiv:'.$arxivId, title: $title, canonicalUrl: 'https://arxiv.org/abs/'.$arxivId, authors: $authors, summary: $summary, - publishedAt: ArxivMetadataParser::parsePublishedDate($body), + publishedAt: $publishedAt, schoolName: $lead['university_name'] ?? null, extra: [ 'platform' => 'arxiv', diff --git a/app/Services/Crawl/ArxivAbsEnricher.php b/app/Services/Crawl/ArxivAbsEnricher.php index 7118766..853f81c 100644 --- a/app/Services/Crawl/ArxivAbsEnricher.php +++ b/app/Services/Crawl/ArxivAbsEnricher.php @@ -24,21 +24,41 @@ class ArxivAbsEnricher return array_map(fn (CrawlItemDto $d) => $this->ensureLeadAuthor($d), $items); } - $max = max(0, (int) config('crawl.arxiv.abs_enrich_max', 8)); + $max = max(0, (int) config('crawl.arxiv.abs_enrich_max', 32)); + $sorted = $items; + usort($sorted, fn (CrawlItemDto $a, CrawlItemDto $b) => $this->enrichPriority($a) <=> $this->enrichPriority($b)); + $enriched = 0; - $out = []; + $enrichedMap = []; - foreach ($items as $dto) { + foreach ($sorted as $dto) { if ($enriched >= $max || ! $this->shouldEnrich($dto)) { - $out[] = $this->ensureLeadAuthor($dto); + $enrichedMap[$dto->externalId] = $this->ensureLeadAuthor($dto); continue; } - $out[] = $this->enrichOne($dto); + $enrichedMap[$dto->externalId] = $this->enrichOne($dto); $enriched++; } - return $out; + return array_map( + fn (CrawlItemDto $dto) => $enrichedMap[$dto->externalId] ?? $this->ensureLeadAuthor($dto), + $items, + ); + } + + protected function enrichPriority(CrawlItemDto $dto): int + { + if (($dto->publishedAt ?? '') === '') { + return 0; + } + + $lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed); + $hasSchool = ($dto->schoolName ?? null) !== null + || ($lead['university_name'] ?? null) !== null + || ($lead['affiliation'] ?? null) !== null; + + return $hasSchool ? 2 : 1; } public function enrichOne(CrawlItemDto $dto): CrawlItemDto @@ -56,6 +76,7 @@ class ArxivAbsEnricher $authorsParsed = $dto->authorsParsed; $enrichedFrom = null; $pageHtml = ''; + $absHtml = ''; $preferHtml = $this->shouldPreferHtmlEnrich($dto); @@ -63,31 +84,63 @@ class ArxivAbsEnricher $pageHtml = $this->fetchHtmlVersion((string) $arxivId); if ($pageHtml !== '') { $enrichedFrom = 'arxiv_html'; + $publishedAt = ArxivMetadataParser::parsePublishedDate($pageHtml) ?? $publishedAt; } } - if ($pageHtml === '') { - $pageHtml = $this->fetchAbsHtml((string) $arxivId); - if ($pageHtml !== '') { - $enrichedFrom = 'abs_html'; - $preferHtml = false; + if (($publishedAt ?? '') === '' || $pageHtml === '') { + $absHtml = $this->fetchAbsHtml((string) $arxivId); + if ($absHtml !== '') { + if ($enrichedFrom === null) { + $enrichedFrom = 'abs_html'; + } + if (($publishedAt ?? '') === '') { + $publishedAt = ArxivMetadataParser::parsePublishedDate($absHtml) ?? $publishedAt; + } } } - if ($pageHtml !== '') { + if ($pageHtml === '' && $absHtml !== '') { + $pageHtml = $absHtml; + $preferHtml = false; + } + + if ($pageHtml === '') { + return $this->ensureLeadAuthor(new CrawlItemDto( + externalId: $dto->externalId, + title: $dto->title, + canonicalUrl: $dto->canonicalUrl, + authors: $dto->authors, + summary: $dto->summary, + publishedAt: $publishedAt ?: ArxivMetadataParser::parsePublishedDateFromArxivId((string) $arxivId), + schoolName: $dto->schoolName, + section: $dto->section, + contentHtml: $dto->contentHtml, + extra: $dto->extra, + authorsParsed: $dto->authorsParsed, + )); + } + + if (($publishedAt ?? '') === '') { $publishedAt = ArxivMetadataParser::parsePublishedDate($pageHtml) ?? $publishedAt; + } + if (($publishedAt ?? '') === '' && $absHtml !== '') { + $publishedAt = ArxivMetadataParser::parsePublishedDate($absHtml) ?? $publishedAt; + } + if (($publishedAt ?? '') === '') { + $publishedAt = ArxivMetadataParser::parsePublishedDateFromArxivId((string) $arxivId); + } - $parsed = $preferHtml - ? ArxivMetadataParser::parseAuthorsFromHtmlVersion($pageHtml) - : ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml); + $parsed = $preferHtml + ? ArxivMetadataParser::parseAuthorsFromHtmlVersion($pageHtml) + : ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml); - if ($parsed === [] && $preferHtml) { - $parsed = ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml); - } + if ($parsed === [] && $preferHtml && $absHtml !== '') { + $parsed = ArxivMetadataParser::parseAuthorsFromAbsHtml($absHtml); + } - if ($parsed !== []) { - $authorsParsed = $parsed; - } + if ($parsed !== []) { + $authorsParsed = $parsed; } $lead = CrawlAuthorParser::leadAuthor($dto->authors, $authorsParsed); @@ -101,10 +154,10 @@ class ArxivAbsEnricher $extra['enriched_from'] = $enrichedFrom; } if (! isset($extra['pdf_url'])) { - $extra['pdf_url'] = ArxivMetadataParser::extractPdfUrl($pageHtml, (string) $arxivId); + $extra['pdf_url'] = ArxivMetadataParser::extractPdfUrl($pageHtml ?: $absHtml, (string) $arxivId); } if (! isset($extra['html_url'])) { - $extra['html_url'] = ArxivMetadataParser::extractHtmlUrl($pageHtml, (string) $arxivId); + $extra['html_url'] = ArxivMetadataParser::extractHtmlUrl($pageHtml ?: $absHtml, (string) $arxivId); } return new CrawlItemDto( @@ -157,8 +210,32 @@ class ArxivAbsEnricher protected function ensureLeadAuthor(CrawlItemDto $dto): CrawlItemDto { + $publishedAt = $dto->publishedAt; + if (($publishedAt ?? '') === '') { + $arxivId = $dto->extra['arxiv_id'] ?? null; + if ($arxivId) { + $publishedAt = ArxivMetadataParser::parsePublishedDateFromArxivId((string) $arxivId); + } + } + if (! empty($dto->extra['lead_author'])) { - return $dto; + if (($publishedAt ?? '') === ($dto->publishedAt ?? '')) { + return $dto; + } + + return new CrawlItemDto( + externalId: $dto->externalId, + title: $dto->title, + canonicalUrl: $dto->canonicalUrl, + authors: $dto->authors, + summary: $dto->summary, + publishedAt: $publishedAt, + schoolName: $dto->schoolName, + section: $dto->section, + contentHtml: $dto->contentHtml, + extra: $dto->extra, + authorsParsed: $dto->authorsParsed, + ); } $lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed); @@ -173,7 +250,7 @@ class ArxivAbsEnricher canonicalUrl: $dto->canonicalUrl, authors: $dto->authors, summary: $dto->summary, - publishedAt: $dto->publishedAt, + publishedAt: $publishedAt, schoolName: $dto->schoolName ?? $lead['university_name'] ?? null, section: $dto->section, contentHtml: $dto->contentHtml, diff --git a/app/Services/Crawl/ArxivMetadataParser.php b/app/Services/Crawl/ArxivMetadataParser.php index 07a2e1d..2fdcdeb 100644 --- a/app/Services/Crawl/ArxivMetadataParser.php +++ b/app/Services/Crawl/ArxivMetadataParser.php @@ -23,9 +23,23 @@ class ArxivMetadataParser return null; } + $raw = $text; + + if ($date = self::parseCitationMetaDate($raw)) { + return $date; + } + + if ($date = self::parseSubmissionHistoryDate($raw)) { + return $date; + } + + if ($date = self::parseDatelineDate($raw)) { + return $date; + } + if (preg_match( '/Generated on\s+(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s+([A-Za-z]+)\s+(\d{1,2})\s+\d{1,2}:\d{2}:\d{2}\s+(\d{4})/i', - $text, + $raw, $gen )) { $date = self::toYmd($gen[3], $gen[1], $gen[2]); @@ -34,7 +48,7 @@ class ArxivMetadataParser } } - $text = html_entity_decode(strip_tags($text), ENT_QUOTES | ENT_HTML5, 'UTF-8'); + $text = html_entity_decode(strip_tags($raw), ENT_QUOTES | ENT_HTML5, 'UTF-8'); $text = preg_replace('/\s+/u', ' ', $text) ?? ''; if (preg_match('/(\d{4})-(\d{2})-(\d{2})/', $text, $iso)) { @@ -59,7 +73,6 @@ class ArxivMetadataParser if (! preg_match($pattern, $text, $m)) { continue; } - // Generated on Thu May 28 ... 2026 → 月、日、年顺序 if (str_starts_with($pattern, '/Generated on')) { $date = self::toYmd($m[3], $m[1], $m[2]); } else { @@ -73,6 +86,89 @@ class ArxivMetadataParser return null; } + /** + * 新格式 arXiv ID(YYMM.NNNNN)可推断提交年月,作为最后兜底(取当月 1 日)。 + */ + public static function parsePublishedDateFromArxivId(?string $arxivId): ?string + { + $arxivId = trim((string) $arxivId); + if ($arxivId === '') { + return null; + } + + $arxivId = preg_replace('/v\d+$/i', '', $arxivId) ?? $arxivId; + + if (preg_match('/^(\d{2})(\d{2})\.\d+(?:v\d+)?$/i', $arxivId, $m)) { + $year = 2000 + (int) $m[1]; + $month = (int) $m[2]; + if ($month >= 1 && $month <= 12 && $year >= 2007 && $year <= 2100) { + return sprintf('%04d-%02d-01', $year, $month); + } + } + + return null; + } + + protected static function parseCitationMetaDate(string $html): ?string + { + if (preg_match('#]+name=["\']citation_date["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $m) + || preg_match('#]+content=["\']([^"\']+)["\'][^>]+name=["\']citation_date["\']#i', $html, $m)) { + return self::normalizeLooseDate($m[1]); + } + + if (preg_match('#]+name=["\']citation_online_date["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $m) + || preg_match('#]+content=["\']([^"\']+)["\'][^>]+name=["\']citation_online_date["\']#i', $html, $m)) { + return self::normalizeLooseDate($m[1]); + } + + return null; + } + + protected static function parseDatelineDate(string $html): ?string + { + if (! preg_match('#
\s*\[Submitted on\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\]\s*
#i', $html, $m)) { + return null; + } + + return self::toYmd($m[3], $m[2], $m[1]); + } + + protected static function parseSubmissionHistoryDate(string $html): ?string + { + if (! preg_match('#
(.*?)
#is', $html, $block)) { + return null; + } + + $section = $block[1]; + if (preg_match('/\[v1\][^<]*(?:)?\s*(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),?\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})/i', $section, $m)) { + return self::toYmd($m[3], $m[2], $m[1]); + } + + if (preg_match('/\[Submitted on\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\]/i', $section, $m)) { + return self::toYmd($m[3], $m[2], $m[1]); + } + + return null; + } + + protected static function normalizeLooseDate(string $value): ?string + { + $value = trim(html_entity_decode($value, ENT_QUOTES | ENT_HTML5, 'UTF-8')); + if ($value === '') { + return null; + } + + if (preg_match('#^(\d{4})[/-](\d{1,2})[/-](\d{1,2})$#', $value, $m)) { + return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]); + } + + if (preg_match('/(\d{4})-(\d{2})-(\d{2})/', $value, $iso)) { + return sprintf('%s-%s-%s', $iso[1], $iso[2], $iso[3]); + } + + return self::parsePublishedDate($value); + } + /** * @return list */ diff --git a/config/crawl.php b/config/crawl.php index 1ff2027..712d81a 100644 --- a/config/crawl.php +++ b/config/crawl.php @@ -18,7 +18,7 @@ return [ */ 'abs_enrich_mode' => env('ARXIV_ABS_ENRICH_MODE', 'auto'), /** 单次任务最多补全篇数(每篇至多 1 次 arXiv 页面请求) */ - 'abs_enrich_max' => (int) env('ARXIV_ABS_ENRICH_MAX', 8), + 'abs_enrich_max' => (int) env('ARXIV_ABS_ENRICH_MAX', 32), /** 补全时优先 HTML 版(机构更全),失败再试 abs */ 'enrich_prefer_html' => (bool) env('ARXIV_ENRICH_PREFER_HTML', true), 'try_html_version' => (bool) env('ARXIV_TRY_HTML_VERSION', true), diff --git a/tests/Unit/ArxivMetadataParserTest.php b/tests/Unit/ArxivMetadataParserTest.php new file mode 100644 index 0000000..a9548c3 --- /dev/null +++ b/tests/Unit/ArxivMetadataParserTest.php @@ -0,0 +1,29 @@ + + +
+

Submission history

+ [v1] Thu, 9 Apr 2026 06:52:51 UTC (1,821 KB)
+
+HTML; + + $this->assertSame('2026-04-09', ArxivMetadataParser::parsePublishedDate($html)); + } + + public function test_parses_published_date_from_arxiv_id(): void + { + $this->assertSame('2026-06-01', ArxivMetadataParser::parsePublishedDateFromArxivId('2606.23690')); + $this->assertSame('2026-06-01', ArxivMetadataParser::parsePublishedDateFromArxivId('2606.23690v1')); + } +}