From 4de9b4675f526f64cb78ca4332a6d45c92f8107b Mon Sep 17 00:00:00 2001
From: lion <120344285@qq.com>
Date: Wed, 24 Jun 2026 10:56:28 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BA=A4=E5=A4=A7=E6=99=BA=E8=83=BD=E7=A0=94?=
 =?UTF-8?q?=E7=A9=B6=E9=99=A2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../Crawl/Adapters/ArxivApiAdapter.php        |   5 +-
 app/Services/Crawl/ArxivAbsEnricher.php       | 127 ++++++++++++++----
 app/Services/Crawl/ArxivMetadataParser.php    | 102 +++++++++++++-
 config/crawl.php                              |   2 +-
 tests/Unit/ArxivMetadataParserTest.php        |  29 ++++
 5 files changed, 235 insertions(+), 30 deletions(-)
 create mode 100644 tests/Unit/ArxivMetadataParserTest.php

diff --git a/app/Services/Crawl/Adapters/ArxivApiAdapter.php b/app/Services/Crawl/Adapters/ArxivApiAdapter.php
index 87fe2f1..a6f2c42 100644
--- a/app/Services/Crawl/Adapters/ArxivApiAdapter.php
+++ b/app/Services/Crawl/Adapters/ArxivApiAdapter.php
@@ -481,13 +481,16 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
             }
             $lead = CrawlAuthorParser::leadAuthor($authors, $authorsParsed);
 
+            $publishedAt = ArxivMetadataParser::parsePublishedDate($body)
+                ?? ArxivMetadataParser::parsePublishedDateFromArxivId($arxivId);
+
             $items[] = new CrawlItemDto(
                 externalId: 'arxiv:'.$arxivId,
                 title: $title,
                 canonicalUrl: 'https://arxiv.org/abs/'.$arxivId,
                 authors: $authors,
                 summary: $summary,
-                publishedAt: ArxivMetadataParser::parsePublishedDate($body),
+                publishedAt: $publishedAt,
                 schoolName: $lead['university_name'] ?? null,
                 extra: [
                     'platform' => 'arxiv',
diff --git a/app/Services/Crawl/ArxivAbsEnricher.php b/app/Services/Crawl/ArxivAbsEnricher.php
index 7118766..853f81c 100644
--- a/app/Services/Crawl/ArxivAbsEnricher.php
+++ b/app/Services/Crawl/ArxivAbsEnricher.php
@@ -24,21 +24,41 @@ class ArxivAbsEnricher
             return array_map(fn (CrawlItemDto $d) => $this->ensureLeadAuthor($d), $items);
         }
 
-        $max = max(0, (int) config('crawl.arxiv.abs_enrich_max', 8));
+        $max = max(0, (int) config('crawl.arxiv.abs_enrich_max', 32));
+        $sorted = $items;
+        usort($sorted, fn (CrawlItemDto $a, CrawlItemDto $b) => $this->enrichPriority($a) <=> $this->enrichPriority($b));
+
         $enriched = 0;
-        $out = [];
+        $enrichedMap = [];
 
-        foreach ($items as $dto) {
+        foreach ($sorted as $dto) {
             if ($enriched >= $max || ! $this->shouldEnrich($dto)) {
-                $out[] = $this->ensureLeadAuthor($dto);
+                $enrichedMap[$dto->externalId] = $this->ensureLeadAuthor($dto);
                 continue;
             }
 
-            $out[] = $this->enrichOne($dto);
+            $enrichedMap[$dto->externalId] = $this->enrichOne($dto);
             $enriched++;
         }
 
-        return $out;
+        return array_map(
+            fn (CrawlItemDto $dto) => $enrichedMap[$dto->externalId] ?? $this->ensureLeadAuthor($dto),
+            $items,
+        );
+    }
+
+    protected function enrichPriority(CrawlItemDto $dto): int
+    {
+        if (($dto->publishedAt ?? '') === '') {
+            return 0;
+        }
+
+        $lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed);
+        $hasSchool = ($dto->schoolName ?? null) !== null
+            || ($lead['university_name'] ?? null) !== null
+            || ($lead['affiliation'] ?? null) !== null;
+
+        return $hasSchool ? 2 : 1;
     }
 
     public function enrichOne(CrawlItemDto $dto): CrawlItemDto
@@ -56,6 +76,7 @@ class ArxivAbsEnricher
         $authorsParsed = $dto->authorsParsed;
         $enrichedFrom = null;
         $pageHtml = '';
+        $absHtml = '';
 
         $preferHtml = $this->shouldPreferHtmlEnrich($dto);
 
@@ -63,31 +84,63 @@ class ArxivAbsEnricher
             $pageHtml = $this->fetchHtmlVersion((string) $arxivId);
             if ($pageHtml !== '') {
                 $enrichedFrom = 'arxiv_html';
+                $publishedAt = ArxivMetadataParser::parsePublishedDate($pageHtml) ?? $publishedAt;
             }
         }
 
-        if ($pageHtml === '') {
-            $pageHtml = $this->fetchAbsHtml((string) $arxivId);
-            if ($pageHtml !== '') {
-                $enrichedFrom = 'abs_html';
-                $preferHtml = false;
+        if (($publishedAt ?? '') === '' || $pageHtml === '') {
+            $absHtml = $this->fetchAbsHtml((string) $arxivId);
+            if ($absHtml !== '') {
+                if ($enrichedFrom === null) {
+                    $enrichedFrom = 'abs_html';
+                }
+                if (($publishedAt ?? '') === '') {
+                    $publishedAt = ArxivMetadataParser::parsePublishedDate($absHtml) ?? $publishedAt;
+                }
             }
         }
 
-        if ($pageHtml !== '') {
+        if ($pageHtml === '' && $absHtml !== '') {
+            $pageHtml = $absHtml;
+            $preferHtml = false;
+        }
+
+        if ($pageHtml === '') {
+            return $this->ensureLeadAuthor(new CrawlItemDto(
+                externalId: $dto->externalId,
+                title: $dto->title,
+                canonicalUrl: $dto->canonicalUrl,
+                authors: $dto->authors,
+                summary: $dto->summary,
+                publishedAt: $publishedAt ?: ArxivMetadataParser::parsePublishedDateFromArxivId((string) $arxivId),
+                schoolName: $dto->schoolName,
+                section: $dto->section,
+                contentHtml: $dto->contentHtml,
+                extra: $dto->extra,
+                authorsParsed: $dto->authorsParsed,
+            ));
+        }
+
+        if (($publishedAt ?? '') === '') {
             $publishedAt = ArxivMetadataParser::parsePublishedDate($pageHtml) ?? $publishedAt;
+        }
+        if (($publishedAt ?? '') === '' && $absHtml !== '') {
+            $publishedAt = ArxivMetadataParser::parsePublishedDate($absHtml) ?? $publishedAt;
+        }
+        if (($publishedAt ?? '') === '') {
+            $publishedAt = ArxivMetadataParser::parsePublishedDateFromArxivId((string) $arxivId);
+        }
 
-            $parsed = $preferHtml
-                ? ArxivMetadataParser::parseAuthorsFromHtmlVersion($pageHtml)
-                : ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml);
+        $parsed = $preferHtml
+            ? ArxivMetadataParser::parseAuthorsFromHtmlVersion($pageHtml)
+            : ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml);
 
-            if ($parsed === [] && $preferHtml) {
-                $parsed = ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml);
-            }
+        if ($parsed === [] && $preferHtml && $absHtml !== '') {
+            $parsed = ArxivMetadataParser::parseAuthorsFromAbsHtml($absHtml);
+        }
 
-            if ($parsed !== []) {
-                $authorsParsed = $parsed;
-            }
+        if ($parsed !== []) {
+            $authorsParsed = $parsed;
         }
 
         $lead = CrawlAuthorParser::leadAuthor($dto->authors, $authorsParsed);
@@ -101,10 +154,10 @@ class ArxivAbsEnricher
             $extra['enriched_from'] = $enrichedFrom;
         }
         if (! isset($extra['pdf_url'])) {
-            $extra['pdf_url'] = ArxivMetadataParser::extractPdfUrl($pageHtml, (string) $arxivId);
+            $extra['pdf_url'] = ArxivMetadataParser::extractPdfUrl($pageHtml ?: $absHtml, (string) $arxivId);
         }
         if (! isset($extra['html_url'])) {
-            $extra['html_url'] = ArxivMetadataParser::extractHtmlUrl($pageHtml, (string) $arxivId);
+            $extra['html_url'] = ArxivMetadataParser::extractHtmlUrl($pageHtml ?: $absHtml, (string) $arxivId);
         }
 
         return new CrawlItemDto(
@@ -157,8 +210,32 @@ class ArxivAbsEnricher
 
     protected function ensureLeadAuthor(CrawlItemDto $dto): CrawlItemDto
     {
+        $publishedAt = $dto->publishedAt;
+        if (($publishedAt ?? '') === '') {
+            $arxivId = $dto->extra['arxiv_id'] ?? null;
+            if ($arxivId) {
+                $publishedAt = ArxivMetadataParser::parsePublishedDateFromArxivId((string) $arxivId);
+            }
+        }
+
         if (! empty($dto->extra['lead_author'])) {
-            return $dto;
+            if (($publishedAt ?? '') === ($dto->publishedAt ?? '')) {
+                return $dto;
+            }
+
+            return new CrawlItemDto(
+                externalId: $dto->externalId,
+                title: $dto->title,
+                canonicalUrl: $dto->canonicalUrl,
+                authors: $dto->authors,
+                summary: $dto->summary,
+                publishedAt: $publishedAt,
+                schoolName: $dto->schoolName,
+                section: $dto->section,
+                contentHtml: $dto->contentHtml,
+                extra: $dto->extra,
+                authorsParsed: $dto->authorsParsed,
+            );
         }
 
         $lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed);
@@ -173,7 +250,7 @@ class ArxivAbsEnricher
             canonicalUrl: $dto->canonicalUrl,
             authors: $dto->authors,
             summary: $dto->summary,
-            publishedAt: $dto->publishedAt,
+            publishedAt: $publishedAt,
             schoolName: $dto->schoolName ?? $lead['university_name'] ?? null,
             section: $dto->section,
             contentHtml: $dto->contentHtml,
diff --git a/app/Services/Crawl/ArxivMetadataParser.php b/app/Services/Crawl/ArxivMetadataParser.php
index 07a2e1d..2fdcdeb 100644
--- a/app/Services/Crawl/ArxivMetadataParser.php
+++ b/app/Services/Crawl/ArxivMetadataParser.php
@@ -23,9 +23,23 @@ class ArxivMetadataParser
             return null;
         }
 
+        $raw = $text;
+
+        if ($date = self::parseCitationMetaDate($raw)) {
+            return $date;
+        }
+
+        if ($date = self::parseSubmissionHistoryDate($raw)) {
+            return $date;
+        }
+
+        if ($date = self::parseDatelineDate($raw)) {
+            return $date;
+        }
+
         if (preg_match(
             '/Generated on\s+(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s+([A-Za-z]+)\s+(\d{1,2})\s+\d{1,2}:\d{2}:\d{2}\s+(\d{4})/i',
-            $text,
+            $raw,
             $gen
         )) {
             $date = self::toYmd($gen[3], $gen[1], $gen[2]);
@@ -34,7 +48,7 @@ class ArxivMetadataParser
             }
         }
 
-        $text = html_entity_decode(strip_tags($text), ENT_QUOTES | ENT_HTML5, 'UTF-8');
+        $text = html_entity_decode(strip_tags($raw), ENT_QUOTES | ENT_HTML5, 'UTF-8');
         $text = preg_replace('/\s+/u', ' ', $text) ?? '';
 
         if (preg_match('/(\d{4})-(\d{2})-(\d{2})/', $text, $iso)) {
@@ -59,7 +73,6 @@ class ArxivMetadataParser
             if (! preg_match($pattern, $text, $m)) {
                 continue;
             }
-            // Generated on Thu May 28 ... 2026 → 月、日、年顺序
             if (str_starts_with($pattern, '/Generated on')) {
                 $date = self::toYmd($m[3], $m[1], $m[2]);
             } else {
@@ -73,6 +86,89 @@ class ArxivMetadataParser
         return null;
     }
 
+    /**
+     * 新格式 arXiv ID（YYMM.NNNNN）可推断提交年月，作为最后兜底（取当月 1 日）。
+     */
+    public static function parsePublishedDateFromArxivId(?string $arxivId): ?string
+    {
+        $arxivId = trim((string) $arxivId);
+        if ($arxivId === '') {
+            return null;
+        }
+
+        $arxivId = preg_replace('/v\d+$/i', '', $arxivId) ?? $arxivId;
+
+        if (preg_match('/^(\d{2})(\d{2})\.\d+(?:v\d+)?$/i', $arxivId, $m)) {
+            $year = 2000 + (int) $m[1];
+            $month = (int) $m[2];
+            if ($month >= 1 && $month <= 12 && $year >= 2007 && $year <= 2100) {
+                return sprintf('%04d-%02d-01', $year, $month);
+            }
+        }
+
+        return null;
+    }
+
+    protected static function parseCitationMetaDate(string $html): ?string
+    {
+        if (preg_match('#<meta[^>]+name=["\']citation_date["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $m)
+            || preg_match('#<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']citation_date["\']#i', $html, $m)) {
+            return self::normalizeLooseDate($m[1]);
+        }
+
+        if (preg_match('#<meta[^>]+name=["\']citation_online_date["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $m)
+            || preg_match('#<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']citation_online_date["\']#i', $html, $m)) {
+            return self::normalizeLooseDate($m[1]);
+        }
+
+        return null;
+    }
+
+    protected static function parseDatelineDate(string $html): ?string
+    {
+        if (! preg_match('#<div class=["\']dateline["\']>\s*\[Submitted on\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\]\s*</div>#i', $html, $m)) {
+            return null;
+        }
+
+        return self::toYmd($m[3], $m[2], $m[1]);
+    }
+
+    protected static function parseSubmissionHistoryDate(string $html): ?string
+    {
+        if (! preg_match('#<div class=["\']submission-history["\']>(.*?)</div>#is', $html, $block)) {
+            return null;
+        }
+
+        $section = $block[1];
+        if (preg_match('/\[v1\][^<]*(?:<br\s*\/?>)?\s*(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),?\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})/i', $section, $m)) {
+            return self::toYmd($m[3], $m[2], $m[1]);
+        }
+
+        if (preg_match('/\[Submitted on\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\]/i', $section, $m)) {
+            return self::toYmd($m[3], $m[2], $m[1]);
+        }
+
+        return null;
+    }
+
+    protected static function normalizeLooseDate(string $value): ?string
+    {
+        $value = trim(html_entity_decode($value, ENT_QUOTES | ENT_HTML5, 'UTF-8'));
+        if ($value === '') {
+            return null;
+        }
+
+        if (preg_match('#^(\d{4})[/-](\d{1,2})[/-](\d{1,2})$#', $value, $m)) {
+            return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
+        }
+
+        if (preg_match('/(\d{4})-(\d{2})-(\d{2})/', $value, $iso)) {
+            return sprintf('%s-%s-%s', $iso[1], $iso[2], $iso[3]);
+        }
+
+        return self::parsePublishedDate($value);
+    }
+
     /**
      * @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
      */
diff --git a/config/crawl.php b/config/crawl.php
index 1ff2027..712d81a 100644
--- a/config/crawl.php
+++ b/config/crawl.php
@@ -18,7 +18,7 @@ return [
          */
         'abs_enrich_mode' => env('ARXIV_ABS_ENRICH_MODE', 'auto'),
         /** 单次任务最多补全篇数（每篇至多 1 次 arXiv 页面请求） */
-        'abs_enrich_max' => (int) env('ARXIV_ABS_ENRICH_MAX', 8),
+        'abs_enrich_max' => (int) env('ARXIV_ABS_ENRICH_MAX', 32),
         /** 补全时优先 HTML 版（机构更全），失败再试 abs */
         'enrich_prefer_html' => (bool) env('ARXIV_ENRICH_PREFER_HTML', true),
         'try_html_version' => (bool) env('ARXIV_TRY_HTML_VERSION', true),
diff --git a/tests/Unit/ArxivMetadataParserTest.php b/tests/Unit/ArxivMetadataParserTest.php
new file mode 100644
index 0000000..a9548c3
--- /dev/null
+++ b/tests/Unit/ArxivMetadataParserTest.php
@@ -0,0 +1,29 @@
+<?php
+
+namespace Tests\Unit;
+
+use App\Services\Crawl\ArxivMetadataParser;
+use PHPUnit\Framework\TestCase;
+
+class ArxivMetadataParserTest extends TestCase
+{
+    public function test_parses_citation_meta_and_submission_history(): void
+    {
+        $html = <<<'HTML'
+<meta name="citation_date" content="2026/04/09" />
+<div class="dateline">[Submitted on 9 Apr 2026]</div>
+<div class="submission-history">
+  <h2>Submission history</h2>
+  <strong>[v1]</strong> Thu, 9 Apr 2026 06:52:51 UTC (1,821 KB)<br/>
+</div>
+HTML;
+
+        $this->assertSame('2026-04-09', ArxivMetadataParser::parsePublishedDate($html));
+    }
+
+    public function test_parses_published_date_from_arxiv_id(): void
+    {
+        $this->assertSame('2026-06-01', ArxivMetadataParser::parsePublishedDateFromArxivId('2606.23690'));
+        $this->assertSame('2026-06-01', ArxivMetadataParser::parsePublishedDateFromArxivId('2606.23690v1'));
+    }
+}