交大智能研究院

master
lion 8 hours ago
parent 322baf9bfa
commit 4de9b4675f

@ -481,13 +481,16 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
}
$lead = CrawlAuthorParser::leadAuthor($authors, $authorsParsed);
$publishedAt = ArxivMetadataParser::parsePublishedDate($body)
?? ArxivMetadataParser::parsePublishedDateFromArxivId($arxivId);
$items[] = new CrawlItemDto(
externalId: 'arxiv:'.$arxivId,
title: $title,
canonicalUrl: 'https://arxiv.org/abs/'.$arxivId,
authors: $authors,
summary: $summary,
publishedAt: ArxivMetadataParser::parsePublishedDate($body),
publishedAt: $publishedAt,
schoolName: $lead['university_name'] ?? null,
extra: [
'platform' => 'arxiv',

@ -24,21 +24,41 @@ class ArxivAbsEnricher
return array_map(fn (CrawlItemDto $d) => $this->ensureLeadAuthor($d), $items);
}
$max = max(0, (int) config('crawl.arxiv.abs_enrich_max', 8));
$max = max(0, (int) config('crawl.arxiv.abs_enrich_max', 32));
$sorted = $items;
usort($sorted, fn (CrawlItemDto $a, CrawlItemDto $b) => $this->enrichPriority($a) <=> $this->enrichPriority($b));
$enriched = 0;
$out = [];
$enrichedMap = [];
foreach ($items as $dto) {
foreach ($sorted as $dto) {
if ($enriched >= $max || ! $this->shouldEnrich($dto)) {
$out[] = $this->ensureLeadAuthor($dto);
$enrichedMap[$dto->externalId] = $this->ensureLeadAuthor($dto);
continue;
}
$out[] = $this->enrichOne($dto);
$enrichedMap[$dto->externalId] = $this->enrichOne($dto);
$enriched++;
}
return $out;
return array_map(
fn (CrawlItemDto $dto) => $enrichedMap[$dto->externalId] ?? $this->ensureLeadAuthor($dto),
$items,
);
}
protected function enrichPriority(CrawlItemDto $dto): int
{
if (($dto->publishedAt ?? '') === '') {
return 0;
}
$lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed);
$hasSchool = ($dto->schoolName ?? null) !== null
|| ($lead['university_name'] ?? null) !== null
|| ($lead['affiliation'] ?? null) !== null;
return $hasSchool ? 2 : 1;
}
public function enrichOne(CrawlItemDto $dto): CrawlItemDto
@ -56,6 +76,7 @@ class ArxivAbsEnricher
$authorsParsed = $dto->authorsParsed;
$enrichedFrom = null;
$pageHtml = '';
$absHtml = '';
$preferHtml = $this->shouldPreferHtmlEnrich($dto);
@ -63,31 +84,63 @@ class ArxivAbsEnricher
$pageHtml = $this->fetchHtmlVersion((string) $arxivId);
if ($pageHtml !== '') {
$enrichedFrom = 'arxiv_html';
$publishedAt = ArxivMetadataParser::parsePublishedDate($pageHtml) ?? $publishedAt;
}
}
if ($pageHtml === '') {
$pageHtml = $this->fetchAbsHtml((string) $arxivId);
if ($pageHtml !== '') {
$enrichedFrom = 'abs_html';
$preferHtml = false;
if (($publishedAt ?? '') === '' || $pageHtml === '') {
$absHtml = $this->fetchAbsHtml((string) $arxivId);
if ($absHtml !== '') {
if ($enrichedFrom === null) {
$enrichedFrom = 'abs_html';
}
if (($publishedAt ?? '') === '') {
$publishedAt = ArxivMetadataParser::parsePublishedDate($absHtml) ?? $publishedAt;
}
}
}
if ($pageHtml !== '') {
if ($pageHtml === '' && $absHtml !== '') {
$pageHtml = $absHtml;
$preferHtml = false;
}
if ($pageHtml === '') {
return $this->ensureLeadAuthor(new CrawlItemDto(
externalId: $dto->externalId,
title: $dto->title,
canonicalUrl: $dto->canonicalUrl,
authors: $dto->authors,
summary: $dto->summary,
publishedAt: $publishedAt ?: ArxivMetadataParser::parsePublishedDateFromArxivId((string) $arxivId),
schoolName: $dto->schoolName,
section: $dto->section,
contentHtml: $dto->contentHtml,
extra: $dto->extra,
authorsParsed: $dto->authorsParsed,
));
}
if (($publishedAt ?? '') === '') {
$publishedAt = ArxivMetadataParser::parsePublishedDate($pageHtml) ?? $publishedAt;
}
if (($publishedAt ?? '') === '' && $absHtml !== '') {
$publishedAt = ArxivMetadataParser::parsePublishedDate($absHtml) ?? $publishedAt;
}
if (($publishedAt ?? '') === '') {
$publishedAt = ArxivMetadataParser::parsePublishedDateFromArxivId((string) $arxivId);
}
$parsed = $preferHtml
? ArxivMetadataParser::parseAuthorsFromHtmlVersion($pageHtml)
: ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml);
$parsed = $preferHtml
? ArxivMetadataParser::parseAuthorsFromHtmlVersion($pageHtml)
: ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml);
if ($parsed === [] && $preferHtml) {
$parsed = ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml);
}
if ($parsed === [] && $preferHtml && $absHtml !== '') {
$parsed = ArxivMetadataParser::parseAuthorsFromAbsHtml($absHtml);
}
if ($parsed !== []) {
$authorsParsed = $parsed;
}
if ($parsed !== []) {
$authorsParsed = $parsed;
}
$lead = CrawlAuthorParser::leadAuthor($dto->authors, $authorsParsed);
@ -101,10 +154,10 @@ class ArxivAbsEnricher
$extra['enriched_from'] = $enrichedFrom;
}
if (! isset($extra['pdf_url'])) {
$extra['pdf_url'] = ArxivMetadataParser::extractPdfUrl($pageHtml, (string) $arxivId);
$extra['pdf_url'] = ArxivMetadataParser::extractPdfUrl($pageHtml ?: $absHtml, (string) $arxivId);
}
if (! isset($extra['html_url'])) {
$extra['html_url'] = ArxivMetadataParser::extractHtmlUrl($pageHtml, (string) $arxivId);
$extra['html_url'] = ArxivMetadataParser::extractHtmlUrl($pageHtml ?: $absHtml, (string) $arxivId);
}
return new CrawlItemDto(
@ -157,8 +210,32 @@ class ArxivAbsEnricher
protected function ensureLeadAuthor(CrawlItemDto $dto): CrawlItemDto
{
$publishedAt = $dto->publishedAt;
if (($publishedAt ?? '') === '') {
$arxivId = $dto->extra['arxiv_id'] ?? null;
if ($arxivId) {
$publishedAt = ArxivMetadataParser::parsePublishedDateFromArxivId((string) $arxivId);
}
}
if (! empty($dto->extra['lead_author'])) {
return $dto;
if (($publishedAt ?? '') === ($dto->publishedAt ?? '')) {
return $dto;
}
return new CrawlItemDto(
externalId: $dto->externalId,
title: $dto->title,
canonicalUrl: $dto->canonicalUrl,
authors: $dto->authors,
summary: $dto->summary,
publishedAt: $publishedAt,
schoolName: $dto->schoolName,
section: $dto->section,
contentHtml: $dto->contentHtml,
extra: $dto->extra,
authorsParsed: $dto->authorsParsed,
);
}
$lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed);
@ -173,7 +250,7 @@ class ArxivAbsEnricher
canonicalUrl: $dto->canonicalUrl,
authors: $dto->authors,
summary: $dto->summary,
publishedAt: $dto->publishedAt,
publishedAt: $publishedAt,
schoolName: $dto->schoolName ?? $lead['university_name'] ?? null,
section: $dto->section,
contentHtml: $dto->contentHtml,

@ -23,9 +23,23 @@ class ArxivMetadataParser
return null;
}
$raw = $text;
if ($date = self::parseCitationMetaDate($raw)) {
return $date;
}
if ($date = self::parseSubmissionHistoryDate($raw)) {
return $date;
}
if ($date = self::parseDatelineDate($raw)) {
return $date;
}
if (preg_match(
'/Generated on\s+(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s+([A-Za-z]+)\s+(\d{1,2})\s+\d{1,2}:\d{2}:\d{2}\s+(\d{4})/i',
$text,
$raw,
$gen
)) {
$date = self::toYmd($gen[3], $gen[1], $gen[2]);
@ -34,7 +48,7 @@ class ArxivMetadataParser
}
}
$text = html_entity_decode(strip_tags($text), ENT_QUOTES | ENT_HTML5, 'UTF-8');
$text = html_entity_decode(strip_tags($raw), ENT_QUOTES | ENT_HTML5, 'UTF-8');
$text = preg_replace('/\s+/u', ' ', $text) ?? '';
if (preg_match('/(\d{4})-(\d{2})-(\d{2})/', $text, $iso)) {
@ -59,7 +73,6 @@ class ArxivMetadataParser
if (! preg_match($pattern, $text, $m)) {
continue;
}
// Generated on Thu May 28 ... 2026 → 月、日、年顺序
if (str_starts_with($pattern, '/Generated on')) {
$date = self::toYmd($m[3], $m[1], $m[2]);
} else {
@ -73,6 +86,89 @@ class ArxivMetadataParser
return null;
}
/**
* 新格式 arXiv IDYYMM.NNNNN可推断提交年月作为最后兜底取当月 1 日)。
*/
public static function parsePublishedDateFromArxivId(?string $arxivId): ?string
{
$arxivId = trim((string) $arxivId);
if ($arxivId === '') {
return null;
}
$arxivId = preg_replace('/v\d+$/i', '', $arxivId) ?? $arxivId;
if (preg_match('/^(\d{2})(\d{2})\.\d+(?:v\d+)?$/i', $arxivId, $m)) {
$year = 2000 + (int) $m[1];
$month = (int) $m[2];
if ($month >= 1 && $month <= 12 && $year >= 2007 && $year <= 2100) {
return sprintf('%04d-%02d-01', $year, $month);
}
}
return null;
}
protected static function parseCitationMetaDate(string $html): ?string
{
if (preg_match('#<meta[^>]+name=["\']citation_date["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $m)
|| preg_match('#<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']citation_date["\']#i', $html, $m)) {
return self::normalizeLooseDate($m[1]);
}
if (preg_match('#<meta[^>]+name=["\']citation_online_date["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $m)
|| preg_match('#<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']citation_online_date["\']#i', $html, $m)) {
return self::normalizeLooseDate($m[1]);
}
return null;
}
protected static function parseDatelineDate(string $html): ?string
{
if (! preg_match('#<div class=["\']dateline["\']>\s*\[Submitted on\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\]\s*</div>#i', $html, $m)) {
return null;
}
return self::toYmd($m[3], $m[2], $m[1]);
}
protected static function parseSubmissionHistoryDate(string $html): ?string
{
if (! preg_match('#<div class=["\']submission-history["\']>(.*?)</div>#is', $html, $block)) {
return null;
}
$section = $block[1];
if (preg_match('/\[v1\][^<]*(?:<br\s*\/?>)?\s*(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),?\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})/i', $section, $m)) {
return self::toYmd($m[3], $m[2], $m[1]);
}
if (preg_match('/\[Submitted on\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\]/i', $section, $m)) {
return self::toYmd($m[3], $m[2], $m[1]);
}
return null;
}
protected static function normalizeLooseDate(string $value): ?string
{
$value = trim(html_entity_decode($value, ENT_QUOTES | ENT_HTML5, 'UTF-8'));
if ($value === '') {
return null;
}
if (preg_match('#^(\d{4})[/-](\d{1,2})[/-](\d{1,2})$#', $value, $m)) {
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
}
if (preg_match('/(\d{4})-(\d{2})-(\d{2})/', $value, $iso)) {
return sprintf('%s-%s-%s', $iso[1], $iso[2], $iso[3]);
}
return self::parsePublishedDate($value);
}
/**
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
*/

@ -18,7 +18,7 @@ return [
*/
'abs_enrich_mode' => env('ARXIV_ABS_ENRICH_MODE', 'auto'),
/** 单次任务最多补全篇数(每篇至多 1 次 arXiv 页面请求) */
'abs_enrich_max' => (int) env('ARXIV_ABS_ENRICH_MAX', 8),
'abs_enrich_max' => (int) env('ARXIV_ABS_ENRICH_MAX', 32),
/** 补全时优先 HTML 版(机构更全),失败再试 abs */
'enrich_prefer_html' => (bool) env('ARXIV_ENRICH_PREFER_HTML', true),
'try_html_version' => (bool) env('ARXIV_TRY_HTML_VERSION', true),

@ -0,0 +1,29 @@
<?php
namespace Tests\Unit;
use App\Services\Crawl\ArxivMetadataParser;
use PHPUnit\Framework\TestCase;
class ArxivMetadataParserTest extends TestCase
{
public function test_parses_citation_meta_and_submission_history(): void
{
$html = <<<'HTML'
<meta name="citation_date" content="2026/04/09" />
<div class="dateline">[Submitted on 9 Apr 2026]</div>
<div class="submission-history">
<h2>Submission history</h2>
<strong>[v1]</strong> Thu, 9 Apr 2026 06:52:51 UTC (1,821 KB)<br/>
</div>
HTML;
$this->assertSame('2026-04-09', ArxivMetadataParser::parsePublishedDate($html));
}
public function test_parses_published_date_from_arxiv_id(): void
{
$this->assertSame('2026-06-01', ArxivMetadataParser::parsePublishedDateFromArxivId('2606.23690'));
$this->assertSame('2026-06-01', ArxivMetadataParser::parsePublishedDateFromArxivId('2606.23690v1'));
}
}
Loading…
Cancel
Save