master
lion 1 day ago
parent 1cb0480eeb
commit 6d5bfc16bd

@ -3,6 +3,7 @@
namespace App\Services\Crawl\Adapters;
use App\Models\CrawlSource;
use App\Models\Paper;
use App\Services\Crawl\ArxivAbsEnricher;
use App\Services\Crawl\ArxivMetadataParser;
use App\Services\Crawl\ArxivRequestGate;
@ -41,6 +42,9 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
$maxResults = min(200, max(1, (int) ($params['max_results'] ?? 50)));
$maxPages = min(20, max(1, (int) ($params['max_pages'] ?? 1)));
$pageSize = 50;
$skipImported = ($params['skip_imported'] ?? true) !== false;
$importedIds = $skipImported ? $this->loadImportedExternalIds() : [];
$maxScanPages = $this->resolveMaxScanPages($maxPages, $maxResults, $skipImported);
if ((bool) config('crawl.arxiv.prefer_html_search', false)) {
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults));
@ -49,10 +53,9 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
$items = [];
$seen = [];
for ($page = 0; $page < $maxPages && count($items) < $maxResults; $page++) {
for ($page = 0; $page < $maxScanPages && count($items) < $maxResults; $page++) {
$start = $page * $pageSize;
$batchSize = min($pageSize, $maxResults - count($items));
$batch = $this->fetchApiPage($keywordRaw, $start, $batchSize);
$batch = $this->fetchApiPage($keywordRaw, $start, $pageSize);
if ($batch === []) {
break;
}
@ -62,10 +65,18 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
continue;
}
$seen[$item->externalId] = true;
if ($skipImported && isset($importedIds[$item->externalId])) {
continue;
}
$items[] = $item;
if (count($items) >= $maxResults) {
break 2;
}
}
if (count($batch) < $batchSize) {
if (count($batch) < $pageSize) {
break;
}
}
@ -74,6 +85,10 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
return $this->finalizeItems($items);
}
if ($keywordRaw === '') {
throw new \RuntimeException('arXiv API 未返回结果,请稍后重试');
}
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, min($maxResults, $pageSize)));
}
@ -100,7 +115,12 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
return [];
}
return $this->parseAtomFeed($response->body(), $keywordRaw);
$body = $response->body();
if (str_contains($body, 'arxiv.org/api/errors') || str_contains($body, '<title>Error</title>')) {
return [];
}
return $this->parseAtomFeed($body, $keywordRaw);
}
/**
@ -340,4 +360,30 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
return $enrichAbs ? $this->absEnricher->enrichMany($items) : $items;
}
/**
* @return array<string, true>
*/
protected function loadImportedExternalIds(): array
{
$ids = Paper::query()
->where('source', 'crawl')
->whereNotNull('external_id')
->pluck('external_id')
->all();
return array_fill_keys($ids, true);
}
protected function resolveMaxScanPages(int $maxPages, int $maxResults, bool $skipImported): int
{
$maxPages = min(20, max(1, $maxPages));
if (! $skipImported) {
return $maxPages;
}
$minForTarget = (int) ceil($maxResults / 50);
return min(200, max($maxPages, $minForTarget * 10));
}
}

@ -86,7 +86,7 @@ class CrawlKeywordParser
{
$phrases = self::parsePhrases($raw);
if ($phrases === []) {
return 'all:*';
return 'cat:*';
}
$clauseParts = [];
@ -106,7 +106,7 @@ class CrawlKeywordParser
}
if ($clauseParts === []) {
return 'all:*';
return 'cat:*';
}
return count($clauseParts) === 1

@ -0,0 +1,20 @@
<?php
namespace Tests\Unit;
use App\Services\Crawl\CrawlKeywordParser;
use PHPUnit\Framework\TestCase;
class CrawlKeywordParserTest extends TestCase
{
public function test_build_arxiv_search_query_without_keyword_uses_category_wildcard(): void
{
$this->assertSame('cat:*', CrawlKeywordParser::buildArxivSearchQuery(''));
$this->assertSame('cat:*', CrawlKeywordParser::buildArxivSearchQuery(null));
}
public function test_build_arxiv_search_query_with_phrase(): void
{
$this->assertSame('(all:machine AND all:learning)', CrawlKeywordParser::buildArxivSearchQuery('machine learning'));
}
}
Loading…
Cancel
Save