diff --git a/app/Services/Crawl/Adapters/ArxivApiAdapter.php b/app/Services/Crawl/Adapters/ArxivApiAdapter.php index 3a10f4c..a8081ba 100644 --- a/app/Services/Crawl/Adapters/ArxivApiAdapter.php +++ b/app/Services/Crawl/Adapters/ArxivApiAdapter.php @@ -3,6 +3,7 @@ namespace App\Services\Crawl\Adapters; use App\Models\CrawlSource; +use App\Models\Paper; use App\Services\Crawl\ArxivAbsEnricher; use App\Services\Crawl\ArxivMetadataParser; use App\Services\Crawl\ArxivRequestGate; @@ -41,6 +42,9 @@ class ArxivApiAdapter implements CrawlerAdapterInterface $maxResults = min(200, max(1, (int) ($params['max_results'] ?? 50))); $maxPages = min(20, max(1, (int) ($params['max_pages'] ?? 1))); $pageSize = 50; + $skipImported = ($params['skip_imported'] ?? true) !== false; + $importedIds = $skipImported ? $this->loadImportedExternalIds() : []; + $maxScanPages = $this->resolveMaxScanPages($maxPages, $maxResults, $skipImported); if ((bool) config('crawl.arxiv.prefer_html_search', false)) { return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults)); @@ -49,10 +53,9 @@ class ArxivApiAdapter implements CrawlerAdapterInterface $items = []; $seen = []; - for ($page = 0; $page < $maxPages && count($items) < $maxResults; $page++) { + for ($page = 0; $page < $maxScanPages && count($items) < $maxResults; $page++) { $start = $page * $pageSize; - $batchSize = min($pageSize, $maxResults - count($items)); - $batch = $this->fetchApiPage($keywordRaw, $start, $batchSize); + $batch = $this->fetchApiPage($keywordRaw, $start, $pageSize); if ($batch === []) { break; } @@ -62,10 +65,18 @@ class ArxivApiAdapter implements CrawlerAdapterInterface continue; } $seen[$item->externalId] = true; + + if ($skipImported && isset($importedIds[$item->externalId])) { + continue; + } + $items[] = $item; + if (count($items) >= $maxResults) { + break 2; + } } - if (count($batch) < $batchSize) { + if (count($batch) < $pageSize) { break; } } @@ -74,6 +85,10 @@ class ArxivApiAdapter implements CrawlerAdapterInterface return $this->finalizeItems($items); } + if ($keywordRaw === '') { + throw new \RuntimeException('arXiv API 未返回结果,请稍后重试'); + } + return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, min($maxResults, $pageSize))); } @@ -100,7 +115,12 @@ class ArxivApiAdapter implements CrawlerAdapterInterface return []; } - return $this->parseAtomFeed($response->body(), $keywordRaw); + $body = $response->body(); + if (str_contains($body, 'arxiv.org/api/errors') || str_contains($body, 'Error')) { + return []; + } + + return $this->parseAtomFeed($body, $keywordRaw); } /** @@ -340,4 +360,30 @@ class ArxivApiAdapter implements CrawlerAdapterInterface return $enrichAbs ? $this->absEnricher->enrichMany($items) : $items; } + + /** + * @return array + */ + protected function loadImportedExternalIds(): array + { + $ids = Paper::query() + ->where('source', 'crawl') + ->whereNotNull('external_id') + ->pluck('external_id') + ->all(); + + return array_fill_keys($ids, true); + } + + protected function resolveMaxScanPages(int $maxPages, int $maxResults, bool $skipImported): int + { + $maxPages = min(20, max(1, $maxPages)); + if (! $skipImported) { + return $maxPages; + } + + $minForTarget = (int) ceil($maxResults / 50); + + return min(200, max($maxPages, $minForTarget * 10)); + } } diff --git a/app/Services/Crawl/CrawlKeywordParser.php b/app/Services/Crawl/CrawlKeywordParser.php index 1a416fc..442b87a 100644 --- a/app/Services/Crawl/CrawlKeywordParser.php +++ b/app/Services/Crawl/CrawlKeywordParser.php @@ -86,7 +86,7 @@ class CrawlKeywordParser { $phrases = self::parsePhrases($raw); if ($phrases === []) { - return 'all:*'; + return 'cat:*'; } $clauseParts = []; @@ -106,7 +106,7 @@ class CrawlKeywordParser } if ($clauseParts === []) { - return 'all:*'; + return 'cat:*'; } return count($clauseParts) === 1 diff --git a/tests/Unit/CrawlKeywordParserTest.php b/tests/Unit/CrawlKeywordParserTest.php new file mode 100644 index 0000000..85a5d26 --- /dev/null +++ b/tests/Unit/CrawlKeywordParserTest.php @@ -0,0 +1,20 @@ +assertSame('cat:*', CrawlKeywordParser::buildArxivSearchQuery('')); + $this->assertSame('cat:*', CrawlKeywordParser::buildArxivSearchQuery(null)); + } + + public function test_build_arxiv_search_query_with_phrase(): void + { + $this->assertSame('(all:machine AND all:learning)', CrawlKeywordParser::buildArxivSearchQuery('machine learning')); + } +}