diff --git a/app/Http/Controllers/Admin/CrawlAddressController.php b/app/Http/Controllers/Admin/CrawlAddressController.php index 819f641..445d5c4 100644 --- a/app/Http/Controllers/Admin/CrawlAddressController.php +++ b/app/Http/Controllers/Admin/CrawlAddressController.php @@ -4,9 +4,11 @@ namespace App\Http\Controllers\Admin; use App\Http\Controllers\Controller; use App\Models\CrawlAddress; +use App\Models\CrawlSource; use App\Models\DictItem; use App\Models\DictType; -use App\Support\ApiResponse; +use App\Services\Crawl\CrawlAddressSourceResolver; +use App\Services\Crawl\CrawlSourceResolver; use Illuminate\Http\JsonResponse; use Illuminate\Http\Request; use Illuminate\Validation\Rule; @@ -17,7 +19,7 @@ class CrawlAddressController extends Controller public function index(Request $request): JsonResponse { - $query = CrawlAddress::query()->with(['categoryDictItem', 'university']); + $query = CrawlAddress::query()->with(['categoryDictItem', 'university', 'crawlSource']); if ($type = $request->query('target_type')) { $query->where('target_type', $type); @@ -47,6 +49,7 @@ class CrawlAddressController extends Controller public function options(Request $request): JsonResponse { $query = CrawlAddress::query() + ->with(['categoryDictItem', 'university', 'crawlSource']) ->where('status', 1) ->orderBy('sort') ->orderBy('name'); @@ -60,22 +63,24 @@ class CrawlAddressController extends Controller return $this->ok(['items' => $items]); } - public function store(Request $request): JsonResponse + public function store(Request $request, CrawlSourceResolver $resolver): JsonResponse { $data = $this->validatePayload($request); + $this->assignCrawlSourceId($data, $resolver); $row = CrawlAddress::query()->create($data); - return $this->ok($this->serialize($row->fresh(['categoryDictItem', 'university'])), '已创建'); + return $this->ok($this->serialize($row->fresh(['categoryDictItem', 'university', 'crawlSource'])), '已创建'); } - public function update(Request $request, int $crawlAddress): JsonResponse + public function update(Request $request, int $crawlAddress, CrawlSourceResolver $resolver): JsonResponse { $row = CrawlAddress::query()->findOrFail($crawlAddress); $data = $this->validatePayload($request, $row); + $this->assignCrawlSourceId($data, $resolver); $row->fill($data); $row->save(); - return $this->ok($this->serialize($row->fresh(['categoryDictItem', 'university'])), '已保存'); + return $this->ok($this->serialize($row->fresh(['categoryDictItem', 'university', 'crawlSource'])), '已保存'); } public function destroy(int $crawlAddress): JsonResponse @@ -99,6 +104,7 @@ class CrawlAddressController extends Controller 'target_type' => ['required', 'in:paper,industry_news,teacher'], 'name' => ['required', 'string', 'max:128'], 'request_url' => ['required', 'url', 'max:512'], + 'crawl_source_id' => ['nullable', 'integer', 'exists:crawl_sources,id'], 'keyword' => ['nullable', 'string', 'max:512'], 'category_dict_item_id' => [ 'nullable', @@ -137,6 +143,21 @@ class CrawlAddressController extends Controller return $data; } + /** + * @param array $data + */ + protected function assignCrawlSourceId(array &$data, CrawlSourceResolver $resolver): void + { + if (! empty($data['crawl_source_id'])) { + return; + } + + $source = $resolver->suggestSource($data['request_url'], $data['target_type']); + if ($source) { + $data['crawl_source_id'] = $source->id; + } + } + /** * @return array */ @@ -147,6 +168,9 @@ class CrawlAddressController extends Controller 'target_type' => $row->target_type, 'name' => $row->name, 'request_url' => $row->request_url, + 'crawl_source_id' => $row->crawl_source_id, + 'crawl_source_name' => $row->crawlSource?->name, + 'adapter_code' => $row->crawlSource?->adapter_code, 'keyword' => $row->keyword, 'category_dict_item_id' => $row->category_dict_item_id, 'category_label' => $row->categoryDictItem?->label, diff --git a/app/Http/Controllers/Admin/CrawlJobController.php b/app/Http/Controllers/Admin/CrawlJobController.php index 4bbd56a..0ee987c 100644 --- a/app/Http/Controllers/Admin/CrawlJobController.php +++ b/app/Http/Controllers/Admin/CrawlJobController.php @@ -29,9 +29,14 @@ class CrawlJobController extends Controller $data = $request->validate([ 'request_url' => ['required', 'url', 'max:512'], 'target_type' => ['required', 'in:paper,industry_news,teacher'], + 'crawl_address_id' => ['nullable', 'integer', 'exists:crawl_addresses,id'], ]); - $source = $resolver->resolve($data['request_url'], $data['target_type']); + $source = $resolver->resolve( + $data['request_url'], + $data['target_type'], + isset($data['crawl_address_id']) ? (int) $data['crawl_address_id'] : null, + ); if (! $source) { return $this->fail('无法识别该地址,请确认 URL 可访问且入库类型正确', 422); } @@ -117,11 +122,16 @@ class CrawlJobController extends Controller 'news_defaults' => ['nullable', 'array'], 'news_defaults.source' => ['nullable', 'string', 'max:128'], 'news_defaults.category_dict_item_id' => ['nullable', 'integer'], + 'crawl_address_id' => ['nullable', 'integer', 'exists:crawl_addresses,id'], ]); $params = $data['params'] ?? []; - $source = $resolver->resolve($data['request_url'], $data['target_type']); + $source = $resolver->resolve( + $data['request_url'], + $data['target_type'], + isset($data['crawl_address_id']) ? (int) $data['crawl_address_id'] : null, + ); if (! $source) { return $this->fail('无法识别该地址对应的采集源', 422); } diff --git a/app/Models/CrawlAddress.php b/app/Models/CrawlAddress.php index cc7a86d..19c62a6 100644 --- a/app/Models/CrawlAddress.php +++ b/app/Models/CrawlAddress.php @@ -11,6 +11,7 @@ class CrawlAddress extends Model 'target_type', 'name', 'request_url', + 'crawl_source_id', 'keyword', 'category_dict_item_id', 'university_id', @@ -20,6 +21,7 @@ class CrawlAddress extends Model ]; protected $casts = [ + 'crawl_source_id' => 'integer', 'category_dict_item_id' => 'integer', 'university_id' => 'integer', 'sort' => 'integer', @@ -35,4 +37,9 @@ class CrawlAddress extends Model { return $this->belongsTo(University::class); } + + public function crawlSource(): BelongsTo + { + return $this->belongsTo(CrawlSource::class); + } } diff --git a/app/Services/Crawl/CrawlAddressSourceResolver.php b/app/Services/Crawl/CrawlAddressSourceResolver.php index 7152d28..ece4680 100644 --- a/app/Services/Crawl/CrawlAddressSourceResolver.php +++ b/app/Services/Crawl/CrawlAddressSourceResolver.php @@ -66,6 +66,11 @@ class CrawlAddressSourceResolver ?? $this->resolveBySourceUrl($articleSourceUrl, $targetType); } + public function normalizeRequestUrl(?string $url): string + { + return $this->normalizeUrl($url); + } + /** * @return list */ diff --git a/app/Services/Crawl/CrawlSourceResolver.php b/app/Services/Crawl/CrawlSourceResolver.php index a9bda36..6172256 100644 --- a/app/Services/Crawl/CrawlSourceResolver.php +++ b/app/Services/Crawl/CrawlSourceResolver.php @@ -2,11 +2,72 @@ namespace App\Services\Crawl; +use App\Models\CrawlAddress; use App\Models\CrawlSource; class CrawlSourceResolver { - public function resolve(string $requestUrl, string $targetType): ?CrawlSource + public function __construct( + protected CrawlAddressSourceResolver $crawlAddressSourceResolver, + ) {} + + public function resolve(string $requestUrl, string $targetType, ?int $crawlAddressId = null): ?CrawlSource + { + $fromAddress = $this->resolveFromCrawlAddress($requestUrl, $targetType, $crawlAddressId); + if ($fromAddress) { + return $fromAddress; + } + + return $this->resolveCore($requestUrl, $targetType); + } + + public function suggestSource(string $requestUrl, string $targetType): ?CrawlSource + { + return $this->resolveCore($requestUrl, $targetType); + } + + protected function resolveFromCrawlAddress(string $requestUrl, string $targetType, ?int $crawlAddressId): ?CrawlSource + { + if ($crawlAddressId) { + $address = CrawlAddress::query() + ->whereKey($crawlAddressId) + ->where('status', 1) + ->where('target_type', $targetType) + ->first(); + + if ($address?->crawl_source_id) { + return $this->findActiveSource((int) $address->crawl_source_id, $targetType); + } + } + + $normalized = $this->crawlAddressSourceResolver->normalizeRequestUrl($requestUrl); + if ($normalized === '') { + return null; + } + + $addresses = CrawlAddress::query() + ->where('target_type', $targetType) + ->where('status', 1) + ->whereNotNull('crawl_source_id') + ->orderBy('sort') + ->orderBy('name') + ->get(); + + foreach ($addresses as $address) { + if ($this->crawlAddressSourceResolver->normalizeRequestUrl($address->request_url) !== $normalized) { + continue; + } + + $source = $this->findActiveSource((int) $address->crawl_source_id, $targetType); + if ($source) { + return $source; + } + } + + return null; + } + + protected function resolveCore(string $requestUrl, string $targetType): ?CrawlSource { $host = strtolower((string) parse_url($requestUrl, PHP_URL_HOST)); if ($host === '') { @@ -43,6 +104,15 @@ class CrawlSourceResolver return $wildcard; } + protected function findActiveSource(int $crawlSourceId, string $targetType): ?CrawlSource + { + return CrawlSource::query() + ->whereKey($crawlSourceId) + ->where('status', 1) + ->where('target_type', $targetType) + ->first(); + } + protected function hostMatchesDomain(string $host, string $domain): bool { $host = $this->normalizeHost($host); @@ -73,8 +143,6 @@ class CrawlSourceResolver } /** - * 域名未精确命中时,按 URL 特征回退(如 pedaily 子域、arxiv 路径)。 - * * @param \Illuminate\Support\Collection $sources */ protected function resolveByUrlHint(string $requestUrl, string $targetType, $sources): ?CrawlSource diff --git a/database/migrations/2026_06_23_000002_add_crawl_source_id_to_crawl_addresses_table.php b/database/migrations/2026_06_23_000002_add_crawl_source_id_to_crawl_addresses_table.php new file mode 100644 index 0000000..2ccb4db --- /dev/null +++ b/database/migrations/2026_06_23_000002_add_crawl_source_id_to_crawl_addresses_table.php @@ -0,0 +1,42 @@ +foreignId('crawl_source_id') + ->nullable() + ->after('request_url') + ->constrained('crawl_sources') + ->nullOnDelete(); + }); + + $sourceId = DB::table('crawl_sources') + ->where('adapter_code', 'ai_sjtu_research_center_api') + ->where('target_type', 'teacher') + ->value('id'); + + if ($sourceId) { + DB::table('crawl_addresses') + ->where('target_type', 'teacher') + ->where(function ($query) { + $query->where('request_url', 'like', '%ai.sjtu.edu.cn/center%') + ->orWhere('request_url', 'like', '%ai.sjtu.edu.cn/center'); + }) + ->update(['crawl_source_id' => $sourceId]); + } + } + + public function down(): void + { + Schema::table('crawl_addresses', function (Blueprint $table) { + $table->dropConstrainedForeignId('crawl_source_id'); + }); + } +}; diff --git a/tests/Unit/CrawlSourceResolverTest.php b/tests/Unit/CrawlSourceResolverTest.php index 4554230..771984c 100644 --- a/tests/Unit/CrawlSourceResolverTest.php +++ b/tests/Unit/CrawlSourceResolverTest.php @@ -2,6 +2,7 @@ namespace Tests\Unit; +use App\Models\CrawlAddress; use App\Models\CrawlSource; use App\Services\Crawl\CrawlSourceResolver; use Illuminate\Foundation\Testing\RefreshDatabase; @@ -33,10 +34,47 @@ class CrawlSourceResolverTest extends TestCase 'sort' => 25, ]); - $resolver = new CrawlSourceResolver; + $resolver = app(CrawlSourceResolver::class); $source = $resolver->resolve('https://ai.sjtu.edu.cn/center', 'teacher'); $this->assertNotNull($source); $this->assertSame('ai_sjtu_research_center_api', $source->adapter_code); } + + public function test_prefers_crawl_address_bound_source_over_wildcard(): void + { + $wildcard = CrawlSource::query()->create([ + 'name' => '师资列表页(通用 HTML)', + 'target_type' => 'teacher', + 'adapter_code' => 'faculty_list_html', + 'entry_url' => 'https://', + 'match_domains' => ['*'], + 'status' => 1, + 'sort' => 30, + ]); + + $aiSource = CrawlSource::query()->create([ + 'name' => '交大人工智能研究院研究中心', + 'target_type' => 'teacher', + 'adapter_code' => 'ai_sjtu_research_center_api', + 'entry_url' => 'https://ai.sjtu.edu.cn/center', + 'match_domains' => ['ai.sjtu.edu.cn'], + 'status' => 1, + 'sort' => 25, + ]); + + CrawlAddress::query()->create([ + 'target_type' => 'teacher', + 'name' => '交大 AI 中心', + 'request_url' => 'https://ai.sjtu.edu.cn/center', + 'crawl_source_id' => $aiSource->id, + 'status' => 1, + ]); + + $resolver = app(CrawlSourceResolver::class); + $source = $resolver->resolve('https://ai.sjtu.edu.cn/center', 'teacher'); + + $this->assertSame('ai_sjtu_research_center_api', $source?->adapter_code); + $this->assertNotSame($wildcard->id, $source?->id); + } }