交大智能研究院

master
lion 17 hours ago
parent 4d93e3e041
commit 33596a43b8

@ -4,9 +4,11 @@ namespace App\Http\Controllers\Admin;
use App\Http\Controllers\Controller;
use App\Models\CrawlAddress;
use App\Models\CrawlSource;
use App\Models\DictItem;
use App\Models\DictType;
use App\Support\ApiResponse;
use App\Services\Crawl\CrawlAddressSourceResolver;
use App\Services\Crawl\CrawlSourceResolver;
use Illuminate\Http\JsonResponse;
use Illuminate\Http\Request;
use Illuminate\Validation\Rule;
@ -17,7 +19,7 @@ class CrawlAddressController extends Controller
public function index(Request $request): JsonResponse
{
$query = CrawlAddress::query()->with(['categoryDictItem', 'university']);
$query = CrawlAddress::query()->with(['categoryDictItem', 'university', 'crawlSource']);
if ($type = $request->query('target_type')) {
$query->where('target_type', $type);
@ -47,6 +49,7 @@ class CrawlAddressController extends Controller
public function options(Request $request): JsonResponse
{
$query = CrawlAddress::query()
->with(['categoryDictItem', 'university', 'crawlSource'])
->where('status', 1)
->orderBy('sort')
->orderBy('name');
@ -60,22 +63,24 @@ class CrawlAddressController extends Controller
return $this->ok(['items' => $items]);
}
public function store(Request $request): JsonResponse
public function store(Request $request, CrawlSourceResolver $resolver): JsonResponse
{
$data = $this->validatePayload($request);
$this->assignCrawlSourceId($data, $resolver);
$row = CrawlAddress::query()->create($data);
return $this->ok($this->serialize($row->fresh(['categoryDictItem', 'university'])), '已创建');
return $this->ok($this->serialize($row->fresh(['categoryDictItem', 'university', 'crawlSource'])), '已创建');
}
public function update(Request $request, int $crawlAddress): JsonResponse
public function update(Request $request, int $crawlAddress, CrawlSourceResolver $resolver): JsonResponse
{
$row = CrawlAddress::query()->findOrFail($crawlAddress);
$data = $this->validatePayload($request, $row);
$this->assignCrawlSourceId($data, $resolver);
$row->fill($data);
$row->save();
return $this->ok($this->serialize($row->fresh(['categoryDictItem', 'university'])), '已保存');
return $this->ok($this->serialize($row->fresh(['categoryDictItem', 'university', 'crawlSource'])), '已保存');
}
public function destroy(int $crawlAddress): JsonResponse
@ -99,6 +104,7 @@ class CrawlAddressController extends Controller
'target_type' => ['required', 'in:paper,industry_news,teacher'],
'name' => ['required', 'string', 'max:128'],
'request_url' => ['required', 'url', 'max:512'],
'crawl_source_id' => ['nullable', 'integer', 'exists:crawl_sources,id'],
'keyword' => ['nullable', 'string', 'max:512'],
'category_dict_item_id' => [
'nullable',
@ -137,6 +143,21 @@ class CrawlAddressController extends Controller
return $data;
}
/**
* @param array<string, mixed> $data
*/
protected function assignCrawlSourceId(array &$data, CrawlSourceResolver $resolver): void
{
if (! empty($data['crawl_source_id'])) {
return;
}
$source = $resolver->suggestSource($data['request_url'], $data['target_type']);
if ($source) {
$data['crawl_source_id'] = $source->id;
}
}
/**
* @return array<string, mixed>
*/
@ -147,6 +168,9 @@ class CrawlAddressController extends Controller
'target_type' => $row->target_type,
'name' => $row->name,
'request_url' => $row->request_url,
'crawl_source_id' => $row->crawl_source_id,
'crawl_source_name' => $row->crawlSource?->name,
'adapter_code' => $row->crawlSource?->adapter_code,
'keyword' => $row->keyword,
'category_dict_item_id' => $row->category_dict_item_id,
'category_label' => $row->categoryDictItem?->label,

@ -29,9 +29,14 @@ class CrawlJobController extends Controller
$data = $request->validate([
'request_url' => ['required', 'url', 'max:512'],
'target_type' => ['required', 'in:paper,industry_news,teacher'],
'crawl_address_id' => ['nullable', 'integer', 'exists:crawl_addresses,id'],
]);
$source = $resolver->resolve($data['request_url'], $data['target_type']);
$source = $resolver->resolve(
$data['request_url'],
$data['target_type'],
isset($data['crawl_address_id']) ? (int) $data['crawl_address_id'] : null,
);
if (! $source) {
return $this->fail('无法识别该地址,请确认 URL 可访问且入库类型正确', 422);
}
@ -117,11 +122,16 @@ class CrawlJobController extends Controller
'news_defaults' => ['nullable', 'array'],
'news_defaults.source' => ['nullable', 'string', 'max:128'],
'news_defaults.category_dict_item_id' => ['nullable', 'integer'],
'crawl_address_id' => ['nullable', 'integer', 'exists:crawl_addresses,id'],
]);
$params = $data['params'] ?? [];
$source = $resolver->resolve($data['request_url'], $data['target_type']);
$source = $resolver->resolve(
$data['request_url'],
$data['target_type'],
isset($data['crawl_address_id']) ? (int) $data['crawl_address_id'] : null,
);
if (! $source) {
return $this->fail('无法识别该地址对应的采集源', 422);
}

@ -11,6 +11,7 @@ class CrawlAddress extends Model
'target_type',
'name',
'request_url',
'crawl_source_id',
'keyword',
'category_dict_item_id',
'university_id',
@ -20,6 +21,7 @@ class CrawlAddress extends Model
];
protected $casts = [
'crawl_source_id' => 'integer',
'category_dict_item_id' => 'integer',
'university_id' => 'integer',
'sort' => 'integer',
@ -35,4 +37,9 @@ class CrawlAddress extends Model
{
return $this->belongsTo(University::class);
}
public function crawlSource(): BelongsTo
{
return $this->belongsTo(CrawlSource::class);
}
}

@ -66,6 +66,11 @@ class CrawlAddressSourceResolver
?? $this->resolveBySourceUrl($articleSourceUrl, $targetType);
}
public function normalizeRequestUrl(?string $url): string
{
return $this->normalizeUrl($url);
}
/**
* @return list<string>
*/

@ -2,11 +2,72 @@
namespace App\Services\Crawl;
use App\Models\CrawlAddress;
use App\Models\CrawlSource;
class CrawlSourceResolver
{
public function resolve(string $requestUrl, string $targetType): ?CrawlSource
public function __construct(
protected CrawlAddressSourceResolver $crawlAddressSourceResolver,
) {}
public function resolve(string $requestUrl, string $targetType, ?int $crawlAddressId = null): ?CrawlSource
{
$fromAddress = $this->resolveFromCrawlAddress($requestUrl, $targetType, $crawlAddressId);
if ($fromAddress) {
return $fromAddress;
}
return $this->resolveCore($requestUrl, $targetType);
}
public function suggestSource(string $requestUrl, string $targetType): ?CrawlSource
{
return $this->resolveCore($requestUrl, $targetType);
}
protected function resolveFromCrawlAddress(string $requestUrl, string $targetType, ?int $crawlAddressId): ?CrawlSource
{
if ($crawlAddressId) {
$address = CrawlAddress::query()
->whereKey($crawlAddressId)
->where('status', 1)
->where('target_type', $targetType)
->first();
if ($address?->crawl_source_id) {
return $this->findActiveSource((int) $address->crawl_source_id, $targetType);
}
}
$normalized = $this->crawlAddressSourceResolver->normalizeRequestUrl($requestUrl);
if ($normalized === '') {
return null;
}
$addresses = CrawlAddress::query()
->where('target_type', $targetType)
->where('status', 1)
->whereNotNull('crawl_source_id')
->orderBy('sort')
->orderBy('name')
->get();
foreach ($addresses as $address) {
if ($this->crawlAddressSourceResolver->normalizeRequestUrl($address->request_url) !== $normalized) {
continue;
}
$source = $this->findActiveSource((int) $address->crawl_source_id, $targetType);
if ($source) {
return $source;
}
}
return null;
}
protected function resolveCore(string $requestUrl, string $targetType): ?CrawlSource
{
$host = strtolower((string) parse_url($requestUrl, PHP_URL_HOST));
if ($host === '') {
@ -43,6 +104,15 @@ class CrawlSourceResolver
return $wildcard;
}
protected function findActiveSource(int $crawlSourceId, string $targetType): ?CrawlSource
{
return CrawlSource::query()
->whereKey($crawlSourceId)
->where('status', 1)
->where('target_type', $targetType)
->first();
}
protected function hostMatchesDomain(string $host, string $domain): bool
{
$host = $this->normalizeHost($host);
@ -73,8 +143,6 @@ class CrawlSourceResolver
}
/**
* 域名未精确命中时,按 URL 特征回退(如 pedaily 子域、arxiv 路径)。
*
* @param \Illuminate\Support\Collection<int, CrawlSource> $sources
*/
protected function resolveByUrlHint(string $requestUrl, string $targetType, $sources): ?CrawlSource

@ -0,0 +1,42 @@
<?php
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
public function up(): void
{
Schema::table('crawl_addresses', function (Blueprint $table) {
$table->foreignId('crawl_source_id')
->nullable()
->after('request_url')
->constrained('crawl_sources')
->nullOnDelete();
});
$sourceId = DB::table('crawl_sources')
->where('adapter_code', 'ai_sjtu_research_center_api')
->where('target_type', 'teacher')
->value('id');
if ($sourceId) {
DB::table('crawl_addresses')
->where('target_type', 'teacher')
->where(function ($query) {
$query->where('request_url', 'like', '%ai.sjtu.edu.cn/center%')
->orWhere('request_url', 'like', '%ai.sjtu.edu.cn/center');
})
->update(['crawl_source_id' => $sourceId]);
}
}
public function down(): void
{
Schema::table('crawl_addresses', function (Blueprint $table) {
$table->dropConstrainedForeignId('crawl_source_id');
});
}
};

@ -2,6 +2,7 @@
namespace Tests\Unit;
use App\Models\CrawlAddress;
use App\Models\CrawlSource;
use App\Services\Crawl\CrawlSourceResolver;
use Illuminate\Foundation\Testing\RefreshDatabase;
@ -33,10 +34,47 @@ class CrawlSourceResolverTest extends TestCase
'sort' => 25,
]);
$resolver = new CrawlSourceResolver;
$resolver = app(CrawlSourceResolver::class);
$source = $resolver->resolve('https://ai.sjtu.edu.cn/center', 'teacher');
$this->assertNotNull($source);
$this->assertSame('ai_sjtu_research_center_api', $source->adapter_code);
}
public function test_prefers_crawl_address_bound_source_over_wildcard(): void
{
$wildcard = CrawlSource::query()->create([
'name' => '师资列表页(通用 HTML',
'target_type' => 'teacher',
'adapter_code' => 'faculty_list_html',
'entry_url' => 'https://',
'match_domains' => ['*'],
'status' => 1,
'sort' => 30,
]);
$aiSource = CrawlSource::query()->create([
'name' => '交大人工智能研究院研究中心',
'target_type' => 'teacher',
'adapter_code' => 'ai_sjtu_research_center_api',
'entry_url' => 'https://ai.sjtu.edu.cn/center',
'match_domains' => ['ai.sjtu.edu.cn'],
'status' => 1,
'sort' => 25,
]);
CrawlAddress::query()->create([
'target_type' => 'teacher',
'name' => '交大 AI 中心',
'request_url' => 'https://ai.sjtu.edu.cn/center',
'crawl_source_id' => $aiSource->id,
'status' => 1,
]);
$resolver = app(CrawlSourceResolver::class);
$source = $resolver->resolve('https://ai.sjtu.edu.cn/center', 'teacher');
$this->assertSame('ai_sjtu_research_center_api', $source?->adapter_code);
$this->assertNotSame($wildcard->id, $source?->id);
}
}

Loading…
Cancel
Save