mergeNormalizedRequestUrl($request); $data = $request->validate([ 'request_url' => ['required', 'url', 'max:512'], 'target_type' => ['required', 'in:paper,industry_news,teacher'], ]); $source = $resolver->resolve($data['request_url'], $data['target_type']); if (! $source) { return $this->fail('无法识别该地址,请确认 URL 可访问且入库类型正确', 422); } return $this->ok([ 'crawl_source_id' => $source->id, 'source_name' => $source->name, 'adapter_code' => $source->adapter_code, 'target_type' => $source->target_type, 'param_schema' => $source->param_schema, 'entry_url' => $source->entry_url, ]); } public function show(int $crawlJob): JsonResponse { $job = CrawlJob::query()->with('crawlSource')->findOrFail($crawlJob); return $this->ok($this->serializeJob($job)); } public function items(Request $request, int $crawlJob): JsonResponse { $job = CrawlJob::query()->findOrFail($crawlJob); $query = CrawlJobItem::query()->where('crawl_job_id', $job->id)->orderByDesc('id'); if ($kind = $request->query('item_kind')) { match ($kind) { 'paper' => $query->where('target_type', 'paper'), 'teacher_lead' => $query->where('target_type', 'teacher_lead'), 'teacher' => $query->where('target_type', 'teacher'), default => null, }; } if ($kw = $request->query('keyword')) { $query->where(function ($q) use ($kw) { $q->where('title', 'like', "%{$kw}%") ->orWhere('canonical_url', 'like', "%{$kw}%"); }); } $pageSize = min(500, max(1, (int) $request->query('page_size', 100))); $paginator = $query->paginate($pageSize)->withQueryString(); $paginator->getCollection()->transform(fn (CrawlJobItem $i) => $this->serializeItem($i)); if ($request->query('item_kind') === 'teacher_lead') { $sorted = $paginator->getCollection() ->sort(function (array $a, array $b) { $aHas = ($a['lead_author_university'] ?? '') !== '' || ($a['school_name'] ?? '') !== ''; $bHas = ($b['lead_author_university'] ?? '') !== '' || ($b['school_name'] ?? '') !== ''; if ($aHas !== $bHas) { return $aHas ? -1 : 1; } return strcmp((string) ($a['lead_author_name'] ?? ''), (string) ($b['lead_author_name'] ?? '')); }) ->values(); $paginator->setCollection($sorted); } return $this->paginated($paginator); } public function store( Request $request, CrawlSourceResolver $resolver, CrawlJobRunnerService $runner, CrawlImportService $importService, ): JsonResponse { $this->mergeNormalizedRequestUrl($request); $data = $request->validate([ 'target_type' => ['required', 'in:paper,industry_news,teacher'], 'request_url' => ['required', 'url', 'max:512'], 'params' => ['nullable', 'array'], 'teacher_defaults' => ['nullable', 'array'], 'teacher_defaults.university_id' => ['nullable', 'integer', 'exists:universities,id'], 'teacher_defaults.city' => ['nullable', 'string', 'max:64'], 'teacher_defaults.research_direction_ids' => ['nullable', 'array'], 'teacher_defaults.research_direction_ids.*' => ['integer', 'exists:research_directions,id'], 'news_defaults' => ['nullable', 'array'], 'news_defaults.source' => ['nullable', 'string', 'max:128'], 'news_defaults.category_dict_item_id' => ['nullable', 'integer'], ]); $params = $data['params'] ?? []; $source = $resolver->resolve($data['request_url'], $data['target_type']); if (! $source) { return $this->fail('无法识别该地址对应的采集源', 422); } $job = CrawlJob::query()->create([ 'target_type' => $data['target_type'], 'request_url' => $data['request_url'], 'platform_url' => $data['request_url'], 'keyword' => (string) ($params['keyword'] ?? ''), 'params' => $params, 'crawl_source_id' => $source->id, 'adapter_code' => $source->adapter_code, 'admin_user_id' => $this->resolveCrawlJobAdminUserId($request), 'status' => 'pending', ]); try { $job = $runner->run($job, $source, $params); $this->applyCrawlDefaultsToPreviewItems( $job, $data['news_defaults'] ?? [], $data['teacher_defaults'] ?? [], ); } catch (\Throwable $e) { $job->update([ 'status' => 'failed', 'result_summary' => '抓取失败:'.$e->getMessage(), 'completed_at' => now(), ]); return $this->fail('抓取任务失败:'.$e->getMessage(), 500, ['id' => $job->id]); } $importResult = ['imported' => 0, 'skipped' => 0, 'failed' => 0]; try { $importResult = $importService->import( $job, null, true, $data['teacher_defaults'] ?? [], $data['news_defaults'] ?? [], ); } catch (\Throwable $e) { return $this->ok( $this->serializeJob($job->fresh(['crawlSource'])), '抓取完成,自动入库失败:'.$e->getMessage() ); } $job = $job->fresh(['crawlSource']); $fetched = (int) ($job->items_fetched ?? 0); $sourceName = $job->crawlSource?->name ?? '采集源'; $resultSummary = $this->buildImportResultSummary($job, $fetched, $importResult, $sourceName); $job->update([ 'result_summary' => $resultSummary, ]); $successMessage = $this->buildImportSuccessMessage($job, $importResult); return $this->ok($this->serializeJob($job->fresh(['crawlSource'])), $successMessage); } public function import(Request $request, int $crawlJob, CrawlImportService $importService): JsonResponse { $job = CrawlJob::query()->findOrFail($crawlJob); $data = $request->validate([ 'item_ids' => ['nullable', 'array'], 'item_ids.*' => ['integer'], 'select_all' => ['nullable', 'boolean'], 'teacher_defaults' => ['nullable', 'array'], 'teacher_defaults.university_id' => ['nullable', 'integer', 'exists:universities,id'], 'teacher_defaults.city' => ['nullable', 'string', 'max:64'], 'teacher_defaults.research_direction_ids' => ['nullable', 'array'], 'teacher_defaults.research_direction_ids.*' => ['integer', 'exists:research_directions,id'], 'news_defaults' => ['nullable', 'array'], 'news_defaults.source' => ['nullable', 'string', 'max:128'], 'news_defaults.category_dict_item_id' => ['nullable', 'integer'], ]); $result = $importService->import( $job, $data['item_ids'] ?? null, (bool) ($data['select_all'] ?? false), $data['teacher_defaults'] ?? [], $data['news_defaults'] ?? [], ); return $this->ok([ 'imported' => $result['imported'], 'imported_primary' => $result['imported_primary'], 'skipped' => $result['skipped'], 'failed' => $result['failed'], 'items_imported' => $job->fresh()->items_imported, ], $this->buildImportSuccessMessage($job, $result)); } public function updateItem(Request $request, int $crawlJob, int $item): JsonResponse { CrawlJob::query()->findOrFail($crawlJob); $row = CrawlJobItem::query() ->where('crawl_job_id', $crawlJob) ->where('target_type', 'news') ->findOrFail($item); $typeId = DictType::query()->where('code', 'news_category')->where('status', 1)->value('id'); if (! $typeId) { return $this->fail('资讯分类字典未配置', 422); } $data = $request->validate([ 'category_dict_item_id' => [ 'nullable', 'integer', \Illuminate\Validation\Rule::exists('dict_items', 'id')->where( fn ($q) => $q->where('dict_type_id', $typeId)->where('status', 1) ), ], 'import_source' => ['nullable', 'string', 'max:128'], ]); $payload = $row->payload ?? []; $extra = $payload['extra'] ?? []; if (array_key_exists('category_dict_item_id', $data) && $data['category_dict_item_id']) { $dictItem = DictItem::query()->findOrFail($data['category_dict_item_id']); $extra['category_dict_item_id'] = (int) $dictItem->id; $extra['category_label'] = $dictItem->label; } if (array_key_exists('import_source', $data)) { $extra['import_source'] = trim((string) $data['import_source']); } $payload['extra'] = $extra; $row->update(['payload' => $payload]); return $this->ok($this->serializeItem($row->fresh()), '已更新'); } /** * @return array */ protected function serializeJob(CrawlJob $job): array { $previewCount = CrawlJobItem::query() ->where('crawl_job_id', $job->id) ->where('status', 'preview') ->count(); $previewPaperCount = CrawlJobItem::query() ->where('crawl_job_id', $job->id) ->where('target_type', 'paper') ->where('status', 'preview') ->count(); $previewTeacherLeadCount = CrawlJobItem::query() ->where('crawl_job_id', $job->id) ->where('target_type', 'teacher_lead') ->where('status', 'preview') ->count(); $previewTeacherCount = CrawlJobItem::query() ->where('crawl_job_id', $job->id) ->where('target_type', 'teacher') ->where('status', 'preview') ->count(); $importedPaperCount = CrawlJobItem::query() ->where('crawl_job_id', $job->id) ->where('target_type', 'paper') ->where('status', 'imported') ->count(); $importedTeacherLeadCount = CrawlJobItem::query() ->where('crawl_job_id', $job->id) ->where('target_type', 'teacher_lead') ->where('status', 'imported') ->count(); return [ 'id' => $job->id, 'target_type' => $job->target_type, 'request_url' => $job->request_url ?? $job->platform_url, 'platform_url' => $job->platform_url, 'keyword' => $job->keyword, 'params' => $job->params, 'status' => $job->status, 'source_name' => $job->crawlSource?->name, 'adapter_code' => $job->adapter_code, 'items_fetched' => $job->items_fetched ?? $job->papers_created, 'items_imported' => $job->items_imported ?? 0, 'papers_created' => $job->papers_created, 'preview_count' => $previewCount, 'preview_paper_count' => $previewPaperCount, 'preview_teacher_lead_count' => $previewTeacherLeadCount, 'preview_teacher_count' => $previewTeacherCount, 'papers_imported' => $importedPaperCount, 'teacher_leads_imported' => $importedTeacherLeadCount, 'result_summary' => $job->result_summary, 'completed_at' => $job->completed_at?->toIso8601String(), ]; } /** * @return array */ protected function serializeItem(CrawlJobItem $item): array { $payload = $item->payload ?? []; $lead = $payload['lead_author'] ?? null; return [ 'id' => $item->id, 'external_id' => $item->external_id, 'title' => $item->title, 'authors' => $payload['authors'] ?? null, 'school_name' => $payload['school_name'] ?? ($lead['university_name'] ?? null), 'published_at' => $this->formatPublishedAt($payload['published_at'] ?? null), 'lead_author_name' => is_array($lead) ? ($lead['name'] ?? null) : null, 'lead_author_email' => is_array($lead) ? ($lead['email'] ?? null) : null, 'lead_author_affiliation' => is_array($lead) ? ($lead['college'] ?? $lead['affiliation'] ?? $payload['college_name'] ?? null) : ($payload['college_name'] ?? null), 'lead_author_university' => is_array($lead) ? ($lead['university_name'] ?? null) : null, 'lead_author_academic_title' => is_array($lead) ? ($lead['academic_title'] ?? $payload['academic_title'] ?? null) : ($payload['academic_title'] ?? null), 'paper_title' => $payload['paper_title'] ?? null, 'paper_external_id' => $payload['paper_external_id'] ?? null, 'url' => $item->canonical_url, 'summary' => $payload['summary'] ?? null, 'content_html' => $payload['content_html'] ?? null, 'section' => $payload['section'] ?? null, 'category_dict_item_id' => isset(($payload['extra'] ?? [])['category_dict_item_id']) ? (int) ($payload['extra']['category_dict_item_id']) : null, 'category_label' => ($payload['extra'] ?? [])['category_label'] ?? null, 'import_source' => ($payload['extra'] ?? [])['import_source'] ?? null, 'status' => $item->status, 'target_type' => $item->target_type, 'source_name' => $item->source_name, 'selectable' => $item->status === 'preview', 'is_duplicate' => $item->status === 'duplicate', ]; } protected function formatPublishedAt(mixed $value): ?string { if (! $value) { return null; } $str = (string) $value; if (preg_match('/^\d{4}-\d{2}-\d{2}/', $str, $m)) { return substr($m[0], 0, 10); } return $str; } /** * @param array{ * imported:int, * imported_primary:int, * skipped:int, * failed:int, * papers_imported:int, * teacher_leads_imported:int, * teachers_imported:int, * news_imported:int * } $importResult */ protected function buildImportResultSummary( CrawlJob $job, int $fetched, array $importResult, string $sourceName, ): string { if ($job->target_type === 'paper') { return sprintf( '已从 %s 抓取 %d 篇论文,已入库 %d 篇论文、%d 位第一作者', $sourceName, $fetched, (int) ($importResult['papers_imported'] ?? 0), (int) ($importResult['teacher_leads_imported'] ?? 0), ); } if ($job->target_type === 'teacher') { return sprintf( '已从 %s 抓取 %d 位老师,已入库 %d 位老师', $sourceName, $fetched, (int) ($importResult['teachers_imported'] ?? 0), ); } return sprintf( '已从 %s 抓取 %d 条资讯,已入库 %d 条资讯', $sourceName, $fetched, (int) ($importResult['news_imported'] ?? 0), ); } /** * @param array{ * imported:int, * imported_primary:int, * skipped:int, * failed:int, * papers_imported:int, * teacher_leads_imported:int, * teachers_imported:int, * news_imported:int * } $importResult */ protected function buildImportSuccessMessage(CrawlJob $job, array $importResult): string { if ($job->target_type === 'paper') { $papers = (int) ($importResult['papers_imported'] ?? 0); $leads = (int) ($importResult['teacher_leads_imported'] ?? 0); return "抓取完成,已入库 {$papers} 篇论文、{$leads} 位第一作者"; } if ($job->target_type === 'teacher') { $teachers = (int) ($importResult['teachers_imported'] ?? 0); return "抓取完成,已入库 {$teachers} 位老师"; } $news = (int) ($importResult['news_imported'] ?? 0); return "抓取完成,已入库 {$news} 条资讯"; } protected function resolveCrawlJobAdminUserId(Request $request): ?int { $user = $request->user(); if ($user instanceof MiniappUser) { $adminUserId = $user->admin_user_id; if ($adminUserId && AdminUser::query()->whereKey($adminUserId)->exists()) { return (int) $adminUserId; } return null; } if ($user instanceof AdminUser) { return $user->id; } return null; } /** * @param array{source?:string, category_dict_item_id?:int} $newsDefaults * @param array{university_id?:int,city?:string,research_direction_ids?:int[]} $teacherDefaults */ protected function applyCrawlDefaultsToPreviewItems( CrawlJob $job, array $newsDefaults, array $teacherDefaults, ): void { if ($job->target_type === 'industry_news' && ! empty($newsDefaults['category_dict_item_id'])) { $categoryId = (int) $newsDefaults['category_dict_item_id']; $dictItem = DictItem::query()->find($categoryId); if (! $dictItem) { return; } CrawlJobItem::query() ->where('crawl_job_id', $job->id) ->where('target_type', 'news') ->where('status', 'preview') ->get() ->each(function (CrawlJobItem $item) use ($categoryId, $dictItem) { $payload = $item->payload ?? []; $extra = $payload['extra'] ?? []; $extra['category_dict_item_id'] = $categoryId; $extra['category_label'] = $dictItem->label; $payload['extra'] = $extra; $item->update(['payload' => $payload]); }); } if ($job->target_type === 'teacher' && ! empty($teacherDefaults['university_id'])) { $universityId = (int) $teacherDefaults['university_id']; CrawlJobItem::query() ->where('crawl_job_id', $job->id) ->where('target_type', 'teacher') ->where('status', 'preview') ->get() ->each(function (CrawlJobItem $item) use ($universityId) { $payload = $item->payload ?? []; $extra = $payload['extra'] ?? []; $extra['default_university_id'] = $universityId; $payload['extra'] = $extra; $item->update(['payload' => $payload]); }); } } /** 粘贴地址常缺协议,补全 https:// 以便通过 url 校验与域名匹配。 */ protected function mergeNormalizedRequestUrl(Request $request): void { $raw = trim((string) $request->input('request_url', '')); if ($raw === '' || preg_match('#^https?://#i', $raw)) { return; } $request->merge(['request_url' => 'https://'.$raw]); } }