[ [ 'key' => 'keyword', 'type' => 'textarea', 'label' => '搜索关键词(选填)', 'required' => false, 'placeholder' => '多个关键词用逗号或换行分隔,如:graph neural, AI', ], [ 'key' => 'max_pages', 'type' => 'number', 'label' => '抓取页数', 'default' => 1, 'min' => 1, 'max' => 20, 'placeholder' => 'arXiv 按提交时间分页,每页 50 条', ], [ 'key' => 'max_results', 'type' => 'number', 'label' => '条数上限', 'default' => 50, 'min' => 1, 'max' => 200, ], ], ]; CrawlSource::query()->updateOrCreate( ['adapter_code' => 'arxiv_api', 'target_type' => 'paper'], [ 'name' => 'arXiv', 'entry_url' => 'https://arxiv.org/', 'match_domains' => ['arxiv.org', 'export.arxiv.org'], 'config' => ['api_base' => 'https://export.arxiv.org/api/query'], 'param_schema' => $paperSchema, 'status' => 1, 'sort' => 10, ] ); $newsSchema = [ 'fields' => [ [ 'key' => 'keyword', 'type' => 'textarea', 'label' => '搜索关键词(选填)', 'required' => false, 'placeholder' => '多个关键词用空格、逗号或换行分隔,如:融资 科创板 AI', ], [ 'key' => 'max_pages', 'type' => 'number', 'label' => '抓取页数', 'default' => 5, 'min' => 1, 'max' => 50, 'placeholder' => '列表分页时连续抓取多页', ], [ 'key' => 'max_results', 'type' => 'number', 'label' => '条数上限', 'default' => 30, 'min' => 1, 'max' => 50, ], ], ]; CrawlSource::query()->updateOrCreate( ['adapter_code' => 'huxiu_html', 'target_type' => 'industry_news'], [ 'name' => '虎嗅', 'entry_url' => 'https://www.huxiu.com/', 'match_domains' => ['huxiu.com', 'www.huxiu.com'], 'config' => [], 'param_schema' => $newsSchema, 'status' => 1, 'sort' => 15, ] ); CrawlSource::query() ->where('target_type', 'industry_news') ->where('name', '虎嗅') ->where('adapter_code', '!=', 'huxiu_html') ->delete(); CrawlSource::query()->updateOrCreate( ['adapter_code' => 'pedaily_html', 'target_type' => 'industry_news'], [ 'name' => '投资界', 'entry_url' => 'https://www.pedaily.cn/all/', 'match_domains' => ['pedaily.cn', 'www.pedaily.cn', '*.pedaily.cn'], 'config' => [], 'param_schema' => $newsSchema, 'status' => 1, 'sort' => 20, ] ); CrawlSource::query()->updateOrCreate( ['adapter_code' => 'generic_news_html', 'target_type' => 'industry_news'], [ 'name' => '通用资讯 HTML', 'entry_url' => 'https://', 'match_domains' => ['*'], 'config' => [], 'param_schema' => $newsSchema, 'status' => 1, 'sort' => 100, ] ); CrawlSource::query()->updateOrCreate( ['adapter_code' => 'generic_paper_html', 'target_type' => 'paper'], [ 'name' => '通用论文 HTML', 'entry_url' => 'https://', 'match_domains' => ['*'], 'config' => [], 'param_schema' => $paperSchema, 'status' => 1, 'sort' => 100, ] ); $teacherSchema = [ 'fields' => [ [ 'key' => 'keyword', 'type' => 'textarea', 'label' => '搜索关键词(选填)', 'required' => false, 'placeholder' => '多个关键词用空格、逗号或换行分隔', ], [ 'key' => 'max_pages', 'type' => 'number', 'label' => '抓取页数', 'default' => 5, 'min' => 1, 'max' => 50, 'placeholder' => '列表分页时连续抓取多页', ], [ 'key' => 'max_results', 'type' => 'number', 'label' => '条数上限', 'default' => 200, 'min' => 1, 'max' => 500, ], ], ]; CrawlSource::query()->updateOrCreate( ['adapter_code' => 'faculty_list_html', 'target_type' => 'teacher'], [ 'name' => '师资列表页(通用 HTML)', 'entry_url' => 'https://', 'match_domains' => ['*'], 'config' => [], 'param_schema' => $teacherSchema, 'status' => 1, 'sort' => 30, ] ); CrawlSource::query()->updateOrCreate( ['adapter_code' => 'ai_sjtu_research_center_api', 'target_type' => 'teacher'], [ 'name' => '交大人工智能研究院研究中心', 'entry_url' => 'https://ai.sjtu.edu.cn/center', 'match_domains' => ['ai.sjtu.edu.cn'], 'config' => ['api_base' => 'https://ai.sjtu.edu.cn/api'], 'param_schema' => $teacherSchema, 'status' => 1, 'sort' => 25, ] ); $this->command?->info('采集源 arXiv / 虎嗅 / 投资界 / 通用 HTML / 师资列表 已写入。'); } }