|
|
<?php
|
|
|
|
|
|
namespace App\Console\Commands;
|
|
|
|
|
|
use App\Models\CrawlJob;
|
|
|
use App\Models\News;
|
|
|
use App\Services\Crawl\CrawlAddressSourceResolver;
|
|
|
use Illuminate\Console\Command;
|
|
|
|
|
|
class FixCrawlNewsSourceCommand extends Command
|
|
|
{
|
|
|
protected $signature = 'crawl:fix-news-source
|
|
|
{--from=* : 需要替换的旧来源值,默认:通用资讯 HTML、爬虫采集}
|
|
|
{--dry-run : 仅预览,不写入数据库}';
|
|
|
|
|
|
protected $description = '将爬虫入库资讯的来源字段修正为爬虫地址名称(如交大要闻、南大要闻)';
|
|
|
|
|
|
public function handle(CrawlAddressSourceResolver $resolver): int
|
|
|
{
|
|
|
$fromValues = $this->option('from');
|
|
|
if ($fromValues === [] || $fromValues === null) {
|
|
|
$fromValues = $resolver->genericAdapterSourceNames();
|
|
|
}
|
|
|
|
|
|
$dryRun = (bool) $this->option('dry-run');
|
|
|
$updated = 0;
|
|
|
$skipped = 0;
|
|
|
|
|
|
News::query()
|
|
|
->where(function ($query) use ($fromValues) {
|
|
|
$query->whereIn('source', $fromValues)
|
|
|
->orWhereIn('source_site', $fromValues);
|
|
|
})
|
|
|
->orderBy('id')
|
|
|
->chunkById(200, function ($rows) use ($resolver, $dryRun, &$updated, &$skipped) {
|
|
|
/** @var \Illuminate\Support\Collection<int, News> $rows */
|
|
|
foreach ($rows as $news) {
|
|
|
$jobUrl = null;
|
|
|
if ($news->crawl_job_id) {
|
|
|
$jobUrl = CrawlJob::query()->whereKey($news->crawl_job_id)->value('request_url');
|
|
|
}
|
|
|
|
|
|
$newSource = $resolver->resolveForNews($jobUrl, $news->source_url);
|
|
|
if ($newSource === null || $newSource === '') {
|
|
|
$skipped++;
|
|
|
$this->line(sprintf(
|
|
|
'跳过 #%d %s(无法匹配爬虫地址,job=%s url=%s)',
|
|
|
$news->id,
|
|
|
$news->title,
|
|
|
$news->crawl_job_id ?: '—',
|
|
|
$news->source_url ?: '—',
|
|
|
));
|
|
|
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
if ($news->source === $newSource && $news->source_site === $newSource) {
|
|
|
$skipped++;
|
|
|
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$this->line(sprintf(
|
|
|
'%s #%d:%s → %s',
|
|
|
$dryRun ? '预览' : '更新',
|
|
|
$news->id,
|
|
|
$news->source ?: '—',
|
|
|
$newSource,
|
|
|
));
|
|
|
|
|
|
if (! $dryRun) {
|
|
|
$news->update([
|
|
|
'source' => $newSource,
|
|
|
'source_site' => $newSource,
|
|
|
]);
|
|
|
}
|
|
|
|
|
|
$updated++;
|
|
|
}
|
|
|
});
|
|
|
|
|
|
$this->info(sprintf(
|
|
|
'%s完成:修正 %d 条,跳过 %d 条。',
|
|
|
$dryRun ? '预览' : '批量',
|
|
|
$updated,
|
|
|
$skipped,
|
|
|
));
|
|
|
|
|
|
if ($dryRun && $updated > 0) {
|
|
|
$this->comment('确认无误后,去掉 --dry-run 再执行一次即可写入。');
|
|
|
}
|
|
|
|
|
|
return self::SUCCESS;
|
|
|
}
|
|
|
}
|