You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

96 lines
3.3 KiB

1 day ago
<?php
namespace App\Console\Commands;
use App\Models\CrawlJob;
use App\Models\News;
use App\Services\Crawl\CrawlAddressSourceResolver;
use Illuminate\Console\Command;
class FixCrawlNewsSourceCommand extends Command
{
protected $signature = 'crawl:fix-news-source
{--from=* : 需要替换的旧来源值,默认:通用资讯 HTML、爬虫采集}
{--dry-run : 仅预览,不写入数据库}';
protected $description = '将爬虫入库资讯的来源字段修正为爬虫地址名称(如交大要闻、南大要闻)';
public function handle(CrawlAddressSourceResolver $resolver): int
{
$fromValues = $this->option('from');
if ($fromValues === [] || $fromValues === null) {
$fromValues = $resolver->genericAdapterSourceNames();
}
$dryRun = (bool) $this->option('dry-run');
$updated = 0;
$skipped = 0;
News::query()
->where(function ($query) use ($fromValues) {
$query->whereIn('source', $fromValues)
->orWhereIn('source_site', $fromValues);
})
->orderBy('id')
->chunkById(200, function ($rows) use ($resolver, $dryRun, &$updated, &$skipped) {
/** @var \Illuminate\Support\Collection<int, News> $rows */
foreach ($rows as $news) {
$jobUrl = null;
if ($news->crawl_job_id) {
$jobUrl = CrawlJob::query()->whereKey($news->crawl_job_id)->value('request_url');
}
$newSource = $resolver->resolveForNews($jobUrl, $news->source_url);
if ($newSource === null || $newSource === '') {
$skipped++;
$this->line(sprintf(
'跳过 #%d %s无法匹配爬虫地址job=%s url=%s',
$news->id,
$news->title,
$news->crawl_job_id ?: '—',
$news->source_url ?: '—',
));
continue;
}
if ($news->source === $newSource && $news->source_site === $newSource) {
$skipped++;
continue;
}
$this->line(sprintf(
'%s #%d%s → %s',
$dryRun ? '预览' : '更新',
$news->id,
$news->source ?: '—',
$newSource,
));
if (! $dryRun) {
$news->update([
'source' => $newSource,
'source_site' => $newSource,
]);
}
$updated++;
}
});
$this->info(sprintf(
'%s完成修正 %d 条,跳过 %d 条。',
$dryRun ? '预览' : '批量',
$updated,
$skipped,
));
if ($dryRun && $updated > 0) {
$this->comment('确认无误后,去掉 --dry-run 再执行一次即可写入。');
}
return self::SUCCESS;
}
}