You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

96 lines
3.3 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
namespace App\Console\Commands;
use App\Models\CrawlJob;
use App\Models\News;
use App\Services\Crawl\CrawlAddressSourceResolver;
use Illuminate\Console\Command;
class FixCrawlNewsSourceCommand extends Command
{
protected $signature = 'crawl:fix-news-source
{--from=* : 需要替换的旧来源值,默认:通用资讯 HTML、爬虫采集}
{--dry-run : 仅预览,不写入数据库}';
protected $description = '将爬虫入库资讯的来源字段修正为爬虫地址名称(如交大要闻、南大要闻)';
public function handle(CrawlAddressSourceResolver $resolver): int
{
$fromValues = $this->option('from');
if ($fromValues === [] || $fromValues === null) {
$fromValues = $resolver->genericAdapterSourceNames();
}
$dryRun = (bool) $this->option('dry-run');
$updated = 0;
$skipped = 0;
News::query()
->where(function ($query) use ($fromValues) {
$query->whereIn('source', $fromValues)
->orWhereIn('source_site', $fromValues);
})
->orderBy('id')
->chunkById(200, function ($rows) use ($resolver, $dryRun, &$updated, &$skipped) {
/** @var \Illuminate\Support\Collection<int, News> $rows */
foreach ($rows as $news) {
$jobUrl = null;
if ($news->crawl_job_id) {
$jobUrl = CrawlJob::query()->whereKey($news->crawl_job_id)->value('request_url');
}
$newSource = $resolver->resolveForNews($jobUrl, $news->source_url);
if ($newSource === null || $newSource === '') {
$skipped++;
$this->line(sprintf(
'跳过 #%d %s无法匹配爬虫地址job=%s url=%s',
$news->id,
$news->title,
$news->crawl_job_id ?: '—',
$news->source_url ?: '—',
));
continue;
}
if ($news->source === $newSource && $news->source_site === $newSource) {
$skipped++;
continue;
}
$this->line(sprintf(
'%s #%d%s → %s',
$dryRun ? '预览' : '更新',
$news->id,
$news->source ?: '—',
$newSource,
));
if (! $dryRun) {
$news->update([
'source' => $newSource,
'source_site' => $newSource,
]);
}
$updated++;
}
});
$this->info(sprintf(
'%s完成修正 %d 条,跳过 %d 条。',
$dryRun ? '预览' : '批量',
$updated,
$skipped,
));
if ($dryRun && $updated > 0) {
$this->comment('确认无误后,去掉 --dry-run 再执行一次即可写入。');
}
return self::SUCCESS;
}
}