You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

167 lines
5.4 KiB

2 days ago
<?php
namespace App\Services\Crawl\Adapters;
use App\Models\CrawlSource;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use App\Services\Crawl\HtmlCrawlSupport;
use Illuminate\Support\Facades\Http;
class HuxiuHtmlAdapter extends GenericNewsHtmlAdapter
{
protected const API_URL = 'https://api-web-article.huxiu.com/web/channel/articleListV1';
protected const PAGE_SIZE = 20;
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
{
$channelId = $this->resolveChannelId($requestUrl);
if ($channelId === null) {
return parent::fetch($requestUrl, $source, $params);
}
return $this->fetchChannelViaApi($requestUrl, $params, $channelId);
}
/**
* @param array<string, mixed> $params
* @return list<CrawlItemDto>
*/
protected function fetchChannelViaApi(string $requestUrl, array $params, int $channelId): array
{
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
$maxResults = min(50, max(1, (int) ($params['max_results'] ?? 30)));
$maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1)));
$listItems = [];
$seen = [];
$lastId = null;
for ($page = 1; $page <= $maxPages && count($listItems) < $maxResults; $page++) {
$payload = $this->requestChannelArticleList($channelId, self::PAGE_SIZE, $lastId);
$rows = $payload['datalist'] ?? [];
if ($rows === []) {
break;
}
foreach ($rows as $row) {
if (count($listItems) >= $maxResults) {
break 2;
}
$item = $this->mapApiRowToDto($row);
if ($item === null || isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$listItems[] = $item;
}
$nextLastId = isset($payload['last_id']) ? (int) $payload['last_id'] : 0;
if ($nextLastId <= 0 || $nextLastId === $lastId) {
break;
}
$lastId = $nextLastId;
}
$applyKeywordFilter = HtmlCrawlSupport::shouldApplyKeywordFilter($requestUrl, '', $keywords);
return $this->enrichNewsItems($listItems, $keywords, $applyKeywordFilter, $maxResults, 'huxiu');
}
protected function resolveChannelId(string $url): ?int
{
$path = (string) parse_url($url, PHP_URL_PATH);
if (preg_match('#/channel/(\d+)\.(?:html?|shtml)$#i', $path, $match)) {
return (int) $match[1];
}
if (preg_match('#/article/?(?:\.html?|\.shtml)?$#i', $path)) {
return 0;
}
return null;
}
/**
* @return array{name?:string, datalist?:list<array<string, mixed>>, last_id?:int|string}
*/
protected function requestChannelArticleList(int $channelId, int $pageSize, ?int $lastId): array
{
$form = [
'platform' => 'www',
'channel_id' => (string) $channelId,
'pagesize' => (string) max(1, min(30, $pageSize)),
];
if ($lastId) {
$form['last_id'] = (string) $lastId;
}
$response = Http::timeout(30)
->withHeaders([
'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
'Accept' => 'application/json',
'Origin' => 'https://www.huxiu.com',
'Referer' => 'https://www.huxiu.com/',
])
->asForm()
->post(self::API_URL, $form);
if (! $response->successful()) {
throw new \RuntimeException('虎嗅列表接口请求失败HTTP '.$response->status());
}
$json = $response->json();
if (! is_array($json) || empty($json['success'])) {
$message = is_array($json) ? (string) ($json['message'] ?? '未知错误') : '响应格式异常';
throw new \RuntimeException('虎嗅列表接口返回失败:'.$message);
}
$data = $json['data'] ?? [];
return is_array($data) ? $data : [];
}
/**
* @param array<string, mixed> $row
*/
protected function mapApiRowToDto(array $row): ?CrawlItemDto
{
$title = trim((string) ($row['title'] ?? ''));
if ($title === '' || HtmlCrawlSupport::isSkippableLinkTitle($title) || HtmlCrawlSupport::isWeakLinkTitle($title)) {
return null;
}
$url = HtmlCrawlSupport::normalizeNewsUrl((string) ($row['url'] ?? ''));
if ($url === null || $url === '') {
$aid = trim((string) ($row['aid'] ?? ''));
if ($aid === '') {
return null;
}
$url = 'https://www.huxiu.com/article/'.$aid.'.html';
}
$publishedAt = null;
if (! empty($row['dateline'])) {
$timestamp = (int) $row['dateline'];
if ($timestamp > 0) {
$publishedAt = gmdate('Y-m-d', $timestamp);
}
}
$summary = trim((string) ($row['summary'] ?? $row['short_content'] ?? ''));
return new CrawlItemDto(
externalId: 'news:'.md5($url),
title: $title,
canonicalUrl: $url,
summary: $summary !== '' ? $summary : null,
publishedAt: $publishedAt,
extra: ['platform' => 'huxiu'],
);
}
}