You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

167 lines
5.4 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
namespace App\Services\Crawl\Adapters;
use App\Models\CrawlSource;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use App\Services\Crawl\HtmlCrawlSupport;
use Illuminate\Support\Facades\Http;
class HuxiuHtmlAdapter extends GenericNewsHtmlAdapter
{
protected const API_URL = 'https://api-web-article.huxiu.com/web/channel/articleListV1';
protected const PAGE_SIZE = 20;
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
{
$channelId = $this->resolveChannelId($requestUrl);
if ($channelId === null) {
return parent::fetch($requestUrl, $source, $params);
}
return $this->fetchChannelViaApi($requestUrl, $params, $channelId);
}
/**
* @param array<string, mixed> $params
* @return list<CrawlItemDto>
*/
protected function fetchChannelViaApi(string $requestUrl, array $params, int $channelId): array
{
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
$maxResults = min(50, max(1, (int) ($params['max_results'] ?? 30)));
$maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1)));
$listItems = [];
$seen = [];
$lastId = null;
for ($page = 1; $page <= $maxPages && count($listItems) < $maxResults; $page++) {
$payload = $this->requestChannelArticleList($channelId, self::PAGE_SIZE, $lastId);
$rows = $payload['datalist'] ?? [];
if ($rows === []) {
break;
}
foreach ($rows as $row) {
if (count($listItems) >= $maxResults) {
break 2;
}
$item = $this->mapApiRowToDto($row);
if ($item === null || isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$listItems[] = $item;
}
$nextLastId = isset($payload['last_id']) ? (int) $payload['last_id'] : 0;
if ($nextLastId <= 0 || $nextLastId === $lastId) {
break;
}
$lastId = $nextLastId;
}
$applyKeywordFilter = HtmlCrawlSupport::shouldApplyKeywordFilter($requestUrl, '', $keywords);
return $this->enrichNewsItems($listItems, $keywords, $applyKeywordFilter, $maxResults, 'huxiu');
}
protected function resolveChannelId(string $url): ?int
{
$path = (string) parse_url($url, PHP_URL_PATH);
if (preg_match('#/channel/(\d+)\.(?:html?|shtml)$#i', $path, $match)) {
return (int) $match[1];
}
if (preg_match('#/article/?(?:\.html?|\.shtml)?$#i', $path)) {
return 0;
}
return null;
}
/**
* @return array{name?:string, datalist?:list<array<string, mixed>>, last_id?:int|string}
*/
protected function requestChannelArticleList(int $channelId, int $pageSize, ?int $lastId): array
{
$form = [
'platform' => 'www',
'channel_id' => (string) $channelId,
'pagesize' => (string) max(1, min(30, $pageSize)),
];
if ($lastId) {
$form['last_id'] = (string) $lastId;
}
$response = Http::timeout(30)
->withHeaders([
'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
'Accept' => 'application/json',
'Origin' => 'https://www.huxiu.com',
'Referer' => 'https://www.huxiu.com/',
])
->asForm()
->post(self::API_URL, $form);
if (! $response->successful()) {
throw new \RuntimeException('虎嗅列表接口请求失败HTTP '.$response->status());
}
$json = $response->json();
if (! is_array($json) || empty($json['success'])) {
$message = is_array($json) ? (string) ($json['message'] ?? '未知错误') : '响应格式异常';
throw new \RuntimeException('虎嗅列表接口返回失败:'.$message);
}
$data = $json['data'] ?? [];
return is_array($data) ? $data : [];
}
/**
* @param array<string, mixed> $row
*/
protected function mapApiRowToDto(array $row): ?CrawlItemDto
{
$title = trim((string) ($row['title'] ?? ''));
if ($title === '' || HtmlCrawlSupport::isSkippableLinkTitle($title) || HtmlCrawlSupport::isWeakLinkTitle($title)) {
return null;
}
$url = HtmlCrawlSupport::normalizeNewsUrl((string) ($row['url'] ?? ''));
if ($url === null || $url === '') {
$aid = trim((string) ($row['aid'] ?? ''));
if ($aid === '') {
return null;
}
$url = 'https://www.huxiu.com/article/'.$aid.'.html';
}
$publishedAt = null;
if (! empty($row['dateline'])) {
$timestamp = (int) $row['dateline'];
if ($timestamp > 0) {
$publishedAt = gmdate('Y-m-d', $timestamp);
}
}
$summary = trim((string) ($row['summary'] ?? $row['short_content'] ?? ''));
return new CrawlItemDto(
externalId: 'news:'.md5($url),
title: $title,
canonicalUrl: $url,
summary: $summary !== '' ? $summary : null,
publishedAt: $publishedAt,
extra: ['platform' => 'huxiu'],
);
}
}