resolveChannelId($requestUrl); if ($channelId === null) { return parent::fetch($requestUrl, $source, $params); } return $this->fetchChannelViaApi($requestUrl, $params, $channelId); } /** * @param array $params * @return list */ protected function fetchChannelViaApi(string $requestUrl, array $params, int $channelId): array { $keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? '')); $maxResults = min(50, max(1, (int) ($params['max_results'] ?? 30))); $maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1))); $listItems = []; $seen = []; $lastId = null; for ($page = 1; $page <= $maxPages && count($listItems) < $maxResults; $page++) { $payload = $this->requestChannelArticleList($channelId, self::PAGE_SIZE, $lastId); $rows = $payload['datalist'] ?? []; if ($rows === []) { break; } foreach ($rows as $row) { if (count($listItems) >= $maxResults) { break 2; } $item = $this->mapApiRowToDto($row); if ($item === null || isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $listItems[] = $item; } $nextLastId = isset($payload['last_id']) ? (int) $payload['last_id'] : 0; if ($nextLastId <= 0 || $nextLastId === $lastId) { break; } $lastId = $nextLastId; } $applyKeywordFilter = HtmlCrawlSupport::shouldApplyKeywordFilter($requestUrl, '', $keywords); return $this->enrichNewsItems($listItems, $keywords, $applyKeywordFilter, $maxResults, 'huxiu'); } protected function resolveChannelId(string $url): ?int { $path = (string) parse_url($url, PHP_URL_PATH); if (preg_match('#/channel/(\d+)\.(?:html?|shtml)$#i', $path, $match)) { return (int) $match[1]; } if (preg_match('#/article/?(?:\.html?|\.shtml)?$#i', $path)) { return 0; } return null; } /** * @return array{name?:string, datalist?:list>, last_id?:int|string} */ protected function requestChannelArticleList(int $channelId, int $pageSize, ?int $lastId): array { $form = [ 'platform' => 'www', 'channel_id' => (string) $channelId, 'pagesize' => (string) max(1, min(30, $pageSize)), ]; if ($lastId) { $form['last_id'] = (string) $lastId; } $response = Http::timeout(30) ->withHeaders([ 'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)', 'Accept' => 'application/json', 'Origin' => 'https://www.huxiu.com', 'Referer' => 'https://www.huxiu.com/', ]) ->asForm() ->post(self::API_URL, $form); if (! $response->successful()) { throw new \RuntimeException('虎嗅列表接口请求失败:HTTP '.$response->status()); } $json = $response->json(); if (! is_array($json) || empty($json['success'])) { $message = is_array($json) ? (string) ($json['message'] ?? '未知错误') : '响应格式异常'; throw new \RuntimeException('虎嗅列表接口返回失败:'.$message); } $data = $json['data'] ?? []; return is_array($data) ? $data : []; } /** * @param array $row */ protected function mapApiRowToDto(array $row): ?CrawlItemDto { $title = trim((string) ($row['title'] ?? '')); if ($title === '' || HtmlCrawlSupport::isSkippableLinkTitle($title) || HtmlCrawlSupport::isWeakLinkTitle($title)) { return null; } $url = HtmlCrawlSupport::normalizeNewsUrl((string) ($row['url'] ?? '')); if ($url === null || $url === '') { $aid = trim((string) ($row['aid'] ?? '')); if ($aid === '') { return null; } $url = 'https://www.huxiu.com/article/'.$aid.'.html'; } $publishedAt = null; if (! empty($row['dateline'])) { $timestamp = (int) $row['dateline']; if ($timestamp > 0) { $publishedAt = gmdate('Y-m-d', $timestamp); } } $summary = trim((string) ($row['summary'] ?? $row['short_content'] ?? '')); return new CrawlItemDto( externalId: 'news:'.md5($url), title: $title, canonicalUrl: $url, summary: $summary !== '' ? $summary : null, publishedAt: $publishedAt, extra: ['platform' => 'huxiu'], ); } }