withHeaders([ 'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)', 'Accept' => 'text/html,application/xhtml+xml', 'Accept-Language' => 'zh-CN,zh;q=0.9,en;q=0.8', ]) ->get($url); if (! $response->successful()) { throw new \RuntimeException('页面请求失败:HTTP '.$response->status()); } return $response->body(); } public static function hostKey(?string $url): ?string { $host = strtolower((string) parse_url((string) $url, PHP_URL_HOST)); if ($host === '') { return null; } if (str_starts_with($host, 'www.')) { $host = substr($host, 4); } return $host; } public static function sameHost(?string $a, ?string $b): bool { return $a !== null && $b !== null && $a === $b; } public static function absoluteUrl(string $href, string $base): ?string { $href = html_entity_decode(trim($href)); if ($href === '' || Str::startsWith($href, ['#', 'javascript:', 'mailto:', 'tel:'])) { return null; } if (Str::startsWith($href, 'http')) { return $href; } $parts = parse_url($base); if (! $parts || empty($parts['scheme']) || empty($parts['host'])) { return null; } $origin = $parts['scheme'].'://'.$parts['host']; if (Str::startsWith($href, '//')) { return $parts['scheme'].':'.$href; } if (Str::startsWith($href, '/')) { return $origin.$href; } $path = $parts['path'] ?? '/'; $dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/'; return $origin.$dir.ltrim($href, '/'); } public static function normalizeDate(?string $raw): ?string { if ($raw === null || trim($raw) === '') { return null; } $raw = trim(html_entity_decode(strip_tags($raw))); if (preg_match('#(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日#u', $raw, $m)) { return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]); } $raw = str_replace(['/', '.'], '-', $raw); if (preg_match('#(\d{4})-(\d{1,2})-(\d{1,2})#', $raw, $m)) { return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]); } return null; } public static function isWeakLinkTitle(string $title): bool { $title = trim($title); if ($title === '') { return true; } if (preg_match('#^\d{4}[-/年]\d{1,2}([-/月日]\d{1,2})?$#u', $title)) { return true; } foreach (['阅读全文', '查看更多', '查看详情', '详情', '点击进入', '更多>>', 'More'] as $noise) { if ($title === $noise || str_starts_with($title, $noise)) { return true; } } return false; } public static function cleanArticleTitle(?string $title): ?string { if ($title === null) { return null; } $title = trim(html_entity_decode(strip_tags($title))); if ($title === '') { return null; } if (preg_match('/^(.+?)\s*[-_|–—]\s*.+$/u', $title, $m) && mb_strlen($m[1]) >= 8) { $title = trim($m[1]); } return $title; } public static function extractDateFromText(string $text): ?string { if (preg_match('#(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日#u', $text, $m)) { return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]); } if (preg_match('#(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})#', $text, $m)) { return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]); } return null; } public static function isSkippableLinkTitle(string $title): bool { if (Str::length($title) < 8) { return true; } foreach (['登录', '注册', '更多', '下一页', '上一页', '首页', '关于我们', '联系我们', '隐私', '版权'] as $noise) { if (Str::contains($title, $noise)) { return true; } } return false; } public static function isAssetPath(string $url): bool { return (bool) preg_match('#\.(css|js|png|jpe?g|gif|svg|ico|woff2?|pdf|zip)(\?|$)#i', $url); } public static function normalizeNewsUrl(?string $url): ?string { if ($url === null || trim($url) === '') { return null; } $parts = parse_url($url); if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) { return $url; } $path = $parts['path'] ?? '/'; $normalized = $parts['scheme'].'://'.$parts['host']; if (! empty($parts['port'])) { $normalized .= ':'.$parts['port']; } return $normalized.$path; } /** * 频道/栏目列表页上的关键词多为分类标签,不应再按标题二次过滤。 * * @param list $keywords */ public static function shouldApplyKeywordFilter(string $requestUrl, string $listHtml, array $keywords): bool { if ($keywords === []) { return false; } $path = strtolower((string) parse_url($requestUrl, PHP_URL_PATH)); if (preg_match('#/(channel|column|tag|topic|special)/[^/]+\.(?:html?|shtml)$#i', $path)) { return false; } $heading = self::extractListPageHeading($listHtml); if ($heading !== null) { foreach ($keywords as $keyword) { if ($keyword !== '' && mb_stripos($heading, $keyword) !== false) { return false; } } } return true; } public static function extractListPageHeading(string $html): ?string { if (preg_match('#]+class=["\'][^"\']*channel-name[^"\']*["\'][^>]*>(.*?)#is', $html, $m)) { $title = trim(strip_tags(html_entity_decode($m[1]))); return $title !== '' ? $title : null; } if (preg_match('#]*>(.*?)#is', $html, $m)) { $title = trim(strip_tags(html_entity_decode($m[1]))); if ($title !== '' && mb_strlen($title) <= 40) { return $title; } } return null; } }