resolveFromCrawlAddress($requestUrl, $targetType, $crawlAddressId); if ($fromAddress) { return $fromAddress; } return $this->resolveCore($requestUrl, $targetType); } public function suggestSource(string $requestUrl, string $targetType): ?CrawlSource { return $this->resolveCore($requestUrl, $targetType); } protected function resolveFromCrawlAddress(string $requestUrl, string $targetType, ?int $crawlAddressId): ?CrawlSource { if ($crawlAddressId) { $address = CrawlAddress::query() ->whereKey($crawlAddressId) ->where('status', 1) ->where('target_type', $targetType) ->first(); if ($address?->crawl_source_id) { return $this->findActiveSource((int) $address->crawl_source_id, $targetType); } } $normalized = $this->crawlAddressSourceResolver->normalizeRequestUrl($requestUrl); if ($normalized === '') { return null; } $addresses = CrawlAddress::query() ->where('target_type', $targetType) ->where('status', 1) ->whereNotNull('crawl_source_id') ->orderBy('sort') ->orderBy('name') ->get(); foreach ($addresses as $address) { if ($this->crawlAddressSourceResolver->normalizeRequestUrl($address->request_url) !== $normalized) { continue; } $source = $this->findActiveSource((int) $address->crawl_source_id, $targetType); if ($source) { return $source; } } return null; } protected function resolveCore(string $requestUrl, string $targetType): ?CrawlSource { $host = strtolower((string) parse_url($requestUrl, PHP_URL_HOST)); if ($host === '') { return null; } $sources = CrawlSource::query() ->where('status', 1) ->where('target_type', $targetType) ->orderBy('sort') ->get(); $hint = $this->resolveByUrlHint($requestUrl, $targetType, $sources); if ($hint) { return $hint; } $wildcard = null; foreach ($sources as $source) { foreach ($source->match_domains ?? [] as $domain) { $domain = strtolower(trim((string) $domain)); if ($domain === '*' || $domain === 'any') { $wildcard ??= $source; continue; } if ($this->hostMatchesDomain($host, $domain)) { return $source; } } } return $wildcard; } protected function findActiveSource(int $crawlSourceId, string $targetType): ?CrawlSource { return CrawlSource::query() ->whereKey($crawlSourceId) ->where('status', 1) ->where('target_type', $targetType) ->first(); } protected function hostMatchesDomain(string $host, string $domain): bool { $host = $this->normalizeHost($host); $domain = strtolower(trim($domain)); if ($domain === '' || $domain === '*' || $domain === 'any') { return false; } if (str_starts_with($domain, '*.')) { $suffix = substr($domain, 1); return $host === substr($domain, 2) || str_ends_with($host, $suffix); } $domain = $this->normalizeHost($domain); return $host === $domain || str_ends_with($host, '.'.$domain); } protected function normalizeHost(string $host): string { $host = strtolower(trim($host)); if (str_starts_with($host, 'www.')) { return substr($host, 4); } return $host; } /** * @param \Illuminate\Support\Collection $sources */ protected function resolveByUrlHint(string $requestUrl, string $targetType, $sources): ?CrawlSource { $lower = strtolower($requestUrl); if ($targetType === 'industry_news' && str_contains($lower, 'pedaily')) { return $sources->firstWhere('adapter_code', 'pedaily_html'); } if ($targetType === 'industry_news' && str_contains($lower, 'huxiu.com')) { return $sources->firstWhere('adapter_code', 'huxiu_html'); } if ($targetType === 'industry_news' && str_contains($lower, 'pedata.cn')) { return $sources->firstWhere('adapter_code', 'generic_news_html'); } if ($targetType === 'paper' && (str_contains($lower, 'arxiv.org') || str_contains($lower, 'arxiv'))) { return $sources->firstWhere('adapter_code', 'arxiv_api'); } if ($targetType === 'teacher' && str_contains($lower, 'ai.sjtu.edu.cn')) { return $sources->firstWhere('adapter_code', 'ai_sjtu_research_center_api'); } return null; } }