You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
175 lines
5.3 KiB
175 lines
5.3 KiB
<?php
|
|
|
|
namespace App\Services\Crawl;
|
|
|
|
use App\Models\CrawlAddress;
|
|
use App\Models\CrawlSource;
|
|
|
|
class CrawlSourceResolver
|
|
{
|
|
public function __construct(
|
|
protected CrawlAddressSourceResolver $crawlAddressSourceResolver,
|
|
) {}
|
|
|
|
public function resolve(string $requestUrl, string $targetType, ?int $crawlAddressId = null): ?CrawlSource
|
|
{
|
|
$fromAddress = $this->resolveFromCrawlAddress($requestUrl, $targetType, $crawlAddressId);
|
|
if ($fromAddress) {
|
|
return $fromAddress;
|
|
}
|
|
|
|
return $this->resolveCore($requestUrl, $targetType);
|
|
}
|
|
|
|
public function suggestSource(string $requestUrl, string $targetType): ?CrawlSource
|
|
{
|
|
return $this->resolveCore($requestUrl, $targetType);
|
|
}
|
|
|
|
protected function resolveFromCrawlAddress(string $requestUrl, string $targetType, ?int $crawlAddressId): ?CrawlSource
|
|
{
|
|
if ($crawlAddressId) {
|
|
$address = CrawlAddress::query()
|
|
->whereKey($crawlAddressId)
|
|
->where('status', 1)
|
|
->where('target_type', $targetType)
|
|
->first();
|
|
|
|
if ($address?->crawl_source_id) {
|
|
return $this->findActiveSource((int) $address->crawl_source_id, $targetType);
|
|
}
|
|
}
|
|
|
|
$normalized = $this->crawlAddressSourceResolver->normalizeRequestUrl($requestUrl);
|
|
if ($normalized === '') {
|
|
return null;
|
|
}
|
|
|
|
$addresses = CrawlAddress::query()
|
|
->where('target_type', $targetType)
|
|
->where('status', 1)
|
|
->whereNotNull('crawl_source_id')
|
|
->orderBy('sort')
|
|
->orderBy('name')
|
|
->get();
|
|
|
|
foreach ($addresses as $address) {
|
|
if ($this->crawlAddressSourceResolver->normalizeRequestUrl($address->request_url) !== $normalized) {
|
|
continue;
|
|
}
|
|
|
|
$source = $this->findActiveSource((int) $address->crawl_source_id, $targetType);
|
|
if ($source) {
|
|
return $source;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
protected function resolveCore(string $requestUrl, string $targetType): ?CrawlSource
|
|
{
|
|
$host = strtolower((string) parse_url($requestUrl, PHP_URL_HOST));
|
|
if ($host === '') {
|
|
return null;
|
|
}
|
|
|
|
$sources = CrawlSource::query()
|
|
->where('status', 1)
|
|
->where('target_type', $targetType)
|
|
->orderBy('sort')
|
|
->get();
|
|
|
|
$hint = $this->resolveByUrlHint($requestUrl, $targetType, $sources);
|
|
if ($hint) {
|
|
return $hint;
|
|
}
|
|
|
|
$wildcard = null;
|
|
|
|
foreach ($sources as $source) {
|
|
foreach ($source->match_domains ?? [] as $domain) {
|
|
$domain = strtolower(trim((string) $domain));
|
|
if ($domain === '*' || $domain === 'any') {
|
|
$wildcard ??= $source;
|
|
|
|
continue;
|
|
}
|
|
if ($this->hostMatchesDomain($host, $domain)) {
|
|
return $source;
|
|
}
|
|
}
|
|
}
|
|
|
|
return $wildcard;
|
|
}
|
|
|
|
protected function findActiveSource(int $crawlSourceId, string $targetType): ?CrawlSource
|
|
{
|
|
return CrawlSource::query()
|
|
->whereKey($crawlSourceId)
|
|
->where('status', 1)
|
|
->where('target_type', $targetType)
|
|
->first();
|
|
}
|
|
|
|
protected function hostMatchesDomain(string $host, string $domain): bool
|
|
{
|
|
$host = $this->normalizeHost($host);
|
|
$domain = strtolower(trim($domain));
|
|
if ($domain === '' || $domain === '*' || $domain === 'any') {
|
|
return false;
|
|
}
|
|
|
|
if (str_starts_with($domain, '*.')) {
|
|
$suffix = substr($domain, 1);
|
|
|
|
return $host === substr($domain, 2) || str_ends_with($host, $suffix);
|
|
}
|
|
|
|
$domain = $this->normalizeHost($domain);
|
|
|
|
return $host === $domain || str_ends_with($host, '.'.$domain);
|
|
}
|
|
|
|
protected function normalizeHost(string $host): string
|
|
{
|
|
$host = strtolower(trim($host));
|
|
if (str_starts_with($host, 'www.')) {
|
|
return substr($host, 4);
|
|
}
|
|
|
|
return $host;
|
|
}
|
|
|
|
/**
|
|
* @param \Illuminate\Support\Collection<int, CrawlSource> $sources
|
|
*/
|
|
protected function resolveByUrlHint(string $requestUrl, string $targetType, $sources): ?CrawlSource
|
|
{
|
|
$lower = strtolower($requestUrl);
|
|
|
|
if ($targetType === 'industry_news' && str_contains($lower, 'pedaily')) {
|
|
return $sources->firstWhere('adapter_code', 'pedaily_html');
|
|
}
|
|
|
|
if ($targetType === 'industry_news' && str_contains($lower, 'huxiu.com')) {
|
|
return $sources->firstWhere('adapter_code', 'huxiu_html');
|
|
}
|
|
|
|
if ($targetType === 'industry_news' && str_contains($lower, 'pedata.cn')) {
|
|
return $sources->firstWhere('adapter_code', 'generic_news_html');
|
|
}
|
|
|
|
if ($targetType === 'paper' && (str_contains($lower, 'arxiv.org') || str_contains($lower, 'arxiv'))) {
|
|
return $sources->firstWhere('adapter_code', 'arxiv_api');
|
|
}
|
|
|
|
if ($targetType === 'teacher' && str_contains($lower, 'ai.sjtu.edu.cn')) {
|
|
return $sources->firstWhere('adapter_code', 'ai_sjtu_research_center_api');
|
|
}
|
|
|
|
return null;
|
|
}
|
|
}
|