From f9256f17bb2c753978b81c8d6fe2f9fde69c3e33 Mon Sep 17 00:00:00 2001 From: lion <120344285@qq.com> Date: Mon, 22 Jun 2026 16:45:16 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Controllers/Admin/CrawlJobController.php | 24 +- .../Crawl/Adapters/FacultyListHtmlAdapter.php | 226 +++++++++++++++--- config/crawl.php | 9 +- tests/Unit/FacultyListHtmlAdapterTest.php | 11 + 4 files changed, 239 insertions(+), 31 deletions(-) diff --git a/app/Http/Controllers/Admin/CrawlJobController.php b/app/Http/Controllers/Admin/CrawlJobController.php index d23ee5c..6fedc0c 100644 --- a/app/Http/Controllers/Admin/CrawlJobController.php +++ b/app/Http/Controllers/Admin/CrawlJobController.php @@ -137,6 +137,8 @@ class CrawlJobController extends Controller 'status' => 'pending', ]); + @set_time_limit(300); + try { $job = $runner->run($job, $source, $params); $this->applyCrawlDefaultsToPreviewItems( @@ -412,12 +414,18 @@ class CrawlJobController extends Controller } if ($job->target_type === 'teacher') { - return sprintf( + $summary = sprintf( '已从 %s 抓取 %d 位老师,已入库 %d 位老师', $sourceName, $fetched, (int) ($importResult['teachers_imported'] ?? 0), ); + $skippedProfiles = $this->countProfileEnrichSkipped($job); + if ($skippedProfiles > 0) { + $summary .= sprintf('(%d 位未访问主页补邮箱,避免超时)', $skippedProfiles); + } + + return $summary; } return sprintf( @@ -428,6 +436,20 @@ class CrawlJobController extends Controller ); } + protected function countProfileEnrichSkipped(CrawlJob $job): int + { + return (int) CrawlJobItem::query() + ->where('crawl_job_id', $job->id) + ->where('target_type', 'teacher') + ->get(['payload']) + ->filter(function (CrawlJobItem $item) { + $extra = $item->payload['extra'] ?? []; + + return ($extra['profile_enrich_skipped'] ?? false) === true; + }) + ->count(); + } + /** * @param array{ * imported:int, diff --git a/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php b/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php index b372e01..a5c4f20 100644 --- a/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php +++ b/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php @@ -30,10 +30,14 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface $merged = []; $seen = []; - for ($page = 1; $page <= $pagesToFetch; $page++) { + for ($page = 1; $page <= $pagesToFetch && count($merged) < $maxResults; $page++) { $html = $page === 1 ? $firstHtml - : $this->fetchHtml($this->buildPageUrl($baseUrl, $page, $firstHtml)); + : null; + + if ($html === null) { + break; + } foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) { if (isset($seen[$item->externalId])) { @@ -47,52 +51,156 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface } } - return $this->enrichEmailsFromProfilePages($merged); + if ($pagesToFetch > 1 && count($merged) < $maxResults) { + $merged = $this->fetchRemainingListPages( + $baseUrl, + $firstHtml, + $pagesToFetch, + $keywords, + $requestUrl, + $merged, + $seen, + $maxResults, + ); + } + + return $this->enrichEmailsFromProfilePages($merged, $params); + } + + /** + * @param list $merged + * @param array $seen + * @param list $keywords + * @return list + */ + protected function fetchRemainingListPages( + string $baseUrl, + string $firstHtml, + int $pagesToFetch, + array $keywords, + string $requestUrl, + array $merged, + array $seen, + int $maxResults, + ): array { + $poolSize = max(1, min(10, (int) config('crawl.faculty.list_fetch_pool_size', 5))); + $pageUrls = []; + for ($page = 2; $page <= $pagesToFetch; $page++) { + $pageUrls[$page] = $this->buildPageUrl($baseUrl, $page, $firstHtml); + } + + foreach (array_chunk($pageUrls, $poolSize, true) as $chunk) { + $htmlByPage = $this->fetchHtmlPool($chunk); + ksort($htmlByPage); + + foreach ($htmlByPage as $html) { + foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) { + if (isset($seen[$item->externalId])) { + continue; + } + $seen[$item->externalId] = true; + $merged[] = $item; + if (count($merged) >= $maxResults) { + return $merged; + } + } + } + } + + return $merged; + } + + /** + * @param array $pageUrls + * @return array + */ + protected function fetchHtmlPool(array $pageUrls): array + { + if ($pageUrls === []) { + return []; + } + + $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20)); + $headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']; + $responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pageUrls, $headers, $timeout) { + foreach ($pageUrls as $page => $url) { + $pool->as((string) $page) + ->timeout($timeout) + ->connectTimeout(min(8, $timeout)) + ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) + ->withHeaders($headers) + ->get($url); + } + }); + + $htmlByPage = []; + foreach ($pageUrls as $page => $url) { + $body = $this->responseBodyFromPoolResult($responses[(string) $page] ?? null); + if ($body !== null && $body !== '') { + $htmlByPage[$page] = $body; + } + } + + return $htmlByPage; } /** * @param list $items + * @param array $params * @return list */ - protected function enrichEmailsFromProfilePages(array $items): array + protected function enrichEmailsFromProfilePages(array $items, array $params = []): array { if (! config('crawl.faculty.profile_email_enrich_enabled', true)) { return $items; } - $poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 6))); - $timeout = max(8, (int) config('crawl.faculty.profile_http_timeout_seconds', 20)); - $headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']; + $maxEnrich = $this->resolveProfileEnrichMax($params, count($items)); + if ($maxEnrich <= 0) { + return $this->markProfileEnrichSkipped($items); + } - $enriched = []; - foreach (array_chunk($items, $poolSize) as $chunk) { - $pending = []; - foreach ($chunk as $item) { - if ($this->itemHasEmail($item) || ! $item->canonicalUrl) { - $enriched[] = $item; + $poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 8))); + $timeout = max(5, (int) config('crawl.faculty.profile_http_timeout_seconds', 10)); + $headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']; - continue; - } - $pending[$item->externalId] = $item; + $fetchMap = []; + $enrichBudget = $maxEnrich; + foreach ($items as $index => $item) { + if ($enrichBudget <= 0) { + break; } - - if ($pending === []) { + if ($this->itemHasEmail($item) || ! $item->canonicalUrl) { continue; } + $fetchMap[$index] = $item; + $enrichBudget--; + } - $responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pending, $headers, $timeout) { - foreach ($pending as $externalId => $item) { - $pool->as($externalId) + if ($fetchMap === []) { + return $items; + } + + $fetchedBodies = []; + foreach (array_chunk($fetchMap, $poolSize, true) as $chunk) { + $batchPending = []; + foreach ($chunk as $index => $item) { + $batchPending[$index] = $item; + } + + $responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($batchPending, $headers, $timeout) { + foreach ($batchPending as $index => $item) { + $pool->as((string) $index) ->timeout($timeout) - ->connectTimeout(min(10, $timeout)) - ->retry(1, 500, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) + ->connectTimeout(min(8, $timeout)) + ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->withHeaders($headers) ->get($item->canonicalUrl); } }); - foreach ($pending as $externalId => $item) { - $body = $this->responseBodyFromPoolResult($responses[$externalId] ?? null); + foreach ($batchPending as $index => $item) { + $body = $this->responseBodyFromPoolResult($responses[(string) $index] ?? null); if ($body !== null) { $email = $this->extractEmailFromProfileHtml($body); if ($email) { @@ -100,11 +208,69 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface } $item = $this->applyProfileMetadataToItem($item, $body); } - $enriched[] = $item; + $fetchedBodies[$index] = $item; } } - return $enriched; + $result = []; + foreach ($items as $index => $item) { + if (isset($fetchedBodies[$index])) { + $result[] = $fetchedBodies[$index]; + } elseif (! $this->itemHasEmail($item) && $item->canonicalUrl) { + $result[] = $this->markItemProfileEnrichSkipped($item); + } else { + $result[] = $item; + } + } + + return $result; + } + + /** + * @param array $params + */ + protected function resolveProfileEnrichMax(array $params, int $itemCount): int + { + if (($params['skip_profile_enrich'] ?? false) === true) { + return 0; + } + + $configured = (int) ($params['profile_enrich_max'] ?? config('crawl.faculty.profile_enrich_max', 32)); + + return max(0, min($itemCount, min(200, $configured))); + } + + /** + * @param list $items + * @return list + */ + protected function markProfileEnrichSkipped(array $items): array + { + return array_map(fn (CrawlItemDto $item) => $this->markItemProfileEnrichSkipped($item), $items); + } + + protected function markItemProfileEnrichSkipped(CrawlItemDto $item): CrawlItemDto + { + if ($this->itemHasEmail($item)) { + return $item; + } + + $extra = $item->extra; + $extra['profile_enrich_skipped'] = true; + + return new CrawlItemDto( + externalId: $item->externalId, + title: $item->title, + canonicalUrl: $item->canonicalUrl, + authors: $item->authors, + summary: $item->summary, + publishedAt: $item->publishedAt, + schoolName: $item->schoolName, + section: $item->section, + contentHtml: $item->contentHtml, + extra: $extra, + authorsParsed: $item->authorsParsed, + ); } protected function responseBodyFromPoolResult(mixed $result): ?string @@ -226,7 +392,11 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface protected function fetchHtml(string $url): string { - $response = Http::timeout(30) + $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20)); + + $response = Http::timeout($timeout) + ->connectTimeout(min(8, $timeout)) + ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']) ->get($url); diff --git a/config/crawl.php b/config/crawl.php index a360cd4..1ff2027 100644 --- a/config/crawl.php +++ b/config/crawl.php @@ -31,8 +31,13 @@ return [ 'faculty' => [ /** 列表项无邮箱时,是否请求教师主页补全邮箱 */ 'profile_email_enrich_enabled' => (bool) env('FACULTY_PROFILE_EMAIL_ENRICH', true), - 'profile_http_timeout_seconds' => (int) env('FACULTY_PROFILE_HTTP_TIMEOUT', 20), + /** 单次任务最多补全主页数(其余仍入库,仅无邮箱) */ + 'profile_enrich_max' => (int) env('FACULTY_PROFILE_ENRICH_MAX', 32), + 'profile_http_timeout_seconds' => (int) env('FACULTY_PROFILE_HTTP_TIMEOUT', 10), /** 并发请求教师主页数 */ - 'profile_enrich_pool_size' => (int) env('FACULTY_PROFILE_ENRICH_POOL', 6), + 'profile_enrich_pool_size' => (int) env('FACULTY_PROFILE_ENRICH_POOL', 8), + 'list_http_timeout_seconds' => (int) env('FACULTY_LIST_HTTP_TIMEOUT', 20), + /** 师资列表分页并发抓取数 */ + 'list_fetch_pool_size' => (int) env('FACULTY_LIST_FETCH_POOL', 5), ], ]; diff --git a/tests/Unit/FacultyListHtmlAdapterTest.php b/tests/Unit/FacultyListHtmlAdapterTest.php index 07fbb2f..09911c9 100644 --- a/tests/Unit/FacultyListHtmlAdapterTest.php +++ b/tests/Unit/FacultyListHtmlAdapterTest.php @@ -210,6 +210,17 @@ HTML; $this->assertSame('塑性成形技术与装备研究院', $item->extra['college_name']); } + public function test_resolve_profile_enrich_max_caps_large_batches(): void + { + $adapter = new FacultyListHtmlAdapter; + $method = new \ReflectionMethod($adapter, 'resolveProfileEnrichMax'); + $method->setAccessible(true); + + $this->assertSame(32, $method->invoke($adapter, [], 500)); + $this->assertSame(10, $method->invoke($adapter, ['profile_enrich_max' => 10], 500)); + $this->assertSame(0, $method->invoke($adapter, ['skip_profile_enrich' => true], 500)); + } + public function test_response_body_from_pool_result_ignores_connection_exception(): void { $adapter = new FacultyListHtmlAdapter;