diff --git a/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php b/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php index fe537a7..b372e01 100644 --- a/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php +++ b/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php @@ -7,6 +7,7 @@ use App\Services\Crawl\Contracts\CrawlerAdapterInterface; use App\Services\Crawl\CrawlAuthorParser; use App\Services\Crawl\CrawlItemDto; use App\Services\Crawl\CrawlKeywordParser; +use Illuminate\Http\Client\Response; use Illuminate\Support\Facades\Http; use Illuminate\Support\Str; @@ -83,15 +84,16 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface foreach ($pending as $externalId => $item) { $pool->as($externalId) ->timeout($timeout) + ->connectTimeout(min(10, $timeout)) + ->retry(1, 500, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->withHeaders($headers) ->get($item->canonicalUrl); } }); foreach ($pending as $externalId => $item) { - $response = $responses[$externalId] ?? null; - if ($response && $response->successful()) { - $body = (string) $response->body(); + $body = $this->responseBodyFromPoolResult($responses[$externalId] ?? null); + if ($body !== null) { $email = $this->extractEmailFromProfileHtml($body); if ($email) { $item = $this->applyEmailToItem($item, $email); @@ -105,6 +107,15 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface return $enriched; } + protected function responseBodyFromPoolResult(mixed $result): ?string + { + if ($result instanceof Response && $result->successful()) { + return (string) $result->body(); + } + + return null; + } + protected function itemHasEmail(CrawlItemDto $item): bool { $lead = $item->extra['lead_author'] ?? null; diff --git a/tests/Unit/FacultyListHtmlAdapterTest.php b/tests/Unit/FacultyListHtmlAdapterTest.php index 3a966ad..07fbb2f 100644 --- a/tests/Unit/FacultyListHtmlAdapterTest.php +++ b/tests/Unit/FacultyListHtmlAdapterTest.php @@ -209,4 +209,17 @@ HTML; $this->assertSame('教授', $item->extra['lead_author']['academic_title']); $this->assertSame('塑性成形技术与装备研究院', $item->extra['college_name']); } + + public function test_response_body_from_pool_result_ignores_connection_exception(): void + { + $adapter = new FacultyListHtmlAdapter; + $method = new \ReflectionMethod($adapter, 'responseBodyFromPoolResult'); + $method->setAccessible(true); + + $this->assertNull($method->invoke($adapter, new \GuzzleHttp\Exception\ConnectException( + 'Connection timed out', + new \GuzzleHttp\Psr7\Request('GET', 'https://faculty.sjtu.edu.cn/test'), + ))); + $this->assertNull($method->invoke($adapter, null)); + } }