master
lion 2 weeks ago
parent 22dec52d2e
commit 099c609328

@ -7,6 +7,7 @@ use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use Illuminate\Http\Client\Response;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Str;
@ -83,15 +84,16 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
foreach ($pending as $externalId => $item) {
$pool->as($externalId)
->timeout($timeout)
->connectTimeout(min(10, $timeout))
->retry(1, 500, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders($headers)
->get($item->canonicalUrl);
}
});
foreach ($pending as $externalId => $item) {
$response = $responses[$externalId] ?? null;
if ($response && $response->successful()) {
$body = (string) $response->body();
$body = $this->responseBodyFromPoolResult($responses[$externalId] ?? null);
if ($body !== null) {
$email = $this->extractEmailFromProfileHtml($body);
if ($email) {
$item = $this->applyEmailToItem($item, $email);
@ -105,6 +107,15 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
return $enriched;
}
protected function responseBodyFromPoolResult(mixed $result): ?string
{
if ($result instanceof Response && $result->successful()) {
return (string) $result->body();
}
return null;
}
protected function itemHasEmail(CrawlItemDto $item): bool
{
$lead = $item->extra['lead_author'] ?? null;

@ -209,4 +209,17 @@ HTML;
$this->assertSame('教授', $item->extra['lead_author']['academic_title']);
$this->assertSame('塑性成形技术与装备研究院', $item->extra['college_name']);
}
public function test_response_body_from_pool_result_ignores_connection_exception(): void
{
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'responseBodyFromPoolResult');
$method->setAccessible(true);
$this->assertNull($method->invoke($adapter, new \GuzzleHttp\Exception\ConnectException(
'Connection timed out',
new \GuzzleHttp\Psr7\Request('GET', 'https://faculty.sjtu.edu.cn/test'),
)));
$this->assertNull($method->invoke($adapter, null));
}
}

Loading…
Cancel
Save