master
lion 1 day ago
parent 6d5bfc16bd
commit 22dec52d2e

@ -91,10 +91,12 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
foreach ($pending as $externalId => $item) {
$response = $responses[$externalId] ?? null;
if ($response && $response->successful()) {
$email = $this->extractEmailFromProfileHtml((string) $response->body());
$body = (string) $response->body();
$email = $this->extractEmailFromProfileHtml($body);
if ($email) {
$item = $this->applyEmailToItem($item, $email);
}
$item = $this->applyProfileMetadataToItem($item, $body);
}
$enriched[] = $item;
}
@ -299,7 +301,12 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
return $items;
}
return $this->extractFromStructuredFacultyList($html, $keywords, $sourceUrl);
$items = $this->extractFromStructuredFacultyList($html, $keywords, $sourceUrl);
if ($items !== []) {
return $items;
}
return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl);
}
/**
@ -362,11 +369,223 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
}
/**
* 上海交通大学等 tsites.CollegeTeacherListdiv.list > ul > li 卡片
* 上海交大材料学院等panel-item + a.staff-item/people/detail_new/{id}
*
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromStaffPanelList(string $html, array $keywords, string $sourceUrl): array
{
$items = [];
$seen = [];
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
$defaultCollege = $this->inferCollegeFromPageTitle($html);
$panelChunks = preg_split('#<div\s+class="panel-head">#u', $html) ?: [];
if (count($panelChunks) > 1) {
array_shift($panelChunks);
foreach ($panelChunks as $chunk) {
if (! preg_match('#<div\s+class="title">\s*([^<]+?)\s*</div>#u', $chunk, $titleMatch)) {
continue;
}
$department = CrawlAuthorParser::cleanText($titleMatch[1]);
foreach ($this->extractStaffItemLinks($chunk) as $link) {
$item = $this->makeStaffPanelItem(
$link,
$department ?: $defaultCollege,
$pageUniversity,
$keywords,
$sourceUrl,
);
if ($item === null || isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$items[] = $item;
}
}
}
if ($items !== []) {
return $items;
}
foreach ($this->extractStaffItemLinks($html) as $link) {
$item = $this->makeStaffPanelItem(
$link,
$defaultCollege,
$pageUniversity,
$keywords,
$sourceUrl,
);
if ($item === null || isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$items[] = $item;
}
return $items;
}
/**
* @return list<array{href:string,name:string}>
*/
protected function extractStaffItemLinks(string $html): array
{
$links = [];
$seen = [];
if (! preg_match_all('#<a\b([^>]*?)>([^<]+)</a>#su', $html, $matches, PREG_SET_ORDER)) {
return [];
}
foreach ($matches as $match) {
$attrs = (string) $match[1];
if (! str_contains($attrs, 'staff-item')) {
continue;
}
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
continue;
}
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
$key = $href.'|'.$name;
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$links[] = ['href' => $href, 'name' => $name];
}
return $links;
}
/**
* @param array{href:string,name:string} $link
* @param list<string> $keywords
*/
protected function makeStaffPanelItem(
array $link,
?string $department,
?string $pageUniversity,
array $keywords,
string $sourceUrl,
): ?CrawlItemDto {
$name = $link['name'];
$profileUrl = $this->resolveUrl($link['href'], $sourceUrl);
$plain = trim($name.' '.($department ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
return null;
}
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
$affiliation = $department;
$universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation);
return $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $affiliation,
universityName: $universityName,
summary: $department ? '单位:'.$department : null,
keywords: $keywords,
academicTitle: null,
platform: 'faculty_html_smse',
bio: null,
);
}
protected function inferCollegeFromPageTitle(string $html): ?string
{
if (! preg_match('/<title>\s*[^<\-\–—]+[\-–—]\s*([^<]+?)\s*<\/title>/u', $html, $match)) {
return null;
}
$title = CrawlAuthorParser::cleanText($match[1]);
if ($title === null || $title === '') {
return null;
}
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $title, $college)) {
return CrawlAuthorParser::cleanText($college[1]);
}
return $title;
}
protected function applyProfileMetadataToItem(CrawlItemDto $item, string $html): CrawlItemDto
{
$lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : [];
$changed = false;
if (empty($lead['academic_title']) && preg_match('/<em>\s*([^<]+?)\s*<\/em>/u', $html, $titleMatch)) {
$title = CrawlAuthorParser::cleanText($titleMatch[1]);
if ($title !== null && $title !== '') {
$lead['academic_title'] = $title;
$changed = true;
}
}
if (empty($lead['college']) && empty($lead['affiliation'])) {
$dept = $this->parseLabeledField($html, '所属二级机构');
if ($dept !== null && $dept !== '') {
$lead['affiliation'] = $dept;
$lead['college'] = $dept;
$changed = true;
}
}
if (! $changed) {
return $item;
}
$extra = $item->extra;
$extra['lead_author'] = $lead;
if (! empty($lead['academic_title'])) {
$extra['academic_title'] = $lead['academic_title'];
}
if (! empty($lead['college'])) {
$extra['college_name'] = $lead['college'];
}
$authorsParsed = $item->authorsParsed;
if ($authorsParsed !== []) {
if (! empty($lead['academic_title'])) {
$authorsParsed[0]['academic_title'] = $lead['academic_title'];
}
if (! empty($lead['college'])) {
$authorsParsed[0]['affiliation'] = $lead['college'];
}
}
return new CrawlItemDto(
externalId: $item->externalId,
title: $item->title,
canonicalUrl: $item->canonicalUrl,
authors: $item->authors,
summary: $item->summary,
publishedAt: $item->publishedAt,
schoolName: $item->schoolName,
section: $item->section,
contentHtml: $item->contentHtml,
extra: $extra,
authorsParsed: $authorsParsed,
);
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromStructuredFacultyList(string $html, array $keywords, string $sourceUrl): array
{
$items = [];

@ -143,4 +143,70 @@ HTML;
$this->assertStringContainsString('PAGENUM=3', $url);
$this->assertStringContainsString('totalpage=20', $url);
}
public function test_extracts_smse_staff_panel_list(): void
{
$html = <<<'HTML'
<title>教师名录 - 上海交通大学材料科学与工程学院</title>
<div class="panel-head">
<div class="title">塑性成形技术与装备研究院</div>
</div>
<div class="panel-body">
<div class="staff-list">
<a href="/people/detail_new/20092" class="staff-item">陈军</a>
<a href="/people/detail_new/20111" class="staff-item">韩先洪</a>
</div>
</div>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'extractFromHtml');
$method->setAccessible(true);
$items = $method->invoke(
$adapter,
$html,
[],
'https://smse.sjtu.edu.cn/people/staff_new/department',
);
$this->assertCount(2, $items);
$this->assertSame('陈军', $items[0]->title);
$this->assertSame('https://smse.sjtu.edu.cn/people/detail_new/20092', $items[0]->canonicalUrl);
$this->assertSame('上海交通大学', $items[0]->schoolName);
$this->assertSame('faculty_html_smse', $items[0]->extra['platform']);
$this->assertSame('塑性成形技术与装备研究院', $items[0]->extra['college_name']);
}
public function test_apply_profile_metadata_from_smse_detail_page(): void
{
$html = <<<'HTML'
<div class="people-name"><p>陈军</p><em>教授</em></div>
<div class="info jigou">所属二级机构:塑性成形技术与装备研究院</div>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'applyProfileMetadataToItem');
$method->setAccessible(true);
$item = $method->invoke(
$adapter,
new \App\Services\Crawl\CrawlItemDto(
externalId: 'faculty:test',
title: '陈军',
canonicalUrl: 'https://smse.sjtu.edu.cn/people/detail_new/20092',
extra: [
'lead_author' => [
'name' => '陈军',
'email' => null,
'university_name' => '上海交通大学',
],
],
),
$html,
);
$this->assertSame('教授', $item->extra['lead_author']['academic_title']);
$this->assertSame('塑性成形技术与装备研究院', $item->extra['college_name']);
}
}

Loading…
Cancel
Save