You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

357 lines
13 KiB

<?php
namespace Tests\Unit;
use App\Services\Crawl\Adapters\FacultyListHtmlAdapter;
use Tests\TestCase;
class FacultyListHtmlAdapterTest extends TestCase
{
public function test_extracts_sjtu_college_teacher_list_without_email(): void
{
$html = <<<'HTML'
<div class="jssy-b">
<div class="title">电子信息与电气工程学院</div>
<div class="list">
<ul>
<li>
<a href="http://faculty.sjtu.edu.cn/bjshen/zh_CN/index.htm" target="_blank">
<div class="name">沈备军</div>
<p>所在单位:电子信息与电气工程学院</p>
<p>职称:副教授</p>
</a>
</li>
<li>
<a href="http://faculty.sjtu.edu.cn/xiabin/zh_CN/index.htm" target="_blank">
<div class="name">夏斌</div>
<p>所在单位:电子信息与电气工程学院</p>
<p>职称:教授</p>
</a>
</li>
</ul>
</div>
</div>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'extractFromHtml');
$method->setAccessible(true);
$items = $method->invoke(
$adapter,
$html,
[],
'https://faculty.sjtu.edu.cn/xyjs_list.jsp?urltype=tsites.CollegeTeacherList&wbtreeid=1001&st=0&id=1701&lang=zh_CN',
);
$this->assertCount(2, $items);
$this->assertSame('沈备军', $items[0]->title);
$this->assertSame('http://faculty.sjtu.edu.cn/bjshen/zh_CN/index.htm', $items[0]->canonicalUrl);
$this->assertSame('上海交通大学', $items[0]->schoolName);
$this->assertSame('faculty_html_tsites', $items[0]->extra['platform']);
$this->assertSame('副教授', $items[0]->extra['academic_title']);
$this->assertSame('电子信息与电气工程学院', $items[0]->extra['college_name']);
$this->assertSame('副教授', $items[0]->extra['lead_author']['academic_title']);
$this->assertSame('夏斌', $items[1]->title);
}
public function test_extracts_teacher_when_profile_href_is_empty(): void
{
$html = <<<'HTML'
<script>u_u11_pic.addimg("/__local/x.png","/shaohaibin/zh_CN/index.htm","邵海滨","1");</script>
<div class="list"><ul>
<li><a href="" target="_blank">
<div class="name">邵海滨</div>
<p>所在单位:电子信息与电气工程学院</p>
<p>职称:副研究员</p>
</a></li>
</ul></div>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'extractFromHtml');
$method->setAccessible(true);
$items = $method->invoke(
$adapter,
$html,
[],
'https://faculty.sjtu.edu.cn/xyjs_list.jsp?id=1701',
);
$this->assertCount(1, $items);
$this->assertSame('邵海滨', $items[0]->title);
$this->assertStringContainsString('shaohaibin', (string) $items[0]->canonicalUrl);
$this->assertSame('副研究员', $items[0]->extra['academic_title']);
$this->assertSame('电子信息与电气工程学院', $items[0]->extra['college_name']);
}
public function test_extracts_email_from_teacher_profile_html(): void
{
$html = '<li><strong>电子邮箱:</strong>bjshen@sjtu.edu.cn</li>';
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'extractEmailFromProfileHtml');
$method->setAccessible(true);
$email = $method->invoke($adapter, $html);
$this->assertSame('bjshen@sjtu.edu.cn', $email);
}
public function test_apply_email_to_item_updates_lead_author(): void
{
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'applyEmailToItem');
$method->setAccessible(true);
$item = $method->invoke(
$adapter,
new \App\Services\Crawl\CrawlItemDto(
externalId: 'faculty:test',
title: '沈备军',
canonicalUrl: 'http://faculty.sjtu.edu.cn/bjshen/zh_CN/index.htm',
extra: [
'lead_author' => [
'name' => '沈备军',
'email' => null,
'university_name' => '上海交通大学',
],
],
),
'bjshen@sjtu.edu.cn',
);
$this->assertSame('bjshen@sjtu.edu.cn', $item->extra['lead_author']['email']);
}
public function test_detects_total_pages_and_builds_pagenum_url(): void
{
$html = '<a href="?totalpage=20&PAGENUM=2&urltype=tsites.CollegeTeacherList&id=1701">下页</a>';
$adapter = new FacultyListHtmlAdapter;
$detect = new \ReflectionMethod($adapter, 'detectTotalPages');
$detect->setAccessible(true);
$this->assertSame(20, $detect->invoke($adapter, $html));
$build = new \ReflectionMethod($adapter, 'buildPageUrl');
$build->setAccessible(true);
$url = $build->invoke(
$adapter,
'https://faculty.sjtu.edu.cn/xyjs_list.jsp?urltype=tsites.CollegeTeacherList&wbtreeid=1001&st=0&id=1701&lang=zh_CN',
3,
$html,
);
$this->assertStringContainsString('PAGENUM=3', $url);
$this->assertStringContainsString('totalpage=20', $url);
}
public function test_extracts_smse_staff_panel_list(): void
{
$html = <<<'HTML'
<title>教师名录 - 上海交通大学材料科学与工程学院</title>
<div class="panel-head">
<div class="title">塑性成形技术与装备研究院</div>
</div>
<div class="panel-body">
<div class="staff-list">
<a href="/people/detail_new/20092" class="staff-item">陈军</a>
<a href="/people/detail_new/20111" class="staff-item">韩先洪</a>
</div>
</div>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'extractFromHtml');
$method->setAccessible(true);
$items = $method->invoke(
$adapter,
$html,
[],
'https://smse.sjtu.edu.cn/people/staff_new/department',
);
$this->assertCount(2, $items);
$this->assertSame('陈军', $items[0]->title);
$this->assertSame('https://smse.sjtu.edu.cn/people/detail_new/20092', $items[0]->canonicalUrl);
$this->assertSame('上海交通大学', $items[0]->schoolName);
$this->assertSame('faculty_html_smse', $items[0]->extra['platform']);
$this->assertSame('塑性成形技术与装备研究院', $items[0]->extra['college_name']);
}
public function test_apply_profile_metadata_from_smse_detail_page(): void
{
$html = <<<'HTML'
<div class="people-name"><p>陈军</p><em>教授</em></div>
<div class="info jigou">所属二级机构:塑性成形技术与装备研究院</div>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'applyProfileMetadataToItem');
$method->setAccessible(true);
$item = $method->invoke(
$adapter,
new \App\Services\Crawl\CrawlItemDto(
externalId: 'faculty:test',
title: '陈军',
canonicalUrl: 'https://smse.sjtu.edu.cn/people/detail_new/20092',
extra: [
'lead_author' => [
'name' => '陈军',
'email' => null,
'university_name' => '上海交通大学',
],
],
),
$html,
);
$this->assertSame('教授', $item->extra['lead_author']['academic_title']);
$this->assertSame('塑性成形技术与装备研究院', $item->extra['college_name']);
}
public function test_extracts_sais_js_list_from_ajax_content(): void
{
$html = <<<'HTML'
<title>教师名录-上海交通大学自动化与感知学院</title>
<div class="js-list">
<li><a href="https://sais.sjtu.edu.cn/faculty/baiyang.html" class="name">白洋</a></li>
<li><a href="https://sais.sjtu.edu.cn/faculty/chenxin.html" class="name">陈新</a></li>
</div>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'extractFromAjaxTeacherContent');
$method->setAccessible(true);
$items = $method->invoke(
$adapter,
$html,
[],
'https://sais.sjtu.edu.cn/faculty.html',
'faculty',
);
$this->assertCount(2, $items);
$this->assertSame('白洋', $items[0]->title);
$this->assertSame('https://sais.sjtu.edu.cn/faculty/baiyang.html', $items[0]->canonicalUrl);
$this->assertSame('faculty_html_ajax', $items[0]->extra['platform']);
$this->assertSame('上海交通大学', $items[0]->schoolName);
}
public function test_extracts_cs_rc_item_teacher_list(): void
{
$html = <<<'HTML'
<title>教师名录-上海交通大学计算机学院(网络空间安全学院、密码学院)</title>
<div class="rc-item">
<div class="tit"><div class="name">并行与分布式系统研究所</div></div>
<div class="dt">
<p>所长:<a href="https://www.cs.sjtu.edu.cn/jiaoshiml/zangbinyu.html" target="_blank">臧斌宇</a></p>
<p><a href="https://www.cs.sjtu.edu.cn/jiaoshiml/chenhaibo.html" target="_blank">陈海波</a></p>
</div>
</div>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'extractFromAjaxTeacherContent');
$method->setAccessible(true);
$items = $method->invoke(
$adapter,
$html,
[],
'https://www.cs.sjtu.edu.cn/jiaoshiml.html',
'jiaoshiml',
);
$this->assertCount(2, $items);
$this->assertSame('臧斌宇', $items[0]->title);
$this->assertSame('并行与分布式系统研究所', $items[0]->extra['college_name']);
$this->assertSame('https://www.cs.sjtu.edu.cn/jiaoshiml/chenhaibo.html', $items[1]->canonicalUrl);
}
public function test_resolve_profile_enrich_max_caps_large_batches(): void
{
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'resolveProfileEnrichMax');
$method->setAccessible(true);
$this->assertSame(32, $method->invoke($adapter, [], 500));
$this->assertSame(10, $method->invoke($adapter, ['profile_enrich_max' => 10], 500));
$this->assertSame(0, $method->invoke($adapter, ['skip_profile_enrich' => true], 500));
}
public function test_response_body_from_pool_result_ignores_connection_exception(): void
{
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'responseBodyFromPoolResult');
$method->setAccessible(true);
$this->assertNull($method->invoke($adapter, new \GuzzleHttp\Exception\ConnectException(
'Connection timed out',
new \GuzzleHttp\Psr7\Request('GET', 'https://faculty.sjtu.edu.cn/test'),
)));
$this->assertNull($method->invoke($adapter, null));
}
public function test_parses_icisee_ajax_teacher_config_without_cat_id(): void
{
$html = <<<'HTML'
<script>
$.ajax({
url: '/active/ajax_teacher_list.html',
type: 'post',
data: {page:page, cat_code:'jiaoshiml', yjszxfl:global_yjszxfl, name:global_name, zm:global_zm},
});
</script>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'parseAjaxTeacherConfig');
$method->setAccessible(true);
$config = $method->invoke($adapter, $html, 'https://icisee.sjtu.edu.cn/jiaoshiml.html');
$this->assertSame('simple', $config['variant']);
$this->assertNull($config['cat_id']);
$this->assertSame('jiaoshiml', $config['cat_code']);
$this->assertSame('https://icisee.sjtu.edu.cn/active/ajax_teacher_list.html', $config['api_url']);
$this->assertTrue($config['uses_page']);
}
public function test_extracts_icisee_card_style_teacher_list(): void
{
$html = <<<'HTML'
<title>教师名录-上海交通大学集成电路学院(信息与电子工程学院)</title>
<a href="/jiaoshiml/caixinghan.html" target="_blank">
<div class="imgk"><img src="/upload/x.png" alt=""></div>
<div class="name">蔡星汉<span>教授</span><p class="line-2">微纳全重党支部书记</p></div>
</a>
<a href="/jiaoshiml/zhangsan.html" target="_blank">
<div class="imgk"><img src="/upload/y.png" alt=""></div>
<div class="name">张三<span>副教授</span></div>
</a>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'extractFromAjaxTeacherContent');
$method->setAccessible(true);
$items = $method->invoke(
$adapter,
$html,
[],
'https://icisee.sjtu.edu.cn/jiaoshiml.html',
'jiaoshiml',
);
$this->assertCount(2, $items);
$this->assertSame('蔡星汉', $items[0]->title);
$this->assertSame('教授', $items[0]->extra['academic_title']);
$this->assertSame('https://icisee.sjtu.edu.cn/jiaoshiml/caixinghan.html', $items[0]->canonicalUrl);
$this->assertSame('张三', $items[1]->title);
$this->assertSame('副教授', $items[1]->extra['academic_title']);
$this->assertSame('faculty_html_ajax', $items[0]->extra['platform']);
}
}