电子信息与电气工程学院
HTML; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'extractFromHtml'); $method->setAccessible(true); $items = $method->invoke( $adapter, $html, [], 'https://faculty.sjtu.edu.cn/xyjs_list.jsp?urltype=tsites.CollegeTeacherList&wbtreeid=1001&st=0&id=1701&lang=zh_CN', ); $this->assertCount(2, $items); $this->assertSame('沈备军', $items[0]->title); $this->assertSame('http://faculty.sjtu.edu.cn/bjshen/zh_CN/index.htm', $items[0]->canonicalUrl); $this->assertSame('上海交通大学', $items[0]->schoolName); $this->assertSame('faculty_html_tsites', $items[0]->extra['platform']); $this->assertSame('副教授', $items[0]->extra['academic_title']); $this->assertSame('电子信息与电气工程学院', $items[0]->extra['college_name']); $this->assertSame('副教授', $items[0]->extra['lead_author']['academic_title']); $this->assertSame('夏斌', $items[1]->title); } public function test_extracts_teacher_when_profile_href_is_empty(): void { $html = <<<'HTML'
HTML; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'extractFromHtml'); $method->setAccessible(true); $items = $method->invoke( $adapter, $html, [], 'https://faculty.sjtu.edu.cn/xyjs_list.jsp?id=1701', ); $this->assertCount(1, $items); $this->assertSame('邵海滨', $items[0]->title); $this->assertStringContainsString('shaohaibin', (string) $items[0]->canonicalUrl); $this->assertSame('副研究员', $items[0]->extra['academic_title']); $this->assertSame('电子信息与电气工程学院', $items[0]->extra['college_name']); } public function test_extracts_email_from_teacher_profile_html(): void { $html = '
  • 电子邮箱:bjshen@sjtu.edu.cn
  • '; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'extractEmailFromProfileHtml'); $method->setAccessible(true); $email = $method->invoke($adapter, $html); $this->assertSame('bjshen@sjtu.edu.cn', $email); } public function test_apply_email_to_item_updates_lead_author(): void { $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'applyEmailToItem'); $method->setAccessible(true); $item = $method->invoke( $adapter, new \App\Services\Crawl\CrawlItemDto( externalId: 'faculty:test', title: '沈备军', canonicalUrl: 'http://faculty.sjtu.edu.cn/bjshen/zh_CN/index.htm', extra: [ 'lead_author' => [ 'name' => '沈备军', 'email' => null, 'university_name' => '上海交通大学', ], ], ), 'bjshen@sjtu.edu.cn', ); $this->assertSame('bjshen@sjtu.edu.cn', $item->extra['lead_author']['email']); } public function test_detects_total_pages_and_builds_pagenum_url(): void { $html = '下页'; $adapter = new FacultyListHtmlAdapter; $detect = new \ReflectionMethod($adapter, 'detectTotalPages'); $detect->setAccessible(true); $this->assertSame(20, $detect->invoke($adapter, $html)); $build = new \ReflectionMethod($adapter, 'buildPageUrl'); $build->setAccessible(true); $url = $build->invoke( $adapter, 'https://faculty.sjtu.edu.cn/xyjs_list.jsp?urltype=tsites.CollegeTeacherList&wbtreeid=1001&st=0&id=1701&lang=zh_CN', 3, $html, ); $this->assertStringContainsString('PAGENUM=3', $url); $this->assertStringContainsString('totalpage=20', $url); } public function test_extracts_smse_staff_panel_list(): void { $html = <<<'HTML' 教师名录 - 上海交通大学材料科学与工程学院
    塑性成形技术与装备研究院
    陈军 韩先洪
    HTML; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'extractFromHtml'); $method->setAccessible(true); $items = $method->invoke( $adapter, $html, [], 'https://smse.sjtu.edu.cn/people/staff_new/department', ); $this->assertCount(2, $items); $this->assertSame('陈军', $items[0]->title); $this->assertSame('https://smse.sjtu.edu.cn/people/detail_new/20092', $items[0]->canonicalUrl); $this->assertSame('上海交通大学', $items[0]->schoolName); $this->assertSame('faculty_html_smse', $items[0]->extra['platform']); $this->assertSame('塑性成形技术与装备研究院', $items[0]->extra['college_name']); } public function test_apply_profile_metadata_from_smse_detail_page(): void { $html = <<<'HTML'

    陈军

    教授
    所属二级机构:塑性成形技术与装备研究院
    HTML; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'applyProfileMetadataToItem'); $method->setAccessible(true); $item = $method->invoke( $adapter, new \App\Services\Crawl\CrawlItemDto( externalId: 'faculty:test', title: '陈军', canonicalUrl: 'https://smse.sjtu.edu.cn/people/detail_new/20092', extra: [ 'lead_author' => [ 'name' => '陈军', 'email' => null, 'university_name' => '上海交通大学', ], ], ), $html, ); $this->assertSame('教授', $item->extra['lead_author']['academic_title']); $this->assertSame('塑性成形技术与装备研究院', $item->extra['college_name']); } public function test_extracts_sais_js_list_from_ajax_content(): void { $html = <<<'HTML' 教师名录-上海交通大学自动化与感知学院
  • 白洋
  • 陈新
  • HTML; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'extractFromAjaxTeacherContent'); $method->setAccessible(true); $items = $method->invoke( $adapter, $html, [], 'https://sais.sjtu.edu.cn/faculty.html', 'faculty', ); $this->assertCount(2, $items); $this->assertSame('白洋', $items[0]->title); $this->assertSame('https://sais.sjtu.edu.cn/faculty/baiyang.html', $items[0]->canonicalUrl); $this->assertSame('faculty_html_ajax', $items[0]->extra['platform']); $this->assertSame('上海交通大学', $items[0]->schoolName); } public function test_extracts_cs_rc_item_teacher_list(): void { $html = <<<'HTML' 教师名录-上海交通大学计算机学院(网络空间安全学院、密码学院)
    并行与分布式系统研究所

    所长:臧斌宇

    陈海波

    HTML; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'extractFromAjaxTeacherContent'); $method->setAccessible(true); $items = $method->invoke( $adapter, $html, [], 'https://www.cs.sjtu.edu.cn/jiaoshiml.html', 'jiaoshiml', ); $this->assertCount(2, $items); $this->assertSame('臧斌宇', $items[0]->title); $this->assertSame('并行与分布式系统研究所', $items[0]->extra['college_name']); $this->assertSame('https://www.cs.sjtu.edu.cn/jiaoshiml/chenhaibo.html', $items[1]->canonicalUrl); } public function test_resolve_profile_enrich_max_caps_large_batches(): void { $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'resolveProfileEnrichMax'); $method->setAccessible(true); $this->assertSame(32, $method->invoke($adapter, [], 500)); $this->assertSame(10, $method->invoke($adapter, ['profile_enrich_max' => 10], 500)); $this->assertSame(0, $method->invoke($adapter, ['skip_profile_enrich' => true], 500)); } public function test_response_body_from_pool_result_ignores_connection_exception(): void { $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'responseBodyFromPoolResult'); $method->setAccessible(true); $this->assertNull($method->invoke($adapter, new \GuzzleHttp\Exception\ConnectException( 'Connection timed out', new \GuzzleHttp\Psr7\Request('GET', 'https://faculty.sjtu.edu.cn/test'), ))); $this->assertNull($method->invoke($adapter, null)); } public function test_parses_icisee_ajax_teacher_config_without_cat_id(): void { $html = <<<'HTML' HTML; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'parseAjaxTeacherConfig'); $method->setAccessible(true); $config = $method->invoke($adapter, $html, 'https://icisee.sjtu.edu.cn/jiaoshiml.html'); $this->assertSame('simple', $config['variant']); $this->assertNull($config['cat_id']); $this->assertSame('jiaoshiml', $config['cat_code']); $this->assertSame('https://icisee.sjtu.edu.cn/active/ajax_teacher_list.html', $config['api_url']); $this->assertTrue($config['uses_page']); } public function test_extracts_icisee_card_style_teacher_list(): void { $html = <<<'HTML' 教师名录-上海交通大学集成电路学院(信息与电子工程学院)
    蔡星汉教授

    微纳全重党支部书记

    张三副教授
    HTML; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'extractFromAjaxTeacherContent'); $method->setAccessible(true); $items = $method->invoke( $adapter, $html, [], 'https://icisee.sjtu.edu.cn/jiaoshiml.html', 'jiaoshiml', ); $this->assertCount(2, $items); $this->assertSame('蔡星汉', $items[0]->title); $this->assertSame('教授', $items[0]->extra['academic_title']); $this->assertSame('https://icisee.sjtu.edu.cn/jiaoshiml/caixinghan.html', $items[0]->canonicalUrl); $this->assertSame('张三', $items[1]->title); $this->assertSame('副教授', $items[1]->extra['academic_title']); $this->assertSame('faculty_html_ajax', $items[0]->extra['platform']); } public function test_detects_nju_teacher_home_page(): void { $html = ''; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'isNjuTeacherHomePage'); $method->setAccessible(true); $this->assertTrue($method->invoke($adapter, $html)); $this->assertFalse($method->invoke($adapter, '')); } public function test_builds_nju_teacher_home_conditions_for_all_faculty(): void { $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'buildNjuTeacherHomeConditions'); $method->setAccessible(true); $conditions = $method->invoke($adapter, null, null); $this->assertCount(1, $conditions); $this->assertSame('published', $conditions[0]['field']); } public function test_builds_nju_teacher_home_conditions_for_professor_category(): void { $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'buildNjuTeacherHomeConditions'); $method->setAccessible(true); $conditions = $method->invoke($adapter, '教授', null); $this->assertCount(2, $conditions); $this->assertSame('exField2', $conditions[1]['field']); $this->assertSame('教授', $conditions[1]['value']); } public function test_parses_nju_site_id_from_html(): void { $html = ''; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'parseNjuSiteId'); $method->setAccessible(true); $this->assertSame(786, $method->invoke($adapter, $html)); } public function test_infers_college_from_meta_description(): void { $html = '师资力量'; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'inferCollegeFromPageTitle'); $method->setAccessible(true); $this->assertSame('智能科学与技术学院', $method->invoke($adapter, $html)); } public function test_extracts_sudy_news_faculty_list(): void { $html = <<<'HTML' 师资力量-南京大学前沿科学学院
  • 功能材料与智能制造研究院

  • HTML; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'extractFromSudyNewsFacultyList'); $method->setAccessible(true); $items = $method->invoke($adapter, $html, [], 'https://frontier.nju.edu.cn/zrjs/list.htm'); $this->assertCount(1, $items); $this->assertSame('王保明', $items[0]->title); $this->assertSame('功能材料与智能制造研究院', $items[0]->extra['college_name']); $this->assertSame('faculty_html_sudy_news', $items[0]->extra['platform']); } public function test_extracts_ra_teacher_cards(): void { $html = <<<'HTML' 专职教师-南京大学机器人与自动化学院 HTML; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'extractFromRaTeacherList'); $method->setAccessible(true); $items = $method->invoke($adapter, $html, [], 'https://ra.nju.edu.cn/szll/zzjs/index.html'); $this->assertCount(1, $items); $this->assertSame('周克敏', $items[0]->title); $this->assertSame('教授', $items[0]->extra['academic_title']); $this->assertSame('faculty_html_ra', $items[0]->extra['platform']); } public function test_extracts_vsb_faculty_table(): void { $html = <<<'HTML' 专兼职教师-南京大学智能软件与工程学院

    教授

    杨鲲陶先平

    副教授

    邵栋
    HTML; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'extractFromVsbFacultyTable'); $method->setAccessible(true); $items = $method->invoke($adapter, $html, [], 'https://ise.nju.edu.cn/szll/zjzjs.htm'); $this->assertCount(3, $items); $names = array_map(fn ($item) => $item->title, $items); $this->assertContains('杨鲲', $names); $this->assertContains('陶先平', $names); $this->assertContains('邵栋', $names); $titles = array_column(array_map(fn ($item) => $item->extra, $items), 'academic_title', null); $this->assertSame('教授', $items[array_search('杨鲲', $names, true)]->extra['academic_title']); $this->assertSame('副教授', $items[array_search('邵栋', $names, true)]->extra['academic_title']); } }