|
|
|
|
@ -31,6 +31,12 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
return $this->enrichEmailsFromProfilePages($items, $params);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($this->isNjuTeacherHomePage($firstHtml)) {
|
|
|
|
|
$items = $this->fetchNjuTeacherHomeItems($requestUrl, $firstHtml, $keywords, $maxResults, $maxPages);
|
|
|
|
|
|
|
|
|
|
return $this->enrichEmailsFromProfilePages($items, $params);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$totalPages = $this->detectTotalPages($firstHtml);
|
|
|
|
|
$pagesToFetch = min($maxPages, $totalPages);
|
|
|
|
|
|
|
|
|
|
@ -485,6 +491,21 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
*/
|
|
|
|
|
protected function extractFromHtml(string $html, array $keywords, string $sourceUrl): array
|
|
|
|
|
{
|
|
|
|
|
$items = $this->extractFromSudyNewsFacultyList($html, $keywords, $sourceUrl);
|
|
|
|
|
if ($items !== []) {
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$items = $this->extractFromRaTeacherList($html, $keywords, $sourceUrl);
|
|
|
|
|
if ($items !== []) {
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$items = $this->extractFromVsbFacultyTable($html, $keywords, $sourceUrl);
|
|
|
|
|
if ($items !== []) {
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$items = $this->extractFromEmailBlocks($html, $keywords, $sourceUrl);
|
|
|
|
|
if ($items !== []) {
|
|
|
|
|
return $items;
|
|
|
|
|
@ -498,6 +519,363 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 南大 Sudy CMS:ul.news_list 内 news_title / news_title1 链接(frontier、ic 等)。
|
|
|
|
|
*
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
*/
|
|
|
|
|
protected function extractFromSudyNewsFacultyList(string $html, array $keywords, string $sourceUrl): array
|
|
|
|
|
{
|
|
|
|
|
if (! preg_match('/class="news_list/u', $html)) {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
|
|
|
|
|
$defaultCollege = $this->inferCollegeFromPageTitle($html);
|
|
|
|
|
if ($defaultCollege === null && preg_match('#<li class="col_title"><h2>([^<]+)</h2>#u', $html, $titleMatch)) {
|
|
|
|
|
$defaultCollege = CrawlAuthorParser::cleanText($titleMatch[1]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$items = [];
|
|
|
|
|
$seen = [];
|
|
|
|
|
$chunks = preg_split('#<li class="wp_sublist#u', $html) ?: [];
|
|
|
|
|
|
|
|
|
|
if (count($chunks) <= 1) {
|
|
|
|
|
return $this->extractSudyNewsLinksFromChunk(
|
|
|
|
|
$html,
|
|
|
|
|
$defaultCollege,
|
|
|
|
|
$keywords,
|
|
|
|
|
$sourceUrl,
|
|
|
|
|
$pageUniversity,
|
|
|
|
|
$seen,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
array_shift($chunks);
|
|
|
|
|
foreach ($chunks as $chunk) {
|
|
|
|
|
$department = $defaultCollege;
|
|
|
|
|
if (preg_match('#subcolumn-name">([^<]+)</span>#u', $chunk, $deptMatch)) {
|
|
|
|
|
$department = CrawlAuthorParser::cleanText($deptMatch[1]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach ($this->extractSudyNewsLinksFromChunk(
|
|
|
|
|
$chunk,
|
|
|
|
|
$department,
|
|
|
|
|
$keywords,
|
|
|
|
|
$sourceUrl,
|
|
|
|
|
$pageUniversity,
|
|
|
|
|
$seen,
|
|
|
|
|
) as $item) {
|
|
|
|
|
$items[] = $item;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param array<string, true> $seen
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
*/
|
|
|
|
|
protected function extractSudyNewsLinksFromChunk(
|
|
|
|
|
string $chunk,
|
|
|
|
|
?string $department,
|
|
|
|
|
array $keywords,
|
|
|
|
|
string $sourceUrl,
|
|
|
|
|
?string $pageUniversity,
|
|
|
|
|
array &$seen,
|
|
|
|
|
): array {
|
|
|
|
|
$items = [];
|
|
|
|
|
|
|
|
|
|
if (! preg_match_all(
|
|
|
|
|
'#<(?:div|span)\s+class="news_title1?">\s*<a\b([^>]*?)>([^<]+)</a>#su',
|
|
|
|
|
$chunk,
|
|
|
|
|
$matches,
|
|
|
|
|
PREG_SET_ORDER,
|
|
|
|
|
)) {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach ($matches as $match) {
|
|
|
|
|
$attrs = (string) $match[1];
|
|
|
|
|
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
|
|
|
|
|
if ($name === '' || ! $this->looksLikePersonName($name)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (! preg_match('#\bhref=[\'"]([^\'"]+)[\'"]#u', $attrs, $hrefMatch)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
|
|
|
if (! $this->looksLikeTeacherProfileUrl($href, null)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$profileUrl = $this->resolveUrl($href, $sourceUrl);
|
|
|
|
|
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
|
|
|
|
|
if (isset($seen[$dedupeKey])) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$plain = trim($name.' '.($department ?? ''));
|
|
|
|
|
if (! $this->matchesKeywords($plain, $keywords)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$seen[$dedupeKey] = true;
|
|
|
|
|
$items[] = $this->makeFacultyItem(
|
|
|
|
|
externalKey: 'faculty:'.md5($dedupeKey),
|
|
|
|
|
name: $name,
|
|
|
|
|
profileUrl: $profileUrl,
|
|
|
|
|
email: null,
|
|
|
|
|
affiliation: $department,
|
|
|
|
|
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($department),
|
|
|
|
|
summary: $department ? '单位:'.$department : null,
|
|
|
|
|
keywords: $keywords,
|
|
|
|
|
academicTitle: null,
|
|
|
|
|
platform: 'faculty_html_sudy_news',
|
|
|
|
|
bio: null,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 南大机器人学院等博山 CMS:ul.teacher 卡片(div.xm 姓名)。
|
|
|
|
|
*
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
*/
|
|
|
|
|
protected function extractFromRaTeacherList(string $html, array $keywords, string $sourceUrl): array
|
|
|
|
|
{
|
|
|
|
|
if (! preg_match('/<ul class="teacher">/u', $html)) {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (! preg_match_all(
|
|
|
|
|
'#<a\b([^>]*?)>.*?<div class="xm">([^<]+)</div>(.*?)</a>#su',
|
|
|
|
|
$html,
|
|
|
|
|
$matches,
|
|
|
|
|
PREG_SET_ORDER,
|
|
|
|
|
)) {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
|
|
|
|
|
$defaultCollege = $this->inferCollegeFromPageTitle($html);
|
|
|
|
|
$items = [];
|
|
|
|
|
$seen = [];
|
|
|
|
|
|
|
|
|
|
foreach ($matches as $match) {
|
|
|
|
|
$attrs = (string) $match[1];
|
|
|
|
|
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
|
|
|
|
|
$tail = (string) $match[3];
|
|
|
|
|
if ($name === '' || ! $this->looksLikePersonName($name)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$profileUrl = $this->resolveUrl(html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'), $sourceUrl);
|
|
|
|
|
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
|
|
|
|
|
if (isset($seen[$dedupeKey])) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$academicTitle = null;
|
|
|
|
|
if (preg_match('#职称:\s*<span>([^<]+)</span>#u', $tail, $titleMatch)) {
|
|
|
|
|
$academicTitle = CrawlAuthorParser::cleanText($titleMatch[1]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$researchField = null;
|
|
|
|
|
if (preg_match('#研究方向:\s*<span>([^<]+)</span>#u', $tail, $fieldMatch)) {
|
|
|
|
|
$researchField = CrawlAuthorParser::cleanText($fieldMatch[1]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$plain = trim($name.' '.($researchField ?? '').' '.($academicTitle ?? '').' '.($defaultCollege ?? ''));
|
|
|
|
|
if (! $this->matchesKeywords($plain, $keywords)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$summaryParts = array_filter([
|
|
|
|
|
$defaultCollege ? '单位:'.$defaultCollege : null,
|
|
|
|
|
$academicTitle ? '职称:'.$academicTitle : null,
|
|
|
|
|
$researchField ? '研究方向:'.$researchField : null,
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$seen[$dedupeKey] = true;
|
|
|
|
|
$items[] = $this->makeFacultyItem(
|
|
|
|
|
externalKey: 'faculty:'.md5($dedupeKey),
|
|
|
|
|
name: $name,
|
|
|
|
|
profileUrl: $profileUrl,
|
|
|
|
|
email: null,
|
|
|
|
|
affiliation: $defaultCollege,
|
|
|
|
|
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
|
|
|
|
|
summary: $summaryParts !== [] ? implode(';', $summaryParts) : null,
|
|
|
|
|
keywords: $keywords,
|
|
|
|
|
academicTitle: $academicTitle,
|
|
|
|
|
platform: 'faculty_html_ra',
|
|
|
|
|
bio: $researchField,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 南大/清华 WebPlus(VSB) 师资表格页(ise zjzjs 等)。
|
|
|
|
|
*
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
*/
|
|
|
|
|
protected function extractFromVsbFacultyTable(string $html, array $keywords, string $sourceUrl): array
|
|
|
|
|
{
|
|
|
|
|
$scope = null;
|
|
|
|
|
if (preg_match('#<div class="zjzjs">(.*?)</div>#su', $html, $match)) {
|
|
|
|
|
$scope = (string) $match[1];
|
|
|
|
|
} elseif (preg_match('#<div id="vsb_content[^"]*">(.*?)</div>\s*</div>\s*</div>#su', $html, $match)) {
|
|
|
|
|
$scope = (string) $match[1];
|
|
|
|
|
} elseif (preg_match('#<ul class="teach-list[^"]*">(.*?)</ul>#su', $html, $match) && trim(strip_tags($match[1])) !== '') {
|
|
|
|
|
$scope = (string) $match[1];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($scope === null || trim(strip_tags($scope)) === '') {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
|
|
|
|
|
$defaultCollege = $this->inferCollegeFromPageTitle($html);
|
|
|
|
|
$items = [];
|
|
|
|
|
$seen = [];
|
|
|
|
|
|
|
|
|
|
$sectionTitles = [];
|
|
|
|
|
if (preg_match_all('#<strong[^>]*>(.*?)</strong>#su', $scope, $sectionMatches, PREG_OFFSET_CAPTURE)) {
|
|
|
|
|
foreach ($sectionMatches[1] as $sectionMatch) {
|
|
|
|
|
$title = CrawlAuthorParser::cleanText(strip_tags($sectionMatch[0]));
|
|
|
|
|
if ($title !== null && $title !== '') {
|
|
|
|
|
$sectionTitles[] = [
|
|
|
|
|
'offset' => $sectionMatch[1],
|
|
|
|
|
'title' => $title,
|
|
|
|
|
];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$resolveSectionTitle = function (int $offset) use ($sectionTitles): ?string {
|
|
|
|
|
$title = null;
|
|
|
|
|
foreach ($sectionTitles as $section) {
|
|
|
|
|
if ($section['offset'] <= $offset) {
|
|
|
|
|
$title = $section['title'];
|
|
|
|
|
} else {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $title;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
$addItem = function (
|
|
|
|
|
string $name,
|
|
|
|
|
?string $profileUrl,
|
|
|
|
|
?string $sectionTitle,
|
|
|
|
|
) use (
|
|
|
|
|
$keywords,
|
|
|
|
|
$defaultCollege,
|
|
|
|
|
$pageUniversity,
|
|
|
|
|
&$items,
|
|
|
|
|
&$seen,
|
|
|
|
|
): void {
|
|
|
|
|
if ($name === '' || ! $this->looksLikePersonName($name)) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
|
|
|
|
|
if (isset($seen[$dedupeKey])) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$academicTitle = $this->inferAcademicTitleFromSection($sectionTitle);
|
|
|
|
|
$plain = trim($name.' '.($academicTitle ?? '').' '.($defaultCollege ?? ''));
|
|
|
|
|
if (! $this->matchesKeywords($plain, $keywords)) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$seen[$dedupeKey] = true;
|
|
|
|
|
$items[] = $this->makeFacultyItem(
|
|
|
|
|
externalKey: 'faculty:'.md5($dedupeKey),
|
|
|
|
|
name: $name,
|
|
|
|
|
profileUrl: $profileUrl,
|
|
|
|
|
email: null,
|
|
|
|
|
affiliation: $defaultCollege,
|
|
|
|
|
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
|
|
|
|
|
summary: $defaultCollege ? '单位:'.$defaultCollege : null,
|
|
|
|
|
keywords: $keywords,
|
|
|
|
|
academicTitle: $academicTitle,
|
|
|
|
|
platform: 'faculty_html_vsb',
|
|
|
|
|
bio: null,
|
|
|
|
|
);
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if (preg_match_all('#<a\b([^>]*?)>(.*?)</a>#su', $scope, $linkMatches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER)) {
|
|
|
|
|
foreach ($linkMatches as $linkMatch) {
|
|
|
|
|
$attrs = (string) $linkMatch[1][0];
|
|
|
|
|
$offset = (int) $linkMatch[0][1];
|
|
|
|
|
$name = CrawlAuthorParser::cleanText(strip_tags($linkMatch[2][0])) ?? '';
|
|
|
|
|
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
|
|
|
if (! $this->looksLikeTeacherProfileUrl($href, null)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$addItem($name, $this->resolveUrl($href, $sourceUrl), $resolveSectionTitle($offset));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match_all('#<td[^>]*>(.*?)</td>#su', $scope, $cellMatches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER)) {
|
|
|
|
|
foreach ($cellMatches as $cellMatch) {
|
|
|
|
|
$cellHtml = (string) $cellMatch[1][0];
|
|
|
|
|
$offset = (int) $cellMatch[0][1];
|
|
|
|
|
if (str_contains($cellHtml, '<a ')) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$name = CrawlAuthorParser::cleanText(strip_tags($cellHtml)) ?? '';
|
|
|
|
|
$addItem($name, null, $resolveSectionTitle($offset));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function inferAcademicTitleFromSection(?string $sectionTitle): ?string
|
|
|
|
|
{
|
|
|
|
|
if ($sectionTitle === null || $sectionTitle === '') {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (str_contains($sectionTitle, '教授') && ! str_contains($sectionTitle, '副教授')) {
|
|
|
|
|
return '教授';
|
|
|
|
|
}
|
|
|
|
|
if (str_contains($sectionTitle, '副教授')) {
|
|
|
|
|
return '副教授';
|
|
|
|
|
}
|
|
|
|
|
if (str_contains($sectionTitle, '助理教授')) {
|
|
|
|
|
return '准聘助理教授';
|
|
|
|
|
}
|
|
|
|
|
if (str_contains($sectionTitle, '博士后')) {
|
|
|
|
|
return '博士后';
|
|
|
|
|
}
|
|
|
|
|
if (str_contains($sectionTitle, '专职科研')) {
|
|
|
|
|
return '专职科研';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return CrawlAuthorParser::cleanText($sectionTitle);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function isAjaxTeacherListPage(string $html, string $sourceUrl): bool
|
|
|
|
|
{
|
|
|
|
|
if (str_contains(strtolower($sourceUrl), 'ajax_teacher_list')) {
|
|
|
|
|
@ -507,6 +885,250 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
return str_contains($html, 'ajax_teacher_list.html');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function isNjuTeacherHomePage(string $html): bool
|
|
|
|
|
{
|
|
|
|
|
return str_contains($html, 'faculty.js')
|
|
|
|
|
&& (bool) preg_match('/<body[^>]*class="[^"]*\bfaculty\b/u', $html);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
*/
|
|
|
|
|
protected function fetchNjuTeacherHomeItems(
|
|
|
|
|
string $requestUrl,
|
|
|
|
|
string $pageHtml,
|
|
|
|
|
array $keywords,
|
|
|
|
|
int $maxResults,
|
|
|
|
|
int $maxPages = 1,
|
|
|
|
|
): array {
|
|
|
|
|
$siteId = $this->parseNjuSiteId($pageHtml);
|
|
|
|
|
$filters = $this->parseNjuTeacherHomeFilters($pageHtml);
|
|
|
|
|
$conditions = $this->buildNjuTeacherHomeConditions($filters['career'], $filters['sub_career']);
|
|
|
|
|
$origin = $this->requestOrigin($requestUrl) ?? 'https://is.nju.edu.cn';
|
|
|
|
|
$apiUrl = $origin.'/_wp3services/generalQuery?queryObj=teacherHome';
|
|
|
|
|
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
|
|
|
|
|
$maxPages = max(1, min(50, $maxPages));
|
|
|
|
|
$rows = 50;
|
|
|
|
|
|
|
|
|
|
$pageUniversity = $this->inferUniversityFromSource($requestUrl, $pageHtml);
|
|
|
|
|
$defaultCollege = $this->inferCollegeFromPageTitle($pageHtml);
|
|
|
|
|
|
|
|
|
|
$items = [];
|
|
|
|
|
$seen = [];
|
|
|
|
|
$pageIndex = 1;
|
|
|
|
|
$pageCount = null;
|
|
|
|
|
|
|
|
|
|
while ($pageIndex <= $maxPages && count($items) < $maxResults) {
|
|
|
|
|
$body = $this->requestNjuTeacherHomePage($apiUrl, $siteId, $pageIndex, $rows, $conditions, $timeout);
|
|
|
|
|
if ($pageCount === null) {
|
|
|
|
|
$pageCount = max(1, (int) ($body['pageCount'] ?? 1));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$data = $body['data'] ?? [];
|
|
|
|
|
if (! is_array($data) || $data === []) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach ($data as $art) {
|
|
|
|
|
if (! is_array($art)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$name = CrawlAuthorParser::cleanText((string) ($art['title'] ?? '')) ?? '';
|
|
|
|
|
if ($name === '' || ! $this->looksLikePersonName($name)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$profileUrl = $this->resolveUrl((string) ($art['cnUrl'] ?? ''), $requestUrl);
|
|
|
|
|
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
|
|
|
|
|
if (isset($seen[$dedupeKey])) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$academicTitle = CrawlAuthorParser::cleanText((string) ($art['exField2'] ?? ''));
|
|
|
|
|
$researchField = CrawlAuthorParser::cleanText((string) ($art['exField1'] ?? ''));
|
|
|
|
|
$plain = trim($name.' '.($researchField ?? '').' '.($academicTitle ?? '').' '.($defaultCollege ?? ''));
|
|
|
|
|
if (! $this->matchesKeywords($plain, $keywords)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$summaryParts = array_filter([
|
|
|
|
|
$defaultCollege ? '单位:'.$defaultCollege : null,
|
|
|
|
|
$academicTitle ? '职称:'.$academicTitle : null,
|
|
|
|
|
$researchField ? '研究领域:'.$researchField : null,
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$seen[$dedupeKey] = true;
|
|
|
|
|
$items[] = $this->makeFacultyItem(
|
|
|
|
|
externalKey: 'faculty:'.md5($dedupeKey),
|
|
|
|
|
name: $name,
|
|
|
|
|
profileUrl: $profileUrl,
|
|
|
|
|
email: null,
|
|
|
|
|
affiliation: $defaultCollege,
|
|
|
|
|
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
|
|
|
|
|
summary: $summaryParts !== [] ? implode(';', $summaryParts) : null,
|
|
|
|
|
keywords: $keywords,
|
|
|
|
|
academicTitle: $academicTitle,
|
|
|
|
|
platform: 'faculty_html_nju_wp',
|
|
|
|
|
bio: $researchField,
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
if (count($items) >= $maxResults) {
|
|
|
|
|
break 2;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($pageIndex >= $pageCount) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$pageIndex++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function parseNjuSiteId(string $html): int
|
|
|
|
|
{
|
|
|
|
|
if (preg_match('/sudy-wp-siteId="(\d+)"/', $html, $match)) {
|
|
|
|
|
return (int) $match[1];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
throw new \RuntimeException('无法解析教师列表站点 ID(siteId)');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return array{career:?string,sub_career:?string}
|
|
|
|
|
*/
|
|
|
|
|
protected function parseNjuTeacherHomeFilters(string $html): array
|
|
|
|
|
{
|
|
|
|
|
$career = null;
|
|
|
|
|
$subCareer = null;
|
|
|
|
|
|
|
|
|
|
if (preg_match('#class="col_item_link\s+selected"[^>]*title="([^"]+)"#u', $html, $match)) {
|
|
|
|
|
$career = CrawlAuthorParser::cleanText($match[1]);
|
|
|
|
|
} elseif (preg_match('#class="col_item_link\s+selected"[^>]*>.*?class="column-name">([^<]+)</span>#su', $html, $match)) {
|
|
|
|
|
$career = CrawlAuthorParser::cleanText($match[1]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('#class="sub-item[^"]*\sselected"[^>]*>.*?class="column-name">([^<]+)</span>#su', $html, $match)) {
|
|
|
|
|
$subCareer = CrawlAuthorParser::cleanText($match[1]);
|
|
|
|
|
} elseif (preg_match('#class="sub-link[^"]*\sselected"[^>]*title="([^"]+)"#u', $html, $match)) {
|
|
|
|
|
$subCareer = CrawlAuthorParser::cleanText($match[1]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return [
|
|
|
|
|
'career' => $career,
|
|
|
|
|
'sub_career' => $subCareer,
|
|
|
|
|
];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return list<array<string, mixed>>
|
|
|
|
|
*/
|
|
|
|
|
protected function buildNjuTeacherHomeConditions(?string $career, ?string $subCareer): array
|
|
|
|
|
{
|
|
|
|
|
$conditions = [
|
|
|
|
|
['field' => 'published', 'value' => '1', 'judge' => '='],
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
if ($subCareer === '长聘副教授') {
|
|
|
|
|
$conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '长聘副教授', 'judge' => '=']]];
|
|
|
|
|
} elseif ($subCareer === '准聘副教授') {
|
|
|
|
|
$conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '准聘副教授', 'judge' => '=']]];
|
|
|
|
|
} elseif ($subCareer === '准聘助理教授') {
|
|
|
|
|
$conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '准聘助理教授', 'judge' => '=']]];
|
|
|
|
|
} elseif ($subCareer === '专职科研') {
|
|
|
|
|
$conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '专职科研', 'judge' => '=']]];
|
|
|
|
|
} elseif ($subCareer === '博士后') {
|
|
|
|
|
$conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '博士后', 'judge' => '=']]];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($career === null || $career === '') {
|
|
|
|
|
return $conditions;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($career === '教授') {
|
|
|
|
|
$conditions[] = ['field' => 'exField2', 'value' => '教授', 'judge' => '='];
|
|
|
|
|
} elseif ($career === '副教授') {
|
|
|
|
|
$conditions[] = ['field' => 'exField2', 'value' => '副教授', 'judge' => '='];
|
|
|
|
|
} elseif ($career === '兼职教授') {
|
|
|
|
|
$conditions[] = ['field' => 'exField2', 'value' => '兼职教授', 'judge' => '='];
|
|
|
|
|
} elseif ($career === '行政管理人员') {
|
|
|
|
|
$conditions[] = ['field' => 'exField2', 'value' => '行政管理人员', 'judge' => '='];
|
|
|
|
|
} elseif ($career === '准长聘' && ($subCareer === null || $subCareer === '')) {
|
|
|
|
|
$conditions[] = [
|
|
|
|
|
'orConditions' => [
|
|
|
|
|
['field' => 'exField2', 'value' => '长聘副教授', 'judge' => '='],
|
|
|
|
|
['field' => 'exField2', 'value' => '准聘副教授', 'judge' => '='],
|
|
|
|
|
['field' => 'exField2', 'value' => '准聘助理教授', 'judge' => '='],
|
|
|
|
|
],
|
|
|
|
|
];
|
|
|
|
|
} elseif ($career === '专职科研及博士后' && ($subCareer === null || $subCareer === '')) {
|
|
|
|
|
$conditions[] = [
|
|
|
|
|
'orConditions' => [
|
|
|
|
|
['field' => 'exField2', 'value' => '专职科研', 'judge' => '='],
|
|
|
|
|
['field' => 'exField2', 'value' => '博士后', 'judge' => '='],
|
|
|
|
|
],
|
|
|
|
|
];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $conditions;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param list<array<string, mixed>> $conditions
|
|
|
|
|
* @return array<string, mixed>
|
|
|
|
|
*/
|
|
|
|
|
protected function requestNjuTeacherHomePage(
|
|
|
|
|
string $apiUrl,
|
|
|
|
|
int $siteId,
|
|
|
|
|
int $pageIndex,
|
|
|
|
|
int $rows,
|
|
|
|
|
array $conditions,
|
|
|
|
|
int $timeout,
|
|
|
|
|
): array {
|
|
|
|
|
$returnInfos = [
|
|
|
|
|
['field' => 'headerPic', 'name' => 'headerPic'],
|
|
|
|
|
['field' => 'exField1', 'name' => 'exField1'],
|
|
|
|
|
['field' => 'exField2', 'name' => 'exField2'],
|
|
|
|
|
['field' => 'cnUrl', 'name' => 'cnUrl'],
|
|
|
|
|
['field' => 'title', 'name' => 'title'],
|
|
|
|
|
['field' => 'phone', 'name' => 'phone'],
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
$response = Http::timeout($timeout)
|
|
|
|
|
->connectTimeout(min(8, $timeout))
|
|
|
|
|
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
|
|
|
|
|
->withHeaders([
|
|
|
|
|
'User-Agent' => 'SlakeSchool-Crawler/1.0',
|
|
|
|
|
'Accept' => 'application/json',
|
|
|
|
|
])
|
|
|
|
|
->asForm()
|
|
|
|
|
->post($apiUrl, [
|
|
|
|
|
'siteId' => $siteId,
|
|
|
|
|
'pageIndex' => $pageIndex,
|
|
|
|
|
'rows' => $rows,
|
|
|
|
|
'orders' => json_encode([['field' => 'siteSort', 'type' => 'asc']], JSON_UNESCAPED_UNICODE),
|
|
|
|
|
'returnInfos' => json_encode($returnInfos, JSON_UNESCAPED_UNICODE),
|
|
|
|
|
'conditions' => json_encode($conditions, JSON_UNESCAPED_UNICODE),
|
|
|
|
|
'articleType' => 1,
|
|
|
|
|
'level' => 1,
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
if (! $response->successful()) {
|
|
|
|
|
throw new \RuntimeException('教师列表接口请求失败(HTTP '.$response->status().')');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$body = $response->json();
|
|
|
|
|
if (! is_array($body)) {
|
|
|
|
|
throw new \RuntimeException('教师列表接口返回格式异常');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $body;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
@ -895,6 +1517,22 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('#/c\d+a\d+/page\.htm$#', $path)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('#/(?:szll|zjzjs)/[^/]+\.(?:htm|html)$#', $path)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('#^(?:szll|zjzjs)/[^/]+\.(?:htm|html)$#', $path)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('#/info/\d+/\d+\.htm$#', $path)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($catCode !== null && $catCode !== '') {
|
|
|
|
|
$code = preg_quote(strtolower($catCode), '#');
|
|
|
|
|
|
|
|
|
|
@ -1116,20 +1754,39 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
|
|
|
|
|
protected function inferCollegeFromPageTitle(string $html): ?string
|
|
|
|
|
{
|
|
|
|
|
if (! preg_match('/<title>\s*[^<\-\–—]+[\-–—]\s*([^<]+?)\s*<\/title>/u', $html, $match)) {
|
|
|
|
|
return null;
|
|
|
|
|
if (preg_match('/<title>\s*[^<\-\–—]+[\-–—]\s*([^<]+?)\s*<\/title>/u', $html, $match)) {
|
|
|
|
|
$title = CrawlAuthorParser::cleanText($match[1]);
|
|
|
|
|
if ($title !== null && $title !== '') {
|
|
|
|
|
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $title, $college)) {
|
|
|
|
|
return CrawlAuthorParser::cleanText($college[1]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $title;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$title = CrawlAuthorParser::cleanText($match[1]);
|
|
|
|
|
if ($title === null || $title === '') {
|
|
|
|
|
return null;
|
|
|
|
|
if (preg_match('/<meta\s+name="description"\s+content="([^"]+)"/u', $html, $match)) {
|
|
|
|
|
$desc = CrawlAuthorParser::cleanText(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'));
|
|
|
|
|
if ($desc !== null && $desc !== '') {
|
|
|
|
|
if (preg_match('/大学([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $desc, $college)) {
|
|
|
|
|
return CrawlAuthorParser::cleanText($college[1]);
|
|
|
|
|
}
|
|
|
|
|
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $desc, $college)) {
|
|
|
|
|
return CrawlAuthorParser::cleanText($college[1]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $title, $college)) {
|
|
|
|
|
return CrawlAuthorParser::cleanText($college[1]);
|
|
|
|
|
if (preg_match('/<meta\s+name=[\'"]SiteName[\'"]\s+content=[\'"]([^\'"]+)[\'"]/u', $html, $match)) {
|
|
|
|
|
$siteName = CrawlAuthorParser::cleanText(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'));
|
|
|
|
|
if ($siteName !== null && $siteName !== '') {
|
|
|
|
|
if (preg_match('/大学([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $siteName, $college)) {
|
|
|
|
|
return CrawlAuthorParser::cleanText($college[1]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $title;
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function applyProfileMetadataToItem(CrawlItemDto $item, string $html): CrawlItemDto
|
|
|
|
|
@ -1446,6 +2103,12 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
if (str_contains($host, 'fudan.edu.cn')) {
|
|
|
|
|
return '复旦大学';
|
|
|
|
|
}
|
|
|
|
|
if (str_contains($host, 'nju.edu.cn')) {
|
|
|
|
|
return '南京大学';
|
|
|
|
|
}
|
|
|
|
|
if (str_contains($host, 'tsinghua.edu.cn')) {
|
|
|
|
|
return '清华大学';
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,20}大学)/u', $this->htmlToPlain($html), $match)) {
|
|
|
|
|
|