You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

477 lines
17 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
namespace App\Services\Crawl;
/**
* 从 arXiv 搜索块 / abs / html 页面文本解析发表日期与作者元数据。
*/
class ArxivMetadataParser
{
/** @var array<string, string> */
private const MONTHS = [
'january' => '01', 'february' => '02', 'march' => '03', 'april' => '04',
'may' => '05', 'june' => '06', 'july' => '07', 'august' => '08',
'september' => '09', 'october' => '10', 'november' => '11', 'december' => '12',
'jan' => '01', 'feb' => '02', 'mar' => '03', 'apr' => '04',
'jun' => '06', 'jul' => '07', 'aug' => '08', 'sep' => '09', 'sept' => '09',
'oct' => '10', 'nov' => '11', 'dec' => '12',
];
public static function parsePublishedDate(?string $text): ?string
{
if ($text === null || trim($text) === '') {
return null;
}
$raw = $text;
if ($date = self::parseCitationMetaDate($raw)) {
return $date;
}
if ($date = self::parseSubmissionHistoryDate($raw)) {
return $date;
}
if ($date = self::parseDatelineDate($raw)) {
return $date;
}
if (preg_match(
'/Generated on\s+(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s+([A-Za-z]+)\s+(\d{1,2})\s+\d{1,2}:\d{2}:\d{2}\s+(\d{4})/i',
$raw,
$gen
)) {
$date = self::toYmd($gen[3], $gen[1], $gen[2]);
if ($date !== null) {
return $date;
}
}
$text = html_entity_decode(strip_tags($raw), ENT_QUOTES | ENT_HTML5, 'UTF-8');
$text = preg_replace('/\s+/u', ' ', $text) ?? '';
if (preg_match('/(\d{4})-(\d{2})-(\d{2})/', $text, $iso)) {
return sprintf('%s-%s-%s', $iso[1], $iso[2], $iso[3]);
}
if (preg_match('/(\d{4})-(\d{2})-(\d{2})T\d{2}:\d{2}/', $text, $isoT)) {
return sprintf('%s-%s-%s', $isoT[1], $isoT[2], $isoT[3]);
}
$patterns = [
'/Generated on\s+(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s+([A-Za-z]+)\s+(\d{1,2})\s+\d{1,2}:\d{2}:\d{2}\s+(\d{4})/i',
'/\[v\d+\]\s*(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),?\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})/i',
'/\[Submitted on\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\]/i',
'/Submitted\s+on\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})/i',
'/Submitted\s+(\d{1,2})\s+([A-Za-z]+),\s+(\d{4})/i',
'/(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),?\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\s+\d{1,2}:\d{2}:\d{2}\s+UTC/i',
'/(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\s+\d{1,2}:\d{2}:\d{2}\s+UTC/i',
];
foreach ($patterns as $pattern) {
if (! preg_match($pattern, $text, $m)) {
continue;
}
if (str_starts_with($pattern, '/Generated on')) {
$date = self::toYmd($m[3], $m[1], $m[2]);
} else {
$date = self::toYmd($m[3], $m[2], $m[1]);
}
if ($date !== null) {
return $date;
}
}
return null;
}
/**
* 新格式 arXiv IDYYMM.NNNNN可推断提交年月作为最后兜底取当月 1 日)。
*/
public static function parsePublishedDateFromArxivId(?string $arxivId): ?string
{
$arxivId = trim((string) $arxivId);
if ($arxivId === '') {
return null;
}
$arxivId = preg_replace('/v\d+$/i', '', $arxivId) ?? $arxivId;
if (preg_match('/^(\d{2})(\d{2})\.\d+(?:v\d+)?$/i', $arxivId, $m)) {
$year = 2000 + (int) $m[1];
$month = (int) $m[2];
if ($month >= 1 && $month <= 12 && $year >= 2007 && $year <= 2100) {
return sprintf('%04d-%02d-01', $year, $month);
}
}
return null;
}
protected static function parseCitationMetaDate(string $html): ?string
{
if (preg_match('#<meta[^>]+name=["\']citation_date["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $m)
|| preg_match('#<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']citation_date["\']#i', $html, $m)) {
return self::normalizeLooseDate($m[1]);
}
if (preg_match('#<meta[^>]+name=["\']citation_online_date["\'][^>]+content=["\']([^"\']+)["\']#i', $html, $m)
|| preg_match('#<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']citation_online_date["\']#i', $html, $m)) {
return self::normalizeLooseDate($m[1]);
}
return null;
}
protected static function parseDatelineDate(string $html): ?string
{
if (! preg_match('#<div class=["\']dateline["\']>\s*\[Submitted on\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\]\s*</div>#i', $html, $m)) {
return null;
}
return self::toYmd($m[3], $m[2], $m[1]);
}
protected static function parseSubmissionHistoryDate(string $html): ?string
{
if (! preg_match('#<div class=["\']submission-history["\']>(.*?)</div>#is', $html, $block)) {
return null;
}
$section = $block[1];
if (preg_match('/\[v1\][^<]*(?:<br\s*\/?>)?\s*(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),?\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})/i', $section, $m)) {
return self::toYmd($m[3], $m[2], $m[1]);
}
if (preg_match('/\[Submitted on\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\]/i', $section, $m)) {
return self::toYmd($m[3], $m[2], $m[1]);
}
return null;
}
protected static function normalizeLooseDate(string $value): ?string
{
$value = trim(html_entity_decode($value, ENT_QUOTES | ENT_HTML5, 'UTF-8'));
if ($value === '') {
return null;
}
if (preg_match('#^(\d{4})[/-](\d{1,2})[/-](\d{1,2})$#', $value, $m)) {
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
}
if (preg_match('/(\d{4})-(\d{2})-(\d{2})/', $value, $iso)) {
return sprintf('%s-%s-%s', $iso[1], $iso[2], $iso[3]);
}
return self::parsePublishedDate($value);
}
/**
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
*/
public static function parseAuthorsFromAbsHtml(string $html): array
{
$block = '';
if (preg_match('#<div[^>]*class="[^"]*authors[^"]*"[^>]*>(.*?)</div>#is', $html, $m)) {
$block = $m[1];
} elseif (preg_match('#<meta[^>]*name="citation_author"[^>]*content="([^"]+)"#i', $html, $meta)) {
return self::rowsFromNames([$meta[1]], $html);
}
if ($block === '') {
return [];
}
$rows = [];
if (preg_match_all('#<a[^>]*href="mailto:([^"]+)"[^>]*>([^<]*)</a>#i', $block, $mailto, PREG_SET_ORDER)) {
foreach ($mailto as $m) {
$rows[] = [
'name' => CrawlAuthorParser::cleanText($m[2]) ?: CrawlAuthorParser::cleanText($m[1]),
'email' => CrawlAuthorParser::normalizeEmail($m[1]),
'affiliation' => null,
'university_name' => null,
];
}
}
if ($rows === [] && preg_match_all(
'#<a[^>]*href="[^"]*searchtype=author[^"]*"[^>]*>([^<]+)</a>#i',
$block,
$links
)) {
$rows = self::rowsFromNames($links[1], $html);
}
if ($rows === [] && preg_match_all('#<a[^>]*class="[^"]*link-author[^"]*"[^>]*>([^<]+)</a>#i', $block, $links)) {
$rows = self::rowsFromNames($links[1], $html);
}
if ($rows === [] && preg_match_all('#<span class="descriptor">([^<]*)</span>#', $block, $names)) {
$rows = self::rowsFromNames($names[1], $html);
}
if ($rows === [] && preg_match_all('#<a[^>]*>([^<]+)</a>#', $block, $links)) {
$names = [];
foreach ($links[1] as $name) {
$name = trim($name);
if ($name !== '' && ! str_contains(strtolower($name), 'orcid')) {
$names[] = $name;
}
}
$rows = self::rowsFromNames($names, $html);
}
if (preg_match_all('#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#', $block, $emails)) {
foreach ($emails[1] as $i => $email) {
if (isset($rows[$i])) {
$rows[$i]['email'] = CrawlAuthorParser::normalizeEmail($email);
}
}
}
$affiliations = [];
if (preg_match_all('#<span class="affiliation">([^<]+)</span>#', $html, $affs)) {
$affiliations = array_map(
fn ($a) => CrawlAuthorParser::cleanText(html_entity_decode($a, ENT_QUOTES | ENT_HTML5, 'UTF-8')),
$affs[1],
);
}
if ($affiliations === [] && preg_match_all(
'/((?:[\x{4e00}-\x{9fff}A-Za-z\s,&.-]{2,60}(?:大学|学院|研究院|研究所|University|College|Institute|School)[^.;]{0,80}))/u',
$block,
$inlineAff
)) {
$affiliations = array_map(fn ($a) => CrawlAuthorParser::cleanText($a), $inlineAff[1]);
}
foreach ($affiliations as $i => $aff) {
if (! isset($rows[$i])) {
continue;
}
$rows[$i]['affiliation'] = $aff;
$rows[$i]['university_name'] = CrawlAuthorParser::universityFromAffiliation($aff);
}
if ($rows !== [] && ($rows[0]['university_name'] ?? null) === null) {
$uni = self::firstUniversityInText($block.' '.$html);
if ($uni !== null) {
$rows[0]['affiliation'] = $rows[0]['affiliation'] ?? $uni;
$rows[0]['university_name'] = CrawlAuthorParser::universityFromAffiliation($uni);
}
}
return array_values(array_filter($rows, fn ($r) => ($r['name'] ?? '') !== ''));
}
/**
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
*/
public static function parseAuthorsFromHtmlVersion(string $html): array
{
$rows = self::parseLtxPersonnameBlock($html);
if ($rows !== []) {
return $rows;
}
if (preg_match_all(
'#<span[^>]*class="[^"]*ltx_author[^"]*"[^>]*>(.*?)</span>#is',
$html,
$blocks
)) {
foreach ($blocks[1] as $chunk) {
$name = '';
if (preg_match('#<span[^>]*class="[^"]*ltx_personname[^"]*"[^>]*>(.*?)</span>#is', $chunk, $n)) {
$parsed = self::parseLtxPersonnameInner($n[1]);
if ($parsed !== []) {
$rows = array_merge($rows, $parsed);
}
continue;
}
if (preg_match('#<span[^>]*class="[^"]*ltx_personname[^"]*"[^>]*>([^<]+)#i', $chunk, $n)) {
$name = CrawlAuthorParser::cleanText($n[1]);
}
if ($name === '') {
continue;
}
$email = null;
if (preg_match('#mailto:([^"\'>\s]+)#i', $chunk, $em)) {
$email = CrawlAuthorParser::normalizeEmail($em[1]);
}
$aff = null;
if (preg_match('#<span[^>]*class="[^"]*ltx_author_affiliation[^"]*"[^>]*>([^<]+)#i', $chunk, $af)) {
$aff = CrawlAuthorParser::cleanText($af[1]);
}
$rows[] = [
'name' => $name,
'email' => $email,
'affiliation' => $aff,
'university_name' => CrawlAuthorParser::universityFromAffiliation($aff),
];
}
}
if ($rows === [] && preg_match_all(
'#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
$html,
$emails
)) {
$rows[] = [
'name' => '',
'email' => CrawlAuthorParser::normalizeEmail($emails[1][0]),
'affiliation' => self::firstUniversityInText($html),
'university_name' => CrawlAuthorParser::universityFromAffiliation(self::firstUniversityInText($html)),
];
}
return array_values(array_filter($rows, fn ($r) => ($r['name'] ?? '') !== '' || ($r['email'] ?? '') !== ''));
}
public static function extractPdfUrl(string $htmlOrBlock, string $arxivId): ?string
{
if (preg_match('#arxiv\.org/pdf/([^"?\s]+)#i', $htmlOrBlock, $m)) {
return 'https://arxiv.org/pdf/'.$m[1];
}
$base = preg_replace('/v\d+$/i', '', $arxivId) ?: $arxivId;
return 'https://arxiv.org/pdf/'.$base;
}
public static function extractHtmlUrl(string $htmlOrBlock, string $arxivId): ?string
{
if (preg_match('#arxiv\.org/html/([^"?\s]+)#i', $htmlOrBlock, $m)) {
return 'https://arxiv.org/html/'.$m[1];
}
$id = preg_replace('/v\d+$/i', '', $arxivId) ?: $arxivId;
if (preg_match('/v\d+$/i', $arxivId)) {
return 'https://arxiv.org/html/'.$arxivId;
}
return 'https://arxiv.org/html/'.$id.'v1';
}
/**
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
*/
protected static function parseLtxPersonnameBlock(string $html): array
{
if (! preg_match(
'#<div[^>]*class="[^"]*ltx_authors[^"]*"[^>]*>.*?<span[^>]*class="[^"]*ltx_personname[^"]*"[^>]*>(.*?)</span>#is',
$html,
$m
)) {
return [];
}
return self::parseLtxPersonnameInner($m[1]);
}
/**
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
*/
protected static function parseLtxPersonnameInner(string $innerHtml): array
{
$parts = preg_split('#<br[^>]*>#i', $innerHtml) ?: [];
$namesRaw = strip_tags($parts[0] ?? '');
$namesPart = CrawlAuthorParser::cleanText($namesRaw) ?? '';
$affiliation = null;
foreach (array_slice($parts, 1) as $part) {
$text = CrawlAuthorParser::cleanText(strip_tags($part));
if ($text !== null && $text !== '' && self::looksLikeAffiliation($text)) {
$affiliation = $text;
break;
}
}
if ($namesPart === '') {
return [];
}
$names = preg_split('/[\x{2003}\x{2002}\x{2009}]|\s{2,}/u', $namesRaw) ?: [];
$names = array_values(array_filter(array_map(
fn ($n) => CrawlAuthorParser::cleanText($n) ?? '',
$names
)));
if (count($names) <= 1 && preg_match('/\s+[A-Z][a-z]+\s+[A-Z][a-z]+\s*$/u', $namesPart)) {
$names = preg_split('/\s+(?=[A-Z][a-z]+\s+[A-Z][a-z]+\s*$)/u', $namesPart, 2) ?: [$namesPart];
}
if ($names === []) {
$names = [$namesPart];
}
$rows = [];
foreach ($names as $name) {
$name = CrawlAuthorParser::cleanText($name) ?? '';
if ($name === '') {
continue;
}
$rows[] = [
'name' => $name,
'email' => null,
'affiliation' => $affiliation,
'university_name' => CrawlAuthorParser::universityFromAffiliation($affiliation),
];
}
return $rows;
}
protected static function looksLikeAffiliation(string $text): bool
{
return (bool) preg_match(
'/(?:大学|学院|研究院|研究所|University|College|Institute|School|Jerusalem|Laboratory|Lab)/iu',
$text
);
}
/**
* @param list<string> $names
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
*/
protected static function rowsFromNames(array $names, string $fullHtml): array
{
$rows = [];
foreach ($names as $name) {
$name = CrawlAuthorParser::cleanText(html_entity_decode($name, ENT_QUOTES | ENT_HTML5, 'UTF-8'));
if ($name === '' || str_ends_with($name, ':')) {
continue;
}
$rows[] = [
'name' => $name,
'email' => null,
'affiliation' => null,
'university_name' => null,
];
}
return $rows;
}
protected static function firstUniversityInText(string $text): ?string
{
if (preg_match(
'/((?:[\x{4e00}-\x{9fff}A-Za-z\s,&.-]{2,50}(?:大学|学院|研究院|研究所))|(?:University|College|Institute)[\s\w,.-]{0,60})/u',
$text,
$m
)) {
return CrawlAuthorParser::cleanText($m[1]);
}
return null;
}
protected static function toYmd(string $year, string $monthName, string $day): ?string
{
$month = self::MONTHS[strtolower(rtrim($monthName, '.'))] ?? null;
if ($month === null) {
return null;
}
return sprintf('%s-%s-%02d', $year, $month, (int) $day);
}
}