研学线路

master
lion 2 days ago
parent 2e15299690
commit 228413f7f1

@ -139,6 +139,9 @@ class StudyTourDeclarationParser
continue;
}
if ($key === 'implementation_html' && mb_strlen($plain) > 150) {
$score -= 80;
}
$score += 12 + min(mb_strlen($plain), 120) / 20;
}
@ -159,7 +162,7 @@ class StudyTourDeclarationParser
$introText = self::joinSectionLines($sections['intro'] ?? []);
$feeText = self::joinSectionLines($sections['fee'] ?? []);
$implText = self::joinSectionLines($sections['impl'] ?? []);
$implText = self::joinSectionLines(self::filterImplementationLines($sections['impl'] ?? []));
$routePlans = self::parseRouteSection($sections['route'] ?? []);
$courses = self::parseCoursesSection($sections['courses'] ?? [], $routePlans);
@ -815,6 +818,71 @@ class StudyTourDeclarationParser
return (bool) preg_match('/(体验|探秘|制作|参观|采摘|仪式|晚宴|午餐|晚餐|早餐|午休|活动|课程|王国|有趣|小夜灯|缫丝|挂件|面包)/u', $line);
}
/**
* @param array<int, string> $lines
* @return array<int, string>
*/
private static function filterImplementationLines(array $lines): array
{
$kept = [];
foreach ($lines as $line) {
if ($line === '') {
continue;
}
if (! self::isValidImplementationLine($line)) {
break;
}
$kept[] = $line;
}
return $kept;
}
private static function isValidImplementationLine(string $line): bool
{
if (self::isGarbledLine($line) || self::isWordMetadataLine($line)) {
return false;
}
if (preg_match('/[￿]/u', $line)) {
return false;
}
if (preg_match('/(?:xmlns|http:|xml |ContentTypes|accent[0-9]|folHlink|theme\/|drawingml)/iu', $line)) {
return false;
}
if (preg_match('/^\d{1,2}月\d{1,2}日/u', $line)) {
return true;
}
if (! preg_match('/[月日场活动安排如下夏令营研学实施时间共]/u', $line)) {
return false;
}
$cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0;
return $cjk >= 4 && ($cjk / max(mb_strlen($line), 1)) >= 0.45;
}
private static function isWordMetadataLine(string $line): bool
{
static $needles = [
'正文', '默认段落字体', '普通表格', '无列表', '页脚', '页眉', '网格型',
'微软用户', 'WordDocument', 'DocumentSummary', 'SummaryInformation',
'CompObj', 'ProductBuild', 'KSOProduct', 'Normal.dotm', 'Microsoft Office',
'xmlns', 'ContentTypes', 'theme/theme', 'themeManager', 'fontTable',
'Root Entry', 'SMWordDoc', 'Word.Document',
];
foreach ($needles as $needle) {
if (str_contains($line, $needle)) {
return true;
}
}
return (bool) preg_match('/^(黑体|宋体|微软雅黑|等线(?: Light)?|Arial|Symbol|Tahoma|Times New Roman|Calibri|Cambria Math|Segoe UI Emoji|SimHei|SimSun|DengXian)$/u', $line);
}
private static function isGarbledLine(string $line): bool
{
if ($line === '') {
@ -829,10 +897,18 @@ class StudyTourDeclarationParser
return false;
}
if (self::isWordMetadataLine($line)) {
return true;
}
if (preg_match('/[\x{0080}-\x{009F}]/u', $line)) {
return true;
}
if (preg_match('/[\x{0200}-\x{024F}\x{0400}-\x{04FF}\x{0500}-\x{052F}\x{0600}-\x{06FF}]/u', $line)) {
return true;
}
if (preg_match('/[ᘀ-᛿]/u', $line)) {
return true;
}
@ -841,17 +917,29 @@ class StudyTourDeclarationParser
return true;
}
if (preg_match('/[￿]/u', $line)) {
return true;
}
if (preg_match('/(?:xmlns|http:|ContentTypes|Normal\.dotm|Word\.Document)/iu', $line)) {
return true;
}
if (preg_match('/[^\x{4e00}-\x{9fff}0-9A-Za-z。、""\'\-\s\/&“”]/u', $line) && mb_strlen($line) <= 8) {
return true;
}
$cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0;
if ($cjk === 0 && mb_strlen($line) >= 3) {
return true;
}
$len = mb_strlen($line);
if ($len < 6) {
return $cjk === 0;
if ($len <= 5) {
return $cjk < 2;
}
$readable = preg_match_all('/[\x{4e00}-\x{9fff}0-9A-Za-z。、\-\s\/&]/u', $line) ?: 0;
$readable = preg_match_all('/[\x{4e00}-\x{9fff}0-9A-Za-z。、\-\s\/&“”]/u', $line) ?: 0;
return ($readable / max($len, 1)) < 0.45;
}

Loading…
Cancel
Save