From 228413f7f1e558dad23a86f862ba8bfb3d878f01 Mon Sep 17 00:00:00 2001 From: lion <120344285@qq.com> Date: Tue, 30 Jun 2026 15:04:32 +0800 Subject: [PATCH] =?UTF-8?q?=E7=A0=94=E5=AD=A6=E7=BA=BF=E8=B7=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/Support/StudyTourDeclarationParser.php | 96 +++++++++++++++++++++- 1 file changed, 92 insertions(+), 4 deletions(-) diff --git a/app/Support/StudyTourDeclarationParser.php b/app/Support/StudyTourDeclarationParser.php index 7c2bf09..d117a38 100644 --- a/app/Support/StudyTourDeclarationParser.php +++ b/app/Support/StudyTourDeclarationParser.php @@ -139,6 +139,9 @@ class StudyTourDeclarationParser continue; } + if ($key === 'implementation_html' && mb_strlen($plain) > 150) { + $score -= 80; + } $score += 12 + min(mb_strlen($plain), 120) / 20; } @@ -159,7 +162,7 @@ class StudyTourDeclarationParser $introText = self::joinSectionLines($sections['intro'] ?? []); $feeText = self::joinSectionLines($sections['fee'] ?? []); - $implText = self::joinSectionLines($sections['impl'] ?? []); + $implText = self::joinSectionLines(self::filterImplementationLines($sections['impl'] ?? [])); $routePlans = self::parseRouteSection($sections['route'] ?? []); $courses = self::parseCoursesSection($sections['courses'] ?? [], $routePlans); @@ -815,6 +818,71 @@ class StudyTourDeclarationParser return (bool) preg_match('/(体验|探秘|制作|参观|采摘|仪式|晚宴|午餐|晚餐|早餐|午休|活动|课程|王国|有趣|小夜灯|缫丝|挂件|面包)/u', $line); } + /** + * @param array $lines + * @return array + */ + private static function filterImplementationLines(array $lines): array + { + $kept = []; + foreach ($lines as $line) { + if ($line === '') { + continue; + } + if (! self::isValidImplementationLine($line)) { + break; + } + $kept[] = $line; + } + + return $kept; + } + + private static function isValidImplementationLine(string $line): bool + { + if (self::isGarbledLine($line) || self::isWordMetadataLine($line)) { + return false; + } + + if (preg_match('/[￿]/u', $line)) { + return false; + } + + if (preg_match('/(?:xmlns|http:|xml |ContentTypes|accent[0-9]|folHlink|theme\/|drawingml)/iu', $line)) { + return false; + } + + if (preg_match('/^\d{1,2}月\d{1,2}日/u', $line)) { + return true; + } + + if (! preg_match('/[月日场活动安排如下夏令营研学实施时间共]/u', $line)) { + return false; + } + + $cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0; + + return $cjk >= 4 && ($cjk / max(mb_strlen($line), 1)) >= 0.45; + } + + private static function isWordMetadataLine(string $line): bool + { + static $needles = [ + '正文', '默认段落字体', '普通表格', '无列表', '页脚', '页眉', '网格型', + '微软用户', 'WordDocument', 'DocumentSummary', 'SummaryInformation', + 'CompObj', 'ProductBuild', 'KSOProduct', 'Normal.dotm', 'Microsoft Office', + 'xmlns', 'ContentTypes', 'theme/theme', 'themeManager', 'fontTable', + 'Root Entry', 'SMWordDoc', 'Word.Document', + ]; + foreach ($needles as $needle) { + if (str_contains($line, $needle)) { + return true; + } + } + + return (bool) preg_match('/^(黑体|宋体|微软雅黑|等线(?: Light)?|Arial|Symbol|Tahoma|Times New Roman|Calibri|Cambria Math|Segoe UI Emoji|SimHei|SimSun|DengXian)$/u', $line); + } + private static function isGarbledLine(string $line): bool { if ($line === '') { @@ -829,10 +897,18 @@ class StudyTourDeclarationParser return false; } + if (self::isWordMetadataLine($line)) { + return true; + } + if (preg_match('/[\x{0080}-\x{009F}]/u', $line)) { return true; } + if (preg_match('/[\x{0200}-\x{024F}\x{0400}-\x{04FF}\x{0500}-\x{052F}\x{0600}-\x{06FF}]/u', $line)) { + return true; + } + if (preg_match('/[ᘀ-᛿]/u', $line)) { return true; } @@ -841,17 +917,29 @@ class StudyTourDeclarationParser return true; } + if (preg_match('/[￿]/u', $line)) { + return true; + } + + if (preg_match('/(?:xmlns|http:|ContentTypes|Normal\.dotm|Word\.Document)/iu', $line)) { + return true; + } + + if (preg_match('/[^\x{4e00}-\x{9fff}0-9A-Za-z,。、:;()""\'\-\s\/&“”]/u', $line) && mb_strlen($line) <= 8) { + return true; + } + $cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0; if ($cjk === 0 && mb_strlen($line) >= 3) { return true; } $len = mb_strlen($line); - if ($len < 6) { - return $cjk === 0; + if ($len <= 5) { + return $cjk < 2; } - $readable = preg_match_all('/[\x{4e00}-\x{9fff}0-9A-Za-z,。、:;()\-\s\/&]/u', $line) ?: 0; + $readable = preg_match_all('/[\x{4e00}-\x{9fff}0-9A-Za-z,。、:;()\-\s\/&“”]/u', $line) ?: 0; return ($readable / max($len, 1)) < 0.45; }