From 4fc30d238d8699ebed69d6fe6793749f95c77fa9 Mon Sep 17 00:00:00 2001 From: lion <120344285@qq.com> Date: Tue, 30 Jun 2026 15:46:28 +0800 Subject: [PATCH] =?UTF-8?q?=E7=A0=94=E5=AD=A6=E7=BA=BF=E8=B7=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/Support/StudyTourDeclarationParser.php | 149 ++++++++++++++++++++- 1 file changed, 143 insertions(+), 6 deletions(-) diff --git a/app/Support/StudyTourDeclarationParser.php b/app/Support/StudyTourDeclarationParser.php index 3118a4d..1bf0b02 100644 --- a/app/Support/StudyTourDeclarationParser.php +++ b/app/Support/StudyTourDeclarationParser.php @@ -162,7 +162,7 @@ class StudyTourDeclarationParser $introText = self::joinSectionLines($sections['intro'] ?? []); $feeText = self::joinSectionLines($sections['fee'] ?? []); - $implText = self::joinSectionLines(self::filterImplementationLines($sections['impl'] ?? [])); + $implText = self::sanitizeImplementationText(self::joinSectionLines(self::filterImplementationLines($sections['impl'] ?? []))); $routePlans = self::parseRouteSection($sections['route'] ?? []); $courses = self::parseCoursesSection($sections['courses'] ?? [], $routePlans); @@ -814,18 +814,135 @@ class StudyTourDeclarationParser if ($line === '') { continue; } - if (self::isGarbledLine($line) || self::isWordMetadataLine($line)) { - break; - } if (preg_match('/^\(?实施场次[、,].*等\)?$/u', $line)) { continue; } + if (! self::isPlausibleImplementationLine($line)) { + break; + } $kept[] = $line; } return $kept; } + private static function sanitizeImplementationText(string $text): string + { + $text = StudyTourPayload::compactMultilineText($text); + if ($text === '') { + return ''; + } + + $kept = []; + foreach (preg_split('/\R/u', $text) ?: [] as $line) { + $line = trim((string) $line); + if ($line === '') { + continue; + } + if (! self::isPlausibleImplementationLine($line)) { + break; + } + $kept[] = $line; + } + + return implode("\n", $kept); + } + + private static function isPlausibleImplementationLine(string $line): bool + { + if (self::isImplementationStopLine($line)) { + return false; + } + + if (preg_match('/(?:\d{1,2}月\d{1,2}(?:-\d{1,2})?日|\d{1,2}-\d{1,2}月|\d{4}年\d{1,2}月\d{1,2}日|第[一二三四五六七八九十百零\d]+[场期次])/u', $line)) { + return true; + } + + if (preg_match('/^(说明|备注|要点)[::]/u', $line)) { + return true; + } + + if (preg_match('/[预约成团寒暑假节假说明要点场次时间共]/u', $line)) { + $cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0; + $readable = preg_match_all('/[\x{4e00}-\x{9fff}0-9A-Za-z,。、:;()\-\s\/&""《》—–,.\:]/u', $line) ?: 0; + + return $cjk >= 3 && ($readable / max(mb_strlen($line), 1)) >= 0.6; + } + + $cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0; + $readable = preg_match_all('/[\x{4e00}-\x{9fff}0-9A-Za-z,。、:;()\-\s\/&""《》—–,.\:]/u', $line) ?: 0; + $len = mb_strlen($line); + + return $cjk >= 10 + && ($readable / max($len, 1)) >= 0.75 + && ! preg_match('/[A-Za-z]/', $line); + } + + private static function isImplementationStopLine(string $line): bool + { + if (self::isGarbledLine($line) || self::isWordMetadataLine($line)) { + return true; + } + + if (preg_match('/[￿Ç]/u', $line)) { + return true; + } + + if (preg_match('/^(一、|二、|三、|四、|五、|六、|、线路)/u', $line)) { + return true; + } + + if (preg_match('/^(线路规划|线路简介|线路名称|线路联络人|组织单位名称|咨询电话|研学课程|线路收费标准|线路点位|适配学段|研学时长|适宜人数)/u', $line)) { + return true; + } + + if (preg_match('/^(日期|时间|行程安排|地点|以下为示例|(可多选)|熊猫饲养员|幼儿园|微软)/u', $line)) { + return true; + } + + if (preg_match('/^第[一二三四五六七八九十百零\d]+天$/u', $line)) { + return true; + } + + if (preg_match('/^(苏州市|常熟市|吴江区|太仓市)$/u', $line)) { + return true; + } + + if (preg_match('/^\d{1,2}$/u', $line)) { + return true; + } + + if (preg_match('/^1\d{10}$/u', $line)) { + return true; + } + + if (preg_match('/^\d{1,2}:\d{2}x$/u', $line)) { + return true; + } + + if (preg_match('/[\x{E000}-\x{F8FF}]/u', $line)) { + return true; + } + + if (preg_match('/[\x{AC00}-\x{D7AF}\x{1100}-\x{11FF}]/u', $line)) { + return true; + } + + if (preg_match('/[鐁耠耀謀卛鄬铐]/u', $line)) { + return true; + } + + if (preg_match('/[°éêÈĨ༪]/u', $line)) { + return true; + } + + if (preg_match('/[A-Za-z]/', $line) && ! preg_match('/(colour|run|DIY)/iu', $line)) { + return true; + } + + return false; + } + private static function isValidImplementationLine(string $line): bool { if (self::isGarbledLine($line) || self::isWordMetadataLine($line)) { @@ -873,7 +990,7 @@ class StudyTourDeclarationParser { static $needles = [ '正文', '默认段落字体', '普通表格', '无列表', '页脚', '页眉', '网格型', - '微软用户', 'WordDocument', 'DocumentSummary', 'SummaryInformation', + '微软用户', '微软中国', 'WordDocument', 'DocumentSummary', 'SummaryInformation', 'CompObj', 'ProductBuild', 'KSOProduct', 'Normal.dotm', 'Microsoft Office', 'xmlns', 'ContentTypes', 'theme/theme', 'themeManager', 'fontTable', 'Root Entry', 'SMWordDoc', 'Word.Document', @@ -929,6 +1046,18 @@ class StudyTourDeclarationParser return true; } + if (preg_match('/[\x{E000}-\x{F8FF}]/u', $line)) { + return true; + } + + if (preg_match('/[鐁耠耀謀卛鄬铐]/u', $line)) { + return true; + } + + if (str_contains($line, 'Ç')) { + return true; + } + if (preg_match('/[￿]/u', $line)) { return true; } @@ -958,8 +1087,16 @@ class StudyTourDeclarationParser private static function containsGarbledText(string $text): bool { + if (preg_match('/[鐁耠耀謀卛鄬铐]/u', $text)) { + return true; + } + + if (preg_match('/(?:线路规划|、线路|微软中国|WordDocument)/u', $text)) { + return true; + } + foreach (preg_split('/\R/u', $text) ?: [] as $line) { - if (self::isGarbledLine(trim((string) $line))) { + if (self::isGarbledLine(trim((string) $line)) || self::isImplementationStopLine(trim((string) $line))) { return true; } }