diff --git a/app/Support/DocTextExtractor.php b/app/Support/DocTextExtractor.php index d90fe9c..1cc3e93 100644 --- a/app/Support/DocTextExtractor.php +++ b/app/Support/DocTextExtractor.php @@ -278,19 +278,22 @@ class DocTextExtractor $chunk = mb_substr($raw, $start); $parts = preg_split('/[\x{0000}-\x{001F}\x{007F}-\x{009F}]+/u', $chunk) ?: []; $lines = []; - $seen = []; + $previous = null; foreach ($parts as $part) { $line = self::normalizeLine((string) $part); if ($line === '' || mb_strlen($line) < 2) { continue; } + if (self::isGarbledDeclarationLine($line)) { + continue; + } if (! preg_match('/[\x{4e00}-\x{9fff}0-9A-Za-z]/u', $line)) { continue; } - if (isset($seen[$line])) { + if ($line === $previous) { continue; } - $seen[$line] = true; + $previous = $line; $lines[] = $line; } @@ -483,4 +486,41 @@ class DocTextExtractor return is_string($output) && $output !== '' ? $output : null; } + + private static function isGarbledDeclarationLine(string $line): bool + { + if (preg_match('/^\d+$/', $line)) { + return false; + } + + if (preg_match('/^\d{1,2}:\d{2}/', $line)) { + return false; + } + + if (preg_match('/[\x{0080}-\x{009F}]/u', $line)) { + return true; + } + + if (preg_match('/[ᘀ-᛿]/u', $line)) { + return true; + } + + if (preg_match('/[漀愀脈摫欀䡒⡯␖]/u', $line)) { + return true; + } + + $cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0; + if ($cjk === 0 && mb_strlen($line) >= 3) { + return true; + } + + $len = mb_strlen($line); + if ($len < 6) { + return $cjk === 0; + } + + $readable = preg_match_all('/[\x{4e00}-\x{9fff}0-9A-Za-z,。、:;()\-\s\/&]/u', $line) ?: 0; + + return ($readable / max($len, 1)) < 0.45; + } } diff --git a/app/Support/StudyTourDeclarationParser.php b/app/Support/StudyTourDeclarationParser.php index 498ac3a..7c2bf09 100644 --- a/app/Support/StudyTourDeclarationParser.php +++ b/app/Support/StudyTourDeclarationParser.php @@ -83,17 +83,63 @@ class StudyTourDeclarationParser $score += count($parsed['venue_items'] ?? []) * 8; foreach ($parsed['route_plans'] ?? [] as $group) { - $score += 10; - $score += count($group['items'] ?? []) * 6; + $validItems = 0; + foreach ($group['items'] ?? [] as $item) { + $time = trim((string) ($item['time'] ?? '')); + $activity = trim((string) ($item['activity'] ?? '')); + $location = trim((string) ($item['location'] ?? '')); + + if ($activity === '' && $time === '') { + continue; + } + if (self::isGarbledLine($activity) || self::isGarbledLine($time) || self::isGarbledLine($location)) { + $score -= 30; + + continue; + } + + $validItems++; + } + + if ($validItems > 0) { + $score += 10; + $score += $validItems * 6; + } } - $score += count($parsed['courses'] ?? []) * 8; + $score += count($parsed['courses'] ?? []) * 5; + foreach ($parsed['courses'] ?? [] as $course) { + $name = trim((string) ($course['name'] ?? '')); + $content = trim((string) ($course['content'] ?? '')); + if ($name === '' || $name === $content) { + $score -= 20; + + continue; + } + if (preg_match('/^(课程\d+|运用|文化馆|走进|寻找|领略|窑烤|泡泡剧场|萤火虫知识)/u', $name)) { + $score -= 20; + + continue; + } + if (mb_strlen($name) > 24) { + $score -= 15; + + continue; + } + $score += 18; + } foreach (['intro_html', 'fee_html', 'implementation_html'] as $key) { $plain = trim(strip_tags((string) ($parsed[$key] ?? ''))); - if ($plain !== '') { - $score += 12 + min(mb_strlen($plain), 120) / 20; + if ($plain === '') { + continue; } + if (self::containsGarbledText($plain)) { + $score -= 120; + + continue; + } + $score += 12 + min(mb_strlen($plain), 120) / 20; } return (int) $score; @@ -116,7 +162,7 @@ class StudyTourDeclarationParser $implText = self::joinSectionLines($sections['impl'] ?? []); $routePlans = self::parseRouteSection($sections['route'] ?? []); - $courses = self::parseCoursesSection($sections['courses'] ?? []); + $courses = self::parseCoursesSection($sections['courses'] ?? [], $routePlans); $venueResult = self::matchVenueItems((string) ($basic['venue_raw'] ?? ''), $warnings); unset($basic['venue_raw']); @@ -162,8 +208,17 @@ class StudyTourDeclarationParser { $text = str_replace(["\r\n", "\r", "\f"], "\n", $text); $parts = preg_split('/\n/u', $text) ?: []; + $lines = []; + + foreach ($parts as $part) { + $line = trim((string) $part); + if ($line === '' || self::isGarbledLine($line)) { + continue; + } + $lines[] = $line; + } - return array_map(fn ($line) => trim((string) $line), $parts); + return $lines; } /** @@ -405,7 +460,31 @@ class StudyTourDeclarationParser if ($currentIndex === null) { continue; } + if (self::isGarbledLine($line) || preg_match('/^[四五六]、/u', $line)) { + break; + } if (! self::isTimeLine($line)) { + if (self::isRouteTableHeaderLine($line)) { + continue; + } + + $itemCount = count($groups[$currentIndex]['items']); + if ($itemCount > 0 && self::isLikelyRouteLocation($line)) { + $lastIndex = $itemCount - 1; + if ($groups[$currentIndex]['items'][$lastIndex]['location'] === '') { + $groups[$currentIndex]['items'][$lastIndex]['location'] = StudyTourPayload::compactText($line); + continue; + } + } + + if (! self::isLikelyRouteLocation($line)) { + $groups[$currentIndex]['items'][] = [ + 'time' => '', + 'activity' => StudyTourPayload::compactText($line), + 'location' => '', + ]; + } + continue; } @@ -413,7 +492,7 @@ class StudyTourDeclarationParser $activity = ''; $location = ''; - if ($i + 1 < count($lines) && ($lines[$i + 1] ?? '') !== '' && ! self::isTimeLine($lines[$i + 1]) && ! self::isRouteDateLabel($lines[$i + 1])) { + if ($i + 1 < count($lines) && ($lines[$i + 1] ?? '') !== '' && ! self::isTimeLine($lines[$i + 1]) && ! self::isRouteDateLabel($lines[$i + 1]) && ! self::isRouteTableHeaderLine($lines[$i + 1])) { $activity = $lines[++$i]; } @@ -421,8 +500,11 @@ class StudyTourDeclarationParser $i++; } - if ($i + 1 < count($lines) && ($lines[$i + 1] ?? '') !== '' && ! self::isTimeLine($lines[$i + 1]) && ! self::isRouteDateLabel($lines[$i + 1])) { - $location = $lines[++$i]; + if ($i + 1 < count($lines) && ($lines[$i + 1] ?? '') !== '' && ! self::isTimeLine($lines[$i + 1]) && ! self::isRouteDateLabel($lines[$i + 1]) && ! self::isRouteTableHeaderLine($lines[$i + 1])) { + $candidate = $lines[$i + 1]; + if (self::isLikelyRouteLocation($candidate) || ! self::looksLikeRouteActivity($candidate)) { + $location = $lines[++$i]; + } } $groups[$currentIndex]['items'][] = [ @@ -432,14 +514,30 @@ class StudyTourDeclarationParser ]; } + foreach ($groups as &$group) { + $lastLocation = ''; + foreach ($group['items'] as &$item) { + if ($item['location'] !== '') { + $lastLocation = $item['location']; + continue; + } + if ($lastLocation !== '') { + $item['location'] = $lastLocation; + } + } + unset($item); + } + unset($group); + return StudyTourPayload::normalizeRoutePlans($groups); } /** * @param array $lines + * @param array}> $routePlans * @return array */ - private static function parseCoursesSection(array $lines): array + private static function parseCoursesSection(array $lines, array $routePlans = []): array { $start = 0; foreach ($lines as $idx => $line) { @@ -469,9 +567,127 @@ class StudyTourDeclarationParser $i += 2; } + if ($courses === []) { + $courses = self::parseCoursesFromContentLines($lines, $start, $routePlans); + } + return StudyTourPayload::normalizeCourses($courses); } + /** + * @param array $lines + * @param array}> $routePlans + * @return array + */ + private static function parseCoursesFromContentLines(array $lines, int $start, array $routePlans): array + { + $contentLines = []; + for ($i = $start; $i < count($lines); $i++) { + $line = $lines[$i]; + if ($line === '' || in_array($line, ['序号', '课程名称', '课程内容'], true)) { + continue; + } + if (preg_match('/^[五六]、/u', $line)) { + break; + } + $contentLines[] = StudyTourPayload::compactText($line); + } + + if ($contentLines === []) { + return []; + } + + $pairedCourses = self::parseCoursesFromNameContentPairs($contentLines); + if ($pairedCourses !== []) { + return $pairedCourses; + } + + $activityNames = self::candidateCourseNamesFromRoutes($routePlans); + if (count($activityNames) === count($contentLines)) { + $courses = []; + foreach ($contentLines as $idx => $content) { + $courses[] = [ + 'sort' => $idx + 1, + 'name' => $activityNames[$idx], + 'content' => $content, + ]; + } + + return $courses; + } + + $courses = []; + foreach ($contentLines as $idx => $line) { + if (preg_match('/^(.{2,30}?)[::]\s*(.+)$/u', $line, $matches)) { + $courses[] = [ + 'sort' => $idx + 1, + 'name' => StudyTourPayload::compactText($matches[1]), + 'content' => StudyTourPayload::compactText($matches[2]), + ]; + continue; + } + $courses[] = [ + 'sort' => $idx + 1, + 'name' => mb_strlen($line) <= 20 ? $line : ('课程'.($idx + 1)), + 'content' => $line, + ]; + } + + return $courses; + } + + /** + * @param array $lines + * @return array + */ + private static function parseCoursesFromNameContentPairs(array $lines): array + { + if (count($lines) < 2 || count($lines) % 2 !== 0) { + return []; + } + + $courses = []; + for ($i = 0; $i < count($lines); $i += 2) { + $name = StudyTourPayload::compactText($lines[$i]); + $content = StudyTourPayload::compactText($lines[$i + 1]); + if ($name === '' || $content === '') { + return []; + } + if (mb_strlen($name) > 30 || mb_strlen($content) < mb_strlen($name)) { + return []; + } + $courses[] = [ + 'sort' => count($courses) + 1, + 'name' => $name, + 'content' => $content, + ]; + } + + return count($courses) >= 1 ? $courses : []; + } + + /** + * @param array}> $routePlans + * @return array + */ + private static function candidateCourseNamesFromRoutes(array $routePlans): array + { + $names = []; + foreach ($routePlans as $group) { + foreach ($group['items'] ?? [] as $item) { + $activity = StudyTourPayload::compactText((string) ($item['activity'] ?? '')); + if ($activity === '' || self::isRouteMealOrCeremony($activity)) { + continue; + } + if (! in_array($activity, $names, true)) { + $names[] = $activity; + } + } + } + + return $names; + } + /** * @return array{items: array>, warnings: array} */ @@ -575,6 +791,82 @@ class StudyTourDeclarationParser return (bool) preg_match('/^\d{1,2}:\d{2}/', $line); } + private static function isRouteTableHeaderLine(string $line): bool + { + return in_array($line, ['日期', '时间', '行程安排', '地点'], true); + } + + private static function isRouteMealOrCeremony(string $activity): bool + { + return (bool) preg_match('/(办理入住|民宿早餐|午休|结营仪式|领取伴手礼|欢迎晚宴|夜探|午餐|晚餐|野火饭|鸡汤|江村饭店|自由活动)/u', $activity); + } + + private static function isLikelyRouteLocation(string $line): bool + { + if (self::isRouteMealOrCeremony($line)) { + return false; + } + + return (bool) preg_match('/(博物馆|文化园|科技馆|湿地|故居|纪念馆|风情园|蚕桑|丝绸|活动中心|营地|基地|有限公司|酒店|民宿)/u', $line); + } + + private static function looksLikeRouteActivity(string $line): bool + { + return (bool) preg_match('/(体验|探秘|制作|参观|采摘|仪式|晚宴|午餐|晚餐|早餐|午休|活动|课程|王国|有趣|小夜灯|缫丝|挂件|面包)/u', $line); + } + + private static function isGarbledLine(string $line): bool + { + if ($line === '') { + return false; + } + + if (preg_match('/^\d+$/', $line)) { + return false; + } + + if (preg_match('/^\d{1,2}:\d{2}/', $line)) { + return false; + } + + if (preg_match('/[\x{0080}-\x{009F}]/u', $line)) { + return true; + } + + if (preg_match('/[ᘀ-᛿]/u', $line)) { + return true; + } + + if (preg_match('/[漀愀脈摫欀䡒⡯␖]/u', $line)) { + return true; + } + + $cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0; + if ($cjk === 0 && mb_strlen($line) >= 3) { + return true; + } + + $len = mb_strlen($line); + if ($len < 6) { + return $cjk === 0; + } + + $readable = preg_match_all('/[\x{4e00}-\x{9fff}0-9A-Za-z,。、:;()\-\s\/&]/u', $line) ?: 0; + + return ($readable / max($len, 1)) < 0.45; + } + + private static function containsGarbledText(string $text): bool + { + foreach (preg_split('/\R/u', $text) ?: [] as $line) { + if (self::isGarbledLine(trim((string) $line))) { + return true; + } + } + + return false; + } + /** * @param array $lines */ @@ -583,11 +875,14 @@ class StudyTourDeclarationParser $chunks = []; $buf = []; foreach ($lines as $line) { - if ($line === '') { + if ($line === '' || self::isGarbledLine($line)) { if ($buf !== []) { $chunks[] = trim(implode("\n", $buf)); $buf = []; } + if (self::isGarbledLine($line)) { + break; + } continue; } $buf[] = $line;