研学线路

master
lion 2 days ago
parent a5c2da0a88
commit 2e15299690

@ -278,19 +278,22 @@ class DocTextExtractor
$chunk = mb_substr($raw, $start);
$parts = preg_split('/[\x{0000}-\x{001F}\x{007F}-\x{009F}]+/u', $chunk) ?: [];
$lines = [];
$seen = [];
$previous = null;
foreach ($parts as $part) {
$line = self::normalizeLine((string) $part);
if ($line === '' || mb_strlen($line) < 2) {
continue;
}
if (self::isGarbledDeclarationLine($line)) {
continue;
}
if (! preg_match('/[\x{4e00}-\x{9fff}0-9A-Za-z]/u', $line)) {
continue;
}
if (isset($seen[$line])) {
if ($line === $previous) {
continue;
}
$seen[$line] = true;
$previous = $line;
$lines[] = $line;
}
@ -483,4 +486,41 @@ class DocTextExtractor
return is_string($output) && $output !== '' ? $output : null;
}
private static function isGarbledDeclarationLine(string $line): bool
{
if (preg_match('/^\d+$/', $line)) {
return false;
}
if (preg_match('/^\d{1,2}:\d{2}/', $line)) {
return false;
}
if (preg_match('/[\x{0080}-\x{009F}]/u', $line)) {
return true;
}
if (preg_match('/[ᘀ-᛿]/u', $line)) {
return true;
}
if (preg_match('/[漀愀脈摫欀䡒⡯␖]/u', $line)) {
return true;
}
$cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0;
if ($cjk === 0 && mb_strlen($line) >= 3) {
return true;
}
$len = mb_strlen($line);
if ($len < 6) {
return $cjk === 0;
}
$readable = preg_match_all('/[\x{4e00}-\x{9fff}0-9A-Za-z。、\-\s\/&]/u', $line) ?: 0;
return ($readable / max($len, 1)) < 0.45;
}
}

@ -83,17 +83,63 @@ class StudyTourDeclarationParser
$score += count($parsed['venue_items'] ?? []) * 8;
foreach ($parsed['route_plans'] ?? [] as $group) {
$score += 10;
$score += count($group['items'] ?? []) * 6;
$validItems = 0;
foreach ($group['items'] ?? [] as $item) {
$time = trim((string) ($item['time'] ?? ''));
$activity = trim((string) ($item['activity'] ?? ''));
$location = trim((string) ($item['location'] ?? ''));
if ($activity === '' && $time === '') {
continue;
}
if (self::isGarbledLine($activity) || self::isGarbledLine($time) || self::isGarbledLine($location)) {
$score -= 30;
continue;
}
$validItems++;
}
if ($validItems > 0) {
$score += 10;
$score += $validItems * 6;
}
}
$score += count($parsed['courses'] ?? []) * 8;
$score += count($parsed['courses'] ?? []) * 5;
foreach ($parsed['courses'] ?? [] as $course) {
$name = trim((string) ($course['name'] ?? ''));
$content = trim((string) ($course['content'] ?? ''));
if ($name === '' || $name === $content) {
$score -= 20;
continue;
}
if (preg_match('/^(课程\d+|运用|文化馆|走进|寻找|领略|窑烤|泡泡剧场|萤火虫知识)/u', $name)) {
$score -= 20;
continue;
}
if (mb_strlen($name) > 24) {
$score -= 15;
continue;
}
$score += 18;
}
foreach (['intro_html', 'fee_html', 'implementation_html'] as $key) {
$plain = trim(strip_tags((string) ($parsed[$key] ?? '')));
if ($plain !== '') {
$score += 12 + min(mb_strlen($plain), 120) / 20;
if ($plain === '') {
continue;
}
if (self::containsGarbledText($plain)) {
$score -= 120;
continue;
}
$score += 12 + min(mb_strlen($plain), 120) / 20;
}
return (int) $score;
@ -116,7 +162,7 @@ class StudyTourDeclarationParser
$implText = self::joinSectionLines($sections['impl'] ?? []);
$routePlans = self::parseRouteSection($sections['route'] ?? []);
$courses = self::parseCoursesSection($sections['courses'] ?? []);
$courses = self::parseCoursesSection($sections['courses'] ?? [], $routePlans);
$venueResult = self::matchVenueItems((string) ($basic['venue_raw'] ?? ''), $warnings);
unset($basic['venue_raw']);
@ -162,8 +208,17 @@ class StudyTourDeclarationParser
{
$text = str_replace(["\r\n", "\r", "\f"], "\n", $text);
$parts = preg_split('/\n/u', $text) ?: [];
$lines = [];
foreach ($parts as $part) {
$line = trim((string) $part);
if ($line === '' || self::isGarbledLine($line)) {
continue;
}
$lines[] = $line;
}
return array_map(fn ($line) => trim((string) $line), $parts);
return $lines;
}
/**
@ -405,7 +460,31 @@ class StudyTourDeclarationParser
if ($currentIndex === null) {
continue;
}
if (self::isGarbledLine($line) || preg_match('/^[四五六]、/u', $line)) {
break;
}
if (! self::isTimeLine($line)) {
if (self::isRouteTableHeaderLine($line)) {
continue;
}
$itemCount = count($groups[$currentIndex]['items']);
if ($itemCount > 0 && self::isLikelyRouteLocation($line)) {
$lastIndex = $itemCount - 1;
if ($groups[$currentIndex]['items'][$lastIndex]['location'] === '') {
$groups[$currentIndex]['items'][$lastIndex]['location'] = StudyTourPayload::compactText($line);
continue;
}
}
if (! self::isLikelyRouteLocation($line)) {
$groups[$currentIndex]['items'][] = [
'time' => '',
'activity' => StudyTourPayload::compactText($line),
'location' => '',
];
}
continue;
}
@ -413,7 +492,7 @@ class StudyTourDeclarationParser
$activity = '';
$location = '';
if ($i + 1 < count($lines) && ($lines[$i + 1] ?? '') !== '' && ! self::isTimeLine($lines[$i + 1]) && ! self::isRouteDateLabel($lines[$i + 1])) {
if ($i + 1 < count($lines) && ($lines[$i + 1] ?? '') !== '' && ! self::isTimeLine($lines[$i + 1]) && ! self::isRouteDateLabel($lines[$i + 1]) && ! self::isRouteTableHeaderLine($lines[$i + 1])) {
$activity = $lines[++$i];
}
@ -421,8 +500,11 @@ class StudyTourDeclarationParser
$i++;
}
if ($i + 1 < count($lines) && ($lines[$i + 1] ?? '') !== '' && ! self::isTimeLine($lines[$i + 1]) && ! self::isRouteDateLabel($lines[$i + 1])) {
$location = $lines[++$i];
if ($i + 1 < count($lines) && ($lines[$i + 1] ?? '') !== '' && ! self::isTimeLine($lines[$i + 1]) && ! self::isRouteDateLabel($lines[$i + 1]) && ! self::isRouteTableHeaderLine($lines[$i + 1])) {
$candidate = $lines[$i + 1];
if (self::isLikelyRouteLocation($candidate) || ! self::looksLikeRouteActivity($candidate)) {
$location = $lines[++$i];
}
}
$groups[$currentIndex]['items'][] = [
@ -432,14 +514,30 @@ class StudyTourDeclarationParser
];
}
foreach ($groups as &$group) {
$lastLocation = '';
foreach ($group['items'] as &$item) {
if ($item['location'] !== '') {
$lastLocation = $item['location'];
continue;
}
if ($lastLocation !== '') {
$item['location'] = $lastLocation;
}
}
unset($item);
}
unset($group);
return StudyTourPayload::normalizeRoutePlans($groups);
}
/**
* @param array<int, string> $lines
* @param array<int, array{date_label: string, items: array<int, array{time: string, activity: string, location: string}>}> $routePlans
* @return array<int, array{sort: int, name: string, content: string}>
*/
private static function parseCoursesSection(array $lines): array
private static function parseCoursesSection(array $lines, array $routePlans = []): array
{
$start = 0;
foreach ($lines as $idx => $line) {
@ -469,9 +567,127 @@ class StudyTourDeclarationParser
$i += 2;
}
if ($courses === []) {
$courses = self::parseCoursesFromContentLines($lines, $start, $routePlans);
}
return StudyTourPayload::normalizeCourses($courses);
}
/**
* @param array<int, string> $lines
* @param array<int, array{date_label: string, items: array<int, array{time: string, activity: string, location: string}>}> $routePlans
* @return array<int, array{sort: int, name: string, content: string}>
*/
private static function parseCoursesFromContentLines(array $lines, int $start, array $routePlans): array
{
$contentLines = [];
for ($i = $start; $i < count($lines); $i++) {
$line = $lines[$i];
if ($line === '' || in_array($line, ['序号', '课程名称', '课程内容'], true)) {
continue;
}
if (preg_match('/^[五六]、/u', $line)) {
break;
}
$contentLines[] = StudyTourPayload::compactText($line);
}
if ($contentLines === []) {
return [];
}
$pairedCourses = self::parseCoursesFromNameContentPairs($contentLines);
if ($pairedCourses !== []) {
return $pairedCourses;
}
$activityNames = self::candidateCourseNamesFromRoutes($routePlans);
if (count($activityNames) === count($contentLines)) {
$courses = [];
foreach ($contentLines as $idx => $content) {
$courses[] = [
'sort' => $idx + 1,
'name' => $activityNames[$idx],
'content' => $content,
];
}
return $courses;
}
$courses = [];
foreach ($contentLines as $idx => $line) {
if (preg_match('/^(.{2,30}?)[:]\s*(.+)$/u', $line, $matches)) {
$courses[] = [
'sort' => $idx + 1,
'name' => StudyTourPayload::compactText($matches[1]),
'content' => StudyTourPayload::compactText($matches[2]),
];
continue;
}
$courses[] = [
'sort' => $idx + 1,
'name' => mb_strlen($line) <= 20 ? $line : ('课程'.($idx + 1)),
'content' => $line,
];
}
return $courses;
}
/**
* @param array<int, string> $lines
* @return array<int, array{sort: int, name: string, content: string}>
*/
private static function parseCoursesFromNameContentPairs(array $lines): array
{
if (count($lines) < 2 || count($lines) % 2 !== 0) {
return [];
}
$courses = [];
for ($i = 0; $i < count($lines); $i += 2) {
$name = StudyTourPayload::compactText($lines[$i]);
$content = StudyTourPayload::compactText($lines[$i + 1]);
if ($name === '' || $content === '') {
return [];
}
if (mb_strlen($name) > 30 || mb_strlen($content) < mb_strlen($name)) {
return [];
}
$courses[] = [
'sort' => count($courses) + 1,
'name' => $name,
'content' => $content,
];
}
return count($courses) >= 1 ? $courses : [];
}
/**
* @param array<int, array{date_label: string, items: array<int, array{time: string, activity: string, location: string}>}> $routePlans
* @return array<int, string>
*/
private static function candidateCourseNamesFromRoutes(array $routePlans): array
{
$names = [];
foreach ($routePlans as $group) {
foreach ($group['items'] ?? [] as $item) {
$activity = StudyTourPayload::compactText((string) ($item['activity'] ?? ''));
if ($activity === '' || self::isRouteMealOrCeremony($activity)) {
continue;
}
if (! in_array($activity, $names, true)) {
$names[] = $activity;
}
}
}
return $names;
}
/**
* @return array{items: array<int, array<string, mixed>>, warnings: array<int, string>}
*/
@ -575,6 +791,82 @@ class StudyTourDeclarationParser
return (bool) preg_match('/^\d{1,2}:\d{2}/', $line);
}
private static function isRouteTableHeaderLine(string $line): bool
{
return in_array($line, ['日期', '时间', '行程安排', '地点'], true);
}
private static function isRouteMealOrCeremony(string $activity): bool
{
return (bool) preg_match('/(办理入住|民宿早餐|午休|结营仪式|领取伴手礼|欢迎晚宴|夜探|午餐|晚餐|野火饭|鸡汤|江村饭店|自由活动)/u', $activity);
}
private static function isLikelyRouteLocation(string $line): bool
{
if (self::isRouteMealOrCeremony($line)) {
return false;
}
return (bool) preg_match('/(博物馆|文化园|科技馆|湿地|故居|纪念馆|风情园|蚕桑|丝绸|活动中心|营地|基地|有限公司|酒店|民宿)/u', $line);
}
private static function looksLikeRouteActivity(string $line): bool
{
return (bool) preg_match('/(体验|探秘|制作|参观|采摘|仪式|晚宴|午餐|晚餐|早餐|午休|活动|课程|王国|有趣|小夜灯|缫丝|挂件|面包)/u', $line);
}
private static function isGarbledLine(string $line): bool
{
if ($line === '') {
return false;
}
if (preg_match('/^\d+$/', $line)) {
return false;
}
if (preg_match('/^\d{1,2}:\d{2}/', $line)) {
return false;
}
if (preg_match('/[\x{0080}-\x{009F}]/u', $line)) {
return true;
}
if (preg_match('/[ᘀ-᛿]/u', $line)) {
return true;
}
if (preg_match('/[漀愀脈摫欀䡒⡯␖]/u', $line)) {
return true;
}
$cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0;
if ($cjk === 0 && mb_strlen($line) >= 3) {
return true;
}
$len = mb_strlen($line);
if ($len < 6) {
return $cjk === 0;
}
$readable = preg_match_all('/[\x{4e00}-\x{9fff}0-9A-Za-z。、\-\s\/&]/u', $line) ?: 0;
return ($readable / max($len, 1)) < 0.45;
}
private static function containsGarbledText(string $text): bool
{
foreach (preg_split('/\R/u', $text) ?: [] as $line) {
if (self::isGarbledLine(trim((string) $line))) {
return true;
}
}
return false;
}
/**
* @param array<int, string> $lines
*/
@ -583,11 +875,14 @@ class StudyTourDeclarationParser
$chunks = [];
$buf = [];
foreach ($lines as $line) {
if ($line === '') {
if ($line === '' || self::isGarbledLine($line)) {
if ($buf !== []) {
$chunks[] = trim(implode("\n", $buf));
$buf = [];
}
if (self::isGarbledLine($line)) {
break;
}
continue;
}
$buf[] = $line;

Loading…
Cancel
Save