'一、线路基本情况', 'intro' => '二、线路简介', 'route' => '三、线路规划', 'courses' => '四、研学课程', 'fee' => '五、线路收费标准', 'impl' => '六、线路计划实施情况', ]; /** * @return array{parsed: array, warnings: array} */ public static function parseFile(string $path, string $extension): array { $text = DocTextExtractor::extract($path, $extension); return self::parseText($text); } /** * @return array{parsed: array, warnings: array} */ public static function parseText(string $text): array { $warnings = []; $lines = self::splitLines($text); $sections = self::splitSections($lines); $basicLines = $sections['basic'] ?? []; $basic = self::parseBasicSection($basicLines); $introText = self::joinSectionLines($sections['intro'] ?? []); $feeText = self::joinSectionLines($sections['fee'] ?? []); $implText = self::joinSectionLines($sections['impl'] ?? []); $routePlans = self::parseRouteSection($sections['route'] ?? []); $courses = self::parseCoursesSection($sections['courses'] ?? []); $venueResult = self::matchVenueItems((string) ($basic['venue_raw'] ?? ''), $warnings); unset($basic['venue_raw']); if ($basic['name'] === '') { $warnings[] = '未识别到线路名称,请手动填写'; } if ($venueResult['items'] === []) { $warnings[] = '未识别到线路点位/场馆,请手动添加'; } $parsed = StudyTourPayload::normalizeIncoming([ 'name' => $basic['name'], 'org_name' => $basic['org_name'], 'seasons' => $basic['seasons'], 'suitable_count' => $basic['suitable_count'], 'grade_levels' => $basic['grade_levels'], 'duration' => $basic['duration'], 'contact_person' => $basic['contact_person'], 'contact_phones' => $basic['contact_phones'], 'venue_items' => $venueResult['items'], 'intro_html' => self::plainTextToHtml($introText), 'route_plans' => $routePlans, 'courses' => $courses, 'fee_html' => self::plainTextToHtml($feeText), 'implementation_html' => self::plainTextToHtml($implText), 'tags' => [], 'cover_image' => '', 'sort' => 0, 'is_on_shelf' => true, ]); return [ 'parsed' => $parsed, 'warnings' => array_values(array_unique($warnings)), ]; } /** * @return array */ private static function splitLines(string $text): array { $text = str_replace(["\r\n", "\r", "\f"], "\n", $text); $parts = preg_split('/\n/u', $text) ?: []; return array_map(fn ($line) => trim((string) $line), $parts); } /** * @param array $lines * @return array> */ private static function splitSections(array $lines): array { $keys = array_keys(self::SECTION_MARKERS); $sections = array_fill_keys($keys, []); $current = null; foreach ($lines as $line) { $matched = null; foreach (self::SECTION_MARKERS as $key => $marker) { if ($line === $marker || str_starts_with($line, $marker)) { $matched = $key; break; } } if ($matched !== null) { $current = $matched; continue; } if ($current !== null) { $sections[$current][] = $line; } } return $sections; } /** * @param array $lines * @return array */ private static function parseBasicSection(array $lines): array { $fields = [ 'org_name' => '', 'name' => '', 'seasons' => [], 'venue_raw' => '', 'suitable_count' => '', 'grade_levels' => [], 'duration' => '', 'contact_person' => '', 'contact_phones' => '', ]; $labels = [ '组织单位名称' => 'org_name', '线路名称' => 'name', '线路点位' => 'venue_raw', '适宜人数' => 'suitable_count', '研学时长' => 'duration', '线路联络人' => 'contact_person', '咨询电话' => 'contact_phones', ]; $seasonBuffer = []; $gradeBuffer = []; for ($i = 0; $i < count($lines); $i++) { $line = $lines[$i]; if ($line === '') { continue; } if (str_starts_with($line, '对应季节')) { $seasonBuffer[] = $line; if (($lines[$i + 1] ?? '') === '(可多选)') { $i++; } while (($lines[$i + 1] ?? '') !== '' && ! self::isBasicLabelLine($lines[$i + 1])) { $seasonBuffer[] = $lines[++$i]; } $fields['seasons'] = self::parseSeasons(implode(' ', $seasonBuffer)); continue; } if (str_starts_with($line, '适配学段')) { $gradeBuffer[] = $line; if (($lines[$i + 1] ?? '') === '(可多选)') { $i++; } while (($lines[$i + 1] ?? '') !== '' && ! self::isBasicLabelLine($lines[$i + 1])) { $gradeBuffer[] = $lines[++$i]; } $fields['grade_levels'] = self::parseGrades(implode(' ', $gradeBuffer)); continue; } foreach ($labels as $label => $key) { if ($line !== $label) { continue; } $valueLines = []; while (($lines[$i + 1] ?? '') !== '' && ! self::isBasicLabelLine($lines[$i + 1]) && ! str_starts_with($lines[$i + 1], '对应季节') && ! str_starts_with($lines[$i + 1], '适配学段')) { $valueLines[] = $lines[++$i]; } $fields[$key] = trim(implode("\n", $valueLines)); continue 2; } } $fields['suitable_count'] = self::normalizeBlankPlaceholder($fields['suitable_count']); $fields['duration'] = self::normalizeDuration($fields['duration']); $fields['contact_phones'] = StudyTourPayload::normalizeContactPhones($fields['contact_phones']); foreach (['org_name', 'name', 'contact_person', 'venue_raw'] as $key) { $fields[$key] = StudyTourPayload::compactText((string) $fields[$key]); } return $fields; } private static function isBasicLabelLine(string $line): bool { if ($line === '(可多选)') { return true; } return array_key_exists($line, [ '组织单位名称' => true, '线路名称' => true, '线路点位' => true, '适宜人数' => true, '研学时长' => true, '线路联络人' => true, '咨询电话' => true, ]); } /** * @return array */ private static function parseSeasons(string $raw): array { $map = [ '春季' => 'spring', '夏季' => 'summer', '秋季' => 'autumn', '冬季' => 'winter', ]; $selected = []; foreach ($map as $label => $value) { if (preg_match('/(?:[☑✅✔]|■)\s*'.preg_quote($label, '/').'/u', $raw)) { $selected[] = $value; } } return self::filterDictValues('study_tour_season', $selected); } /** * @return array */ private static function parseGrades(string $raw): array { $map = [ '幼儿园' => 'kindergarten', '小学' => 'primary', '初中' => 'junior', '高中' => 'high', '全学段' => 'all', ]; $selected = []; foreach ($map as $label => $value) { if (preg_match('/(?:[☑✅✔]|■)\s*'.preg_quote($label, '/').'/u', $raw)) { $selected[] = $value; } } return self::filterDictValues('study_tour_grade_level', $selected); } /** * @param array $values * @return array */ private static function filterDictValues(string $dictType, array $values): array { $allowed = DictItem::query() ->where('dict_type', $dictType) ->where('is_active', true) ->pluck('item_value') ->all(); return array_values(array_intersect($values, $allowed)); } private static function normalizeBlankPlaceholder(string $raw): string { $text = StudyTourPayload::compactText($raw); $text = preg_replace('/_+/u', '', $text) ?? $text; return StudyTourPayload::compactText($text); } private static function normalizeDuration(string $raw): string { $text = StudyTourPayload::compactMultilineText($raw); if ($text === '') { return ''; } $text = preg_replace('/_+/u', '', $text) ?? $text; return StudyTourPayload::compactText(str_replace("\n", ' ', $text)); } /** * @param array $lines * @return array}> */ private static function parseRouteSection(array $lines): array { $start = 0; foreach ($lines as $idx => $line) { if (in_array($line, ['日期', '时间', '行程安排', '地点'], true)) { $start = $idx + 1; } } $groups = []; $currentIndex = null; for ($i = $start; $i < count($lines); $i++) { $line = $lines[$i]; if ($line === '') { continue; } if (self::isRouteDateLabel($line)) { $groups[] = [ 'date_label' => StudyTourPayload::compactText($line), 'items' => [], ]; $currentIndex = count($groups) - 1; continue; } if ($currentIndex === null) { continue; } if (! self::isTimeLine($line)) { continue; } $time = $line; $activity = ''; $location = ''; if ($i + 1 < count($lines) && ($lines[$i + 1] ?? '') !== '' && ! self::isTimeLine($lines[$i + 1]) && ! self::isRouteDateLabel($lines[$i + 1])) { $activity = $lines[++$i]; } while ($i + 1 < count($lines) && ($lines[$i + 1] ?? '') === '') { $i++; } if ($i + 1 < count($lines) && ($lines[$i + 1] ?? '') !== '' && ! self::isTimeLine($lines[$i + 1]) && ! self::isRouteDateLabel($lines[$i + 1])) { $location = $lines[++$i]; } $groups[$currentIndex]['items'][] = [ 'time' => StudyTourPayload::compactText($time), 'activity' => StudyTourPayload::compactText($activity), 'location' => StudyTourPayload::compactText($location), ]; } return StudyTourPayload::normalizeRoutePlans($groups); } /** * @param array $lines * @return array */ private static function parseCoursesSection(array $lines): array { $start = 0; foreach ($lines as $idx => $line) { if (in_array($line, ['序号', '课程名称', '课程内容'], true)) { $start = $idx + 1; } } $courses = []; $sort = 1; for ($i = $start; $i < count($lines); $i++) { $line = $lines[$i]; if ($line === '' || ! preg_match('/^\d+$/', $line)) { continue; } $name = StudyTourPayload::compactText((string) ($lines[$i + 1] ?? '')); $content = StudyTourPayload::compactText((string) ($lines[$i + 2] ?? '')); if ($name === '' && $content === '') { $i += 2; continue; } $courses[] = [ 'sort' => $sort++, 'name' => $name, 'content' => $content, ]; $i += 2; } return StudyTourPayload::normalizeCourses($courses); } /** * @return array{items: array>, warnings: array} */ private static function matchVenueItems(string $raw, array &$warnings): array { $raw = trim($raw); if ($raw === '') { return ['items' => [], 'warnings' => []]; } $parts = preg_split('#[++、,,/|;;\n]+#u', $raw) ?: []; $parts = array_values(array_filter(array_map('trim', $parts), fn ($p) => $p !== '')); if ($parts === []) { $parts = [$raw]; } /** @var Collection $venues */ $venues = Venue::query()->orderBy('sort')->orderBy('id')->get(['id', 'name']); $items = []; $usedVenueIds = []; foreach ($parts as $part) { $part = self::cleanVenueToken($part); if ($part === '') { continue; } $match = self::findVenueMatch($part, $venues, $usedVenueIds); if ($match !== null) { $items[] = ['type' => 'system', 'venue_id' => $match->id]; $usedVenueIds[] = $match->id; continue; } $items[] = ['type' => 'custom', 'name' => $part]; $warnings[] = "场馆「{$part}」未在系统中匹配,已作为自定义场馆添加"; } return ['items' => $items, 'warnings' => []]; } /** * @param Collection $venues * @param array $usedVenueIds */ private static function findVenueMatch(string $token, Collection $venues, array $usedVenueIds): ?Venue { $tokenNorm = self::normalizeVenueName($token); $exact = $venues->first(function (Venue $v) use ($tokenNorm, $usedVenueIds) { if (in_array($v->id, $usedVenueIds, true)) { return false; } return self::normalizeVenueName((string) $v->name) === $tokenNorm; }); if ($exact !== null) { return $exact; } $contains = $venues->filter(function (Venue $v) use ($tokenNorm, $usedVenueIds) { if (in_array($v->id, $usedVenueIds, true)) { return false; } $nameNorm = self::normalizeVenueName((string) $v->name); return $nameNorm !== '' && (str_contains($nameNorm, $tokenNorm) || str_contains($tokenNorm, $nameNorm)); })->sortByDesc(fn (Venue $v) => mb_strlen((string) $v->name))->first(); return $contains; } private static function cleanVenueToken(string $token): string { $token = StudyTourPayload::compactText($token); $token = preg_replace('/等$/u', '', $token) ?? $token; $token = preg_replace('/[((].*[))]/u', '', $token) ?? $token; return StudyTourPayload::compactText($token); } private static function normalizeVenueName(string $name): string { $name = mb_strtolower(trim($name)); $name = str_replace([' ', ' ', '·', '•'], '', $name); return $name; } private static function isRouteDateLabel(string $line): bool { if (self::isTimeLine($line)) { return false; } return (bool) preg_match('/^(线路[一二三四五六七八九十百零\d]+|第[一二三四五六七八九十百零\d]+天|上午|中午|下午|晚上)/u', $line); } private static function isTimeLine(string $line): bool { return (bool) preg_match('/^\d{1,2}:\d{2}/', $line); } /** * @param array $lines */ private static function joinSectionLines(array $lines): string { $chunks = []; $buf = []; foreach ($lines as $line) { if ($line === '') { if ($buf !== []) { $chunks[] = trim(implode("\n", $buf)); $buf = []; } continue; } $buf[] = $line; } if ($buf !== []) { $chunks[] = trim(implode("\n", $buf)); } return trim(implode("\n\n", array_filter($chunks, fn ($c) => $c !== ''))); } private static function plainTextToHtml(string $text): string { $text = StudyTourPayload::compactMultilineText($text); if ($text === '') { return ''; } $paragraphs = preg_split("/\n{2,}/u", $text) ?: [$text]; $html = []; foreach ($paragraphs as $paragraph) { $paragraph = StudyTourPayload::compactMultilineText($paragraph); if ($paragraph === '') { continue; } $escaped = htmlspecialchars($paragraph, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8'); $escaped = nl2br($escaped, false); $html[] = '

'.$escaped.'

'; } return implode('', $html); } }