You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
szkp-map-service/app/Support/StudyTourDeclarationParser.php

552 lines
18 KiB

3 days ago
<?php
namespace App\Support;
use App\Models\DictItem;
use App\Models\Venue;
use Illuminate\Support\Collection;
class StudyTourDeclarationParser
{
private const SECTION_MARKERS = [
'basic' => '一、线路基本情况',
'intro' => '二、线路简介',
'route' => '三、线路规划',
'courses' => '四、研学课程',
'fee' => '五、线路收费标准',
'impl' => '六、线路计划实施情况',
];
/**
* @return array{parsed: array<string, mixed>, warnings: array<int, string>}
*/
public static function parseFile(string $path, string $extension): array
{
$text = DocTextExtractor::extract($path, $extension);
return self::parseText($text);
}
/**
* @return array{parsed: array<string, mixed>, warnings: array<int, string>}
*/
public static function parseText(string $text): array
{
$warnings = [];
$lines = self::splitLines($text);
$sections = self::splitSections($lines);
$basicLines = $sections['basic'] ?? [];
$basic = self::parseBasicSection($basicLines);
$introText = self::joinSectionLines($sections['intro'] ?? []);
$feeText = self::joinSectionLines($sections['fee'] ?? []);
$implText = self::joinSectionLines($sections['impl'] ?? []);
$routePlans = self::parseRouteSection($sections['route'] ?? []);
$courses = self::parseCoursesSection($sections['courses'] ?? []);
$venueResult = self::matchVenueItems((string) ($basic['venue_raw'] ?? ''), $warnings);
unset($basic['venue_raw']);
if ($basic['name'] === '') {
$warnings[] = '未识别到线路名称,请手动填写';
}
if ($venueResult['items'] === []) {
$warnings[] = '未识别到线路点位/场馆,请手动添加';
}
$parsed = StudyTourPayload::normalizeIncoming([
'name' => $basic['name'],
'org_name' => $basic['org_name'],
'seasons' => $basic['seasons'],
'suitable_count' => $basic['suitable_count'],
'grade_levels' => $basic['grade_levels'],
'duration' => $basic['duration'],
'contact_person' => $basic['contact_person'],
'contact_phones' => $basic['contact_phones'],
'venue_items' => $venueResult['items'],
'intro_html' => self::plainTextToHtml($introText),
'route_plans' => $routePlans,
'courses' => $courses,
'fee_html' => self::plainTextToHtml($feeText),
'implementation_html' => self::plainTextToHtml($implText),
'tags' => [],
'cover_image' => '',
'sort' => 0,
'is_on_shelf' => true,
]);
return [
'parsed' => $parsed,
'warnings' => array_values(array_unique($warnings)),
];
}
/**
* @return array<int, string>
*/
private static function splitLines(string $text): array
{
$text = str_replace(["\r\n", "\r", "\f"], "\n", $text);
$parts = preg_split('/\n/u', $text) ?: [];
return array_map(fn ($line) => trim((string) $line), $parts);
}
/**
* @param array<int, string> $lines
* @return array<string, array<int, string>>
*/
private static function splitSections(array $lines): array
{
$keys = array_keys(self::SECTION_MARKERS);
$sections = array_fill_keys($keys, []);
$current = null;
foreach ($lines as $line) {
$matched = null;
foreach (self::SECTION_MARKERS as $key => $marker) {
if ($line === $marker || str_starts_with($line, $marker)) {
$matched = $key;
break;
}
}
if ($matched !== null) {
$current = $matched;
continue;
}
if ($current !== null) {
$sections[$current][] = $line;
}
}
return $sections;
}
/**
* @param array<int, string> $lines
* @return array<string, mixed>
*/
private static function parseBasicSection(array $lines): array
{
$fields = [
'org_name' => '',
'name' => '',
'seasons' => [],
'venue_raw' => '',
'suitable_count' => '',
'grade_levels' => [],
'duration' => '',
'contact_person' => '',
'contact_phones' => '',
];
$labels = [
'组织单位名称' => 'org_name',
'线路名称' => 'name',
'线路点位' => 'venue_raw',
'适宜人数' => 'suitable_count',
'研学时长' => 'duration',
'线路联络人' => 'contact_person',
'咨询电话' => 'contact_phones',
];
$seasonBuffer = [];
$gradeBuffer = [];
for ($i = 0; $i < count($lines); $i++) {
$line = $lines[$i];
if ($line === '') {
continue;
}
if (str_starts_with($line, '对应季节')) {
$seasonBuffer[] = $line;
if (($lines[$i + 1] ?? '') === '(可多选)') {
$i++;
}
while (($lines[$i + 1] ?? '') !== '' && ! self::isBasicLabelLine($lines[$i + 1])) {
$seasonBuffer[] = $lines[++$i];
}
$fields['seasons'] = self::parseSeasons(implode(' ', $seasonBuffer));
continue;
}
if (str_starts_with($line, '适配学段')) {
$gradeBuffer[] = $line;
if (($lines[$i + 1] ?? '') === '(可多选)') {
$i++;
}
while (($lines[$i + 1] ?? '') !== '' && ! self::isBasicLabelLine($lines[$i + 1])) {
$gradeBuffer[] = $lines[++$i];
}
$fields['grade_levels'] = self::parseGrades(implode(' ', $gradeBuffer));
continue;
}
foreach ($labels as $label => $key) {
if ($line !== $label) {
continue;
}
$valueLines = [];
while (($lines[$i + 1] ?? '') !== '' && ! self::isBasicLabelLine($lines[$i + 1]) && ! str_starts_with($lines[$i + 1], '对应季节') && ! str_starts_with($lines[$i + 1], '适配学段')) {
$valueLines[] = $lines[++$i];
}
$fields[$key] = trim(implode("\n", $valueLines));
continue 2;
}
}
$fields['suitable_count'] = self::normalizeBlankPlaceholder($fields['suitable_count']);
$fields['duration'] = self::normalizeDuration($fields['duration']);
$fields['contact_phones'] = StudyTourPayload::normalizeContactPhones($fields['contact_phones']);
foreach (['org_name', 'name', 'contact_person', 'venue_raw'] as $key) {
$fields[$key] = StudyTourPayload::compactText((string) $fields[$key]);
}
return $fields;
}
private static function isBasicLabelLine(string $line): bool
{
if ($line === '(可多选)') {
return true;
}
return array_key_exists($line, [
'组织单位名称' => true,
'线路名称' => true,
'线路点位' => true,
'适宜人数' => true,
'研学时长' => true,
'线路联络人' => true,
'咨询电话' => true,
]);
}
/**
* @return array<int, string>
*/
private static function parseSeasons(string $raw): array
{
$map = [
'春季' => 'spring',
'夏季' => 'summer',
'秋季' => 'autumn',
'冬季' => 'winter',
];
$selected = [];
foreach ($map as $label => $value) {
if (preg_match('/(?:[☑✅✔]|■)\s*'.preg_quote($label, '/').'/u', $raw)) {
$selected[] = $value;
}
}
return self::filterDictValues('study_tour_season', $selected);
}
/**
* @return array<int, string>
*/
private static function parseGrades(string $raw): array
{
$map = [
'幼儿园' => 'kindergarten',
'小学' => 'primary',
'初中' => 'junior',
'高中' => 'high',
'全学段' => 'all',
];
$selected = [];
foreach ($map as $label => $value) {
if (preg_match('/(?:[☑✅✔]|■)\s*'.preg_quote($label, '/').'/u', $raw)) {
$selected[] = $value;
}
}
return self::filterDictValues('study_tour_grade_level', $selected);
}
/**
* @param array<int, string> $values
* @return array<int, string>
*/
private static function filterDictValues(string $dictType, array $values): array
{
$allowed = DictItem::query()
->where('dict_type', $dictType)
->where('is_active', true)
->pluck('item_value')
->all();
return array_values(array_intersect($values, $allowed));
}
private static function normalizeBlankPlaceholder(string $raw): string
{
$text = StudyTourPayload::compactText($raw);
$text = preg_replace('/_+/u', '', $text) ?? $text;
return StudyTourPayload::compactText($text);
}
private static function normalizeDuration(string $raw): string
{
$text = StudyTourPayload::compactMultilineText($raw);
if ($text === '') {
return '';
}
$text = preg_replace('/_+/u', '', $text) ?? $text;
return StudyTourPayload::compactText(str_replace("\n", ' ', $text));
}
/**
* @param array<int, string> $lines
* @return array<int, array{date_label: string, items: array<int, array{time: string, activity: string, location: string}>}>
*/
private static function parseRouteSection(array $lines): array
{
$start = 0;
foreach ($lines as $idx => $line) {
if (in_array($line, ['日期', '时间', '行程安排', '地点'], true)) {
$start = $idx + 1;
}
}
$groups = [];
$currentIndex = null;
for ($i = $start; $i < count($lines); $i++) {
$line = $lines[$i];
if ($line === '') {
continue;
}
if (self::isRouteDateLabel($line)) {
$groups[] = [
'date_label' => StudyTourPayload::compactText($line),
'items' => [],
];
$currentIndex = count($groups) - 1;
continue;
}
if ($currentIndex === null) {
continue;
}
if (! self::isTimeLine($line)) {
continue;
}
$time = $line;
$activity = '';
$location = '';
if ($i + 1 < count($lines) && ($lines[$i + 1] ?? '') !== '' && ! self::isTimeLine($lines[$i + 1]) && ! self::isRouteDateLabel($lines[$i + 1])) {
$activity = $lines[++$i];
}
while ($i + 1 < count($lines) && ($lines[$i + 1] ?? '') === '') {
$i++;
}
if ($i + 1 < count($lines) && ($lines[$i + 1] ?? '') !== '' && ! self::isTimeLine($lines[$i + 1]) && ! self::isRouteDateLabel($lines[$i + 1])) {
$location = $lines[++$i];
}
$groups[$currentIndex]['items'][] = [
'time' => StudyTourPayload::compactText($time),
'activity' => StudyTourPayload::compactText($activity),
'location' => StudyTourPayload::compactText($location),
];
}
return StudyTourPayload::normalizeRoutePlans($groups);
}
/**
* @param array<int, string> $lines
* @return array<int, array{sort: int, name: string, content: string}>
*/
private static function parseCoursesSection(array $lines): array
{
$start = 0;
foreach ($lines as $idx => $line) {
if (in_array($line, ['序号', '课程名称', '课程内容'], true)) {
$start = $idx + 1;
}
}
$courses = [];
$sort = 1;
for ($i = $start; $i < count($lines); $i++) {
$line = $lines[$i];
if ($line === '' || ! preg_match('/^\d+$/', $line)) {
continue;
}
$name = StudyTourPayload::compactText((string) ($lines[$i + 1] ?? ''));
$content = StudyTourPayload::compactText((string) ($lines[$i + 2] ?? ''));
if ($name === '' && $content === '') {
$i += 2;
continue;
}
$courses[] = [
'sort' => $sort++,
'name' => $name,
'content' => $content,
];
$i += 2;
}
return StudyTourPayload::normalizeCourses($courses);
}
/**
* @return array{items: array<int, array<string, mixed>>, warnings: array<int, string>}
*/
private static function matchVenueItems(string $raw, array &$warnings): array
{
$raw = trim($raw);
if ($raw === '') {
return ['items' => [], 'warnings' => []];
}
$parts = preg_split('#[++、,/|;\n]+#u', $raw) ?: [];
$parts = array_values(array_filter(array_map('trim', $parts), fn ($p) => $p !== ''));
if ($parts === []) {
$parts = [$raw];
}
/** @var Collection<int, Venue> $venues */
$venues = Venue::query()->orderBy('sort')->orderBy('id')->get(['id', 'name']);
$items = [];
$usedVenueIds = [];
foreach ($parts as $part) {
$part = self::cleanVenueToken($part);
if ($part === '') {
continue;
}
$match = self::findVenueMatch($part, $venues, $usedVenueIds);
if ($match !== null) {
$items[] = ['type' => 'system', 'venue_id' => $match->id];
$usedVenueIds[] = $match->id;
continue;
}
$items[] = ['type' => 'custom', 'name' => $part];
$warnings[] = "场馆「{$part}」未在系统中匹配,已作为自定义场馆添加";
}
return ['items' => $items, 'warnings' => []];
}
/**
* @param Collection<int, Venue> $venues
* @param array<int, int> $usedVenueIds
*/
private static function findVenueMatch(string $token, Collection $venues, array $usedVenueIds): ?Venue
{
$tokenNorm = self::normalizeVenueName($token);
$exact = $venues->first(function (Venue $v) use ($tokenNorm, $usedVenueIds) {
if (in_array($v->id, $usedVenueIds, true)) {
return false;
}
return self::normalizeVenueName((string) $v->name) === $tokenNorm;
});
if ($exact !== null) {
return $exact;
}
$contains = $venues->filter(function (Venue $v) use ($tokenNorm, $usedVenueIds) {
if (in_array($v->id, $usedVenueIds, true)) {
return false;
}
$nameNorm = self::normalizeVenueName((string) $v->name);
return $nameNorm !== '' && (str_contains($nameNorm, $tokenNorm) || str_contains($tokenNorm, $nameNorm));
})->sortByDesc(fn (Venue $v) => mb_strlen((string) $v->name))->first();
return $contains;
}
private static function cleanVenueToken(string $token): string
{
$token = StudyTourPayload::compactText($token);
$token = preg_replace('/等$/u', '', $token) ?? $token;
$token = preg_replace('/[(].*[)]/u', '', $token) ?? $token;
return StudyTourPayload::compactText($token);
}
private static function normalizeVenueName(string $name): string
{
$name = mb_strtolower(trim($name));
$name = str_replace([' ', ' ', '·', '•'], '', $name);
return $name;
}
private static function isRouteDateLabel(string $line): bool
{
if (self::isTimeLine($line)) {
return false;
}
return (bool) preg_match('/^(线路[一二三四五六七八九十百零\d]+|第[一二三四五六七八九十百零\d]+天|上午|中午|下午|晚上)/u', $line);
}
private static function isTimeLine(string $line): bool
{
return (bool) preg_match('/^\d{1,2}:\d{2}/', $line);
}
/**
* @param array<int, string> $lines
*/
private static function joinSectionLines(array $lines): string
{
$chunks = [];
$buf = [];
foreach ($lines as $line) {
if ($line === '') {
if ($buf !== []) {
$chunks[] = trim(implode("\n", $buf));
$buf = [];
}
continue;
}
$buf[] = $line;
}
if ($buf !== []) {
$chunks[] = trim(implode("\n", $buf));
}
return trim(implode("\n\n", array_filter($chunks, fn ($c) => $c !== '')));
}
private static function plainTextToHtml(string $text): string
{
$text = StudyTourPayload::compactMultilineText($text);
if ($text === '') {
return '';
}
$paragraphs = preg_split("/\n{2,}/u", $text) ?: [$text];
$html = [];
foreach ($paragraphs as $paragraph) {
$paragraph = StudyTourPayload::compactMultilineText($paragraph);
if ($paragraph === '') {
continue;
}
$escaped = htmlspecialchars($paragraph, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8');
$escaped = nl2br($escaped, false);
$html[] = '<p>'.$escaped.'</p>';
}
return implode('', $html);
}
}