<?php
namespace App\Support;
use App\Models\DictItem;
use App\Models\Venue;
use Illuminate\Support\Collection;
class StudyTourDeclarationParser
{
private const SECTION_MARKERS = [
'basic' => '一、线路基本情况',
'intro' => '二、线路简介',
'route' => '三、线路规划',
'courses' => '四、研学课程',
'fee' => '五、线路收费标准',
'impl' => '六、线路计划实施情况',
];
/**
* @return array{parsed: array< string , mixed > , warnings: array< int , string > }
*/
public static function parseFile(string $path, string $extension): array
{
$extension = strtolower(ltrim($extension, '.'));
if ($extension === 'doc') {
return self::parseDocFile($path);
}
$text = DocTextExtractor::extract($path, $extension);
return self::parseText($text);
}
/**
* @return array{parsed: array< string , mixed > , warnings: array< int , string > }
*/
private static function parseDocFile(string $path): array
{
$candidates = DocTextExtractor::extractDocCandidates($path);
if ($candidates === []) {
throw new \RuntimeException('无法解析 .doc 申报表,请检查文件是否损坏,或另存为 .docx 后重试');
}
$bestResult = null;
$bestScore = -1;
foreach ($candidates as $text) {
$result = self::parseText($text);
$score = self::scoreParsedResult($result);
if ($score > $bestScore) {
$bestScore = $score;
$bestResult = $result;
}
}
return $bestResult ?? self::parseText($candidates[0]);
}
/**
* @param array{parsed: array< string , mixed > , warnings: array< int , string > } $result
*/
private static function scoreParsedResult(array $result): int
{
$parsed = $result['parsed'] ?? [];
$score = 0;
if (trim((string) ($parsed['name'] ?? '')) !== '') {
$score += 50;
}
if (trim((string) ($parsed['org_name'] ?? '')) !== '') {
$score += 20;
}
if (trim((string) ($parsed['suitable_count'] ?? '')) !== '') {
$score += 5;
}
if (trim((string) ($parsed['duration'] ?? '')) !== '') {
$score += 5;
}
$score += count($parsed['seasons'] ?? []) * 3;
$score += count($parsed['grade_levels'] ?? []) * 3;
$score += count($parsed['venue_items'] ?? []) * 8;
foreach ($parsed['route_plans'] ?? [] as $group) {
$validItems = 0;
foreach ($group['items'] ?? [] as $item) {
$time = trim((string) ($item['time'] ?? ''));
$activity = trim((string) ($item['activity'] ?? ''));
$location = trim((string) ($item['location'] ?? ''));
if ($activity === '' & & $time === '') {
continue;
}
if (self::isGarbledLine($activity) || self::isGarbledLine($time) || self::isGarbledLine($location)) {
$score -= 30;
continue;
}
$validItems++;
}
if ($validItems > 0) {
$score += 10;
$score += $validItems * 6;
}
}
$score += count($parsed['courses'] ?? []) * 5;
foreach ($parsed['courses'] ?? [] as $course) {
$name = trim((string) ($course['name'] ?? ''));
$content = trim((string) ($course['content'] ?? ''));
if ($name === '' || $name === $content) {
$score -= 20;
continue;
}
if (preg_match('/^(课程\d+|运用|文化馆|走进|寻找|领略|窑烤|泡泡剧场|萤火虫知识)/u', $name)) {
$score -= 20;
continue;
}
if (mb_strlen($name) > 24) {
$score -= 15;
continue;
}
$score += 18;
}
foreach (['intro_html', 'fee_html', 'implementation_html'] as $key) {
$plain = trim(strip_tags((string) ($parsed[$key] ?? '')));
if ($plain === '') {
continue;
}
if (self::containsGarbledText($plain)) {
$score -= 120;
continue;
}
if ($key === 'implementation_html' & & mb_strlen($plain) > 150) {
$score -= 80;
}
$score += 12 + min(mb_strlen($plain), 120) / 20;
}
return (int) $score;
}
/**
* @return array{parsed: array< string , mixed > , warnings: array< int , string > }
*/
public static function parseText(string $text): array
{
$warnings = [];
$lines = self::splitLines($text);
$sections = self::splitSections($lines);
$basicLines = $sections['basic'] ?? [];
$basic = self::parseBasicSection($basicLines);
$introText = self::joinSectionLines($sections['intro'] ?? []);
$feeText = self::joinSectionLines($sections['fee'] ?? []);
$implText = self::joinSectionLines(self::filterImplementationLines($sections['impl'] ?? []));
$routePlans = self::parseRouteSection($sections['route'] ?? []);
$courses = self::parseCoursesSection($sections['courses'] ?? [], $routePlans);
$venueResult = self::matchVenueItems((string) ($basic['venue_raw'] ?? ''), $warnings);
unset($basic['venue_raw']);
if ($basic['name'] === '') {
$warnings[] = '未识别到线路名称,请手动填写';
}
if ($venueResult['items'] === []) {
$warnings[] = '未识别到线路点位/场馆,请手动添加';
}
$parsed = StudyTourPayload::normalizeIncoming([
'name' => $basic['name'],
'org_name' => $basic['org_name'],
'seasons' => $basic['seasons'],
'suitable_count' => $basic['suitable_count'],
'grade_levels' => $basic['grade_levels'],
'duration' => $basic['duration'],
'contact_person' => $basic['contact_person'],
'contact_phones' => $basic['contact_phones'],
'venue_items' => $venueResult['items'],
'intro_html' => self::plainTextToHtml($introText),
'route_plans' => $routePlans,
'courses' => $courses,
'fee_html' => self::plainTextToHtml($feeText),
'implementation_html' => self::plainTextToHtml($implText),
'tags' => [],
'cover_image' => '',
'sort' => 0,
'is_on_shelf' => true,
]);
return [
'parsed' => $parsed,
'warnings' => array_values(array_unique($warnings)),
];
}
/**
* @return array< int , string >
*/
private static function splitLines(string $text): array
{
$text = str_replace(["\r\n", "\r", "\f"], "\n", $text);
$parts = preg_split('/\n/u', $text) ?: [];
$lines = [];
foreach ($parts as $part) {
$line = trim((string) $part);
if ($line === '' || self::isGarbledLine($line)) {
continue;
}
$lines[] = $line;
}
return $lines;
}
/**
* @param array< int , string > $lines
* @return array< string , array < int , string > >
*/
private static function splitSections(array $lines): array
{
$keys = array_keys(self::SECTION_MARKERS);
$sections = array_fill_keys($keys, []);
$current = null;
foreach ($lines as $line) {
$matched = null;
foreach (self::SECTION_MARKERS as $key => $marker) {
if ($line === $marker || str_starts_with($line, $marker)) {
$matched = $key;
break;
}
}
if ($matched !== null) {
$current = $matched;
continue;
}
if ($current !== null) {
$sections[$current][] = $line;
}
}
return $sections;
}
/**
* @param array< int , string > $lines
* @return array< string , mixed >
*/
private static function parseBasicSection(array $lines): array
{
$fields = [
'org_name' => '',
'name' => '',
'seasons' => [],
'venue_raw' => '',
'suitable_count' => '',
'grade_levels' => [],
'duration' => '',
'contact_person' => '',
'contact_phones' => '',
];
$labels = [
'组织单位名称' => 'org_name',
'线路名称' => 'name',
'线路点位' => 'venue_raw',
'适宜人数' => 'suitable_count',
'研学时长' => 'duration',
'线路联络人' => 'contact_person',
'咨询电话' => 'contact_phones',
];
$seasonBuffer = [];
$gradeBuffer = [];
for ($i = 0; $i < count ( $ lines ) ; $ i + + ) {
$line = $lines[$i];
if ($line === '') {
continue;
}
if (str_starts_with($line, '对应季节')) {
$seasonBuffer[] = $line;
if (($lines[$i + 1] ?? '') === '(可多选)') {
$i++;
}
while (($lines[$i + 1] ?? '') !== '' & & ! self::isBasicLabelLine($lines[$i + 1])) {
$seasonBuffer[] = $lines[++$i];
}
$fields['seasons'] = self::parseSeasons(implode(' ', $seasonBuffer));
continue;
}
if (str_starts_with($line, '适配学段')) {
$gradeBuffer[] = $line;
if (($lines[$i + 1] ?? '') === '(可多选)') {
$i++;
}
while (($lines[$i + 1] ?? '') !== '' & & ! self::isBasicLabelLine($lines[$i + 1])) {
$gradeBuffer[] = $lines[++$i];
}
$fields['grade_levels'] = self::parseGrades(implode(' ', $gradeBuffer));
continue;
}
foreach ($labels as $label => $key) {
if ($line !== $label) {
continue;
}
$valueLines = [];
while (($lines[$i + 1] ?? '') !== '' & & ! self::isBasicLabelLine($lines[$i + 1]) & & ! str_starts_with($lines[$i + 1], '对应季节') & & ! str_starts_with($lines[$i + 1], '适配学段')) {
$valueLines[] = $lines[++$i];
}
$fields[$key] = trim(implode("\n", $valueLines));
continue 2;
}
}
$fields['suitable_count'] = self::normalizeBlankPlaceholder($fields['suitable_count']);
$fields['duration'] = self::normalizeDuration($fields['duration']);
$fields['contact_phones'] = StudyTourPayload::normalizeContactPhones($fields['contact_phones']);
foreach (['org_name', 'name', 'contact_person', 'venue_raw'] as $key) {
$fields[$key] = StudyTourPayload::compactText((string) $fields[$key]);
}
return $fields;
}
private static function isBasicLabelLine(string $line): bool
{
if ($line === '(可多选)') {
return true;
}
return array_key_exists($line, [
'组织单位名称' => true,
'线路名称' => true,
'线路点位' => true,
'适宜人数' => true,
'研学时长' => true,
'线路联络人' => true,
'咨询电话' => true,
]);
}
/**
* @return array< int , string >
*/
private static function parseSeasons(string $raw): array
{
$map = [
'春季' => 'spring',
'夏季' => 'summer',
'秋季' => 'autumn',
'冬季' => 'winter',
];
$selected = [];
foreach ($map as $label => $value) {
if (preg_match('/(?:[☑✅✔]|■)\s*'.preg_quote($label, '/').'/u', $raw)) {
$selected[] = $value;
}
}
return self::filterDictValues('study_tour_season', $selected);
}
/**
* @return array< int , string >
*/
private static function parseGrades(string $raw): array
{
$map = [
'幼儿园' => 'kindergarten',
'小学' => 'primary',
'初中' => 'junior',
'高中' => 'high',
'全学段' => 'all',
];
$selected = [];
foreach ($map as $label => $value) {
if (preg_match('/(?:[☑✅✔]|■)\s*'.preg_quote($label, '/').'/u', $raw)) {
$selected[] = $value;
}
}
return self::filterDictValues('study_tour_grade_level', $selected);
}
/**
* @param array< int , string > $values
* @return array< int , string >
*/
private static function filterDictValues(string $dictType, array $values): array
{
$allowed = DictItem::query()
->where('dict_type', $dictType)
->where('is_active', true)
->pluck('item_value')
->all();
return array_values(array_intersect($values, $allowed));
}
private static function normalizeBlankPlaceholder(string $raw): string
{
$text = StudyTourPayload::compactText($raw);
$text = preg_replace('/_+/u', '', $text) ?? $text;
return StudyTourPayload::compactText($text);
}
private static function normalizeDuration(string $raw): string
{
$text = StudyTourPayload::compactMultilineText($raw);
if ($text === '') {
return '';
}
$text = preg_replace('/_+/u', '', $text) ?? $text;
return StudyTourPayload::compactText(str_replace("\n", ' ', $text));
}
/**
* @param array< int , string > $lines
* @return array< int , array { date_label: string , items: array < int , array { time: string , activity: string , location: string } > }>
*/
private static function parseRouteSection(array $lines): array
{
$start = 0;
foreach ($lines as $idx => $line) {
if (in_array($line, ['日期', '时间', '行程安排', '地点'], true)) {
$start = $idx + 1;
}
}
$groups = [];
$currentIndex = null;
for ($i = $start; $i < count ( $ lines ) ; $ i + + ) {
$line = $lines[$i];
if ($line === '') {
continue;
}
if (self::isRouteDateLabel($line)) {
$groups[] = [
'date_label' => StudyTourPayload::compactText($line),
'items' => [],
];
$currentIndex = count($groups) - 1;
continue;
}
if ($currentIndex === null) {
continue;
}
if (self::isGarbledLine($line) || preg_match('/^[四五六]、/u', $line)) {
break;
}
if (! self::isTimeLine($line)) {
if (self::isRouteTableHeaderLine($line)) {
continue;
}
$itemCount = count($groups[$currentIndex]['items']);
if ($itemCount > 0 & & self::isLikelyRouteLocation($line)) {
$lastIndex = $itemCount - 1;
if ($groups[$currentIndex]['items'][$lastIndex]['location'] === '') {
$groups[$currentIndex]['items'][$lastIndex]['location'] = StudyTourPayload::compactText($line);
continue;
}
}
if (! self::isLikelyRouteLocation($line)) {
$groups[$currentIndex]['items'][] = [
'time' => '',
'activity' => StudyTourPayload::compactText($line),
'location' => '',
];
}
continue;
}
$time = $line;
$activity = '';
$location = '';
if ($i + 1 < count ( $ lines ) & & ( $ lines [ $ i + 1 ] ? ? ' ' ) ! = = ' ' & & ! self::isTimeLine ( $ lines [ $ i + 1 ] ) & & ! self::isRouteDateLabel ( $ lines [ $ i + 1 ] ) & & ! self::isRouteTableHeaderLine ( $ lines [ $ i + 1 ] ) ) {
$activity = $lines[++$i];
}
while ($i + 1 < count ( $ lines ) & & ( $ lines [ $ i + 1 ] ? ? ' ' ) = = = ' ' ) {
$i++;
}
if ($i + 1 < count ( $ lines ) & & ( $ lines [ $ i + 1 ] ? ? ' ' ) ! = = ' ' & & ! self::isTimeLine ( $ lines [ $ i + 1 ] ) & & ! self::isRouteDateLabel ( $ lines [ $ i + 1 ] ) & & ! self::isRouteTableHeaderLine ( $ lines [ $ i + 1 ] ) ) {
$candidate = $lines[$i + 1];
if (self::isLikelyRouteLocation($candidate) || ! self::looksLikeRouteActivity($candidate)) {
$location = $lines[++$i];
}
}
$groups[$currentIndex]['items'][] = [
'time' => StudyTourPayload::compactText($time),
'activity' => StudyTourPayload::compactText($activity),
'location' => StudyTourPayload::compactText($location),
];
}
foreach ($groups as & $group) {
$lastLocation = '';
foreach ($group['items'] as & $item) {
if ($item['location'] !== '') {
$lastLocation = $item['location'];
continue;
}
if ($lastLocation !== '') {
$item['location'] = $lastLocation;
}
}
unset($item);
}
unset($group);
return StudyTourPayload::normalizeRoutePlans($groups);
}
/**
* @param array< int , string > $lines
* @param array< int , array { date_label: string , items: array < int , array { time: string , activity: string , location: string } > }> $routePlans
* @return array< int , array { sort: int , name: string , content: string } >
*/
private static function parseCoursesSection(array $lines, array $routePlans = []): array
{
$start = 0;
foreach ($lines as $idx => $line) {
if (in_array($line, ['序号', '课程名称', '课程内容'], true)) {
$start = $idx + 1;
}
}
$courses = [];
$sort = 1;
for ($i = $start; $i < count ( $ lines ) ; $ i + + ) {
$line = $lines[$i];
if ($line === '' || ! preg_match('/^\d+$/', $line)) {
continue;
}
$name = StudyTourPayload::compactText((string) ($lines[$i + 1] ?? ''));
$content = StudyTourPayload::compactText((string) ($lines[$i + 2] ?? ''));
if ($name === '' & & $content === '') {
$i += 2;
continue;
}
$courses[] = [
'sort' => $sort++,
'name' => $name,
'content' => $content,
];
$i += 2;
}
if ($courses === []) {
$courses = self::parseCoursesFromContentLines($lines, $start, $routePlans);
}
return StudyTourPayload::normalizeCourses($courses);
}
/**
* @param array< int , string > $lines
* @param array< int , array { date_label: string , items: array < int , array { time: string , activity: string , location: string } > }> $routePlans
* @return array< int , array { sort: int , name: string , content: string } >
*/
private static function parseCoursesFromContentLines(array $lines, int $start, array $routePlans): array
{
$contentLines = [];
for ($i = $start; $i < count ( $ lines ) ; $ i + + ) {
$line = $lines[$i];
if ($line === '' || in_array($line, ['序号', '课程名称', '课程内容'], true)) {
continue;
}
if (preg_match('/^[五六]、/u', $line)) {
break;
}
$contentLines[] = StudyTourPayload::compactText($line);
}
if ($contentLines === []) {
return [];
}
$pairedCourses = self::parseCoursesFromNameContentPairs($contentLines);
if ($pairedCourses !== []) {
return $pairedCourses;
}
$activityNames = self::candidateCourseNamesFromRoutes($routePlans);
if (count($activityNames) === count($contentLines)) {
$courses = [];
foreach ($contentLines as $idx => $content) {
$courses[] = [
'sort' => $idx + 1,
'name' => $activityNames[$idx],
'content' => $content,
];
}
return $courses;
}
$courses = [];
foreach ($contentLines as $idx => $line) {
if (preg_match('/^(.{2,30}?)[: :]\s*(.+)$/u', $line, $matches)) {
$courses[] = [
'sort' => $idx + 1,
'name' => StudyTourPayload::compactText($matches[1]),
'content' => StudyTourPayload::compactText($matches[2]),
];
continue;
}
$courses[] = [
'sort' => $idx + 1,
'name' => mb_strlen($line) < = 20 ? $line : ('课程'.($idx + 1)),
'content' => $line,
];
}
return $courses;
}
/**
* @param array< int , string > $lines
* @return array< int , array { sort: int , name: string , content: string } >
*/
private static function parseCoursesFromNameContentPairs(array $lines): array
{
if (count($lines) < 2 | | count ( $ lines ) % 2 ! = = 0 ) {
return [];
}
$courses = [];
for ($i = 0; $i < count ( $ lines ) ; $ i + = 2 ) {
$name = StudyTourPayload::compactText($lines[$i]);
$content = StudyTourPayload::compactText($lines[$i + 1]);
if ($name === '' || $content === '') {
return [];
}
if (mb_strlen($name) > 30 || mb_strlen($content) < mb_strlen ( $ name ) ) {
return [];
}
$courses[] = [
'sort' => count($courses) + 1,
'name' => $name,
'content' => $content,
];
}
return count($courses) >= 1 ? $courses : [];
}
/**
* @param array< int , array { date_label: string , items: array < int , array { time: string , activity: string , location: string } > }> $routePlans
* @return array< int , string >
*/
private static function candidateCourseNamesFromRoutes(array $routePlans): array
{
$names = [];
foreach ($routePlans as $group) {
foreach ($group['items'] ?? [] as $item) {
$activity = StudyTourPayload::compactText((string) ($item['activity'] ?? ''));
if ($activity === '' || self::isRouteMealOrCeremony($activity)) {
continue;
}
if (! in_array($activity, $names, true)) {
$names[] = $activity;
}
}
}
return $names;
}
/**
* @return array{items: array< int , array < string , mixed > >, warnings: array< int , string > }
*/
private static function matchVenueItems(string $raw, array & $warnings): array
{
$raw = trim($raw);
if ($raw === '') {
return ['items' => [], 'warnings' => []];
}
$parts = preg_split('#[++、,, /|; ;\n]+#u', $raw) ?: [];
$parts = array_values(array_filter(array_map('trim', $parts), fn ($p) => $p !== ''));
if ($parts === []) {
$parts = [$raw];
}
/** @var Collection< int , Venue > $venues */
$venues = Venue::query()->orderBy('sort')->orderBy('id')->get(['id', 'name']);
$items = [];
$usedVenueIds = [];
foreach ($parts as $part) {
$part = self::cleanVenueToken($part);
if ($part === '') {
continue;
}
$match = self::findVenueMatch($part, $venues, $usedVenueIds);
if ($match !== null) {
$items[] = ['type' => 'system', 'venue_id' => $match->id];
$usedVenueIds[] = $match->id;
continue;
}
$items[] = ['type' => 'custom', 'name' => $part];
$warnings[] = "场馆「{$part}」未在系统中匹配,已作为自定义场馆添加";
}
return ['items' => $items, 'warnings' => []];
}
/**
* @param Collection< int , Venue > $venues
* @param array< int , int > $usedVenueIds
*/
private static function findVenueMatch(string $token, Collection $venues, array $usedVenueIds): ?Venue
{
$tokenNorm = self::normalizeVenueName($token);
$exact = $venues->first(function (Venue $v) use ($tokenNorm, $usedVenueIds) {
if (in_array($v->id, $usedVenueIds, true)) {
return false;
}
return self::normalizeVenueName((string) $v->name) === $tokenNorm;
});
if ($exact !== null) {
return $exact;
}
$contains = $venues->filter(function (Venue $v) use ($tokenNorm, $usedVenueIds) {
if (in_array($v->id, $usedVenueIds, true)) {
return false;
}
$nameNorm = self::normalizeVenueName((string) $v->name);
return $nameNorm !== '' & & (str_contains($nameNorm, $tokenNorm) || str_contains($tokenNorm, $nameNorm));
})->sortByDesc(fn (Venue $v) => mb_strlen((string) $v->name))->first();
return $contains;
}
private static function cleanVenueToken(string $token): string
{
$token = StudyTourPayload::compactText($token);
$token = preg_replace('/等$/u', '', $token) ?? $token;
$token = preg_replace('/[( (].*[)) ]/u', '', $token) ?? $token;
return StudyTourPayload::compactText($token);
}
private static function normalizeVenueName(string $name): string
{
$name = mb_strtolower(trim($name));
$name = str_replace([' ', ' ', '·', '•'], '', $name);
return $name;
}
private static function isRouteDateLabel(string $line): bool
{
if (self::isTimeLine($line)) {
return false;
}
return (bool) preg_match('/^(线路[一二三四五六七八九十百零\d]+|第[一二三四五六七八九十百零\d]+天|上午|中午|下午|晚上)/u', $line);
}
private static function isTimeLine(string $line): bool
{
return (bool) preg_match('/^\d{1,2}:\d{2}/', $line);
}
private static function isRouteTableHeaderLine(string $line): bool
{
return in_array($line, ['日期', '时间', '行程安排', '地点'], true);
}
private static function isRouteMealOrCeremony(string $activity): bool
{
return (bool) preg_match('/(办理入住|民宿早餐|午休|结营仪式|领取伴手礼|欢迎晚宴|夜探|午餐|晚餐|野火饭|鸡汤|江村饭店|自由活动)/u', $activity);
}
private static function isLikelyRouteLocation(string $line): bool
{
if (self::isRouteMealOrCeremony($line)) {
return false;
}
return (bool) preg_match('/(博物馆|文化园|科技馆|湿地|故居|纪念馆|风情园|蚕桑|丝绸|活动中心|营地|基地|有限公司|酒店|民宿)/u', $line);
}
private static function looksLikeRouteActivity(string $line): bool
{
return (bool) preg_match('/(体验|探秘|制作|参观|采摘|仪式|晚宴|午餐|晚餐|早餐|午休|活动|课程|王国|有趣|小夜灯|缫丝|挂件|面包)/u', $line);
}
/**
* @param array< int , string > $lines
* @return array< int , string >
*/
private static function filterImplementationLines(array $lines): array
{
$kept = [];
foreach ($lines as $line) {
if ($line === '') {
continue;
}
if (! self::isValidImplementationLine($line)) {
break;
}
$kept[] = $line;
}
return $kept;
}
private static function isValidImplementationLine(string $line): bool
{
if (self::isGarbledLine($line) || self::isWordMetadataLine($line)) {
return false;
}
if (preg_match('/[]/u', $line)) {
return false;
}
if (preg_match('/(?:xmlns|http:|xml |ContentTypes|accent[0-9]|folHlink|theme\/|drawingml)/iu', $line)) {
return false;
}
if (preg_match('/^\d{1,2}月\d{1,2}日/u', $line)) {
return true;
}
if (! preg_match('/[月日场活动安排如下夏令营研学实施时间共]/u', $line)) {
return false;
}
$cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0;
return $cjk >= 4 & & ($cjk / max(mb_strlen($line), 1)) >= 0.45;
}
private static function isWordMetadataLine(string $line): bool
{
static $needles = [
'正文', '默认段落字体', '普通表格', '无列表', '页脚', '页眉', '网格型',
'微软用户', 'WordDocument', 'DocumentSummary', 'SummaryInformation',
'CompObj', 'ProductBuild', 'KSOProduct', 'Normal.dotm', 'Microsoft Office',
'xmlns', 'ContentTypes', 'theme/theme', 'themeManager', 'fontTable',
'Root Entry', 'SMWordDoc', 'Word.Document',
];
foreach ($needles as $needle) {
if (str_contains($line, $needle)) {
return true;
}
}
return (bool) preg_match('/^(黑体|宋体|微软雅黑|等线(?: Light)?|Arial|Symbol|Tahoma|Times New Roman|Calibri|Cambria Math|Segoe UI Emoji|SimHei|SimSun|DengXian)$/u', $line);
}
private static function isGarbledLine(string $line): bool
{
if ($line === '') {
return false;
}
if (preg_match('/^\d+$/', $line)) {
return false;
}
if (preg_match('/^\d{1,2}:\d{2}/', $line)) {
return false;
}
if (self::isWordMetadataLine($line)) {
return true;
}
if (preg_match('/[\x{0080}-\x{009F}]/u', $line)) {
return true;
}
if (preg_match('/[\x{0200}-\x{024F}\x{0400}-\x{04FF}\x{0500}-\x{052F}\x{0600}-\x{06FF}]/u', $line)) {
return true;
}
if (preg_match('/[ᘀ-]/u', $line)) {
return true;
}
if (preg_match('/[漀愀脈摫欀䡒⡯␖]/u', $line)) {
return true;
}
if (preg_match('/[]/u', $line)) {
return true;
}
if (preg_match('/(?:xmlns|http:|ContentTypes|Normal\.dotm|Word\.Document)/iu', $line)) {
return true;
}
if (preg_match('/[^\x{4e00}-\x{9fff}0-9A-Za-z, 。、: ; ( ) ""\'\-\s\/& “”]/u', $line) & & mb_strlen($line) < = 8) {
return true;
}
$cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0;
if ($cjk === 0 & & mb_strlen($line) >= 3) {
return true;
}
$len = mb_strlen($line);
if ($len < = 5) {
return $cjk < 2 ;
}
$readable = preg_match_all('/[\x{4e00}-\x{9fff}0-9A-Za-z, 。、: ; ( ) \-\s\/& “”]/u', $line) ?: 0;
return ($readable / max($len, 1)) < 0.45 ;
}
private static function containsGarbledText(string $text): bool
{
foreach (preg_split('/\R/u', $text) ?: [] as $line) {
if (self::isGarbledLine(trim((string) $line))) {
return true;
}
}
return false;
}
/**
* @param array< int , string > $lines
*/
private static function joinSectionLines(array $lines): string
{
$chunks = [];
$buf = [];
foreach ($lines as $line) {
if ($line === '' || self::isGarbledLine($line)) {
if ($buf !== []) {
$chunks[] = trim(implode("\n", $buf));
$buf = [];
}
if (self::isGarbledLine($line)) {
break;
}
continue;
}
$buf[] = $line;
}
if ($buf !== []) {
$chunks[] = trim(implode("\n", $buf));
}
return trim(implode("\n\n", array_filter($chunks, fn ($c) => $c !== '')));
}
private static function plainTextToHtml(string $text): string
{
$text = StudyTourPayload::compactMultilineText($text);
if ($text === '') {
return '';
}
$paragraphs = preg_split("/\n{2,}/u", $text) ?: [$text];
$html = [];
foreach ($paragraphs as $paragraph) {
$paragraph = StudyTourPayload::compactMultilineText($paragraph);
if ($paragraph === '') {
continue;
}
$escaped = htmlspecialchars($paragraph, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8');
$escaped = nl2br($escaped, false);
$html[] = '< p > '.$escaped.'< / p > ';
}
return implode('', $html);
}
}