|
|
|
|
@ -162,7 +162,7 @@ class StudyTourDeclarationParser
|
|
|
|
|
|
|
|
|
|
$introText = self::joinSectionLines($sections['intro'] ?? []);
|
|
|
|
|
$feeText = self::joinSectionLines($sections['fee'] ?? []);
|
|
|
|
|
$implText = self::joinSectionLines(self::filterImplementationLines($sections['impl'] ?? []));
|
|
|
|
|
$implText = self::sanitizeImplementationText(self::joinSectionLines(self::filterImplementationLines($sections['impl'] ?? [])));
|
|
|
|
|
|
|
|
|
|
$routePlans = self::parseRouteSection($sections['route'] ?? []);
|
|
|
|
|
$courses = self::parseCoursesSection($sections['courses'] ?? [], $routePlans);
|
|
|
|
|
@ -814,18 +814,135 @@ class StudyTourDeclarationParser
|
|
|
|
|
if ($line === '') {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (self::isGarbledLine($line) || self::isWordMetadataLine($line)) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
if (preg_match('/^\(?实施场次[、,].*等\)?$/u', $line)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (! self::isPlausibleImplementationLine($line)) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
$kept[] = $line;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $kept;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static function sanitizeImplementationText(string $text): string
|
|
|
|
|
{
|
|
|
|
|
$text = StudyTourPayload::compactMultilineText($text);
|
|
|
|
|
if ($text === '') {
|
|
|
|
|
return '';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$kept = [];
|
|
|
|
|
foreach (preg_split('/\R/u', $text) ?: [] as $line) {
|
|
|
|
|
$line = trim((string) $line);
|
|
|
|
|
if ($line === '') {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (! self::isPlausibleImplementationLine($line)) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
$kept[] = $line;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return implode("\n", $kept);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static function isPlausibleImplementationLine(string $line): bool
|
|
|
|
|
{
|
|
|
|
|
if (self::isImplementationStopLine($line)) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/(?:\d{1,2}月\d{1,2}(?:-\d{1,2})?日|\d{1,2}-\d{1,2}月|\d{4}年\d{1,2}月\d{1,2}日|第[一二三四五六七八九十百零\d]+[场期次])/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/^(说明|备注|要点)[::]/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/[预约成团寒暑假节假说明要点场次时间共]/u', $line)) {
|
|
|
|
|
$cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0;
|
|
|
|
|
$readable = preg_match_all('/[\x{4e00}-\x{9fff}0-9A-Za-z,。、:;()\-\s\/&""《》—–,.\:]/u', $line) ?: 0;
|
|
|
|
|
|
|
|
|
|
return $cjk >= 3 && ($readable / max(mb_strlen($line), 1)) >= 0.6;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0;
|
|
|
|
|
$readable = preg_match_all('/[\x{4e00}-\x{9fff}0-9A-Za-z,。、:;()\-\s\/&""《》—–,.\:]/u', $line) ?: 0;
|
|
|
|
|
$len = mb_strlen($line);
|
|
|
|
|
|
|
|
|
|
return $cjk >= 10
|
|
|
|
|
&& ($readable / max($len, 1)) >= 0.75
|
|
|
|
|
&& ! preg_match('/[A-Za-z]/', $line);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static function isImplementationStopLine(string $line): bool
|
|
|
|
|
{
|
|
|
|
|
if (self::isGarbledLine($line) || self::isWordMetadataLine($line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/[Ç]/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/^(一、|二、|三、|四、|五、|六、|、线路)/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/^(线路规划|线路简介|线路名称|线路联络人|组织单位名称|咨询电话|研学课程|线路收费标准|线路点位|适配学段|研学时长|适宜人数)/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/^(日期|时间|行程安排|地点|以下为示例|(可多选)|熊猫饲养员|幼儿园|微软)/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/^第[一二三四五六七八九十百零\d]+天$/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/^(苏州市|常熟市|吴江区|太仓市)$/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/^\d{1,2}$/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/^1\d{10}$/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/^\d{1,2}:\d{2}x$/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/[\x{E000}-\x{F8FF}]/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/[\x{AC00}-\x{D7AF}\x{1100}-\x{11FF}]/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/[鐁耠耀謀卛鄬铐]/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/[°éêÈĨ༪]/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/[A-Za-z]/', $line) && ! preg_match('/(colour|run|DIY)/iu', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static function isValidImplementationLine(string $line): bool
|
|
|
|
|
{
|
|
|
|
|
if (self::isGarbledLine($line) || self::isWordMetadataLine($line)) {
|
|
|
|
|
@ -873,7 +990,7 @@ class StudyTourDeclarationParser
|
|
|
|
|
{
|
|
|
|
|
static $needles = [
|
|
|
|
|
'正文', '默认段落字体', '普通表格', '无列表', '页脚', '页眉', '网格型',
|
|
|
|
|
'微软用户', 'WordDocument', 'DocumentSummary', 'SummaryInformation',
|
|
|
|
|
'微软用户', '微软中国', 'WordDocument', 'DocumentSummary', 'SummaryInformation',
|
|
|
|
|
'CompObj', 'ProductBuild', 'KSOProduct', 'Normal.dotm', 'Microsoft Office',
|
|
|
|
|
'xmlns', 'ContentTypes', 'theme/theme', 'themeManager', 'fontTable',
|
|
|
|
|
'Root Entry', 'SMWordDoc', 'Word.Document',
|
|
|
|
|
@ -929,6 +1046,18 @@ class StudyTourDeclarationParser
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/[\x{E000}-\x{F8FF}]/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/[鐁耠耀謀卛鄬铐]/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (str_contains($line, 'Ç')) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/[]/u', $line)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
@ -958,8 +1087,16 @@ class StudyTourDeclarationParser
|
|
|
|
|
|
|
|
|
|
private static function containsGarbledText(string $text): bool
|
|
|
|
|
{
|
|
|
|
|
if (preg_match('/[鐁耠耀謀卛鄬铐]/u', $text)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/(?:线路规划|、线路|微软中国|WordDocument)/u', $text)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach (preg_split('/\R/u', $text) ?: [] as $line) {
|
|
|
|
|
if (self::isGarbledLine(trim((string) $line))) {
|
|
|
|
|
if (self::isGarbledLine(trim((string) $line)) || self::isImplementationStopLine(trim((string) $line))) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|