From a5c2da0a8815986a2fdf34816939ed0c667ed77f Mon Sep 17 00:00:00 2001 From: lion <120344285@qq.com> Date: Tue, 30 Jun 2026 14:41:49 +0800 Subject: [PATCH] =?UTF-8?q?=E7=A0=94=E5=AD=A6=E7=BA=BF=E8=B7=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Controllers/Api/StudyTourController.php | 14 +- app/Support/DocTextExtractor.php | 225 +++++++++++++++--- app/Support/StudyTourDeclarationParser.php | 72 ++++++ 3 files changed, 270 insertions(+), 41 deletions(-) diff --git a/app/Http/Controllers/Api/StudyTourController.php b/app/Http/Controllers/Api/StudyTourController.php index a0c5758..1bf9eee 100644 --- a/app/Http/Controllers/Api/StudyTourController.php +++ b/app/Http/Controllers/Api/StudyTourController.php @@ -8,6 +8,7 @@ use App\Support\StudyTourDeclarationParser; use App\Support\StudyTourPayload; use Illuminate\Http\JsonResponse; use Illuminate\Http\Request; +use Illuminate\Support\Str; class StudyTourController extends Controller { @@ -75,15 +76,18 @@ class StudyTourController extends Controller if (! in_array($extension, ['doc', 'docx'], true)) { return response()->json(['message' => '仅支持 .doc / .docx 申报表'], 422); } - $tmpPath = $uploaded->getRealPath(); - if (! is_string($tmpPath) || $tmpPath === '') { - return response()->json(['message' => '无法读取上传文件'], 422); - } + $path = null; try { - $result = StudyTourDeclarationParser::parseFile($tmpPath, $extension); + $storedPath = $uploaded->storeAs('tmp/declaration-import', Str::uuid()->toString().'.'.$extension); + $path = storage_path('app/'.$storedPath); + $result = StudyTourDeclarationParser::parseFile($path, $extension); } catch (\Throwable $e) { return response()->json(['message' => '申报表解析失败:'.$e->getMessage()], 422); + } finally { + if (isset($path) && is_file($path)) { + @unlink($path); + } } return response()->json($result); diff --git a/app/Support/DocTextExtractor.php b/app/Support/DocTextExtractor.php index ef4dee7..d90fe9c 100644 --- a/app/Support/DocTextExtractor.php +++ b/app/Support/DocTextExtractor.php @@ -10,6 +10,7 @@ use PhpOffice\PhpWord\Element\Table; use PhpOffice\PhpWord\Element\Text; use PhpOffice\PhpWord\Element\TextBreak; use PhpOffice\PhpWord\IOFactory; +use PhpOffice\PhpWord\Shared\OLERead; use RuntimeException; use Symfony\Component\Process\Process; use ZipArchive; @@ -90,54 +91,132 @@ class DocTextExtractor public static function extractDoc(string $path): string { - $phpWordText = self::extractDocViaPhpWord($path); - if (is_string($phpWordText) && trim($phpWordText) !== '') { - return $phpWordText; + $candidates = self::extractDocCandidates($path); + if ($candidates === []) { + throw new RuntimeException('无法解析 .doc 申报表,请检查文件是否损坏,或另存为 .docx 后重试'); } - $textutil = self::resolveBinary('textutil'); - if ($textutil !== null) { - $out = self::runCommand([$textutil, '-convert', 'txt', '-stdout', $path], 45); - if (is_string($out) && trim($out) !== '') { - return self::normalizeText($out); + $bestText = null; + $bestScore = -1; + foreach ($candidates as $text) { + $score = self::scoreDeclarationText($text); + if ($score > $bestScore) { + $bestScore = $score; + $bestText = $text; } } - $soffice = self::resolveBinary('soffice'); - if ($soffice !== null) { - $tmpDir = sys_get_temp_dir().'/study-tour-doc-'.Str::random(8); - if (! @mkdir($tmpDir, 0700, true) && ! is_dir($tmpDir)) { - throw new RuntimeException('无法创建临时目录'); + return $bestText ?? $candidates[0]; + } + + /** + * @return array + */ + public static function extractDocCandidates(string $path): array + { + $candidates = []; + + foreach ([ + self::extractDocViaBinaryUtf16($path), + self::extractDocViaPhpWord($path), + self::extractDocViaTextutil($path), + self::extractDocViaSoffice($path), + ] as $text) { + if (! is_string($text) || trim($text) === '') { + continue; } - self::runCommand([ - $soffice, - '--headless', - '--convert-to', - 'txt:Text', - '--outdir', - $tmpDir, - $path, - ], 120); - $base = pathinfo($path, PATHINFO_FILENAME); - $txtPath = $tmpDir.'/'.$base.'.txt'; - if (is_file($txtPath)) { - $text = file_get_contents($txtPath) ?: ''; - @unlink($txtPath); - @rmdir($tmpDir); - - if (trim($text) !== '') { - return self::normalizeText($text); - } + $normalized = self::normalizeText($text); + if ($normalized === '') { + continue; + } + $candidates[$normalized] = $normalized; + } + + return array_values($candidates); + } + + private static function extractDocViaTextutil(string $path): ?string + { + $textutil = self::resolveBinary('textutil'); + if ($textutil === null) { + return null; + } + + $out = self::runCommand([$textutil, '-convert', 'txt', '-stdout', $path], 45); + + return is_string($out) && trim($out) !== '' ? $out : null; + } + + private static function extractDocViaSoffice(string $path): ?string + { + $soffice = self::resolveBinary('soffice'); + if ($soffice === null) { + return null; + } + + $tmpDir = sys_get_temp_dir().'/study-tour-doc-'.Str::random(8); + if (! @mkdir($tmpDir, 0700, true) && ! is_dir($tmpDir)) { + return null; + } + + self::runCommand([ + $soffice, + '--headless', + '--convert-to', + 'txt:Text', + '--outdir', + $tmpDir, + $path, + ], 120); + + $base = pathinfo($path, PATHINFO_FILENAME); + $txtPath = $tmpDir.'/'.$base.'.txt'; + $text = is_file($txtPath) ? (file_get_contents($txtPath) ?: '') : ''; + @array_map('unlink', glob($tmpDir.'/*') ?: []); + @rmdir($tmpDir); + + return trim($text) !== '' ? $text : null; + } + + private static function scoreDeclarationText(string $text): int + { + $score = 0; + $markers = [ + '一、线路基本情况' => 100, + '三、线路规划' => 80, + '二、线路简介' => 40, + '四、研学课程' => 40, + '五、线路收费标准' => 40, + '六、线路计划实施情况' => 40, + '线路名称' => 30, + '组织单位名称' => 20, + '序号' => 25, + '研学点名称' => 25, + '研学活动内容' => 15, + '研学时长' => 10, + ]; + + foreach ($markers as $marker => $weight) { + if (str_contains($text, $marker)) { + $score += $weight; } - @array_map('unlink', glob($tmpDir.'/*') ?: []); - @rmdir($tmpDir); } - throw new RuntimeException('无法解析 .doc 申报表,请检查文件是否损坏,或另存为 .docx 后重试'); + $score += min(substr_count($text, "\n"), 80); + + if (strlen($text) > 6000) { + $score -= (int) ((strlen($text) - 6000) / 800); + } + + return $score; } private static function extractDocViaPhpWord(string $path): ?string { + if (! class_exists(IOFactory::class)) { + return null; + } + try { $phpWord = IOFactory::load($path, 'MsDoc'); } catch (\Throwable) { @@ -161,8 +240,82 @@ class DocTextExtractor } $text = self::joinLines($lines); + if ($text === '' || (! str_contains($text, '线路基本情况') && ! str_contains($text, '线路名称'))) { + return null; + } - return trim($text) !== '' ? $text : null; + return $text; + } + + private static function extractDocViaBinaryUtf16(string $path): ?string + { + if (! is_readable($path)) { + return null; + } + + $data = @file_get_contents($path); + if (! is_string($data) || strlen($data) < 8) { + return null; + } + + if (substr($data, 0, 8) !== OLERead::IDENTIFIER_OLE) { + return null; + } + + $raw = self::convertUtf16LeToUtf8($data); + if (! is_string($raw) || $raw === '') { + return null; + } + + $start = mb_strpos($raw, '一、线路基本情况'); + if ($start === false) { + $start = mb_strpos($raw, '苏州市'); + } + if ($start === false) { + $start = 0; + } + + $chunk = mb_substr($raw, $start); + $parts = preg_split('/[\x{0000}-\x{001F}\x{007F}-\x{009F}]+/u', $chunk) ?: []; + $lines = []; + $seen = []; + foreach ($parts as $part) { + $line = self::normalizeLine((string) $part); + if ($line === '' || mb_strlen($line) < 2) { + continue; + } + if (! preg_match('/[\x{4e00}-\x{9fff}0-9A-Za-z]/u', $line)) { + continue; + } + if (isset($seen[$line])) { + continue; + } + $seen[$line] = true; + $lines[] = $line; + } + + $text = self::joinLines($lines); + + return str_contains($text, '线路基本情况') || str_contains($text, '线路名称') ? $text : null; + } + + private static function convertUtf16LeToUtf8(string $data): ?string + { + if (function_exists('iconv')) { + $text = @iconv('UTF-16LE', 'UTF-8//IGNORE', $data); + if (is_string($text) && $text !== '') { + return $text; + } + } + + if (function_exists('mb_convert_encoding')) { + $text = @mb_convert_encoding($data, 'UTF-8', 'UTF-16LE'); + if (is_string($text) && $text !== '') { + return $text; + } + } + + return null; } private static function extractPhpWordElementText(object $element): string diff --git a/app/Support/StudyTourDeclarationParser.php b/app/Support/StudyTourDeclarationParser.php index 2b13f60..498ac3a 100644 --- a/app/Support/StudyTourDeclarationParser.php +++ b/app/Support/StudyTourDeclarationParser.php @@ -22,11 +22,83 @@ class StudyTourDeclarationParser */ public static function parseFile(string $path, string $extension): array { + $extension = strtolower(ltrim($extension, '.')); + + if ($extension === 'doc') { + return self::parseDocFile($path); + } + $text = DocTextExtractor::extract($path, $extension); return self::parseText($text); } + /** + * @return array{parsed: array, warnings: array} + */ + private static function parseDocFile(string $path): array + { + $candidates = DocTextExtractor::extractDocCandidates($path); + if ($candidates === []) { + throw new \RuntimeException('无法解析 .doc 申报表,请检查文件是否损坏,或另存为 .docx 后重试'); + } + + $bestResult = null; + $bestScore = -1; + foreach ($candidates as $text) { + $result = self::parseText($text); + $score = self::scoreParsedResult($result); + if ($score > $bestScore) { + $bestScore = $score; + $bestResult = $result; + } + } + + return $bestResult ?? self::parseText($candidates[0]); + } + + /** + * @param array{parsed: array, warnings: array} $result + */ + private static function scoreParsedResult(array $result): int + { + $parsed = $result['parsed'] ?? []; + $score = 0; + + if (trim((string) ($parsed['name'] ?? '')) !== '') { + $score += 50; + } + if (trim((string) ($parsed['org_name'] ?? '')) !== '') { + $score += 20; + } + if (trim((string) ($parsed['suitable_count'] ?? '')) !== '') { + $score += 5; + } + if (trim((string) ($parsed['duration'] ?? '')) !== '') { + $score += 5; + } + + $score += count($parsed['seasons'] ?? []) * 3; + $score += count($parsed['grade_levels'] ?? []) * 3; + $score += count($parsed['venue_items'] ?? []) * 8; + + foreach ($parsed['route_plans'] ?? [] as $group) { + $score += 10; + $score += count($group['items'] ?? []) * 6; + } + + $score += count($parsed['courses'] ?? []) * 8; + + foreach (['intro_html', 'fee_html', 'implementation_html'] as $key) { + $plain = trim(strip_tags((string) ($parsed[$key] ?? ''))); + if ($plain !== '') { + $score += 12 + min(mb_strlen($plain), 120) / 20; + } + } + + return (int) $score; + } + /** * @return array{parsed: array, warnings: array} */