self::extractDocx($path), 'doc' => self::extractDoc($path), default => throw new RuntimeException('仅支持 .doc / .docx 申报表'), }; } public static function extractDocx(string $path): string { $zip = new ZipArchive(); if ($zip->open($path) !== true) { throw new RuntimeException('无法读取 docx 文件'); } $xml = $zip->getFromName('word/document.xml'); $zip->close(); if (! is_string($xml) || trim($xml) === '') { throw new RuntimeException('docx 文档内容为空'); } $dom = new \DOMDocument(); $dom->preserveWhiteSpace = false; @$dom->loadXML($xml); $xpath = new \DOMXPath($dom); $xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'); $lines = []; $bodyNodes = $xpath->query('//w:body/*'); if ($bodyNodes === false) { return ''; } foreach ($bodyNodes as $node) { $local = $node->localName ?? $node->nodeName; if ($local === 'p') { $text = self::textFromNode($xpath, $node); $lines[] = self::normalizeLine($text); continue; } if ($local === 'tbl') { $rowNodes = $xpath->query('.//w:tr', $node); if ($rowNodes === false) { continue; } foreach ($rowNodes as $row) { $cellTexts = []; $cellNodes = $xpath->query('./w:tc', $row); if ($cellNodes === false) { continue; } foreach ($cellNodes as $cell) { $cellTexts[] = self::normalizeLine(self::textFromNode($xpath, $cell)); } if (count($cellTexts) === 1) { $lines[] = $cellTexts[0]; } else { foreach ($cellTexts as $cellText) { $lines[] = $cellText; } } } } } return self::joinLines($lines); } public static function extractDoc(string $path): string { $candidates = self::extractDocCandidates($path); if ($candidates === []) { throw new RuntimeException('无法解析 .doc 申报表,请检查文件是否损坏,或另存为 .docx 后重试'); } $bestText = null; $bestScore = -1; foreach ($candidates as $text) { $score = self::scoreDeclarationText($text); if ($score > $bestScore) { $bestScore = $score; $bestText = $text; } } return $bestText ?? $candidates[0]; } /** * @return array */ public static function extractDocCandidates(string $path): array { $candidates = []; foreach ([ self::extractDocViaBinaryUtf16($path), self::extractDocViaPhpWord($path), self::extractDocViaTextutil($path), self::extractDocViaSoffice($path), ] as $text) { if (! is_string($text) || trim($text) === '') { continue; } $normalized = self::normalizeText($text); if ($normalized === '') { continue; } $candidates[$normalized] = $normalized; } return array_values($candidates); } private static function extractDocViaTextutil(string $path): ?string { $textutil = self::resolveBinary('textutil'); if ($textutil === null) { return null; } $out = self::runCommand([$textutil, '-convert', 'txt', '-stdout', $path], 45); return is_string($out) && trim($out) !== '' ? $out : null; } private static function extractDocViaSoffice(string $path): ?string { $soffice = self::resolveBinary('soffice'); if ($soffice === null) { return null; } $tmpDir = sys_get_temp_dir().'/study-tour-doc-'.Str::random(8); if (! @mkdir($tmpDir, 0700, true) && ! is_dir($tmpDir)) { return null; } self::runCommand([ $soffice, '--headless', '--convert-to', 'txt:Text', '--outdir', $tmpDir, $path, ], 120); $base = pathinfo($path, PATHINFO_FILENAME); $txtPath = $tmpDir.'/'.$base.'.txt'; $text = is_file($txtPath) ? (file_get_contents($txtPath) ?: '') : ''; @array_map('unlink', glob($tmpDir.'/*') ?: []); @rmdir($tmpDir); return trim($text) !== '' ? $text : null; } private static function scoreDeclarationText(string $text): int { $score = 0; $markers = [ '一、线路基本情况' => 100, '三、线路规划' => 80, '二、线路简介' => 40, '四、研学课程' => 40, '五、线路收费标准' => 40, '六、线路计划实施情况' => 40, '线路名称' => 30, '组织单位名称' => 20, '序号' => 25, '研学点名称' => 25, '研学活动内容' => 15, '研学时长' => 10, ]; foreach ($markers as $marker => $weight) { if (str_contains($text, $marker)) { $score += $weight; } } $score += min(substr_count($text, "\n"), 80); if (strlen($text) > 6000) { $score -= (int) ((strlen($text) - 6000) / 800); } return $score; } private static function extractDocViaPhpWord(string $path): ?string { if (! class_exists(IOFactory::class)) { return null; } try { $phpWord = IOFactory::load($path, 'MsDoc'); } catch (\Throwable) { return null; } $lines = []; foreach ($phpWord->getSections() as $section) { foreach ($section->getElements() as $element) { $chunk = self::extractPhpWordElementText($element); if ($chunk === '') { continue; } foreach (preg_split('/\R/u', $chunk) ?: [] as $line) { $line = self::normalizeLine((string) $line); if ($line !== '') { $lines[] = $line; } } } } $text = self::joinLines($lines); if ($text === '' || (! str_contains($text, '线路基本情况') && ! str_contains($text, '线路名称'))) { return null; } return $text; } private static function extractDocViaBinaryUtf16(string $path): ?string { if (! is_readable($path)) { return null; } $data = @file_get_contents($path); if (! is_string($data) || strlen($data) < 8) { return null; } if (substr($data, 0, 8) !== OLERead::IDENTIFIER_OLE) { return null; } $raw = self::convertUtf16LeToUtf8($data); if (! is_string($raw) || $raw === '') { return null; } $start = mb_strpos($raw, '一、线路基本情况'); if ($start === false) { $start = mb_strpos($raw, '苏州市'); } if ($start === false) { $start = 0; } $chunk = mb_substr($raw, $start); $parts = preg_split('/[\x{0000}-\x{001F}\x{007F}-\x{009F}]+/u', $chunk) ?: []; $lines = []; $previous = null; foreach ($parts as $part) { $line = self::normalizeLine((string) $part); if ($line === '' || mb_strlen($line) < 2) { continue; } if (self::isGarbledDeclarationLine($line)) { continue; } if (! preg_match('/[\x{4e00}-\x{9fff}0-9A-Za-z]/u', $line)) { continue; } if ($line === $previous) { continue; } $previous = $line; $lines[] = $line; } $text = self::joinLines($lines); return str_contains($text, '线路基本情况') || str_contains($text, '线路名称') ? $text : null; } private static function convertUtf16LeToUtf8(string $data): ?string { if (function_exists('iconv')) { $text = @iconv('UTF-16LE', 'UTF-8//IGNORE', $data); if (is_string($text) && $text !== '') { return $text; } } if (function_exists('mb_convert_encoding')) { $text = @mb_convert_encoding($data, 'UTF-8', 'UTF-16LE'); if (is_string($text) && $text !== '') { return $text; } } return null; } private static function extractPhpWordElementText(object $element): string { if ($element instanceof Text) { return (string) $element->getText(); } if ($element instanceof TextBreak) { return "\n"; } if ($element instanceof Table) { $parts = []; foreach ($element->getRows() as $row) { if ($row instanceof Row) { $parts[] = self::extractPhpWordRowText($row); } } return implode("\n", array_filter($parts, fn ($part) => $part !== '')); } if ($element instanceof AbstractContainer) { $parts = []; foreach ($element->getElements() as $child) { $parts[] = self::extractPhpWordElementText($child); } return implode('', $parts); } if (method_exists($element, 'getText')) { $text = $element->getText(); return is_string($text) ? $text : ''; } return ''; } private static function extractPhpWordRowText(Row $row): string { $cells = []; foreach ($row->getCells() as $cell) { if ($cell instanceof Cell) { $cells[] = self::extractPhpWordCellText($cell); } } return implode("\n", array_values(array_filter( array_map(fn ($cellText) => self::normalizeLine($cellText), $cells), fn ($cellText) => $cellText !== '' ))); } private static function extractPhpWordCellText(Cell $cell): string { $parts = []; foreach ($cell->getElements() as $element) { $parts[] = self::extractPhpWordElementText($element); } return self::normalizeLine(implode('', $parts)); } private static function textFromNode(\DOMXPath $xpath, \DOMNode $node): string { $parts = []; $textNodes = $xpath->query('.//w:t', $node); if ($textNodes !== false) { foreach ($textNodes as $textNode) { $parts[] = $textNode->textContent ?? ''; } } return trim(implode('', $parts)); } private static function normalizeLine(string $text): string { $text = str_replace(["\xc2\xa0", "\xef\xbb\xbf"], ' ', $text); $text = preg_replace('/\s+/u', ' ', $text) ?? $text; return trim($text); } private static function joinLines(array $lines): string { $normalized = array_map(fn ($line) => self::normalizeLine((string) $line), $lines); return self::normalizeText(implode("\n", $normalized)); } private static function normalizeText(string $text): string { $text = str_replace(["\r\n", "\r", "\f"], "\n", $text); $text = preg_replace("/\n{3,}/", "\n\n", $text) ?? $text; return trim($text); } private static function resolveBinary(string $command): ?string { $candidates = [$command]; if ($command === 'soffice') { $candidates = array_merge($candidates, [ '/usr/bin/soffice', '/usr/local/bin/soffice', '/opt/libreoffice/program/soffice', '/usr/lib/libreoffice/program/soffice', ]); } if ($command === 'textutil') { $candidates[] = '/usr/bin/textutil'; } foreach ($candidates as $candidate) { if ($candidate === $command) { if (self::isExecutableOnPath($command)) { return $command; } continue; } if (is_file($candidate) && is_executable($candidate)) { return $candidate; } } return null; } private static function isExecutableOnPath(string $command): bool { $pathEnv = getenv('PATH'); if (! is_string($pathEnv) || $pathEnv === '') { $pathEnv = '/usr/local/bin:/usr/bin:/bin'; } foreach (explode(':', $pathEnv) as $dir) { $dir = trim($dir); if ($dir === '') { continue; } $full = rtrim($dir, '/').'/'.$command; if (is_file($full) && is_executable($full)) { return true; } } return false; } /** * @param array $command */ private static function runCommand(array $command, int $timeoutSeconds = 30): ?string { $process = new Process($command); $process->setTimeout($timeoutSeconds); try { $process->mustRun(); } catch (\Throwable) { return null; } $output = $process->getOutput(); return is_string($output) && $output !== '' ? $output : null; } private static function isGarbledDeclarationLine(string $line): bool { if (preg_match('/^\d+$/', $line)) { return false; } if (preg_match('/^\d{1,2}:\d{2}/', $line)) { return false; } if (preg_match('/[\x{0080}-\x{009F}]/u', $line)) { return true; } if (preg_match('/[ᘀ-᛿]/u', $line)) { return true; } if (preg_match('/[漀愀脈摫欀䡒⡯␖]/u', $line)) { return true; } $cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0; if ($cjk === 0 && mb_strlen($line) >= 3) { return true; } $len = mb_strlen($line); if ($len < 6) { return $cjk === 0; } $readable = preg_match_all('/[\x{4e00}-\x{9fff}0-9A-Za-z,。、:;()\-\s\/&]/u', $line) ?: 0; return ($readable / max($len, 1)) < 0.45; } }