self::extractDocx($path), 'doc' => self::extractDoc($path), default => throw new RuntimeException('仅支持 .doc / .docx 申报表'), }; } public static function extractDocx(string $path): string { $zip = new ZipArchive(); if ($zip->open($path) !== true) { throw new RuntimeException('无法读取 docx 文件'); } $xml = $zip->getFromName('word/document.xml'); $zip->close(); if (! is_string($xml) || trim($xml) === '') { throw new RuntimeException('docx 文档内容为空'); } $dom = new \DOMDocument(); $dom->preserveWhiteSpace = false; @$dom->loadXML($xml); $xpath = new \DOMXPath($dom); $xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'); $lines = []; $bodyNodes = $xpath->query('//w:body/*'); if ($bodyNodes === false) { return ''; } foreach ($bodyNodes as $node) { $local = $node->localName ?? $node->nodeName; if ($local === 'p') { $text = self::textFromNode($xpath, $node); $lines[] = self::normalizeLine($text); continue; } if ($local === 'tbl') { $rowNodes = $xpath->query('.//w:tr', $node); if ($rowNodes === false) { continue; } foreach ($rowNodes as $row) { $cellTexts = []; $cellNodes = $xpath->query('./w:tc', $row); if ($cellNodes === false) { continue; } foreach ($cellNodes as $cell) { $cellTexts[] = self::normalizeLine(self::textFromNode($xpath, $cell)); } if (count($cellTexts) === 1) { $lines[] = $cellTexts[0]; } else { foreach ($cellTexts as $cellText) { $lines[] = $cellText; } } } } } return self::joinLines($lines); } public static function extractDoc(string $path): string { if (self::commandExists('textutil')) { $out = self::runCommand(['textutil', '-convert', 'txt', '-stdout', $path], 45); if (is_string($out) && trim($out) !== '') { return self::normalizeText($out); } } if (self::commandExists('soffice')) { $tmpDir = sys_get_temp_dir().'/study-tour-doc-'.Str::random(8); if (! @mkdir($tmpDir, 0700, true) && ! is_dir($tmpDir)) { throw new RuntimeException('无法创建临时目录'); } self::runCommand([ 'soffice', '--headless', '--convert-to', 'txt:Text', '--outdir', $tmpDir, $path, ], 120); $base = pathinfo($path, PATHINFO_FILENAME); $txtPath = $tmpDir.'/'.$base.'.txt'; if (is_file($txtPath)) { $text = file_get_contents($txtPath) ?: ''; @unlink($txtPath); @rmdir($tmpDir); if (trim($text) !== '') { return self::normalizeText($text); } } @array_map('unlink', glob($tmpDir.'/*') ?: []); @rmdir($tmpDir); } throw new RuntimeException('无法解析 .doc 文件,请安装 textutil 或 LibreOffice,或改用 .docx 格式'); } private static function textFromNode(\DOMXPath $xpath, \DOMNode $node): string { $parts = []; $textNodes = $xpath->query('.//w:t', $node); if ($textNodes !== false) { foreach ($textNodes as $textNode) { $parts[] = $textNode->textContent ?? ''; } } return trim(implode('', $parts)); } private static function normalizeLine(string $text): string { $text = str_replace(["\xc2\xa0", "\xef\xbb\xbf"], ' ', $text); $text = preg_replace('/\s+/u', ' ', $text) ?? $text; return trim($text); } private static function joinLines(array $lines): string { $normalized = array_map(fn ($line) => self::normalizeLine((string) $line), $lines); return self::normalizeText(implode("\n", $normalized)); } private static function normalizeText(string $text): string { $text = str_replace(["\r\n", "\r", "\f"], "\n", $text); $text = preg_replace("/\n{3,}/", "\n\n", $text) ?? $text; return trim($text); } private static function commandExists(string $command): bool { $path = trim((string) shell_exec('command -v '.escapeshellarg($command).' 2>/dev/null')); return $path !== ''; } /** * @param array $command */ private static function runCommand(array $command, int $timeoutSeconds = 30): ?string { $process = new Process($command); $process->setTimeout($timeoutSeconds); try { $process->mustRun(); } catch (\Throwable) { return null; } $output = $process->getOutput(); return is_string($output) && $output !== '' ? $output : null; } }