self::extractDocx($path), 'doc' => self::extractDoc($path), default => throw new RuntimeException('仅支持 .doc / .docx 申报表'), }; } public static function extractDocx(string $path): string { $zip = new ZipArchive(); if ($zip->open($path) !== true) { throw new RuntimeException('无法读取 docx 文件'); } $xml = $zip->getFromName('word/document.xml'); $zip->close(); if (! is_string($xml) || trim($xml) === '') { throw new RuntimeException('docx 文档内容为空'); } $dom = new \DOMDocument(); $dom->preserveWhiteSpace = false; @$dom->loadXML($xml); $xpath = new \DOMXPath($dom); $xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'); $lines = []; $bodyNodes = $xpath->query('//w:body/*'); if ($bodyNodes === false) { return ''; } foreach ($bodyNodes as $node) { $local = $node->localName ?? $node->nodeName; if ($local === 'p') { $text = self::textFromNode($xpath, $node); $lines[] = self::normalizeLine($text); continue; } if ($local === 'tbl') { $rowNodes = $xpath->query('.//w:tr', $node); if ($rowNodes === false) { continue; } foreach ($rowNodes as $row) { $cellTexts = []; $cellNodes = $xpath->query('./w:tc', $row); if ($cellNodes === false) { continue; } foreach ($cellNodes as $cell) { $cellTexts[] = self::normalizeLine(self::textFromNode($xpath, $cell)); } if (count($cellTexts) === 1) { $lines[] = $cellTexts[0]; } else { foreach ($cellTexts as $cellText) { $lines[] = $cellText; } } } } } return self::joinLines($lines); } public static function extractDoc(string $path): string { $textutil = self::resolveBinary('textutil'); if ($textutil !== null) { $out = self::runCommand([$textutil, '-convert', 'txt', '-stdout', $path], 45); if (is_string($out) && trim($out) !== '') { return self::normalizeText($out); } } $soffice = self::resolveBinary('soffice'); if ($soffice !== null) { $tmpDir = sys_get_temp_dir().'/study-tour-doc-'.Str::random(8); if (! @mkdir($tmpDir, 0700, true) && ! is_dir($tmpDir)) { throw new RuntimeException('无法创建临时目录'); } self::runCommand([ $soffice, '--headless', '--convert-to', 'txt:Text', '--outdir', $tmpDir, $path, ], 120); $base = pathinfo($path, PATHINFO_FILENAME); $txtPath = $tmpDir.'/'.$base.'.txt'; if (is_file($txtPath)) { $text = file_get_contents($txtPath) ?: ''; @unlink($txtPath); @rmdir($tmpDir); if (trim($text) !== '') { return self::normalizeText($text); } } @array_map('unlink', glob($tmpDir.'/*') ?: []); @rmdir($tmpDir); } throw new RuntimeException('无法解析 .doc 文件,请安装 LibreOffice(soffice),或改用 .docx 格式'); } private static function textFromNode(\DOMXPath $xpath, \DOMNode $node): string { $parts = []; $textNodes = $xpath->query('.//w:t', $node); if ($textNodes !== false) { foreach ($textNodes as $textNode) { $parts[] = $textNode->textContent ?? ''; } } return trim(implode('', $parts)); } private static function normalizeLine(string $text): string { $text = str_replace(["\xc2\xa0", "\xef\xbb\xbf"], ' ', $text); $text = preg_replace('/\s+/u', ' ', $text) ?? $text; return trim($text); } private static function joinLines(array $lines): string { $normalized = array_map(fn ($line) => self::normalizeLine((string) $line), $lines); return self::normalizeText(implode("\n", $normalized)); } private static function normalizeText(string $text): string { $text = str_replace(["\r\n", "\r", "\f"], "\n", $text); $text = preg_replace("/\n{3,}/", "\n\n", $text) ?? $text; return trim($text); } private static function resolveBinary(string $command): ?string { $candidates = [$command]; if ($command === 'soffice') { $candidates = array_merge($candidates, [ '/usr/bin/soffice', '/usr/local/bin/soffice', '/opt/libreoffice/program/soffice', '/usr/lib/libreoffice/program/soffice', ]); } if ($command === 'textutil') { $candidates[] = '/usr/bin/textutil'; } foreach ($candidates as $candidate) { if ($candidate === $command) { if (self::isExecutableOnPath($command)) { return $command; } continue; } if (is_file($candidate) && is_executable($candidate)) { return $candidate; } } return null; } private static function isExecutableOnPath(string $command): bool { $pathEnv = getenv('PATH'); if (! is_string($pathEnv) || $pathEnv === '') { $pathEnv = '/usr/local/bin:/usr/bin:/bin'; } foreach (explode(':', $pathEnv) as $dir) { $dir = trim($dir); if ($dir === '') { continue; } $full = rtrim($dir, '/').'/'.$command; if (is_file($full) && is_executable($full)) { return true; } } return false; } /** * @param array $command */ private static function runCommand(array $command, int $timeoutSeconds = 30): ?string { $process = new Process($command); $process->setTimeout($timeoutSeconds); try { $process->mustRun(); } catch (\Throwable) { return null; } $output = $process->getOutput(); return is_string($output) && $output !== '' ? $output : null; } }