You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

233 lines
7.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
namespace App\Support;
use Illuminate\Support\Str;
use RuntimeException;
use Symfony\Component\Process\Process;
use ZipArchive;
class DocTextExtractor
{
public static function extract(string $path, string $extension): string
{
$extension = strtolower(ltrim($extension, '.'));
return match ($extension) {
'docx' => self::extractDocx($path),
'doc' => self::extractDoc($path),
default => throw new RuntimeException('仅支持 .doc / .docx 申报表'),
};
}
public static function extractDocx(string $path): string
{
$zip = new ZipArchive();
if ($zip->open($path) !== true) {
throw new RuntimeException('无法读取 docx 文件');
}
$xml = $zip->getFromName('word/document.xml');
$zip->close();
if (! is_string($xml) || trim($xml) === '') {
throw new RuntimeException('docx 文档内容为空');
}
$dom = new \DOMDocument();
$dom->preserveWhiteSpace = false;
@$dom->loadXML($xml);
$xpath = new \DOMXPath($dom);
$xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main');
$lines = [];
$bodyNodes = $xpath->query('//w:body/*');
if ($bodyNodes === false) {
return '';
}
foreach ($bodyNodes as $node) {
$local = $node->localName ?? $node->nodeName;
if ($local === 'p') {
$text = self::textFromNode($xpath, $node);
$lines[] = self::normalizeLine($text);
continue;
}
if ($local === 'tbl') {
$rowNodes = $xpath->query('.//w:tr', $node);
if ($rowNodes === false) {
continue;
}
foreach ($rowNodes as $row) {
$cellTexts = [];
$cellNodes = $xpath->query('./w:tc', $row);
if ($cellNodes === false) {
continue;
}
foreach ($cellNodes as $cell) {
$cellTexts[] = self::normalizeLine(self::textFromNode($xpath, $cell));
}
if (count($cellTexts) === 1) {
$lines[] = $cellTexts[0];
} else {
foreach ($cellTexts as $cellText) {
$lines[] = $cellText;
}
}
}
}
}
return self::joinLines($lines);
}
public static function extractDoc(string $path): string
{
$textutil = self::resolveBinary('textutil');
if ($textutil !== null) {
$out = self::runCommand([$textutil, '-convert', 'txt', '-stdout', $path], 45);
if (is_string($out) && trim($out) !== '') {
return self::normalizeText($out);
}
}
$soffice = self::resolveBinary('soffice');
if ($soffice !== null) {
$tmpDir = sys_get_temp_dir().'/study-tour-doc-'.Str::random(8);
if (! @mkdir($tmpDir, 0700, true) && ! is_dir($tmpDir)) {
throw new RuntimeException('无法创建临时目录');
}
self::runCommand([
$soffice,
'--headless',
'--convert-to',
'txt:Text',
'--outdir',
$tmpDir,
$path,
], 120);
$base = pathinfo($path, PATHINFO_FILENAME);
$txtPath = $tmpDir.'/'.$base.'.txt';
if (is_file($txtPath)) {
$text = file_get_contents($txtPath) ?: '';
@unlink($txtPath);
@rmdir($tmpDir);
if (trim($text) !== '') {
return self::normalizeText($text);
}
}
@array_map('unlink', glob($tmpDir.'/*') ?: []);
@rmdir($tmpDir);
}
throw new RuntimeException('无法解析 .doc 文件,请安装 LibreOfficesoffice或改用 .docx 格式');
}
private static function textFromNode(\DOMXPath $xpath, \DOMNode $node): string
{
$parts = [];
$textNodes = $xpath->query('.//w:t', $node);
if ($textNodes !== false) {
foreach ($textNodes as $textNode) {
$parts[] = $textNode->textContent ?? '';
}
}
return trim(implode('', $parts));
}
private static function normalizeLine(string $text): string
{
$text = str_replace(["\xc2\xa0", "\xef\xbb\xbf"], ' ', $text);
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
return trim($text);
}
private static function joinLines(array $lines): string
{
$normalized = array_map(fn ($line) => self::normalizeLine((string) $line), $lines);
return self::normalizeText(implode("\n", $normalized));
}
private static function normalizeText(string $text): string
{
$text = str_replace(["\r\n", "\r", "\f"], "\n", $text);
$text = preg_replace("/\n{3,}/", "\n\n", $text) ?? $text;
return trim($text);
}
private static function resolveBinary(string $command): ?string
{
$candidates = [$command];
if ($command === 'soffice') {
$candidates = array_merge($candidates, [
'/usr/bin/soffice',
'/usr/local/bin/soffice',
'/opt/libreoffice/program/soffice',
'/usr/lib/libreoffice/program/soffice',
]);
}
if ($command === 'textutil') {
$candidates[] = '/usr/bin/textutil';
}
foreach ($candidates as $candidate) {
if ($candidate === $command) {
if (self::isExecutableOnPath($command)) {
return $command;
}
continue;
}
if (is_file($candidate) && is_executable($candidate)) {
return $candidate;
}
}
return null;
}
private static function isExecutableOnPath(string $command): bool
{
$pathEnv = getenv('PATH');
if (! is_string($pathEnv) || $pathEnv === '') {
$pathEnv = '/usr/local/bin:/usr/bin:/bin';
}
foreach (explode(':', $pathEnv) as $dir) {
$dir = trim($dir);
if ($dir === '') {
continue;
}
$full = rtrim($dir, '/').'/'.$command;
if (is_file($full) && is_executable($full)) {
return true;
}
}
return false;
}
/**
* @param array<int, string> $command
*/
private static function runCommand(array $command, int $timeoutSeconds = 30): ?string
{
$process = new Process($command);
$process->setTimeout($timeoutSeconds);
try {
$process->mustRun();
} catch (\Throwable) {
return null;
}
$output = $process->getOutput();
return is_string($output) && $output !== '' ? $output : null;
}
}