You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

334 lines
10 KiB

<?php
namespace App\Support;
use Illuminate\Support\Str;
use PhpOffice\PhpWord\Element\AbstractContainer;
use PhpOffice\PhpWord\Element\Cell;
use PhpOffice\PhpWord\Element\Row;
use PhpOffice\PhpWord\Element\Table;
use PhpOffice\PhpWord\Element\Text;
use PhpOffice\PhpWord\Element\TextBreak;
use PhpOffice\PhpWord\IOFactory;
use RuntimeException;
use Symfony\Component\Process\Process;
use ZipArchive;
class DocTextExtractor
{
public static function extract(string $path, string $extension): string
{
$extension = strtolower(ltrim($extension, '.'));
return match ($extension) {
'docx' => self::extractDocx($path),
'doc' => self::extractDoc($path),
default => throw new RuntimeException('仅支持 .doc / .docx 申报表'),
};
}
public static function extractDocx(string $path): string
{
$zip = new ZipArchive();
if ($zip->open($path) !== true) {
throw new RuntimeException('无法读取 docx 文件');
}
$xml = $zip->getFromName('word/document.xml');
$zip->close();
if (! is_string($xml) || trim($xml) === '') {
throw new RuntimeException('docx 文档内容为空');
}
$dom = new \DOMDocument();
$dom->preserveWhiteSpace = false;
@$dom->loadXML($xml);
$xpath = new \DOMXPath($dom);
$xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main');
$lines = [];
$bodyNodes = $xpath->query('//w:body/*');
if ($bodyNodes === false) {
return '';
}
foreach ($bodyNodes as $node) {
$local = $node->localName ?? $node->nodeName;
if ($local === 'p') {
$text = self::textFromNode($xpath, $node);
$lines[] = self::normalizeLine($text);
continue;
}
if ($local === 'tbl') {
$rowNodes = $xpath->query('.//w:tr', $node);
if ($rowNodes === false) {
continue;
}
foreach ($rowNodes as $row) {
$cellTexts = [];
$cellNodes = $xpath->query('./w:tc', $row);
if ($cellNodes === false) {
continue;
}
foreach ($cellNodes as $cell) {
$cellTexts[] = self::normalizeLine(self::textFromNode($xpath, $cell));
}
if (count($cellTexts) === 1) {
$lines[] = $cellTexts[0];
} else {
foreach ($cellTexts as $cellText) {
$lines[] = $cellText;
}
}
}
}
}
return self::joinLines($lines);
}
public static function extractDoc(string $path): string
{
$phpWordText = self::extractDocViaPhpWord($path);
if (is_string($phpWordText) && trim($phpWordText) !== '') {
return $phpWordText;
}
$textutil = self::resolveBinary('textutil');
if ($textutil !== null) {
$out = self::runCommand([$textutil, '-convert', 'txt', '-stdout', $path], 45);
if (is_string($out) && trim($out) !== '') {
return self::normalizeText($out);
}
}
$soffice = self::resolveBinary('soffice');
if ($soffice !== null) {
$tmpDir = sys_get_temp_dir().'/study-tour-doc-'.Str::random(8);
if (! @mkdir($tmpDir, 0700, true) && ! is_dir($tmpDir)) {
throw new RuntimeException('无法创建临时目录');
}
self::runCommand([
$soffice,
'--headless',
'--convert-to',
'txt:Text',
'--outdir',
$tmpDir,
$path,
], 120);
$base = pathinfo($path, PATHINFO_FILENAME);
$txtPath = $tmpDir.'/'.$base.'.txt';
if (is_file($txtPath)) {
$text = file_get_contents($txtPath) ?: '';
@unlink($txtPath);
@rmdir($tmpDir);
if (trim($text) !== '') {
return self::normalizeText($text);
}
}
@array_map('unlink', glob($tmpDir.'/*') ?: []);
@rmdir($tmpDir);
}
throw new RuntimeException('无法解析 .doc 申报表,请检查文件是否损坏,或另存为 .docx 后重试');
}
private static function extractDocViaPhpWord(string $path): ?string
{
try {
$phpWord = IOFactory::load($path, 'MsDoc');
} catch (\Throwable) {
return null;
}
$lines = [];
foreach ($phpWord->getSections() as $section) {
foreach ($section->getElements() as $element) {
$chunk = self::extractPhpWordElementText($element);
if ($chunk === '') {
continue;
}
foreach (preg_split('/\R/u', $chunk) ?: [] as $line) {
$line = self::normalizeLine((string) $line);
if ($line !== '') {
$lines[] = $line;
}
}
}
}
$text = self::joinLines($lines);
return trim($text) !== '' ? $text : null;
}
private static function extractPhpWordElementText(object $element): string
{
if ($element instanceof Text) {
return (string) $element->getText();
}
if ($element instanceof TextBreak) {
return "\n";
}
if ($element instanceof Table) {
$parts = [];
foreach ($element->getRows() as $row) {
if ($row instanceof Row) {
$parts[] = self::extractPhpWordRowText($row);
}
}
return implode("\n", array_filter($parts, fn ($part) => $part !== ''));
}
if ($element instanceof AbstractContainer) {
$parts = [];
foreach ($element->getElements() as $child) {
$parts[] = self::extractPhpWordElementText($child);
}
return implode('', $parts);
}
if (method_exists($element, 'getText')) {
$text = $element->getText();
return is_string($text) ? $text : '';
}
return '';
}
private static function extractPhpWordRowText(Row $row): string
{
$cells = [];
foreach ($row->getCells() as $cell) {
if ($cell instanceof Cell) {
$cells[] = self::extractPhpWordCellText($cell);
}
}
return implode("\n", array_values(array_filter(
array_map(fn ($cellText) => self::normalizeLine($cellText), $cells),
fn ($cellText) => $cellText !== ''
)));
}
private static function extractPhpWordCellText(Cell $cell): string
{
$parts = [];
foreach ($cell->getElements() as $element) {
$parts[] = self::extractPhpWordElementText($element);
}
return self::normalizeLine(implode('', $parts));
}
private static function textFromNode(\DOMXPath $xpath, \DOMNode $node): string
{
$parts = [];
$textNodes = $xpath->query('.//w:t', $node);
if ($textNodes !== false) {
foreach ($textNodes as $textNode) {
$parts[] = $textNode->textContent ?? '';
}
}
return trim(implode('', $parts));
}
private static function normalizeLine(string $text): string
{
$text = str_replace(["\xc2\xa0", "\xef\xbb\xbf"], ' ', $text);
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
return trim($text);
}
private static function joinLines(array $lines): string
{
$normalized = array_map(fn ($line) => self::normalizeLine((string) $line), $lines);
return self::normalizeText(implode("\n", $normalized));
}
private static function normalizeText(string $text): string
{
$text = str_replace(["\r\n", "\r", "\f"], "\n", $text);
$text = preg_replace("/\n{3,}/", "\n\n", $text) ?? $text;
return trim($text);
}
private static function resolveBinary(string $command): ?string
{
$candidates = [$command];
if ($command === 'soffice') {
$candidates = array_merge($candidates, [
'/usr/bin/soffice',
'/usr/local/bin/soffice',
'/opt/libreoffice/program/soffice',
'/usr/lib/libreoffice/program/soffice',
]);
}
if ($command === 'textutil') {
$candidates[] = '/usr/bin/textutil';
}
foreach ($candidates as $candidate) {
if ($candidate === $command) {
if (self::isExecutableOnPath($command)) {
return $command;
}
continue;
}
if (is_file($candidate) && is_executable($candidate)) {
return $candidate;
}
}
return null;
}
private static function isExecutableOnPath(string $command): bool
{
$pathEnv = getenv('PATH');
if (! is_string($pathEnv) || $pathEnv === '') {
$pathEnv = '/usr/local/bin:/usr/bin:/bin';
}
foreach (explode(':', $pathEnv) as $dir) {
$dir = trim($dir);
if ($dir === '') {
continue;
}
$full = rtrim($dir, '/').'/'.$command;
if (is_file($full) && is_executable($full)) {
return true;
}
}
return false;
}
/**
* @param array<int, string> $command
*/
private static function runCommand(array $command, int $timeoutSeconds = 30): ?string
{
$process = new Process($command);
$process->setTimeout($timeoutSeconds);
try {
$process->mustRun();
} catch (\Throwable) {
return null;
}
$output = $process->getOutput();
return is_string($output) && $output !== '' ? $output : null;
}
}