From 4e18dedf8b66c41dfd905534827680f951575d74 Mon Sep 17 00:00:00 2001 From: lion <120344285@qq.com> Date: Tue, 30 Jun 2026 14:31:08 +0800 Subject: [PATCH] =?UTF-8?q?=E7=A0=94=E5=AD=A6=E7=BA=BF=E8=B7=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/Support/DocTextExtractor.php | 103 +++++++++++++++++- composer.json | 3 +- composer.lock | 175 ++++++++++++++++++++++++++++++- 3 files changed, 278 insertions(+), 3 deletions(-) diff --git a/app/Support/DocTextExtractor.php b/app/Support/DocTextExtractor.php index d4ee71b..ef4dee7 100644 --- a/app/Support/DocTextExtractor.php +++ b/app/Support/DocTextExtractor.php @@ -3,6 +3,13 @@ namespace App\Support; use Illuminate\Support\Str; +use PhpOffice\PhpWord\Element\AbstractContainer; +use PhpOffice\PhpWord\Element\Cell; +use PhpOffice\PhpWord\Element\Row; +use PhpOffice\PhpWord\Element\Table; +use PhpOffice\PhpWord\Element\Text; +use PhpOffice\PhpWord\Element\TextBreak; +use PhpOffice\PhpWord\IOFactory; use RuntimeException; use Symfony\Component\Process\Process; use ZipArchive; @@ -83,6 +90,11 @@ class DocTextExtractor public static function extractDoc(string $path): string { + $phpWordText = self::extractDocViaPhpWord($path); + if (is_string($phpWordText) && trim($phpWordText) !== '') { + return $phpWordText; + } + $textutil = self::resolveBinary('textutil'); if ($textutil !== null) { $out = self::runCommand([$textutil, '-convert', 'txt', '-stdout', $path], 45); @@ -121,7 +133,96 @@ class DocTextExtractor @rmdir($tmpDir); } - throw new RuntimeException('无法解析 .doc 文件,请安装 LibreOffice(soffice),或改用 .docx 格式'); + throw new RuntimeException('无法解析 .doc 申报表,请检查文件是否损坏,或另存为 .docx 后重试'); + } + + private static function extractDocViaPhpWord(string $path): ?string + { + try { + $phpWord = IOFactory::load($path, 'MsDoc'); + } catch (\Throwable) { + return null; + } + + $lines = []; + foreach ($phpWord->getSections() as $section) { + foreach ($section->getElements() as $element) { + $chunk = self::extractPhpWordElementText($element); + if ($chunk === '') { + continue; + } + foreach (preg_split('/\R/u', $chunk) ?: [] as $line) { + $line = self::normalizeLine((string) $line); + if ($line !== '') { + $lines[] = $line; + } + } + } + } + + $text = self::joinLines($lines); + + return trim($text) !== '' ? $text : null; + } + + private static function extractPhpWordElementText(object $element): string + { + if ($element instanceof Text) { + return (string) $element->getText(); + } + if ($element instanceof TextBreak) { + return "\n"; + } + if ($element instanceof Table) { + $parts = []; + foreach ($element->getRows() as $row) { + if ($row instanceof Row) { + $parts[] = self::extractPhpWordRowText($row); + } + } + + return implode("\n", array_filter($parts, fn ($part) => $part !== '')); + } + if ($element instanceof AbstractContainer) { + $parts = []; + foreach ($element->getElements() as $child) { + $parts[] = self::extractPhpWordElementText($child); + } + + return implode('', $parts); + } + if (method_exists($element, 'getText')) { + $text = $element->getText(); + + return is_string($text) ? $text : ''; + } + + return ''; + } + + private static function extractPhpWordRowText(Row $row): string + { + $cells = []; + foreach ($row->getCells() as $cell) { + if ($cell instanceof Cell) { + $cells[] = self::extractPhpWordCellText($cell); + } + } + + return implode("\n", array_values(array_filter( + array_map(fn ($cellText) => self::normalizeLine($cellText), $cells), + fn ($cellText) => $cellText !== '' + ))); + } + + private static function extractPhpWordCellText(Cell $cell): string + { + $parts = []; + foreach ($cell->getElements() as $element) { + $parts[] = self::extractPhpWordElementText($element); + } + + return self::normalizeLine(implode('', $parts)); } private static function textFromNode(\DOMXPath $xpath, \DOMNode $node): string diff --git a/composer.json b/composer.json index 7eb8c07..3c7a70a 100644 --- a/composer.json +++ b/composer.json @@ -11,7 +11,8 @@ "laravel/sanctum": "^3.3", "laravel/tinker": "^2.8", "overtrue/pinyin": "^5.0", - "phpoffice/phpspreadsheet": "^5.3" + "phpoffice/phpspreadsheet": "^5.3", + "phpoffice/phpword": "^1.3" }, "require-dev": { "fakerphp/faker": "^1.9.1", diff --git a/composer.lock b/composer.lock index 7694c05..e05b13e 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "2fd1262c3117ca0367f247bc6998ea9b", + "content-hash": "1a85fb70206d510a41d0911211e84521", "packages": [ { "name": "brick/math", @@ -2938,6 +2938,64 @@ ], "time": "2025-03-16T02:16:27+00:00" }, + { + "name": "phpoffice/math", + "version": "0.2.0", + "source": { + "type": "git", + "url": "https://github.com/PHPOffice/Math.git", + "reference": "fc2eb6d1a61b058d5dac77197059db30ee3c8329" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/PHPOffice/Math/zipball/fc2eb6d1a61b058d5dac77197059db30ee3c8329", + "reference": "fc2eb6d1a61b058d5dac77197059db30ee3c8329", + "shasum": "", + "mirrors": [ + { + "url": "https://mirrors.aliyun.com/composer/dists/%package%/%reference%.%type%", + "preferred": true + } + ] + }, + "require": { + "ext-dom": "*", + "ext-xml": "*", + "php": "^7.1|^8.0" + }, + "require-dev": { + "phpstan/phpstan": "^0.12.88 || ^1.0.0", + "phpunit/phpunit": "^7.0 || ^9.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "PhpOffice\\Math\\": "src/Math/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Progi1984", + "homepage": "https://lefevre.dev" + } + ], + "description": "Math - Manipulate Math Formula", + "homepage": "https://phpoffice.github.io/Math/", + "keywords": [ + "MathML", + "officemathml", + "php" + ], + "support": { + "issues": "https://github.com/PHPOffice/Math/issues", + "source": "https://github.com/PHPOffice/Math/tree/0.2.0" + }, + "time": "2024-08-12T07:30:45+00:00" + }, { "name": "phpoffice/phpspreadsheet", "version": "5.3.0", @@ -3050,6 +3108,121 @@ }, "time": "2025-11-24T15:47:10+00:00" }, + { + "name": "phpoffice/phpword", + "version": "1.3.0", + "source": { + "type": "git", + "url": "https://github.com/PHPOffice/PHPWord.git", + "reference": "8392134ce4b5dba65130ba956231a1602b848b7f" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/PHPOffice/PHPWord/zipball/8392134ce4b5dba65130ba956231a1602b848b7f", + "reference": "8392134ce4b5dba65130ba956231a1602b848b7f", + "shasum": "", + "mirrors": [ + { + "url": "https://mirrors.aliyun.com/composer/dists/%package%/%reference%.%type%", + "preferred": true + } + ] + }, + "require": { + "ext-dom": "*", + "ext-json": "*", + "ext-xml": "*", + "php": "^7.1|^8.0", + "phpoffice/math": "^0.2" + }, + "require-dev": { + "dompdf/dompdf": "^2.0", + "ext-gd": "*", + "ext-libxml": "*", + "ext-zip": "*", + "friendsofphp/php-cs-fixer": "^3.3", + "mpdf/mpdf": "^8.1", + "phpmd/phpmd": "^2.13", + "phpstan/phpstan-phpunit": "@stable", + "phpunit/phpunit": ">=7.0", + "symfony/process": "^4.4 || ^5.0", + "tecnickcom/tcpdf": "^6.5" + }, + "suggest": { + "dompdf/dompdf": "Allows writing PDF", + "ext-gd2": "Allows adding images", + "ext-xmlwriter": "Allows writing OOXML and ODF", + "ext-xsl": "Allows applying XSL style sheet to headers, to main document part, and to footers of an OOXML template", + "ext-zip": "Allows writing OOXML and ODF" + }, + "type": "library", + "autoload": { + "psr-4": { + "PhpOffice\\PhpWord\\": "src/PhpWord" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "LGPL-3.0" + ], + "authors": [ + { + "name": "Mark Baker" + }, + { + "name": "Gabriel Bull", + "email": "me@gabrielbull.com", + "homepage": "http://gabrielbull.com/" + }, + { + "name": "Franck Lefevre", + "homepage": "https://rootslabs.net/blog/" + }, + { + "name": "Ivan Lanin", + "homepage": "http://ivan.lanin.org" + }, + { + "name": "Roman Syroeshko", + "homepage": "http://ru.linkedin.com/pub/roman-syroeshko/34/a53/994/" + }, + { + "name": "Antoine de Troostembergh" + } + ], + "description": "PHPWord - A pure PHP library for reading and writing word processing documents (OOXML, ODF, RTF, HTML, PDF)", + "homepage": "https://phpoffice.github.io/PHPWord/", + "keywords": [ + "ISO IEC 29500", + "OOXML", + "Office Open XML", + "OpenDocument", + "OpenXML", + "PhpOffice", + "PhpWord", + "Rich Text Format", + "WordprocessingML", + "doc", + "docx", + "html", + "odf", + "odt", + "office", + "pdf", + "php", + "reader", + "rtf", + "template", + "template processor", + "word", + "writer" + ], + "support": { + "issues": "https://github.com/PHPOffice/PHPWord/issues", + "source": "https://github.com/PHPOffice/PHPWord/tree/1.3.0" + }, + "time": "2024-08-30T18:03:42+00:00" + }, { "name": "phpoption/phpoption", "version": "1.9.3",