You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

535 lines
16 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
namespace App\Support;
use Illuminate\Support\Str;
use PhpOffice\PhpWord\Element\AbstractContainer;
use PhpOffice\PhpWord\Element\Cell;
use PhpOffice\PhpWord\Element\Row;
use PhpOffice\PhpWord\Element\Table;
use PhpOffice\PhpWord\Element\Text;
use PhpOffice\PhpWord\Element\TextBreak;
use PhpOffice\PhpWord\IOFactory;
use PhpOffice\PhpWord\Shared\OLERead;
use RuntimeException;
use Symfony\Component\Process\Process;
use ZipArchive;
class DocTextExtractor
{
public static function extract(string $path, string $extension): string
{
$extension = strtolower(ltrim($extension, '.'));
return match ($extension) {
'docx' => self::extractDocx($path),
'doc' => self::extractDoc($path),
default => throw new RuntimeException('仅支持 .doc / .docx 申报表'),
};
}
public static function extractDocx(string $path): string
{
$zip = new ZipArchive();
if ($zip->open($path) !== true) {
throw new RuntimeException('无法读取 docx 文件');
}
$xml = $zip->getFromName('word/document.xml');
$zip->close();
if (! is_string($xml) || trim($xml) === '') {
throw new RuntimeException('docx 文档内容为空');
}
$dom = new \DOMDocument();
$dom->preserveWhiteSpace = false;
@$dom->loadXML($xml);
$xpath = new \DOMXPath($dom);
$xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main');
$lines = [];
$bodyNodes = $xpath->query('//w:body/*');
if ($bodyNodes === false) {
return '';
}
foreach ($bodyNodes as $node) {
$local = $node->localName ?? $node->nodeName;
if ($local === 'p') {
$text = self::textFromNode($xpath, $node);
$lines[] = self::normalizeLine($text);
continue;
}
if ($local === 'tbl') {
$rowNodes = $xpath->query('.//w:tr', $node);
if ($rowNodes === false) {
continue;
}
foreach ($rowNodes as $row) {
$cellTexts = [];
$cellNodes = $xpath->query('./w:tc', $row);
if ($cellNodes === false) {
continue;
}
foreach ($cellNodes as $cell) {
$cellTexts[] = self::normalizeLine(self::textFromNode($xpath, $cell));
}
if (count($cellTexts) === 1) {
$lines[] = $cellTexts[0];
} else {
foreach ($cellTexts as $cellText) {
$lines[] = $cellText;
}
}
}
}
}
return self::joinLines($lines);
}
public static function extractDoc(string $path): string
{
$candidates = self::extractDocCandidates($path);
if ($candidates === []) {
throw new RuntimeException('无法解析 .doc 申报表,请检查文件是否损坏,或另存为 .docx 后重试');
}
$bestText = null;
$bestScore = -1;
foreach ($candidates as $text) {
$score = self::scoreDeclarationText($text);
if ($score > $bestScore) {
$bestScore = $score;
$bestText = $text;
}
}
return $bestText ?? $candidates[0];
}
/**
* @return array<int, string>
*/
public static function extractDocCandidates(string $path): array
{
$candidates = [];
foreach ([
self::extractDocViaBinaryUtf16($path),
self::extractDocViaPhpWord($path),
self::extractDocViaTextutil($path),
self::extractDocViaSoffice($path),
] as $text) {
if (! is_string($text) || trim($text) === '') {
continue;
}
$normalized = self::normalizeText($text);
if ($normalized === '') {
continue;
}
$candidates[$normalized] = $normalized;
}
return array_values($candidates);
}
private static function extractDocViaTextutil(string $path): ?string
{
$textutil = self::resolveBinary('textutil');
if ($textutil === null) {
return null;
}
$out = self::runCommand([$textutil, '-convert', 'txt', '-stdout', $path], 45);
return is_string($out) && trim($out) !== '' ? $out : null;
}
private static function extractDocViaSoffice(string $path): ?string
{
$soffice = self::resolveBinary('soffice');
if ($soffice === null) {
return null;
}
$tmpDir = sys_get_temp_dir().'/study-tour-doc-'.Str::random(8);
if (! @mkdir($tmpDir, 0700, true) && ! is_dir($tmpDir)) {
return null;
}
self::runCommand([
$soffice,
'--headless',
'--convert-to',
'txt:Text',
'--outdir',
$tmpDir,
$path,
], 120);
$base = pathinfo($path, PATHINFO_FILENAME);
$txtPath = $tmpDir.'/'.$base.'.txt';
$text = is_file($txtPath) ? (file_get_contents($txtPath) ?: '') : '';
@array_map('unlink', glob($tmpDir.'/*') ?: []);
@rmdir($tmpDir);
return trim($text) !== '' ? $text : null;
}
private static function scoreDeclarationText(string $text): int
{
$score = 0;
$markers = [
'一、线路基本情况' => 100,
'三、线路规划' => 80,
'二、线路简介' => 40,
'四、研学课程' => 40,
'五、线路收费标准' => 40,
'六、线路计划实施情况' => 40,
'线路名称' => 30,
'组织单位名称' => 20,
'序号' => 25,
'研学点名称' => 25,
'研学活动内容' => 15,
'研学时长' => 10,
];
foreach ($markers as $marker => $weight) {
if (str_contains($text, $marker)) {
$score += $weight;
}
}
$score += min(substr_count($text, "\n"), 80);
if (strlen($text) > 6000) {
$score -= (int) ((strlen($text) - 6000) / 800);
}
return $score;
}
private static function extractDocViaPhpWord(string $path): ?string
{
if (! class_exists(IOFactory::class)) {
return null;
}
try {
$phpWord = IOFactory::load($path, 'MsDoc');
} catch (\Throwable) {
return null;
}
$lines = [];
foreach ($phpWord->getSections() as $section) {
foreach ($section->getElements() as $element) {
$chunk = self::extractPhpWordElementText($element);
if ($chunk === '') {
continue;
}
foreach (preg_split('/\R/u', $chunk) ?: [] as $line) {
$line = self::normalizeLine((string) $line);
if ($line !== '') {
$lines[] = $line;
}
}
}
}
$text = self::joinLines($lines);
if ($text === '' || (! str_contains($text, '线路基本情况') && ! str_contains($text, '线路名称'))) {
return null;
}
return $text;
}
private static function extractDocViaBinaryUtf16(string $path): ?string
{
if (! is_readable($path)) {
return null;
}
$data = @file_get_contents($path);
if (! is_string($data) || strlen($data) < 8) {
return null;
}
if (substr($data, 0, 8) !== OLERead::IDENTIFIER_OLE) {
return null;
}
$raw = self::convertUtf16LeToUtf8($data);
if (! is_string($raw) || $raw === '') {
return null;
}
$start = mb_strpos($raw, '一、线路基本情况');
if ($start === false) {
$start = mb_strpos($raw, '苏州市');
}
if ($start === false) {
$start = 0;
}
$chunk = mb_substr($raw, $start);
$parts = preg_split('/[\x{0000}-\x{001F}\x{007F}-\x{009F}]+/u', $chunk) ?: [];
$lines = [];
$previous = null;
foreach ($parts as $part) {
$line = self::normalizeLine((string) $part);
if ($line === '' || mb_strlen($line) < 2) {
continue;
}
if (self::isGarbledDeclarationLine($line)) {
continue;
}
if (! preg_match('/[\x{4e00}-\x{9fff}0-9A-Za-z]/u', $line)) {
continue;
}
if ($line === $previous) {
continue;
}
$previous = $line;
$lines[] = $line;
}
$text = self::joinLines($lines);
return str_contains($text, '线路基本情况') || str_contains($text, '线路名称') ? $text : null;
}
private static function convertUtf16LeToUtf8(string $data): ?string
{
if (function_exists('iconv')) {
$text = @iconv('UTF-16LE', 'UTF-8//IGNORE', $data);
if (is_string($text) && $text !== '') {
return $text;
}
}
if (function_exists('mb_convert_encoding')) {
$text = @mb_convert_encoding($data, 'UTF-8', 'UTF-16LE');
if (is_string($text) && $text !== '') {
return $text;
}
}
return null;
}
private static function extractPhpWordElementText(object $element): string
{
if ($element instanceof Text) {
return (string) $element->getText();
}
if ($element instanceof TextBreak) {
return "\n";
}
if ($element instanceof Table) {
$parts = [];
foreach ($element->getRows() as $row) {
if ($row instanceof Row) {
$parts[] = self::extractPhpWordRowText($row);
}
}
return implode("\n", array_filter($parts, fn ($part) => $part !== ''));
}
if ($element instanceof AbstractContainer) {
$parts = [];
foreach ($element->getElements() as $child) {
$parts[] = self::extractPhpWordElementText($child);
}
return implode('', $parts);
}
if (method_exists($element, 'getText')) {
$text = $element->getText();
return is_string($text) ? $text : '';
}
return '';
}
private static function extractPhpWordRowText(Row $row): string
{
$cells = [];
foreach ($row->getCells() as $cell) {
if ($cell instanceof Cell) {
$cells[] = self::extractPhpWordCellText($cell);
}
}
return implode("\n", array_values(array_filter(
array_map(fn ($cellText) => self::normalizeLine($cellText), $cells),
fn ($cellText) => $cellText !== ''
)));
}
private static function extractPhpWordCellText(Cell $cell): string
{
$parts = [];
foreach ($cell->getElements() as $element) {
$parts[] = self::extractPhpWordElementText($element);
}
return self::normalizeLine(implode('', $parts));
}
private static function textFromNode(\DOMXPath $xpath, \DOMNode $node): string
{
$parts = [];
$textNodes = $xpath->query('.//w:t', $node);
if ($textNodes !== false) {
foreach ($textNodes as $textNode) {
$parts[] = $textNode->textContent ?? '';
}
}
return trim(implode('', $parts));
}
private static function normalizeLine(string $text): string
{
$text = str_replace(["\xc2\xa0", "\xef\xbb\xbf"], ' ', $text);
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
return trim($text);
}
private static function joinLines(array $lines): string
{
$normalized = array_map(fn ($line) => self::normalizeLine((string) $line), $lines);
return self::normalizeText(implode("\n", $normalized));
}
private static function normalizeText(string $text): string
{
$text = str_replace(["\r\n", "\r", "\f"], "\n", $text);
$text = preg_replace("/\n{3,}/", "\n\n", $text) ?? $text;
return trim($text);
}
private static function resolveBinary(string $command): ?string
{
$candidates = [$command];
if ($command === 'soffice') {
$candidates = array_merge($candidates, [
'/usr/bin/soffice',
'/usr/local/bin/soffice',
'/opt/libreoffice/program/soffice',
'/usr/lib/libreoffice/program/soffice',
]);
}
if ($command === 'textutil') {
$candidates[] = '/usr/bin/textutil';
}
foreach ($candidates as $candidate) {
if ($candidate === $command) {
if (self::isExecutableOnPath($command)) {
return $command;
}
continue;
}
if (is_file($candidate) && is_executable($candidate)) {
return $candidate;
}
}
return null;
}
private static function isExecutableOnPath(string $command): bool
{
$pathEnv = getenv('PATH');
if (! is_string($pathEnv) || $pathEnv === '') {
$pathEnv = '/usr/local/bin:/usr/bin:/bin';
}
foreach (explode(':', $pathEnv) as $dir) {
$dir = trim($dir);
if ($dir === '') {
continue;
}
$full = rtrim($dir, '/').'/'.$command;
if (is_file($full) && is_executable($full)) {
return true;
}
}
return false;
}
/**
* @param array<int, string> $command
*/
private static function runCommand(array $command, int $timeoutSeconds = 30): ?string
{
$process = new Process($command);
$process->setTimeout($timeoutSeconds);
try {
$process->mustRun();
} catch (\Throwable) {
return null;
}
$output = $process->getOutput();
return is_string($output) && $output !== '' ? $output : null;
}
private static function isGarbledDeclarationLine(string $line): bool
{
if (preg_match('/^\d+$/', $line)) {
return false;
}
if (preg_match('/^\d{1,2}:\d{2}/', $line)) {
return false;
}
if (preg_match('/^[\d\s]+人$/u', $line)) {
return false;
}
if (preg_match('/^[\d\-\—-–\s]+$/u', $line) && preg_match('/\d{3,}/', $line)) {
return false;
}
if (preg_match('/[\x{0080}-\x{009F}]/u', $line)) {
return true;
}
if (preg_match('/[ᘀ-᛿]/u', $line)) {
return true;
}
if (preg_match('/[漀愀脈摫欀䡒⡯␖]/u', $line)) {
return true;
}
$cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0;
if ($cjk === 0 && mb_strlen($line) >= 3) {
return true;
}
$len = mb_strlen($line);
if ($len < 6) {
return $cjk === 0;
}
$readable = preg_match_all('/[\x{4e00}-\x{9fff}0-9A-Za-z。、\-\s\/&]/u', $line) ?: 0;
return ($readable / max($len, 1)) < 0.45;
}
}