|
|
<?php
|
|
|
|
|
|
namespace App\Support;
|
|
|
|
|
|
use Illuminate\Support\Str;
|
|
|
use PhpOffice\PhpWord\Element\AbstractContainer;
|
|
|
use PhpOffice\PhpWord\Element\Cell;
|
|
|
use PhpOffice\PhpWord\Element\Row;
|
|
|
use PhpOffice\PhpWord\Element\Table;
|
|
|
use PhpOffice\PhpWord\Element\Text;
|
|
|
use PhpOffice\PhpWord\Element\TextBreak;
|
|
|
use PhpOffice\PhpWord\IOFactory;
|
|
|
use PhpOffice\PhpWord\Shared\OLERead;
|
|
|
use RuntimeException;
|
|
|
use Symfony\Component\Process\Process;
|
|
|
use ZipArchive;
|
|
|
|
|
|
class DocTextExtractor
|
|
|
{
|
|
|
public static function extract(string $path, string $extension): string
|
|
|
{
|
|
|
$extension = strtolower(ltrim($extension, '.'));
|
|
|
|
|
|
return match ($extension) {
|
|
|
'docx' => self::extractDocx($path),
|
|
|
'doc' => self::extractDoc($path),
|
|
|
default => throw new RuntimeException('仅支持 .doc / .docx 申报表'),
|
|
|
};
|
|
|
}
|
|
|
|
|
|
public static function extractDocx(string $path): string
|
|
|
{
|
|
|
$zip = new ZipArchive();
|
|
|
if ($zip->open($path) !== true) {
|
|
|
throw new RuntimeException('无法读取 docx 文件');
|
|
|
}
|
|
|
|
|
|
$xml = $zip->getFromName('word/document.xml');
|
|
|
$zip->close();
|
|
|
|
|
|
if (! is_string($xml) || trim($xml) === '') {
|
|
|
throw new RuntimeException('docx 文档内容为空');
|
|
|
}
|
|
|
|
|
|
$dom = new \DOMDocument();
|
|
|
$dom->preserveWhiteSpace = false;
|
|
|
@$dom->loadXML($xml);
|
|
|
$xpath = new \DOMXPath($dom);
|
|
|
$xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main');
|
|
|
|
|
|
$lines = [];
|
|
|
$bodyNodes = $xpath->query('//w:body/*');
|
|
|
if ($bodyNodes === false) {
|
|
|
return '';
|
|
|
}
|
|
|
|
|
|
foreach ($bodyNodes as $node) {
|
|
|
$local = $node->localName ?? $node->nodeName;
|
|
|
if ($local === 'p') {
|
|
|
$text = self::textFromNode($xpath, $node);
|
|
|
$lines[] = self::normalizeLine($text);
|
|
|
continue;
|
|
|
}
|
|
|
if ($local === 'tbl') {
|
|
|
$rowNodes = $xpath->query('.//w:tr', $node);
|
|
|
if ($rowNodes === false) {
|
|
|
continue;
|
|
|
}
|
|
|
foreach ($rowNodes as $row) {
|
|
|
$cellTexts = [];
|
|
|
$cellNodes = $xpath->query('./w:tc', $row);
|
|
|
if ($cellNodes === false) {
|
|
|
continue;
|
|
|
}
|
|
|
foreach ($cellNodes as $cell) {
|
|
|
$cellTexts[] = self::normalizeLine(self::textFromNode($xpath, $cell));
|
|
|
}
|
|
|
if (count($cellTexts) === 1) {
|
|
|
$lines[] = $cellTexts[0];
|
|
|
} else {
|
|
|
foreach ($cellTexts as $cellText) {
|
|
|
$lines[] = $cellText;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return self::joinLines($lines);
|
|
|
}
|
|
|
|
|
|
public static function extractDoc(string $path): string
|
|
|
{
|
|
|
$candidates = self::extractDocCandidates($path);
|
|
|
if ($candidates === []) {
|
|
|
throw new RuntimeException('无法解析 .doc 申报表,请检查文件是否损坏,或另存为 .docx 后重试');
|
|
|
}
|
|
|
|
|
|
$bestText = null;
|
|
|
$bestScore = -1;
|
|
|
foreach ($candidates as $text) {
|
|
|
$score = self::scoreDeclarationText($text);
|
|
|
if ($score > $bestScore) {
|
|
|
$bestScore = $score;
|
|
|
$bestText = $text;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return $bestText ?? $candidates[0];
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @return array<int, string>
|
|
|
*/
|
|
|
public static function extractDocCandidates(string $path): array
|
|
|
{
|
|
|
$candidates = [];
|
|
|
|
|
|
foreach ([
|
|
|
self::extractDocViaBinaryUtf16($path),
|
|
|
self::extractDocViaPhpWord($path),
|
|
|
self::extractDocViaTextutil($path),
|
|
|
self::extractDocViaSoffice($path),
|
|
|
] as $text) {
|
|
|
if (! is_string($text) || trim($text) === '') {
|
|
|
continue;
|
|
|
}
|
|
|
$normalized = self::normalizeText($text);
|
|
|
if ($normalized === '') {
|
|
|
continue;
|
|
|
}
|
|
|
$candidates[$normalized] = $normalized;
|
|
|
}
|
|
|
|
|
|
return array_values($candidates);
|
|
|
}
|
|
|
|
|
|
private static function extractDocViaTextutil(string $path): ?string
|
|
|
{
|
|
|
$textutil = self::resolveBinary('textutil');
|
|
|
if ($textutil === null) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$out = self::runCommand([$textutil, '-convert', 'txt', '-stdout', $path], 45);
|
|
|
|
|
|
return is_string($out) && trim($out) !== '' ? $out : null;
|
|
|
}
|
|
|
|
|
|
private static function extractDocViaSoffice(string $path): ?string
|
|
|
{
|
|
|
$soffice = self::resolveBinary('soffice');
|
|
|
if ($soffice === null) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$tmpDir = sys_get_temp_dir().'/study-tour-doc-'.Str::random(8);
|
|
|
if (! @mkdir($tmpDir, 0700, true) && ! is_dir($tmpDir)) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
self::runCommand([
|
|
|
$soffice,
|
|
|
'--headless',
|
|
|
'--convert-to',
|
|
|
'txt:Text',
|
|
|
'--outdir',
|
|
|
$tmpDir,
|
|
|
$path,
|
|
|
], 120);
|
|
|
|
|
|
$base = pathinfo($path, PATHINFO_FILENAME);
|
|
|
$txtPath = $tmpDir.'/'.$base.'.txt';
|
|
|
$text = is_file($txtPath) ? (file_get_contents($txtPath) ?: '') : '';
|
|
|
@array_map('unlink', glob($tmpDir.'/*') ?: []);
|
|
|
@rmdir($tmpDir);
|
|
|
|
|
|
return trim($text) !== '' ? $text : null;
|
|
|
}
|
|
|
|
|
|
private static function scoreDeclarationText(string $text): int
|
|
|
{
|
|
|
$score = 0;
|
|
|
$markers = [
|
|
|
'一、线路基本情况' => 100,
|
|
|
'三、线路规划' => 80,
|
|
|
'二、线路简介' => 40,
|
|
|
'四、研学课程' => 40,
|
|
|
'五、线路收费标准' => 40,
|
|
|
'六、线路计划实施情况' => 40,
|
|
|
'线路名称' => 30,
|
|
|
'组织单位名称' => 20,
|
|
|
'序号' => 25,
|
|
|
'研学点名称' => 25,
|
|
|
'研学活动内容' => 15,
|
|
|
'研学时长' => 10,
|
|
|
];
|
|
|
|
|
|
foreach ($markers as $marker => $weight) {
|
|
|
if (str_contains($text, $marker)) {
|
|
|
$score += $weight;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
$score += min(substr_count($text, "\n"), 80);
|
|
|
|
|
|
if (strlen($text) > 6000) {
|
|
|
$score -= (int) ((strlen($text) - 6000) / 800);
|
|
|
}
|
|
|
|
|
|
return $score;
|
|
|
}
|
|
|
|
|
|
private static function extractDocViaPhpWord(string $path): ?string
|
|
|
{
|
|
|
if (! class_exists(IOFactory::class)) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
try {
|
|
|
$phpWord = IOFactory::load($path, 'MsDoc');
|
|
|
} catch (\Throwable) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$lines = [];
|
|
|
foreach ($phpWord->getSections() as $section) {
|
|
|
foreach ($section->getElements() as $element) {
|
|
|
$chunk = self::extractPhpWordElementText($element);
|
|
|
if ($chunk === '') {
|
|
|
continue;
|
|
|
}
|
|
|
foreach (preg_split('/\R/u', $chunk) ?: [] as $line) {
|
|
|
$line = self::normalizeLine((string) $line);
|
|
|
if ($line !== '') {
|
|
|
$lines[] = $line;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
$text = self::joinLines($lines);
|
|
|
if ($text === '' || (! str_contains($text, '线路基本情况') && ! str_contains($text, '线路名称'))) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
return $text;
|
|
|
}
|
|
|
|
|
|
private static function extractDocViaBinaryUtf16(string $path): ?string
|
|
|
{
|
|
|
if (! is_readable($path)) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$data = @file_get_contents($path);
|
|
|
if (! is_string($data) || strlen($data) < 8) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
if (substr($data, 0, 8) !== OLERead::IDENTIFIER_OLE) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$raw = self::convertUtf16LeToUtf8($data);
|
|
|
if (! is_string($raw) || $raw === '') {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$start = mb_strpos($raw, '一、线路基本情况');
|
|
|
if ($start === false) {
|
|
|
$start = mb_strpos($raw, '苏州市');
|
|
|
}
|
|
|
if ($start === false) {
|
|
|
$start = 0;
|
|
|
}
|
|
|
|
|
|
$chunk = mb_substr($raw, $start);
|
|
|
$parts = preg_split('/[\x{0000}-\x{001F}\x{007F}-\x{009F}]+/u', $chunk) ?: [];
|
|
|
$lines = [];
|
|
|
$previous = null;
|
|
|
foreach ($parts as $part) {
|
|
|
$line = self::normalizeLine((string) $part);
|
|
|
if ($line === '' || mb_strlen($line) < 2) {
|
|
|
continue;
|
|
|
}
|
|
|
if (self::isGarbledDeclarationLine($line)) {
|
|
|
continue;
|
|
|
}
|
|
|
if (! preg_match('/[\x{4e00}-\x{9fff}0-9A-Za-z]/u', $line)) {
|
|
|
continue;
|
|
|
}
|
|
|
if ($line === $previous) {
|
|
|
continue;
|
|
|
}
|
|
|
$previous = $line;
|
|
|
$lines[] = $line;
|
|
|
}
|
|
|
|
|
|
$text = self::joinLines($lines);
|
|
|
|
|
|
return str_contains($text, '线路基本情况') || str_contains($text, '线路名称') ? $text : null;
|
|
|
}
|
|
|
|
|
|
private static function convertUtf16LeToUtf8(string $data): ?string
|
|
|
{
|
|
|
if (function_exists('iconv')) {
|
|
|
$text = @iconv('UTF-16LE', 'UTF-8//IGNORE', $data);
|
|
|
if (is_string($text) && $text !== '') {
|
|
|
return $text;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if (function_exists('mb_convert_encoding')) {
|
|
|
$text = @mb_convert_encoding($data, 'UTF-8', 'UTF-16LE');
|
|
|
if (is_string($text) && $text !== '') {
|
|
|
return $text;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
private static function extractPhpWordElementText(object $element): string
|
|
|
{
|
|
|
if ($element instanceof Text) {
|
|
|
return (string) $element->getText();
|
|
|
}
|
|
|
if ($element instanceof TextBreak) {
|
|
|
return "\n";
|
|
|
}
|
|
|
if ($element instanceof Table) {
|
|
|
$parts = [];
|
|
|
foreach ($element->getRows() as $row) {
|
|
|
if ($row instanceof Row) {
|
|
|
$parts[] = self::extractPhpWordRowText($row);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return implode("\n", array_filter($parts, fn ($part) => $part !== ''));
|
|
|
}
|
|
|
if ($element instanceof AbstractContainer) {
|
|
|
$parts = [];
|
|
|
foreach ($element->getElements() as $child) {
|
|
|
$parts[] = self::extractPhpWordElementText($child);
|
|
|
}
|
|
|
|
|
|
return implode('', $parts);
|
|
|
}
|
|
|
if (method_exists($element, 'getText')) {
|
|
|
$text = $element->getText();
|
|
|
|
|
|
return is_string($text) ? $text : '';
|
|
|
}
|
|
|
|
|
|
return '';
|
|
|
}
|
|
|
|
|
|
private static function extractPhpWordRowText(Row $row): string
|
|
|
{
|
|
|
$cells = [];
|
|
|
foreach ($row->getCells() as $cell) {
|
|
|
if ($cell instanceof Cell) {
|
|
|
$cells[] = self::extractPhpWordCellText($cell);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return implode("\n", array_values(array_filter(
|
|
|
array_map(fn ($cellText) => self::normalizeLine($cellText), $cells),
|
|
|
fn ($cellText) => $cellText !== ''
|
|
|
)));
|
|
|
}
|
|
|
|
|
|
private static function extractPhpWordCellText(Cell $cell): string
|
|
|
{
|
|
|
$parts = [];
|
|
|
foreach ($cell->getElements() as $element) {
|
|
|
$parts[] = self::extractPhpWordElementText($element);
|
|
|
}
|
|
|
|
|
|
return self::normalizeLine(implode('', $parts));
|
|
|
}
|
|
|
|
|
|
private static function textFromNode(\DOMXPath $xpath, \DOMNode $node): string
|
|
|
{
|
|
|
$parts = [];
|
|
|
$textNodes = $xpath->query('.//w:t', $node);
|
|
|
if ($textNodes !== false) {
|
|
|
foreach ($textNodes as $textNode) {
|
|
|
$parts[] = $textNode->textContent ?? '';
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return trim(implode('', $parts));
|
|
|
}
|
|
|
|
|
|
private static function normalizeLine(string $text): string
|
|
|
{
|
|
|
$text = str_replace(["\xc2\xa0", "\xef\xbb\xbf"], ' ', $text);
|
|
|
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
|
|
|
|
|
|
return trim($text);
|
|
|
}
|
|
|
|
|
|
private static function joinLines(array $lines): string
|
|
|
{
|
|
|
$normalized = array_map(fn ($line) => self::normalizeLine((string) $line), $lines);
|
|
|
|
|
|
return self::normalizeText(implode("\n", $normalized));
|
|
|
}
|
|
|
|
|
|
private static function normalizeText(string $text): string
|
|
|
{
|
|
|
$text = str_replace(["\r\n", "\r", "\f"], "\n", $text);
|
|
|
$text = preg_replace("/\n{3,}/", "\n\n", $text) ?? $text;
|
|
|
|
|
|
return trim($text);
|
|
|
}
|
|
|
|
|
|
private static function resolveBinary(string $command): ?string
|
|
|
{
|
|
|
$candidates = [$command];
|
|
|
if ($command === 'soffice') {
|
|
|
$candidates = array_merge($candidates, [
|
|
|
'/usr/bin/soffice',
|
|
|
'/usr/local/bin/soffice',
|
|
|
'/opt/libreoffice/program/soffice',
|
|
|
'/usr/lib/libreoffice/program/soffice',
|
|
|
]);
|
|
|
}
|
|
|
if ($command === 'textutil') {
|
|
|
$candidates[] = '/usr/bin/textutil';
|
|
|
}
|
|
|
|
|
|
foreach ($candidates as $candidate) {
|
|
|
if ($candidate === $command) {
|
|
|
if (self::isExecutableOnPath($command)) {
|
|
|
return $command;
|
|
|
}
|
|
|
continue;
|
|
|
}
|
|
|
if (is_file($candidate) && is_executable($candidate)) {
|
|
|
return $candidate;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
private static function isExecutableOnPath(string $command): bool
|
|
|
{
|
|
|
$pathEnv = getenv('PATH');
|
|
|
if (! is_string($pathEnv) || $pathEnv === '') {
|
|
|
$pathEnv = '/usr/local/bin:/usr/bin:/bin';
|
|
|
}
|
|
|
|
|
|
foreach (explode(':', $pathEnv) as $dir) {
|
|
|
$dir = trim($dir);
|
|
|
if ($dir === '') {
|
|
|
continue;
|
|
|
}
|
|
|
$full = rtrim($dir, '/').'/'.$command;
|
|
|
if (is_file($full) && is_executable($full)) {
|
|
|
return true;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param array<int, string> $command
|
|
|
*/
|
|
|
private static function runCommand(array $command, int $timeoutSeconds = 30): ?string
|
|
|
{
|
|
|
$process = new Process($command);
|
|
|
$process->setTimeout($timeoutSeconds);
|
|
|
|
|
|
try {
|
|
|
$process->mustRun();
|
|
|
} catch (\Throwable) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$output = $process->getOutput();
|
|
|
|
|
|
return is_string($output) && $output !== '' ? $output : null;
|
|
|
}
|
|
|
|
|
|
private static function isGarbledDeclarationLine(string $line): bool
|
|
|
{
|
|
|
if (preg_match('/^\d+$/', $line)) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
if (preg_match('/^\d{1,2}:\d{2}/', $line)) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
if (preg_match('/^[\d\s]+人$/u', $line)) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
if (preg_match('/^[\d\-\—-–\s]+$/u', $line) && preg_match('/\d{3,}/', $line)) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
if (preg_match('/[\x{0080}-\x{009F}]/u', $line)) {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
if (preg_match('/[ᘀ-]/u', $line)) {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
if (preg_match('/[漀愀脈摫欀䡒⡯␖]/u', $line)) {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
$cjk = preg_match_all('/[\x{4e00}-\x{9fff}]/u', $line) ?: 0;
|
|
|
if ($cjk === 0 && mb_strlen($line) >= 3) {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
$len = mb_strlen($line);
|
|
|
if ($len < 6) {
|
|
|
return $cjk === 0;
|
|
|
}
|
|
|
|
|
|
$readable = preg_match_all('/[\x{4e00}-\x{9fff}0-9A-Za-z,。、:;()\-\s\/&]/u', $line) ?: 0;
|
|
|
|
|
|
return ($readable / max($len, 1)) < 0.45;
|
|
|
}
|
|
|
}
|