研学线路

master
lion 4 days ago
parent 4e18dedf8b
commit a5c2da0a88

@ -8,6 +8,7 @@ use App\Support\StudyTourDeclarationParser;
use App\Support\StudyTourPayload;
use Illuminate\Http\JsonResponse;
use Illuminate\Http\Request;
use Illuminate\Support\Str;
class StudyTourController extends Controller
{
@ -75,15 +76,18 @@ class StudyTourController extends Controller
if (! in_array($extension, ['doc', 'docx'], true)) {
return response()->json(['message' => '仅支持 .doc / .docx 申报表'], 422);
}
$tmpPath = $uploaded->getRealPath();
if (! is_string($tmpPath) || $tmpPath === '') {
return response()->json(['message' => '无法读取上传文件'], 422);
}
$path = null;
try {
$result = StudyTourDeclarationParser::parseFile($tmpPath, $extension);
$storedPath = $uploaded->storeAs('tmp/declaration-import', Str::uuid()->toString().'.'.$extension);
$path = storage_path('app/'.$storedPath);
$result = StudyTourDeclarationParser::parseFile($path, $extension);
} catch (\Throwable $e) {
return response()->json(['message' => '申报表解析失败:'.$e->getMessage()], 422);
} finally {
if (isset($path) && is_file($path)) {
@unlink($path);
}
}
return response()->json($result);

@ -10,6 +10,7 @@ use PhpOffice\PhpWord\Element\Table;
use PhpOffice\PhpWord\Element\Text;
use PhpOffice\PhpWord\Element\TextBreak;
use PhpOffice\PhpWord\IOFactory;
use PhpOffice\PhpWord\Shared\OLERead;
use RuntimeException;
use Symfony\Component\Process\Process;
use ZipArchive;
@ -90,54 +91,132 @@ class DocTextExtractor
public static function extractDoc(string $path): string
{
$phpWordText = self::extractDocViaPhpWord($path);
if (is_string($phpWordText) && trim($phpWordText) !== '') {
return $phpWordText;
$candidates = self::extractDocCandidates($path);
if ($candidates === []) {
throw new RuntimeException('无法解析 .doc 申报表,请检查文件是否损坏,或另存为 .docx 后重试');
}
$textutil = self::resolveBinary('textutil');
if ($textutil !== null) {
$out = self::runCommand([$textutil, '-convert', 'txt', '-stdout', $path], 45);
if (is_string($out) && trim($out) !== '') {
return self::normalizeText($out);
$bestText = null;
$bestScore = -1;
foreach ($candidates as $text) {
$score = self::scoreDeclarationText($text);
if ($score > $bestScore) {
$bestScore = $score;
$bestText = $text;
}
}
$soffice = self::resolveBinary('soffice');
if ($soffice !== null) {
$tmpDir = sys_get_temp_dir().'/study-tour-doc-'.Str::random(8);
if (! @mkdir($tmpDir, 0700, true) && ! is_dir($tmpDir)) {
throw new RuntimeException('无法创建临时目录');
return $bestText ?? $candidates[0];
}
/**
* @return array<int, string>
*/
public static function extractDocCandidates(string $path): array
{
$candidates = [];
foreach ([
self::extractDocViaBinaryUtf16($path),
self::extractDocViaPhpWord($path),
self::extractDocViaTextutil($path),
self::extractDocViaSoffice($path),
] as $text) {
if (! is_string($text) || trim($text) === '') {
continue;
}
self::runCommand([
$soffice,
'--headless',
'--convert-to',
'txt:Text',
'--outdir',
$tmpDir,
$path,
], 120);
$base = pathinfo($path, PATHINFO_FILENAME);
$txtPath = $tmpDir.'/'.$base.'.txt';
if (is_file($txtPath)) {
$text = file_get_contents($txtPath) ?: '';
@unlink($txtPath);
@rmdir($tmpDir);
if (trim($text) !== '') {
return self::normalizeText($text);
}
$normalized = self::normalizeText($text);
if ($normalized === '') {
continue;
}
$candidates[$normalized] = $normalized;
}
return array_values($candidates);
}
private static function extractDocViaTextutil(string $path): ?string
{
$textutil = self::resolveBinary('textutil');
if ($textutil === null) {
return null;
}
$out = self::runCommand([$textutil, '-convert', 'txt', '-stdout', $path], 45);
return is_string($out) && trim($out) !== '' ? $out : null;
}
private static function extractDocViaSoffice(string $path): ?string
{
$soffice = self::resolveBinary('soffice');
if ($soffice === null) {
return null;
}
$tmpDir = sys_get_temp_dir().'/study-tour-doc-'.Str::random(8);
if (! @mkdir($tmpDir, 0700, true) && ! is_dir($tmpDir)) {
return null;
}
self::runCommand([
$soffice,
'--headless',
'--convert-to',
'txt:Text',
'--outdir',
$tmpDir,
$path,
], 120);
$base = pathinfo($path, PATHINFO_FILENAME);
$txtPath = $tmpDir.'/'.$base.'.txt';
$text = is_file($txtPath) ? (file_get_contents($txtPath) ?: '') : '';
@array_map('unlink', glob($tmpDir.'/*') ?: []);
@rmdir($tmpDir);
return trim($text) !== '' ? $text : null;
}
private static function scoreDeclarationText(string $text): int
{
$score = 0;
$markers = [
'一、线路基本情况' => 100,
'三、线路规划' => 80,
'二、线路简介' => 40,
'四、研学课程' => 40,
'五、线路收费标准' => 40,
'六、线路计划实施情况' => 40,
'线路名称' => 30,
'组织单位名称' => 20,
'序号' => 25,
'研学点名称' => 25,
'研学活动内容' => 15,
'研学时长' => 10,
];
foreach ($markers as $marker => $weight) {
if (str_contains($text, $marker)) {
$score += $weight;
}
@array_map('unlink', glob($tmpDir.'/*') ?: []);
@rmdir($tmpDir);
}
throw new RuntimeException('无法解析 .doc 申报表,请检查文件是否损坏,或另存为 .docx 后重试');
$score += min(substr_count($text, "\n"), 80);
if (strlen($text) > 6000) {
$score -= (int) ((strlen($text) - 6000) / 800);
}
return $score;
}
private static function extractDocViaPhpWord(string $path): ?string
{
if (! class_exists(IOFactory::class)) {
return null;
}
try {
$phpWord = IOFactory::load($path, 'MsDoc');
} catch (\Throwable) {
@ -161,8 +240,82 @@ class DocTextExtractor
}
$text = self::joinLines($lines);
if ($text === '' || (! str_contains($text, '线路基本情况') && ! str_contains($text, '线路名称'))) {
return null;
}
return trim($text) !== '' ? $text : null;
return $text;
}
private static function extractDocViaBinaryUtf16(string $path): ?string
{
if (! is_readable($path)) {
return null;
}
$data = @file_get_contents($path);
if (! is_string($data) || strlen($data) < 8) {
return null;
}
if (substr($data, 0, 8) !== OLERead::IDENTIFIER_OLE) {
return null;
}
$raw = self::convertUtf16LeToUtf8($data);
if (! is_string($raw) || $raw === '') {
return null;
}
$start = mb_strpos($raw, '一、线路基本情况');
if ($start === false) {
$start = mb_strpos($raw, '苏州市');
}
if ($start === false) {
$start = 0;
}
$chunk = mb_substr($raw, $start);
$parts = preg_split('/[\x{0000}-\x{001F}\x{007F}-\x{009F}]+/u', $chunk) ?: [];
$lines = [];
$seen = [];
foreach ($parts as $part) {
$line = self::normalizeLine((string) $part);
if ($line === '' || mb_strlen($line) < 2) {
continue;
}
if (! preg_match('/[\x{4e00}-\x{9fff}0-9A-Za-z]/u', $line)) {
continue;
}
if (isset($seen[$line])) {
continue;
}
$seen[$line] = true;
$lines[] = $line;
}
$text = self::joinLines($lines);
return str_contains($text, '线路基本情况') || str_contains($text, '线路名称') ? $text : null;
}
private static function convertUtf16LeToUtf8(string $data): ?string
{
if (function_exists('iconv')) {
$text = @iconv('UTF-16LE', 'UTF-8//IGNORE', $data);
if (is_string($text) && $text !== '') {
return $text;
}
}
if (function_exists('mb_convert_encoding')) {
$text = @mb_convert_encoding($data, 'UTF-8', 'UTF-16LE');
if (is_string($text) && $text !== '') {
return $text;
}
}
return null;
}
private static function extractPhpWordElementText(object $element): string

@ -22,11 +22,83 @@ class StudyTourDeclarationParser
*/
public static function parseFile(string $path, string $extension): array
{
$extension = strtolower(ltrim($extension, '.'));
if ($extension === 'doc') {
return self::parseDocFile($path);
}
$text = DocTextExtractor::extract($path, $extension);
return self::parseText($text);
}
/**
* @return array{parsed: array<string, mixed>, warnings: array<int, string>}
*/
private static function parseDocFile(string $path): array
{
$candidates = DocTextExtractor::extractDocCandidates($path);
if ($candidates === []) {
throw new \RuntimeException('无法解析 .doc 申报表,请检查文件是否损坏,或另存为 .docx 后重试');
}
$bestResult = null;
$bestScore = -1;
foreach ($candidates as $text) {
$result = self::parseText($text);
$score = self::scoreParsedResult($result);
if ($score > $bestScore) {
$bestScore = $score;
$bestResult = $result;
}
}
return $bestResult ?? self::parseText($candidates[0]);
}
/**
* @param array{parsed: array<string, mixed>, warnings: array<int, string>} $result
*/
private static function scoreParsedResult(array $result): int
{
$parsed = $result['parsed'] ?? [];
$score = 0;
if (trim((string) ($parsed['name'] ?? '')) !== '') {
$score += 50;
}
if (trim((string) ($parsed['org_name'] ?? '')) !== '') {
$score += 20;
}
if (trim((string) ($parsed['suitable_count'] ?? '')) !== '') {
$score += 5;
}
if (trim((string) ($parsed['duration'] ?? '')) !== '') {
$score += 5;
}
$score += count($parsed['seasons'] ?? []) * 3;
$score += count($parsed['grade_levels'] ?? []) * 3;
$score += count($parsed['venue_items'] ?? []) * 8;
foreach ($parsed['route_plans'] ?? [] as $group) {
$score += 10;
$score += count($group['items'] ?? []) * 6;
}
$score += count($parsed['courses'] ?? []) * 8;
foreach (['intro_html', 'fee_html', 'implementation_html'] as $key) {
$plain = trim(strip_tags((string) ($parsed[$key] ?? '')));
if ($plain !== '') {
$score += 12 + min(mb_strlen($plain), 120) / 20;
}
}
return (int) $score;
}
/**
* @return array{parsed: array<string, mixed>, warnings: array<int, string>}
*/

Loading…
Cancel
Save