|
|
|
|
<?php
|
|
|
|
|
|
|
|
|
|
namespace Tests\Unit;
|
|
|
|
|
|
|
|
|
|
use App\Services\Crawl\NewsContentHtml;
|
|
|
|
|
use PHPUnit\Framework\TestCase;
|
|
|
|
|
|
|
|
|
|
class NewsContentHtmlTest extends TestCase
|
|
|
|
|
{
|
|
|
|
|
public function test_extracts_sjtu_article_content(): void
|
|
|
|
|
{
|
|
|
|
|
$html = <<<'HTML'
|
|
|
|
|
<div class="Article_content">
|
|
|
|
|
<p>正文段落一</p>
|
|
|
|
|
<p><img src="/resource/upload/202604/a.png" alt="图"/></p>
|
|
|
|
|
<p>正文段落二,需要足够长的文字才能通过提取阈值校验。</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="Article-source">来源</div>
|
|
|
|
|
HTML;
|
|
|
|
|
|
|
|
|
|
$body = NewsContentHtml::extractBody($html);
|
|
|
|
|
$this->assertNotNull($body);
|
|
|
|
|
$this->assertStringContainsString('正文段落一', $body);
|
|
|
|
|
$this->assertStringContainsString('/resource/upload/', $body);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_resolves_relative_image_src(): void
|
|
|
|
|
{
|
|
|
|
|
$normalized = NewsContentHtml::normalize(
|
|
|
|
|
'<img src="/resource/upload/a.png">',
|
|
|
|
|
'https://news.sjtu.edu.cn/jdyw/20260408/221279.html'
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
$this->assertStringContainsString('https://news.sjtu.edu.cn/resource/upload/a.png', $normalized);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extracts_pedata_article_main(): void
|
|
|
|
|
{
|
|
|
|
|
$html = <<<'HTML'
|
|
|
|
|
<div class="article_con">
|
|
|
|
|
<div class="article_main">
|
|
|
|
|
<p>清科研究中心正文段落一,包含足够长的文字用于通过正文提取阈值校验。</p>
|
|
|
|
|
<p>第二段正文内容继续补充长度,确保 strip_tags 后超过三十个字符。</p>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="news_all_text">版权声明</div>
|
|
|
|
|
HTML;
|
|
|
|
|
|
|
|
|
|
$body = NewsContentHtml::extractBody($html);
|
|
|
|
|
|
|
|
|
|
$this->assertNotNull($body);
|
|
|
|
|
$this->assertStringContainsString('清科研究中心正文段落一', $body);
|
|
|
|
|
}
|
|
|
|
|
}
|