You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
37 lines
1.1 KiB
37 lines
1.1 KiB
|
6 days ago
|
<?php
|
||
|
|
|
||
|
|
namespace Tests\Unit;
|
||
|
|
|
||
|
|
use App\Services\Crawl\NewsContentHtml;
|
||
|
|
use PHPUnit\Framework\TestCase;
|
||
|
|
|
||
|
|
class NewsContentHtmlTest extends TestCase
|
||
|
|
{
|
||
|
|
public function test_extracts_sjtu_article_content(): void
|
||
|
|
{
|
||
|
|
$html = <<<'HTML'
|
||
|
|
<div class="Article_content">
|
||
|
|
<p>正文段落一</p>
|
||
|
|
<p><img src="/resource/upload/202604/a.png" alt="图"/></p>
|
||
|
|
<p>正文段落二,需要足够长的文字才能通过提取阈值校验。</p>
|
||
|
|
</div>
|
||
|
|
<div class="Article-source">来源</div>
|
||
|
|
HTML;
|
||
|
|
|
||
|
|
$body = NewsContentHtml::extractBody($html);
|
||
|
|
$this->assertNotNull($body);
|
||
|
|
$this->assertStringContainsString('正文段落一', $body);
|
||
|
|
$this->assertStringContainsString('/resource/upload/', $body);
|
||
|
|
}
|
||
|
|
|
||
|
|
public function test_resolves_relative_image_src(): void
|
||
|
|
{
|
||
|
|
$normalized = NewsContentHtml::normalize(
|
||
|
|
'<img src="/resource/upload/a.png">',
|
||
|
|
'https://news.sjtu.edu.cn/jdyw/20260408/221279.html'
|
||
|
|
);
|
||
|
|
|
||
|
|
$this->assertStringContainsString('https://news.sjtu.edu.cn/resource/upload/a.png', $normalized);
|
||
|
|
}
|
||
|
|
}
|