You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

641 lines
19 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<template>
<view class="page-wrap profile-bg crawler-page">
<view class="card form-card">
<view class="field">
<text class="form-label">入库类型 *</text>
<picker class="picker-field" :range="typeLabels" :value="typeIndex" @change="onTypeChange">
<view class="select">{{ typeLabels[typeIndex] }}</view>
</picker>
</view>
<view class="field">
<text class="form-label">爬虫地址选填</text>
<picker
class="picker-field"
:range="addressLabels"
:value="addressIndex"
@change="onAddressPick"
>
<view class="select">{{ addressLabels[addressIndex] }}</view>
</picker>
</view>
<view class="field">
<text class="form-label">目标地址 *</text>
<input
v-model="form.request_url"
class="input"
placeholder="https:// 列表页或详情页"
placeholder-class="input-placeholder"
@blur="onUrlBlur"
/>
<text v-if="resolving" class="hint">正在识别采集源…</text>
<text v-else-if="resolvedName" class="hint">已识别:{{ resolvedName }}{{ resolvedAdapter ? `${formatAdapterLabel(resolvedAdapter)}` : '' }}</text>
<text v-if="selectedAddressHint" class="hint">{{ selectedAddressHint }}</text>
</view>
<view class="field">
<text class="form-label">搜索关键词(选填)</text>
<textarea
v-model="keyword"
class="textarea"
maxlength="500"
:placeholder="keywordPlaceholder"
placeholder-class="input-placeholder"
/>
</view>
<view v-if="form.target_type === 'paper' || form.target_type === 'industry_news' || form.target_type === 'teacher'" class="field">
<text class="form-label">抓取页数</text>
<input v-model.number="maxPages" class="input" type="number" :disabled="isAiSjtuResearchCenter" />
<text v-if="form.target_type === 'paper'" class="hint">arXiv 按提交时间倒序,每页约 50 条;增大页数可抓取更早论文</text>
<text v-if="form.target_type === 'paper'" class="hint">已入库论文自动跳过,不计入条数上限</text>
<text v-else-if="form.target_type === 'industry_news'" class="hint">虎嗅、投资界、清科等列表页建议 35 页;正文将自动补全入库</text>
<text v-else-if="isAiSjtuResearchCenter" class="hint">交大 AI 研究院研究中心为 API 一次性拉取,无需分页</text>
<text v-else-if="form.target_type === 'teacher'" class="hint">多页列表(如 Sudy CMS、博山 CMS、交大 tsites请适当增大页数</text>
<text v-else-if="form.target_type === 'teacher' && !isAiSjtuResearchCenter" class="hint">大批量抓取时仅部分老师会访问主页补邮箱,避免请求超时</text>
</view>
<view class="field">
<text class="form-label">条数上限</text>
<input v-model.number="maxResults" class="input" type="number" />
<text v-if="form.target_type === 'paper'" class="hint">论文最多 200 条</text>
<text v-else-if="form.target_type === 'teacher'" class="hint">师资列表最多 500 条</text>
<text v-if="isAiSjtuResearchCenter" class="hint">将抓取各研究中心「研究团队」成员,含邮箱、电话、研究方向</text>
<text v-else-if="form.target_type === 'industry_news'" class="hint">资讯最多 50 条;同 URL 已入库将跳过(不重写正文,空正文需先删旧记录再重抓)</text>
<text v-if="form.target_type === 'industry_news'" class="hint">来源将使用爬虫地址名称(如交大要闻),不会填「通用资讯 HTML」</text>
</view>
<button class="btn btn-primary submit-btn" :loading="submitting" @tap="submit">开始抓取</button>
</view>
<view v-if="lastResult" class="card result-card">
<text class="result-title">抓取结果</text>
<text class="result-line">状态:{{ lastResult.status === 'completed' ? '已完成' : lastResult.status }}</text>
<text v-if="lastResult.source_name" class="result-line">采集源:{{ lastResult.source_name }}</text>
<text v-if="lastResult.adapter_code" class="result-line">适配器:{{ formatAdapterLabel(lastResult.adapter_code) }}</text>
<text v-if="lastResult.result_summary" class="result-line">{{ lastResult.result_summary }}</text>
<text v-else class="result-line">已入库 {{ lastResult.items_imported ?? 0 }} 条</text>
<text v-if="lastResult.teacher_duplicates_skipped" class="result-line hint-line">
跳过 {{ lastResult.teacher_duplicates_skipped }} 位:老师库中已有相同邮箱,或同校同院系同名老师
</text>
<text v-if="lastResult.items_fetched" class="result-line">共抓取 {{ lastResult.items_fetched }} 条</text>
</view>
</view>
</template>
<script lang="ts">
import { getShareAppMessage, getShareTimelineMessage } from '@/utils/page-share-handlers'
export default {
onShareAppMessage: getShareAppMessage,
onShareTimeline: getShareTimelineMessage,
}
</script>
<script setup lang="ts">
import { computed, reactive, ref } from 'vue'
import { onShow } from '@dcloudio/uni-app'
import { crawlerApi, type CrawlTargetType } from '@/api/crawler'
import { crawlAddressApi, type CrawlAddressOption } from '@/api/crawl-addresses'
import { useUserStore } from '@/stores/user'
const userStore = useUserStore()
const typeOptions: { label: string; value: CrawlTargetType }[] = [
{ label: '论文 → 论文库', value: 'paper' },
{ label: '行业资讯 → 资讯管理', value: 'industry_news' },
{ label: '老师库 → 老师库', value: 'teacher' },
]
const typeLabels = typeOptions.map((item) => item.label)
const typeIndex = ref(0)
const keyword = ref('')
const maxPages = ref(1)
const maxResults = ref(50)
const resolving = ref(false)
const submitting = ref(false)
const resolvedName = ref('')
const resolvedAdapter = ref('')
const resolvedUrl = ref('')
const ADAPTER_LABELS: Record<string, string> = {
huxiu_html: '虎嗅 API',
pedaily_html: '投资界',
faculty_list_html: '师资 HTML',
generic_news_html: '通用资讯',
arxiv_api: 'arXiv API',
ai_sjtu_research_center_api: '交大 AI 研究中心',
}
function formatAdapterLabel(code: string) {
return ADAPTER_LABELS[code] || code
}
const isAiSjtuResearchCenter = computed(
() => resolvedAdapter.value === 'ai_sjtu_research_center_api',
)
const lastResult = ref<Awaited<ReturnType<typeof crawlerApi.submit>> | null>(null)
const addressOptions = ref<CrawlAddressOption[]>([])
const addressIndex = ref(0)
const selectedCrawlAddressId = ref<number | null>(null)
const crawlDefaults = ref<{
category_dict_item_id?: number
category_label?: string
source_name?: string
university_id?: number
university_name?: string
department?: string
adapter_code?: string
}>({})
const addressLabels = computed(() => [
'请选择爬虫地址',
...addressOptions.value.map((item) => item.name),
])
const keywordPlaceholder = computed(() => {
if (form.target_type === 'paper') {
return '多个关键词用逗号或换行分隔graph neural, AI'
}
if (form.target_type === 'industry_news') {
return '多个关键词用空格、逗号或换行分隔,如:融资 科创板 AI'
}
return '多个关键词用空格、逗号或换行分隔'
})
const selectedAddressHint = computed(() => {
const parts: string[] = []
if (crawlDefaults.value.category_label) {
parts.push(`资讯分类:${crawlDefaults.value.category_label}`)
}
if (crawlDefaults.value.department) {
parts.push(`默认院系:${crawlDefaults.value.department}`)
}
if (crawlDefaults.value.university_name) {
parts.push(`默认高校:${crawlDefaults.value.university_name}`)
}
if (crawlDefaults.value.adapter_code) {
parts.push(`采集适配器:${formatAdapterLabel(crawlDefaults.value.adapter_code)}`)
}
return parts.length > 0 ? parts.join('') : ''
})
const form = reactive({
target_type: 'paper' as CrawlTargetType,
request_url: 'https://arxiv.org/',
})
function defaultUrl(type: CrawlTargetType) {
if (type === 'paper') return 'https://arxiv.org/'
if (type === 'teacher') return ''
return 'https://www.pedaily.cn/all/'
}
function applyTypeDefaults(type: CrawlTargetType) {
if (type === 'teacher') {
maxResults.value = 200
maxPages.value = 5
} else if (type === 'industry_news') {
maxResults.value = 30
maxPages.value = 5
} else {
maxResults.value = 50
maxPages.value = 1
}
}
function clampParams(type: CrawlTargetType) {
if (type === 'paper') {
maxResults.value = Math.min(200, Math.max(1, Number(maxResults.value) || 50))
maxPages.value = Math.min(20, Math.max(1, Number(maxPages.value) || 1))
} else if (type === 'teacher') {
maxResults.value = Math.min(500, Math.max(1, Number(maxResults.value) || 200))
maxPages.value = Math.min(50, Math.max(1, Number(maxPages.value) || 5))
} else {
maxResults.value = Math.min(50, Math.max(1, Number(maxResults.value) || 30))
maxPages.value = Math.min(50, Math.max(1, Number(maxPages.value) || 5))
}
}
function normalizeUrl(url: string) {
const trimmed = url.trim()
if (!trimmed) return ''
return trimmed.startsWith('http') ? trimmed : `https://${trimmed}`
}
async function loadAddresses() {
try {
addressOptions.value = await crawlAddressApi.options(form.target_type)
} catch {
addressOptions.value = []
}
addressIndex.value = 0
selectedCrawlAddressId.value = null
crawlDefaults.value = {}
}
function applyCrawlDefaults(addr: CrawlAddressOption) {
crawlDefaults.value = {}
if (addr.category_dict_item_id) {
crawlDefaults.value.category_dict_item_id = addr.category_dict_item_id
}
if (addr.category_label) {
crawlDefaults.value.category_label = addr.category_label
}
crawlDefaults.value.source_name = addr.name
if (addr.university_id) {
crawlDefaults.value.university_id = addr.university_id
}
if (addr.university_name) {
crawlDefaults.value.university_name = addr.university_name
}
if (addr.department) {
crawlDefaults.value.department = addr.department
}
if (addr.adapter_code) {
crawlDefaults.value.adapter_code = addr.adapter_code
}
}
function syncFromCrawlAddress(url: string, options?: { fillKeyword?: boolean }) {
const normalized = normalizeUrl(url)
const matched = addressOptions.value.find(
(item) => normalizeUrl(item.request_url) === normalized,
)
if (!matched) {
addressIndex.value = 0
selectedCrawlAddressId.value = null
crawlDefaults.value = {}
return
}
addressIndex.value = addressOptions.value.indexOf(matched) + 1
selectedCrawlAddressId.value = matched.id
if (options?.fillKeyword && matched.keyword) {
keyword.value = matched.keyword
}
applyCrawlDefaults(matched)
}
onShow(async () => {
if (!userStore.isLoggedIn) {
uni.navigateTo({ url: '/subpkg/login/index' })
return
}
try {
await userStore.fetchMe()
} catch {
// ignore
}
if (!userStore.isStaff) {
uni.showToast({ title: '无权使用数据爬虫', icon: 'none' })
setTimeout(() => uni.navigateBack(), 1200)
return
}
applyTypeDefaults(form.target_type)
await loadAddresses()
if (form.request_url.trim()) {
void onUrlBlur()
}
})
function onTypeChange(event: UniHelper.PickerChangeEvent) {
typeIndex.value = Number(event.detail.value)
form.target_type = typeOptions[typeIndex.value]?.value || 'paper'
resolvedName.value = ''
resolvedAdapter.value = ''
resolvedUrl.value = ''
form.request_url = defaultUrl(form.target_type)
keyword.value = ''
applyTypeDefaults(form.target_type)
void loadAddresses()
}
function onAddressPick(event: UniHelper.PickerChangeEvent) {
addressIndex.value = Number(event.detail.value)
if (addressIndex.value <= 0) {
selectedCrawlAddressId.value = null
crawlDefaults.value = {}
return
}
const addr = addressOptions.value[addressIndex.value - 1]
if (!addr) return
selectedCrawlAddressId.value = addr.id
form.request_url = addr.request_url
if (addr.keyword) {
keyword.value = addr.keyword
}
applyCrawlDefaults(addr)
void onUrlBlur()
}
async function onUrlBlur() {
const url = form.request_url.trim()
if (!url) {
resolvedName.value = ''
resolvedAdapter.value = ''
resolvedUrl.value = ''
addressIndex.value = 0
selectedCrawlAddressId.value = null
crawlDefaults.value = {}
return
}
const normalized = normalizeUrl(url)
if (normalized !== url) {
form.request_url = normalized
}
const matched = addressOptions.value.some(
(item) => normalizeUrl(item.request_url) === normalized,
)
syncFromCrawlAddress(normalized, { fillKeyword: matched })
resolving.value = true
try {
const res = await crawlerApi.resolveUrl({
request_url: normalized,
target_type: form.target_type,
crawl_address_id: selectedCrawlAddressId.value ?? undefined,
})
resolvedName.value = res.source_name
resolvedAdapter.value = res.adapter_code || ''
resolvedUrl.value = normalized
if (res.adapter_code === 'ai_sjtu_research_center_api') {
maxPages.value = 1
maxResults.value = Math.max(maxResults.value, 200)
}
} catch (error) {
resolvedName.value = ''
resolvedAdapter.value = ''
resolvedUrl.value = ''
uni.showToast({
title: error instanceof Error ? error.message : '无法识别该地址',
icon: 'none',
})
} finally {
resolving.value = false
}
}
async function ensureResolved(): Promise<boolean> {
const url = form.request_url.trim()
if (!url) {
uni.showToast({ title: '请填写目标地址', icon: 'none' })
return false
}
const normalized = normalizeUrl(url)
if (normalized !== url) {
form.request_url = normalized
}
if (!resolvedName.value || resolvedUrl.value !== normalized) {
await onUrlBlur()
}
return !!resolvedName.value
}
function buildParams(): Record<string, unknown> {
clampParams(form.target_type)
const params: Record<string, unknown> = {}
params.keyword = keyword.value.trim()
params.max_results = maxResults.value
if (form.target_type === 'paper' || form.target_type === 'industry_news' || form.target_type === 'teacher') {
params.max_pages = maxPages.value
}
return params
}
function buildToastMessage(result: Awaited<ReturnType<typeof crawlerApi.submit>>): string {
if (result.target_type === 'teacher') {
const imported = result.items_imported ?? 0
const skipped = result.teacher_duplicates_skipped ?? 0
if (skipped > 0) {
return `已入库${imported}位,跳过${skipped}位`
}
return `已入库${imported}位老师`
}
if (result.target_type === 'paper') {
const papers = result.papers_imported ?? result.items_imported ?? 0
return `已入库${papers}篇论文`
}
if (result.target_type === 'industry_news') {
const news = result.items_imported ?? 0
return `已入库${news}条资讯`
}
return '抓取完成'
}
function resolveNewsSourceName(url: string): string {
if (crawlDefaults.value.source_name) {
return crawlDefaults.value.source_name
}
const normalized = normalizeUrl(url)
const matched = addressOptions.value.find(
(item) => normalizeUrl(item.request_url) === normalized,
)
return matched?.name || ''
}
function buildNewsDefaults(url: string) {
const defaults: NonNullable<Parameters<typeof crawlerApi.submit>[0]['news_defaults']> = {}
if (crawlDefaults.value.category_dict_item_id) {
defaults.category_dict_item_id = crawlDefaults.value.category_dict_item_id
}
const source = resolveNewsSourceName(url)
if (source) {
defaults.source = source
}
return Object.keys(defaults).length > 0 ? defaults : undefined
}
async function submit() {
if (!(await ensureResolved())) {
uni.showToast({ title: '无法识别该地址,请检查入库类型与 URL', icon: 'none' })
return
}
submitting.value = true
try {
syncFromCrawlAddress(form.request_url, { fillKeyword: false })
const normalizedUrl = normalizeUrl(form.request_url)
const payload: Parameters<typeof crawlerApi.submit>[0] = {
target_type: form.target_type,
request_url: normalizedUrl,
params: buildParams(),
}
if (selectedCrawlAddressId.value) {
payload.crawl_address_id = selectedCrawlAddressId.value
}
if (form.target_type === 'industry_news') {
const newsDefaults = buildNewsDefaults(normalizedUrl)
if (newsDefaults) {
payload.news_defaults = newsDefaults
}
}
if (form.target_type === 'teacher') {
const teacherDefaults: NonNullable<Parameters<typeof crawlerApi.submit>[0]['teacher_defaults']> = {}
if (crawlDefaults.value.university_id) {
teacherDefaults.university_id = crawlDefaults.value.university_id
}
if (crawlDefaults.value.department) {
teacherDefaults.department = crawlDefaults.value.department
}
if (Object.keys(teacherDefaults).length > 0) {
payload.teacher_defaults = teacherDefaults
}
}
lastResult.value = await crawlerApi.submit(payload)
uni.showToast({
title: buildToastMessage(lastResult.value),
icon: 'none',
duration: 2500,
})
} catch (error) {
uni.showToast({
title: error instanceof Error ? error.message : '抓取失败',
icon: 'none',
})
} finally {
submitting.value = false
}
}
</script>
<style scoped lang="scss">
@import '@/styles/page.scss';
@import '@/styles/card.scss';
@import '@/styles/tokens.scss';
.crawler-page {
box-sizing: border-box;
}
.form-card,
.result-card {
box-sizing: border-box;
width: 100%;
margin-bottom: $section-gap;
overflow: hidden;
}
.form-card {
padding: 32rpx;
}
.result-card {
padding: 32rpx;
}
.field {
box-sizing: border-box;
width: 100%;
margin-bottom: 24rpx;
}
.field:last-of-type {
margin-bottom: 0;
}
.form-label {
display: block;
margin-bottom: 12rpx;
color: #374151;
font-size: 28rpx;
}
.picker-field {
display: block;
width: 100%;
}
.select,
.input,
.textarea {
box-sizing: border-box;
display: block;
width: 100%;
max-width: 100%;
}
.select {
display: flex;
align-items: center;
min-height: 84rpx;
padding: 0 24rpx;
border: 1px solid #d6dde8;
border-radius: 16rpx;
background: #fff;
font-size: 28rpx;
line-height: 1.4;
color: #111827;
}
.input {
height: 84rpx;
padding: 0 24rpx;
border: 1px solid #d6dde8;
border-radius: 16rpx;
background: #fff;
font-size: 28rpx;
}
.input[disabled] {
background: #f3f4f6;
color: #9ca3af;
}
.textarea {
min-height: 240rpx;
padding: 24rpx;
border: 1px solid #d6dde8;
border-radius: 16rpx;
background: #fff;
font-size: 28rpx;
line-height: 1.6;
}
.input-placeholder {
color: #9ca3af;
}
.hint {
display: block;
margin-top: 12rpx;
color: #6b7280;
font-size: 24rpx;
}
.submit-btn {
width: 100%;
min-height: 88rpx;
margin-top: 32rpx;
border-radius: 16rpx;
font-size: 30rpx;
font-weight: 500;
}
.submit-btn::after {
border: none;
}
.result-title {
display: block;
margin-bottom: 16rpx;
color: #111827;
font-size: 30rpx;
font-weight: 500;
}
.result-line {
display: block;
margin-top: 8rpx;
color: #4b5563;
font-size: 26rpx;
line-height: 1.5;
}
.hint-line {
color: #6b7280;
font-size: 24rpx;
}
.result-summary {
margin-top: 8rpx;
line-height: 1.6;
}
</style>