From 745faa0ecc7076ac6fe8ea4ef31f2587a7be855e Mon Sep 17 00:00:00 2001 From: zhaojunlong <5482498@qq.com> Date: Sun, 14 Dec 2025 19:21:19 +0800 Subject: [PATCH] =?UTF-8?q?```=20feat(public):=20=E5=AE=9E=E7=8E=B0?= =?UTF-8?q?=E6=8C=89=E6=95=B0=E9=87=8F=E6=8A=93=E5=8F=96=E5=A4=9A=E9=A1=B5?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 在普通模式下,支持根据用户指定的数量抓取多页列表数据,直到满足所需数量或达到最大页数限制。增加分页请求逻辑与延时控制,提升数据获取稳定性。 feat(server): 改进详情页解析与预算金额提取逻辑 增强标题、发布时间和正文内容的选择器容错能力,支持多种页面结构。优化预算金额提取规则,引入优先级匹配机制,并支持元转万元计算,提高数据准确性。 ``` --- public/app.js | 38 ++++++++++++-- src/server.js | 135 +++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 146 insertions(+), 27 deletions(-) diff --git a/public/app.js b/public/app.js index ff51546..9294624 100644 --- a/public/app.js +++ b/public/app.js @@ -134,10 +134,42 @@ async function fetchDetails() { listData = await dateRangeResponse.json(); } else { - // 普通模式 + // 普通模式 - 按数量抓取多页 const url = document.getElementById('detailUrl').value; - const listResponse = await fetch(`${API_BASE}/list?url=${encodeURIComponent(url)}`); - listData = await listResponse.json(); + const limit = parseInt(document.getElementById('detailLimit').value); + + // 抓取多页直到获得足够数量 + const allItems = []; + let page = 1; + const maxPagesToFetch = Math.ceil(limit / 10) + 1; // 假设每页约10条 + + while (allItems.length < limit && page <= maxPagesToFetch) { + const listResponse = await fetch(`${API_BASE}/list?url=${encodeURIComponent(url)}&page=${page}`); + const pageData = await listResponse.json(); + + if (!pageData.success) { + if (allItems.length === 0) { + results.innerHTML = `
错误: ${pageData.error}
`; + loading.classList.remove('active'); + return; + } + break; + } + + if (pageData.data.length === 0) { + break; + } + + allItems.push(...pageData.data); + page++; + + // 如果还需要更多数据且未到达上限,稍作延迟 + if (allItems.length < limit && page <= maxPagesToFetch) { + await new Promise(resolve => setTimeout(resolve, 500)); + } + } + + listData = { success: true, data: allItems }; } if (!listData.success) { diff --git a/src/server.js b/src/server.js index a8f4d26..cd74722 100644 --- a/src/server.js +++ b/src/server.js @@ -14,11 +14,12 @@ app.use(express.static('public')); const BASE_URL = 'https://gjzx.nanjing.gov.cn/gggs/'; // 获取分页URL -function getPageUrl(pageIndex) { +function getPageUrl(pageIndex, baseUrl = BASE_URL) { if (pageIndex === 0) { - return BASE_URL; + return baseUrl; } - return `${BASE_URL}index_${pageIndex}.html`; + const cleanBaseUrl = baseUrl.replace(/\/$/, ''); + return `${cleanBaseUrl}/index_${pageIndex}.html`; } // 检查日期是否在范围内 @@ -151,18 +152,49 @@ function parseList(html) { function parseDetail(html) { const $ = cheerio.load(html); - const title = $('.title18').text().trim(); + // 尝试多种标题选择器 + let title = $('.title18').text().trim(); + if (!title) { + title = $('.article-info h1').text().trim(); + } + if (!title) { + title = $('h1').first().text().trim(); + } + // 尝试提取发布时间 const publishTd = $('td:contains("发布部门")').filter((_, el) => { return $(el).text().includes('发布时间'); }); const publishText = publishTd.text().trim(); + let timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/); + let publishTime = timeMatch ? timeMatch[1] : ''; - const timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/); - const publishTime = timeMatch ? timeMatch[1] : ''; + // 如果第一种方式没找到,尝试其他方式 + if (!publishTime) { + const infoText = $('.info-sources').text() || $('body').text(); + timeMatch = infoText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})/); + publishTime = timeMatch ? timeMatch[1] : ''; + } - const contentTd = $('.zhenwen td').first(); - const content = contentTd.text().trim(); + // 尝试多种内容选择器 + let content = ''; + const contentSelectors = [ + '.zhenwen td', // 原有格式 + '.con', // 新格式(宁易新系统) + '.article-content', // 通用格式 + '.ewb-article-content', + 'body' // 兜底方案 + ]; + + for (const selector of contentSelectors) { + const el = $(selector).first(); + if (el.length > 0) { + const text = el.text().trim(); + if (text.length > content.length) { + content = text; + } + } + } const budget = extractBudget(content); @@ -175,27 +207,54 @@ function parseDetail(html) { } function extractBudget(content) { + // 直接定义金额匹配模式(从高优先级到低优先级) const patterns = [ - /预算金额[::]\s*(\d+(?:\.\d+)?)\s*万元/, - /最高限价[::]\s*(\d+(?:\.\d+)?)\s*万元/, - /预算[::]\s*(\d+(?:\.\d+)?)\s*万元/, - /金额[::]\s*(\d+(?:\.\d+)?)\s*万元/, - /(\d+(?:\.\d+)?)\s*万元/, + // 优先级1: 带货币符号的万元 + { regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 1 }, + + // 优先级2: 括号内的金额(元) + { regex: /[((][¥¥]([\d,,]+(?:\.\d+)?)[))]/i, priority: 2, divider: 10000 }, + + // 优先级3: 普通万元格式 + { regex: /([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 3 }, + + // 优先级4: 带货币符号的元(转万元) + { regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*元/i, priority: 4, divider: 10000 }, + + // 优先级5: 普通元格式(转万元,最低优先级,排除"元整"避免误匹配中文大写) + { regex: /([\d,,]+(?:\.\d+)?)\s*元(?!整)/i, priority: 5, divider: 10000 } ]; + let bestMatch = null; + let bestPriority = Infinity; + + // 遍历所有模式,找到优先级最高的匹配 for (const pattern of patterns) { - const match = content.match(pattern); - if (match) { - const amount = parseFloat(match[1]); - return { - amount, - unit: '万元', - text: match[0], - }; + const match = content.match(pattern.regex); + if (match && pattern.priority < bestPriority) { + // 清理数字中的逗号并转换 + const numberStr = match[1].replace(/[,,]/g, ''); + let amount = parseFloat(numberStr); + + // 如果是元单位,转换为万元 + if (pattern.divider) { + amount = amount / pattern.divider; + } + + // 验证金额合理性(0.01万元到1000000万元之间) + if (!isNaN(amount) && amount >= 0.01 && amount <= 1000000) { + bestMatch = { + amount, + unit: '万元', + text: match[0], + originalUnit: pattern.divider ? '元' : '万元' + }; + bestPriority = pattern.priority; + } } } - return null; + return bestMatch; } // API 路由 @@ -270,8 +329,36 @@ app.post('/api/report', async (req, res) => { const { limit = 15, threshold = 50, url } = req.body; const targetUrl = url && url.trim() !== '' ? url : BASE_URL; - const listHtml = await fetchHtml(targetUrl); - const items = parseList(listHtml); + // 按需抓取多页以获取足够的数据 + const items = []; + let pageIndex = 0; + const maxPagesToFetch = Math.ceil(limit / 10) + 1; // 假设每页约10条,多抓一页保险 + + while (items.length < limit && pageIndex < maxPagesToFetch) { + const pageUrl = getPageUrl(pageIndex, targetUrl); + console.log(`正在抓取第 ${pageIndex + 1} 页: ${pageUrl}`); + + try { + const html = await fetchHtml(pageUrl); + const pageItems = parseList(html); + + if (pageItems.length === 0) { + console.log(`第 ${pageIndex + 1} 页没有数据,停止抓取`); + break; + } + + items.push(...pageItems); + pageIndex++; + + if (items.length < limit && pageIndex < maxPagesToFetch) { + await new Promise(resolve => setTimeout(resolve, 500)); + } + } catch (err) { + console.error(`抓取第 ${pageIndex + 1} 页失败: ${err.message}`); + break; + } + } + const results = []; const toFetch = items.slice(0, limit);