feat(public): 实现按数量抓取多页数据功能

在普通模式下,支持根据用户指定的数量抓取多页列表数据,直到满足所需数量或达到最大页数限制。增加分页请求逻辑与延时控制,提升数据获取稳定性。

feat(server): 改进详情页解析与预算金额提取逻辑

增强标题、发布时间和正文内容的选择器容错能力,支持多种页面结构。优化预算金额提取规则,引入优先级匹配机制,并支持元转万元计算,提高数据准确性。
```
This commit is contained in:
2025-12-14 19:21:19 +08:00
parent 83a8a3bb9a
commit 745faa0ecc
2 changed files with 146 additions and 27 deletions

View File

@@ -14,11 +14,12 @@ app.use(express.static('public'));
const BASE_URL = 'https://gjzx.nanjing.gov.cn/gggs/';
// 获取分页URL
function getPageUrl(pageIndex) {
function getPageUrl(pageIndex, baseUrl = BASE_URL) {
if (pageIndex === 0) {
return BASE_URL;
return baseUrl;
}
return `${BASE_URL}index_${pageIndex}.html`;
const cleanBaseUrl = baseUrl.replace(/\/$/, '');
return `${cleanBaseUrl}/index_${pageIndex}.html`;
}
// 检查日期是否在范围内
@@ -151,18 +152,49 @@ function parseList(html) {
function parseDetail(html) {
const $ = cheerio.load(html);
const title = $('.title18').text().trim();
// 尝试多种标题选择器
let title = $('.title18').text().trim();
if (!title) {
title = $('.article-info h1').text().trim();
}
if (!title) {
title = $('h1').first().text().trim();
}
// 尝试提取发布时间
const publishTd = $('td:contains("发布部门")').filter((_, el) => {
return $(el).text().includes('发布时间');
});
const publishText = publishTd.text().trim();
let timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/);
let publishTime = timeMatch ? timeMatch[1] : '';
const timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/);
const publishTime = timeMatch ? timeMatch[1] : '';
// 如果第一种方式没找到,尝试其他方式
if (!publishTime) {
const infoText = $('.info-sources').text() || $('body').text();
timeMatch = infoText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})/);
publishTime = timeMatch ? timeMatch[1] : '';
}
const contentTd = $('.zhenwen td').first();
const content = contentTd.text().trim();
// 尝试多种内容选择器
let content = '';
const contentSelectors = [
'.zhenwen td', // 原有格式
'.con', // 新格式(宁易新系统)
'.article-content', // 通用格式
'.ewb-article-content',
'body' // 兜底方案
];
for (const selector of contentSelectors) {
const el = $(selector).first();
if (el.length > 0) {
const text = el.text().trim();
if (text.length > content.length) {
content = text;
}
}
}
const budget = extractBudget(content);
@@ -175,27 +207,54 @@ function parseDetail(html) {
}
function extractBudget(content) {
// 直接定义金额匹配模式(从高优先级到低优先级)
const patterns = [
/预算金额[:]\s*(\d+(?:\.\d+)?)\s*万元/,
/最高限价[:]\s*(\d+(?:\.\d+)?)\s*万元/,
/预算[:]\s*(\d+(?:\.\d+)?)\s*万元/,
/金额[:]\s*(\d+(?:\.\d+)?)\s*万元/,
/(\d+(?:\.\d+)?)\s*万元/,
// 优先级1: 带货币符号的万元
{ regex: /(?:¥|¥|人民币)\s*([\d,]+(?:\.\d+)?)\s*万元/i, priority: 1 },
// 优先级2: 括号内的金额(元)
{ regex: /[(][¥¥]([\d,]+(?:\.\d+)?)[)]/i, priority: 2, divider: 10000 },
// 优先级3: 普通万元格式
{ regex: /([\d,]+(?:\.\d+)?)\s*万元/i, priority: 3 },
// 优先级4: 带货币符号的元(转万元)
{ regex: /(?:¥|¥|人民币)\s*([\d,]+(?:\.\d+)?)\s*元/i, priority: 4, divider: 10000 },
// 优先级5: 普通元格式(转万元,最低优先级,排除"元整"避免误匹配中文大写)
{ regex: /([\d,]+(?:\.\d+)?)\s*元(?!整)/i, priority: 5, divider: 10000 }
];
let bestMatch = null;
let bestPriority = Infinity;
// 遍历所有模式,找到优先级最高的匹配
for (const pattern of patterns) {
const match = content.match(pattern);
if (match) {
const amount = parseFloat(match[1]);
return {
amount,
unit: '万元',
text: match[0],
};
const match = content.match(pattern.regex);
if (match && pattern.priority < bestPriority) {
// 清理数字中的逗号并转换
const numberStr = match[1].replace(/[,]/g, '');
let amount = parseFloat(numberStr);
// 如果是元单位,转换为万元
if (pattern.divider) {
amount = amount / pattern.divider;
}
// 验证金额合理性(0.01万元到1000000万元之间)
if (!isNaN(amount) && amount >= 0.01 && amount <= 1000000) {
bestMatch = {
amount,
unit: '万元',
text: match[0],
originalUnit: pattern.divider ? '元' : '万元'
};
bestPriority = pattern.priority;
}
}
}
return null;
return bestMatch;
}
// API 路由
@@ -270,8 +329,36 @@ app.post('/api/report', async (req, res) => {
const { limit = 15, threshold = 50, url } = req.body;
const targetUrl = url && url.trim() !== '' ? url : BASE_URL;
const listHtml = await fetchHtml(targetUrl);
const items = parseList(listHtml);
// 按需抓取多页以获取足够的数据
const items = [];
let pageIndex = 0;
const maxPagesToFetch = Math.ceil(limit / 10) + 1; // 假设每页约10条多抓一页保险
while (items.length < limit && pageIndex < maxPagesToFetch) {
const pageUrl = getPageUrl(pageIndex, targetUrl);
console.log(`正在抓取第 ${pageIndex + 1} 页: ${pageUrl}`);
try {
const html = await fetchHtml(pageUrl);
const pageItems = parseList(html);
if (pageItems.length === 0) {
console.log(`${pageIndex + 1} 页没有数据,停止抓取`);
break;
}
items.push(...pageItems);
pageIndex++;
if (items.length < limit && pageIndex < maxPagesToFetch) {
await new Promise(resolve => setTimeout(resolve, 500));
}
} catch (err) {
console.error(`抓取第 ${pageIndex + 1} 页失败: ${err.message}`);
break;
}
}
const results = [];
const toFetch = items.slice(0, limit);