```
feat(public): 实现按数量抓取多页数据功能 在普通模式下,支持根据用户指定的数量抓取多页列表数据,直到满足所需数量或达到最大页数限制。增加分页请求逻辑与延时控制,提升数据获取稳定性。 feat(server): 改进详情页解析与预算金额提取逻辑 增强标题、发布时间和正文内容的选择器容错能力,支持多种页面结构。优化预算金额提取规则,引入优先级匹配机制,并支持元转万元计算,提高数据准确性。 ```
This commit is contained in:
@@ -134,10 +134,42 @@ async function fetchDetails() {
|
|||||||
|
|
||||||
listData = await dateRangeResponse.json();
|
listData = await dateRangeResponse.json();
|
||||||
} else {
|
} else {
|
||||||
// 普通模式
|
// 普通模式 - 按数量抓取多页
|
||||||
const url = document.getElementById('detailUrl').value;
|
const url = document.getElementById('detailUrl').value;
|
||||||
const listResponse = await fetch(`${API_BASE}/list?url=${encodeURIComponent(url)}`);
|
const limit = parseInt(document.getElementById('detailLimit').value);
|
||||||
listData = await listResponse.json();
|
|
||||||
|
// 抓取多页直到获得足够数量
|
||||||
|
const allItems = [];
|
||||||
|
let page = 1;
|
||||||
|
const maxPagesToFetch = Math.ceil(limit / 10) + 1; // 假设每页约10条
|
||||||
|
|
||||||
|
while (allItems.length < limit && page <= maxPagesToFetch) {
|
||||||
|
const listResponse = await fetch(`${API_BASE}/list?url=${encodeURIComponent(url)}&page=${page}`);
|
||||||
|
const pageData = await listResponse.json();
|
||||||
|
|
||||||
|
if (!pageData.success) {
|
||||||
|
if (allItems.length === 0) {
|
||||||
|
results.innerHTML = `<div class="error">错误: ${pageData.error}</div>`;
|
||||||
|
loading.classList.remove('active');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pageData.data.length === 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
allItems.push(...pageData.data);
|
||||||
|
page++;
|
||||||
|
|
||||||
|
// 如果还需要更多数据且未到达上限,稍作延迟
|
||||||
|
if (allItems.length < limit && page <= maxPagesToFetch) {
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 500));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
listData = { success: true, data: allItems };
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!listData.success) {
|
if (!listData.success) {
|
||||||
|
|||||||
135
src/server.js
135
src/server.js
@@ -14,11 +14,12 @@ app.use(express.static('public'));
|
|||||||
const BASE_URL = 'https://gjzx.nanjing.gov.cn/gggs/';
|
const BASE_URL = 'https://gjzx.nanjing.gov.cn/gggs/';
|
||||||
|
|
||||||
// 获取分页URL
|
// 获取分页URL
|
||||||
function getPageUrl(pageIndex) {
|
function getPageUrl(pageIndex, baseUrl = BASE_URL) {
|
||||||
if (pageIndex === 0) {
|
if (pageIndex === 0) {
|
||||||
return BASE_URL;
|
return baseUrl;
|
||||||
}
|
}
|
||||||
return `${BASE_URL}index_${pageIndex}.html`;
|
const cleanBaseUrl = baseUrl.replace(/\/$/, '');
|
||||||
|
return `${cleanBaseUrl}/index_${pageIndex}.html`;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 检查日期是否在范围内
|
// 检查日期是否在范围内
|
||||||
@@ -151,18 +152,49 @@ function parseList(html) {
|
|||||||
function parseDetail(html) {
|
function parseDetail(html) {
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
const title = $('.title18').text().trim();
|
// 尝试多种标题选择器
|
||||||
|
let title = $('.title18').text().trim();
|
||||||
|
if (!title) {
|
||||||
|
title = $('.article-info h1').text().trim();
|
||||||
|
}
|
||||||
|
if (!title) {
|
||||||
|
title = $('h1').first().text().trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
// 尝试提取发布时间
|
||||||
const publishTd = $('td:contains("发布部门")').filter((_, el) => {
|
const publishTd = $('td:contains("发布部门")').filter((_, el) => {
|
||||||
return $(el).text().includes('发布时间');
|
return $(el).text().includes('发布时间');
|
||||||
});
|
});
|
||||||
const publishText = publishTd.text().trim();
|
const publishText = publishTd.text().trim();
|
||||||
|
let timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/);
|
||||||
|
let publishTime = timeMatch ? timeMatch[1] : '';
|
||||||
|
|
||||||
const timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/);
|
// 如果第一种方式没找到,尝试其他方式
|
||||||
const publishTime = timeMatch ? timeMatch[1] : '';
|
if (!publishTime) {
|
||||||
|
const infoText = $('.info-sources').text() || $('body').text();
|
||||||
|
timeMatch = infoText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})/);
|
||||||
|
publishTime = timeMatch ? timeMatch[1] : '';
|
||||||
|
}
|
||||||
|
|
||||||
const contentTd = $('.zhenwen td').first();
|
// 尝试多种内容选择器
|
||||||
const content = contentTd.text().trim();
|
let content = '';
|
||||||
|
const contentSelectors = [
|
||||||
|
'.zhenwen td', // 原有格式
|
||||||
|
'.con', // 新格式(宁易新系统)
|
||||||
|
'.article-content', // 通用格式
|
||||||
|
'.ewb-article-content',
|
||||||
|
'body' // 兜底方案
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of contentSelectors) {
|
||||||
|
const el = $(selector).first();
|
||||||
|
if (el.length > 0) {
|
||||||
|
const text = el.text().trim();
|
||||||
|
if (text.length > content.length) {
|
||||||
|
content = text;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const budget = extractBudget(content);
|
const budget = extractBudget(content);
|
||||||
|
|
||||||
@@ -175,27 +207,54 @@ function parseDetail(html) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function extractBudget(content) {
|
function extractBudget(content) {
|
||||||
|
// 直接定义金额匹配模式(从高优先级到低优先级)
|
||||||
const patterns = [
|
const patterns = [
|
||||||
/预算金额[::]\s*(\d+(?:\.\d+)?)\s*万元/,
|
// 优先级1: 带货币符号的万元
|
||||||
/最高限价[::]\s*(\d+(?:\.\d+)?)\s*万元/,
|
{ regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 1 },
|
||||||
/预算[::]\s*(\d+(?:\.\d+)?)\s*万元/,
|
|
||||||
/金额[::]\s*(\d+(?:\.\d+)?)\s*万元/,
|
// 优先级2: 括号内的金额(元)
|
||||||
/(\d+(?:\.\d+)?)\s*万元/,
|
{ regex: /[((][¥¥]([\d,,]+(?:\.\d+)?)[))]/i, priority: 2, divider: 10000 },
|
||||||
|
|
||||||
|
// 优先级3: 普通万元格式
|
||||||
|
{ regex: /([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 3 },
|
||||||
|
|
||||||
|
// 优先级4: 带货币符号的元(转万元)
|
||||||
|
{ regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*元/i, priority: 4, divider: 10000 },
|
||||||
|
|
||||||
|
// 优先级5: 普通元格式(转万元,最低优先级,排除"元整"避免误匹配中文大写)
|
||||||
|
{ regex: /([\d,,]+(?:\.\d+)?)\s*元(?!整)/i, priority: 5, divider: 10000 }
|
||||||
];
|
];
|
||||||
|
|
||||||
|
let bestMatch = null;
|
||||||
|
let bestPriority = Infinity;
|
||||||
|
|
||||||
|
// 遍历所有模式,找到优先级最高的匹配
|
||||||
for (const pattern of patterns) {
|
for (const pattern of patterns) {
|
||||||
const match = content.match(pattern);
|
const match = content.match(pattern.regex);
|
||||||
if (match) {
|
if (match && pattern.priority < bestPriority) {
|
||||||
const amount = parseFloat(match[1]);
|
// 清理数字中的逗号并转换
|
||||||
return {
|
const numberStr = match[1].replace(/[,,]/g, '');
|
||||||
amount,
|
let amount = parseFloat(numberStr);
|
||||||
unit: '万元',
|
|
||||||
text: match[0],
|
// 如果是元单位,转换为万元
|
||||||
};
|
if (pattern.divider) {
|
||||||
|
amount = amount / pattern.divider;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 验证金额合理性(0.01万元到1000000万元之间)
|
||||||
|
if (!isNaN(amount) && amount >= 0.01 && amount <= 1000000) {
|
||||||
|
bestMatch = {
|
||||||
|
amount,
|
||||||
|
unit: '万元',
|
||||||
|
text: match[0],
|
||||||
|
originalUnit: pattern.divider ? '元' : '万元'
|
||||||
|
};
|
||||||
|
bestPriority = pattern.priority;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return bestMatch;
|
||||||
}
|
}
|
||||||
|
|
||||||
// API 路由
|
// API 路由
|
||||||
@@ -270,8 +329,36 @@ app.post('/api/report', async (req, res) => {
|
|||||||
const { limit = 15, threshold = 50, url } = req.body;
|
const { limit = 15, threshold = 50, url } = req.body;
|
||||||
const targetUrl = url && url.trim() !== '' ? url : BASE_URL;
|
const targetUrl = url && url.trim() !== '' ? url : BASE_URL;
|
||||||
|
|
||||||
const listHtml = await fetchHtml(targetUrl);
|
// 按需抓取多页以获取足够的数据
|
||||||
const items = parseList(listHtml);
|
const items = [];
|
||||||
|
let pageIndex = 0;
|
||||||
|
const maxPagesToFetch = Math.ceil(limit / 10) + 1; // 假设每页约10条,多抓一页保险
|
||||||
|
|
||||||
|
while (items.length < limit && pageIndex < maxPagesToFetch) {
|
||||||
|
const pageUrl = getPageUrl(pageIndex, targetUrl);
|
||||||
|
console.log(`正在抓取第 ${pageIndex + 1} 页: ${pageUrl}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const html = await fetchHtml(pageUrl);
|
||||||
|
const pageItems = parseList(html);
|
||||||
|
|
||||||
|
if (pageItems.length === 0) {
|
||||||
|
console.log(`第 ${pageIndex + 1} 页没有数据,停止抓取`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
items.push(...pageItems);
|
||||||
|
pageIndex++;
|
||||||
|
|
||||||
|
if (items.length < limit && pageIndex < maxPagesToFetch) {
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 500));
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error(`抓取第 ${pageIndex + 1} 页失败: ${err.message}`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const results = [];
|
const results = [];
|
||||||
const toFetch = items.slice(0, limit);
|
const toFetch = items.slice(0, limit);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user