diff --git a/public/app.js b/public/app.js
index ff51546..9294624 100644
--- a/public/app.js
+++ b/public/app.js
@@ -134,10 +134,42 @@ async function fetchDetails() {
listData = await dateRangeResponse.json();
} else {
- // 普通模式
+ // 普通模式 - 按数量抓取多页
const url = document.getElementById('detailUrl').value;
- const listResponse = await fetch(`${API_BASE}/list?url=${encodeURIComponent(url)}`);
- listData = await listResponse.json();
+ const limit = parseInt(document.getElementById('detailLimit').value);
+
+ // 抓取多页直到获得足够数量
+ const allItems = [];
+ let page = 1;
+ const maxPagesToFetch = Math.ceil(limit / 10) + 1; // 假设每页约10条
+
+ while (allItems.length < limit && page <= maxPagesToFetch) {
+ const listResponse = await fetch(`${API_BASE}/list?url=${encodeURIComponent(url)}&page=${page}`);
+ const pageData = await listResponse.json();
+
+ if (!pageData.success) {
+ if (allItems.length === 0) {
+ results.innerHTML = `
错误: ${pageData.error}
`;
+ loading.classList.remove('active');
+ return;
+ }
+ break;
+ }
+
+ if (pageData.data.length === 0) {
+ break;
+ }
+
+ allItems.push(...pageData.data);
+ page++;
+
+ // 如果还需要更多数据且未到达上限,稍作延迟
+ if (allItems.length < limit && page <= maxPagesToFetch) {
+ await new Promise(resolve => setTimeout(resolve, 500));
+ }
+ }
+
+ listData = { success: true, data: allItems };
}
if (!listData.success) {
diff --git a/src/server.js b/src/server.js
index a8f4d26..cd74722 100644
--- a/src/server.js
+++ b/src/server.js
@@ -14,11 +14,12 @@ app.use(express.static('public'));
const BASE_URL = 'https://gjzx.nanjing.gov.cn/gggs/';
// 获取分页URL
-function getPageUrl(pageIndex) {
+function getPageUrl(pageIndex, baseUrl = BASE_URL) {
if (pageIndex === 0) {
- return BASE_URL;
+ return baseUrl;
}
- return `${BASE_URL}index_${pageIndex}.html`;
+ const cleanBaseUrl = baseUrl.replace(/\/$/, '');
+ return `${cleanBaseUrl}/index_${pageIndex}.html`;
}
// 检查日期是否在范围内
@@ -151,18 +152,49 @@ function parseList(html) {
function parseDetail(html) {
const $ = cheerio.load(html);
- const title = $('.title18').text().trim();
+ // 尝试多种标题选择器
+ let title = $('.title18').text().trim();
+ if (!title) {
+ title = $('.article-info h1').text().trim();
+ }
+ if (!title) {
+ title = $('h1').first().text().trim();
+ }
+ // 尝试提取发布时间
const publishTd = $('td:contains("发布部门")').filter((_, el) => {
return $(el).text().includes('发布时间');
});
const publishText = publishTd.text().trim();
+ let timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/);
+ let publishTime = timeMatch ? timeMatch[1] : '';
- const timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/);
- const publishTime = timeMatch ? timeMatch[1] : '';
+ // 如果第一种方式没找到,尝试其他方式
+ if (!publishTime) {
+ const infoText = $('.info-sources').text() || $('body').text();
+ timeMatch = infoText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})/);
+ publishTime = timeMatch ? timeMatch[1] : '';
+ }
- const contentTd = $('.zhenwen td').first();
- const content = contentTd.text().trim();
+ // 尝试多种内容选择器
+ let content = '';
+ const contentSelectors = [
+ '.zhenwen td', // 原有格式
+ '.con', // 新格式(宁易新系统)
+ '.article-content', // 通用格式
+ '.ewb-article-content',
+ 'body' // 兜底方案
+ ];
+
+ for (const selector of contentSelectors) {
+ const el = $(selector).first();
+ if (el.length > 0) {
+ const text = el.text().trim();
+ if (text.length > content.length) {
+ content = text;
+ }
+ }
+ }
const budget = extractBudget(content);
@@ -175,27 +207,54 @@ function parseDetail(html) {
}
function extractBudget(content) {
+ // 直接定义金额匹配模式(从高优先级到低优先级)
const patterns = [
- /预算金额[::]\s*(\d+(?:\.\d+)?)\s*万元/,
- /最高限价[::]\s*(\d+(?:\.\d+)?)\s*万元/,
- /预算[::]\s*(\d+(?:\.\d+)?)\s*万元/,
- /金额[::]\s*(\d+(?:\.\d+)?)\s*万元/,
- /(\d+(?:\.\d+)?)\s*万元/,
+ // 优先级1: 带货币符号的万元
+ { regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 1 },
+
+ // 优先级2: 括号内的金额(元)
+ { regex: /[((][¥¥]([\d,,]+(?:\.\d+)?)[))]/i, priority: 2, divider: 10000 },
+
+ // 优先级3: 普通万元格式
+ { regex: /([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 3 },
+
+ // 优先级4: 带货币符号的元(转万元)
+ { regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*元/i, priority: 4, divider: 10000 },
+
+ // 优先级5: 普通元格式(转万元,最低优先级,排除"元整"避免误匹配中文大写)
+ { regex: /([\d,,]+(?:\.\d+)?)\s*元(?!整)/i, priority: 5, divider: 10000 }
];
+ let bestMatch = null;
+ let bestPriority = Infinity;
+
+ // 遍历所有模式,找到优先级最高的匹配
for (const pattern of patterns) {
- const match = content.match(pattern);
- if (match) {
- const amount = parseFloat(match[1]);
- return {
- amount,
- unit: '万元',
- text: match[0],
- };
+ const match = content.match(pattern.regex);
+ if (match && pattern.priority < bestPriority) {
+ // 清理数字中的逗号并转换
+ const numberStr = match[1].replace(/[,,]/g, '');
+ let amount = parseFloat(numberStr);
+
+ // 如果是元单位,转换为万元
+ if (pattern.divider) {
+ amount = amount / pattern.divider;
+ }
+
+ // 验证金额合理性(0.01万元到1000000万元之间)
+ if (!isNaN(amount) && amount >= 0.01 && amount <= 1000000) {
+ bestMatch = {
+ amount,
+ unit: '万元',
+ text: match[0],
+ originalUnit: pattern.divider ? '元' : '万元'
+ };
+ bestPriority = pattern.priority;
+ }
}
}
- return null;
+ return bestMatch;
}
// API 路由
@@ -270,8 +329,36 @@ app.post('/api/report', async (req, res) => {
const { limit = 15, threshold = 50, url } = req.body;
const targetUrl = url && url.trim() !== '' ? url : BASE_URL;
- const listHtml = await fetchHtml(targetUrl);
- const items = parseList(listHtml);
+ // 按需抓取多页以获取足够的数据
+ const items = [];
+ let pageIndex = 0;
+ const maxPagesToFetch = Math.ceil(limit / 10) + 1; // 假设每页约10条,多抓一页保险
+
+ while (items.length < limit && pageIndex < maxPagesToFetch) {
+ const pageUrl = getPageUrl(pageIndex, targetUrl);
+ console.log(`正在抓取第 ${pageIndex + 1} 页: ${pageUrl}`);
+
+ try {
+ const html = await fetchHtml(pageUrl);
+ const pageItems = parseList(html);
+
+ if (pageItems.length === 0) {
+ console.log(`第 ${pageIndex + 1} 页没有数据,停止抓取`);
+ break;
+ }
+
+ items.push(...pageItems);
+ pageIndex++;
+
+ if (items.length < limit && pageIndex < maxPagesToFetch) {
+ await new Promise(resolve => setTimeout(resolve, 500));
+ }
+ } catch (err) {
+ console.error(`抓取第 ${pageIndex + 1} 页失败: ${err.message}`);
+ break;
+ }
+ }
+
const results = [];
const toFetch = items.slice(0, limit);