Files
tool-node/src/server.js
zhaojunlong 745faa0ecc ```
feat(public): 实现按数量抓取多页数据功能

在普通模式下,支持根据用户指定的数量抓取多页列表数据,直到满足所需数量或达到最大页数限制。增加分页请求逻辑与延时控制,提升数据获取稳定性。

feat(server): 改进详情页解析与预算金额提取逻辑

增强标题、发布时间和正文内容的选择器容错能力,支持多种页面结构。优化预算金额提取规则,引入优先级匹配机制,并支持元转万元计算,提高数据准确性。
```
2025-12-14 19:21:19 +08:00

497 lines
14 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import express from 'express';
import cors from 'cors';
import axios from 'axios';
import * as cheerio from 'cheerio';
import iconv from 'iconv-lite';
const app = express();
const PORT = 3000;
app.use(cors());
app.use(express.json());
app.use(express.static('public'));
const BASE_URL = 'https://gjzx.nanjing.gov.cn/gggs/';
// 获取分页URL
function getPageUrl(pageIndex, baseUrl = BASE_URL) {
if (pageIndex === 0) {
return baseUrl;
}
const cleanBaseUrl = baseUrl.replace(/\/$/, '');
return `${cleanBaseUrl}/index_${pageIndex}.html`;
}
// 检查日期是否在范围内
function isDateInRange(dateStr, startDate, endDate) {
if (!dateStr) return false;
const date = new Date(dateStr);
if (isNaN(date.getTime())) return false;
if (startDate && date < new Date(startDate)) return false;
if (endDate && date > new Date(endDate)) return false;
return true;
}
// 按时间范围抓取多页列表
async function fetchListByDateRange(startDate, endDate, maxPages = 23) {
const allItems = [];
let shouldContinue = true;
let pageIndex = 0;
console.log(`开始按时间范围抓取: ${startDate || '不限'}${endDate || '不限'}`);
while (shouldContinue && pageIndex < maxPages) {
const pageUrl = getPageUrl(pageIndex);
console.log(`正在抓取第 ${pageIndex + 1} 页: ${pageUrl}`);
try {
const html = await fetchHtml(pageUrl);
const items = parseList(html);
if (items.length === 0) {
console.log(`${pageIndex + 1} 页没有数据,停止抓取`);
break;
}
let hasItemsInRange = false;
let allItemsBeforeRange = true;
for (const item of items) {
if (isDateInRange(item.date, startDate, endDate)) {
allItems.push(item);
hasItemsInRange = true;
allItemsBeforeRange = false;
} else if (startDate && new Date(item.date) < new Date(startDate)) {
allItemsBeforeRange = allItemsBeforeRange && true;
} else {
allItemsBeforeRange = false;
}
}
if (allItemsBeforeRange && startDate) {
console.log(`${pageIndex + 1} 页所有项目都早于起始日期,停止抓取`);
shouldContinue = false;
}
console.log(`${pageIndex + 1} 页找到 ${items.length} 条,符合条件 ${hasItemsInRange ? '有' : '无'}`);
pageIndex++;
if (shouldContinue && pageIndex < maxPages) {
await new Promise(resolve => setTimeout(resolve, 500));
}
} catch (err) {
console.error(`抓取第 ${pageIndex + 1} 页失败: ${err.message}`);
break;
}
}
console.log(`总共抓取了 ${pageIndex} 页,找到 ${allItems.length} 条符合条件的公告`);
return allItems;
}
const http = axios.create({
responseType: 'arraybuffer',
timeout: 10000,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; gjzx-scraper/1.0)',
},
});
function pickEncoding(contentType = '') {
const match = /charset=([^;]+)/i.exec(contentType);
if (!match) return 'utf-8';
const charset = match[1].trim().toLowerCase();
if (charset.includes('gb')) return 'gbk';
return charset;
}
async function fetchHtml(url) {
const res = await http.get(url);
const encoding = pickEncoding(res.headers['content-type']);
const html = iconv.decode(res.data, encoding || 'utf-8');
return html;
}
function parseList(html) {
const $ = cheerio.load(html);
const items = [];
// 查找所有表格行中的链接
$('table tr').each((_, row) => {
const $row = $(row);
const link = $row.find('td:first-child a').first();
const dateCell = $row.find('td:nth-child(2)');
if (link.length && dateCell.length) {
const title = link.attr('title') || link.text().trim();
const rawHref = link.attr('href') || '';
const dateText = dateCell.text().trim();
// 过滤掉导航链接和空链接
if (!rawHref || !title || title.length < 5) return;
if (rawHref === './' || rawHref === '../') return;
// 验证日期格式 (YYYY-MM-DD)
if (!/^\d{4}-\d{2}-\d{2}$/.test(dateText)) return;
try {
const href = new URL(rawHref, BASE_URL).toString();
items.push({ title, href, date: dateText });
} catch (err) {
// 跳过无效URL
return;
}
}
});
return items;
}
function parseDetail(html) {
const $ = cheerio.load(html);
// 尝试多种标题选择器
let title = $('.title18').text().trim();
if (!title) {
title = $('.article-info h1').text().trim();
}
if (!title) {
title = $('h1').first().text().trim();
}
// 尝试提取发布时间
const publishTd = $('td:contains("发布部门")').filter((_, el) => {
return $(el).text().includes('发布时间');
});
const publishText = publishTd.text().trim();
let timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/);
let publishTime = timeMatch ? timeMatch[1] : '';
// 如果第一种方式没找到,尝试其他方式
if (!publishTime) {
const infoText = $('.info-sources').text() || $('body').text();
timeMatch = infoText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})/);
publishTime = timeMatch ? timeMatch[1] : '';
}
// 尝试多种内容选择器
let content = '';
const contentSelectors = [
'.zhenwen td', // 原有格式
'.con', // 新格式(宁易新系统)
'.article-content', // 通用格式
'.ewb-article-content',
'body' // 兜底方案
];
for (const selector of contentSelectors) {
const el = $(selector).first();
if (el.length > 0) {
const text = el.text().trim();
if (text.length > content.length) {
content = text;
}
}
}
const budget = extractBudget(content);
return {
title,
publishTime,
content,
budget,
};
}
function extractBudget(content) {
// 直接定义金额匹配模式(从高优先级到低优先级)
const patterns = [
// 优先级1: 带货币符号的万元
{ regex: /(?:¥|¥|人民币)\s*([\d,]+(?:\.\d+)?)\s*万元/i, priority: 1 },
// 优先级2: 括号内的金额(元)
{ regex: /[(][¥¥]([\d,]+(?:\.\d+)?)[)]/i, priority: 2, divider: 10000 },
// 优先级3: 普通万元格式
{ regex: /([\d,]+(?:\.\d+)?)\s*万元/i, priority: 3 },
// 优先级4: 带货币符号的元(转万元)
{ regex: /(?:¥|¥|人民币)\s*([\d,]+(?:\.\d+)?)\s*元/i, priority: 4, divider: 10000 },
// 优先级5: 普通元格式(转万元,最低优先级,排除"元整"避免误匹配中文大写)
{ regex: /([\d,]+(?:\.\d+)?)\s*元(?!整)/i, priority: 5, divider: 10000 }
];
let bestMatch = null;
let bestPriority = Infinity;
// 遍历所有模式,找到优先级最高的匹配
for (const pattern of patterns) {
const match = content.match(pattern.regex);
if (match && pattern.priority < bestPriority) {
// 清理数字中的逗号并转换
const numberStr = match[1].replace(/[,]/g, '');
let amount = parseFloat(numberStr);
// 如果是元单位,转换为万元
if (pattern.divider) {
amount = amount / pattern.divider;
}
// 验证金额合理性(0.01万元到1000000万元之间)
if (!isNaN(amount) && amount >= 0.01 && amount <= 1000000) {
bestMatch = {
amount,
unit: '万元',
text: match[0],
originalUnit: pattern.divider ? '元' : '万元'
};
bestPriority = pattern.priority;
}
}
}
return bestMatch;
}
// API 路由
// 获取列表
app.get('/api/list', async (req, res) => {
try {
const baseUrl = req.query.url || BASE_URL;
const page = parseInt(req.query.page) || 1;
// 根据页码构建URL
let url = baseUrl;
if (page > 1) {
// 移除baseUrl末尾的斜杠
const cleanBaseUrl = baseUrl.replace(/\/$/, '');
url = `${cleanBaseUrl}/index_${page - 1}.html`;
}
const html = await fetchHtml(url);
const items = parseList(html);
res.json({ success: true, data: items, page });
} catch (error) {
res.status(500).json({ success: false, error: error.message });
}
});
// 按时间范围获取列表
app.post('/api/list-daterange', async (req, res) => {
try {
const { startDate, endDate, maxPages = 23 } = req.body;
const items = await fetchListByDateRange(startDate, endDate, maxPages);
res.json({ success: true, data: items });
} catch (error) {
res.status(500).json({ success: false, error: error.message });
}
});
// 获取详情
app.post('/api/details', async (req, res) => {
try {
const { items, limit = 10 } = req.body;
const results = [];
const toFetch = items.slice(0, limit);
for (const item of toFetch) {
try {
const html = await fetchHtml(item.href);
const detail = parseDetail(html);
results.push({
...item,
detail,
});
await new Promise((resolve) => setTimeout(resolve, 500));
} catch (err) {
results.push({
...item,
detail: null,
error: err.message,
});
}
}
res.json({ success: true, data: results });
} catch (error) {
res.status(500).json({ success: false, error: error.message });
}
});
// 生成报告
app.post('/api/report', async (req, res) => {
try {
const { limit = 15, threshold = 50, url } = req.body;
const targetUrl = url && url.trim() !== '' ? url : BASE_URL;
// 按需抓取多页以获取足够的数据
const items = [];
let pageIndex = 0;
const maxPagesToFetch = Math.ceil(limit / 10) + 1; // 假设每页约10条多抓一页保险
while (items.length < limit && pageIndex < maxPagesToFetch) {
const pageUrl = getPageUrl(pageIndex, targetUrl);
console.log(`正在抓取第 ${pageIndex + 1} 页: ${pageUrl}`);
try {
const html = await fetchHtml(pageUrl);
const pageItems = parseList(html);
if (pageItems.length === 0) {
console.log(`${pageIndex + 1} 页没有数据,停止抓取`);
break;
}
items.push(...pageItems);
pageIndex++;
if (items.length < limit && pageIndex < maxPagesToFetch) {
await new Promise(resolve => setTimeout(resolve, 500));
}
} catch (err) {
console.error(`抓取第 ${pageIndex + 1} 页失败: ${err.message}`);
break;
}
}
const results = [];
const toFetch = items.slice(0, limit);
for (const item of toFetch) {
try {
const html = await fetchHtml(item.href);
const detail = parseDetail(html);
results.push({
...item,
detail,
});
await new Promise((resolve) => setTimeout(resolve, 500));
} catch (err) {
results.push({
...item,
detail: null,
error: err.message,
});
}
}
const filtered = results.filter((item) => {
return item.detail?.budget && item.detail.budget.amount > threshold;
});
const total = filtered.reduce(
(sum, item) => sum + (item.detail.budget?.amount || 0),
0
);
const report = {
summary: {
total_count: results.length,
filtered_count: filtered.length,
threshold: `${threshold}万元`,
total_amount: `${total.toFixed(2)}万元`,
generated_at: new Date().toISOString(),
},
projects: filtered.map((item) => ({
title: item.title,
date: item.date,
publish_time: item.detail.publishTime,
budget: item.detail.budget,
url: item.href,
})),
};
res.json({ success: true, data: report });
} catch (error) {
res.status(500).json({ success: false, error: error.message });
}
});
// 按时间范围生成报告
app.post('/api/report-daterange', async (req, res) => {
try {
const { startDate, endDate, threshold = 50, maxPages = 23 } = req.body;
// 按时间范围抓取列表
const items = await fetchListByDateRange(startDate, endDate, maxPages);
if (items.length === 0) {
return res.json({
success: true,
data: {
summary: {
total_count: 0,
filtered_count: 0,
threshold: `${threshold}万元`,
total_amount: '0.00万元',
generated_at: new Date().toISOString(),
date_range: { startDate, endDate },
},
projects: [],
},
});
}
// 抓取详情
const results = [];
for (const item of items) {
try {
const html = await fetchHtml(item.href);
const detail = parseDetail(html);
results.push({
...item,
detail,
});
await new Promise((resolve) => setTimeout(resolve, 500));
} catch (err) {
results.push({
...item,
detail: null,
error: err.message,
});
}
}
// 生成报告
const filtered = results.filter((item) => {
return item.detail?.budget && item.detail.budget.amount > threshold;
});
const total = filtered.reduce(
(sum, item) => sum + (item.detail.budget?.amount || 0),
0
);
const report = {
summary: {
total_count: results.length,
filtered_count: filtered.length,
threshold: `${threshold}万元`,
total_amount: `${total.toFixed(2)}万元`,
generated_at: new Date().toISOString(),
date_range: { startDate, endDate },
},
projects: filtered.map((item) => ({
title: item.title,
date: item.date,
publish_time: item.detail.publishTime,
budget: item.detail.budget,
url: item.href,
})),
};
res.json({ success: true, data: report });
} catch (error) {
res.status(500).json({ success: false, error: error.message });
}
});
app.listen(PORT, () => {
console.log(`Server running at http://localhost:${PORT}`);
});