feat(public): 实现按数量抓取多页数据功能 在普通模式下,支持根据用户指定的数量抓取多页列表数据,直到满足所需数量或达到最大页数限制。增加分页请求逻辑与延时控制,提升数据获取稳定性。 feat(server): 改进详情页解析与预算金额提取逻辑 增强标题、发布时间和正文内容的选择器容错能力,支持多种页面结构。优化预算金额提取规则,引入优先级匹配机制,并支持元转万元计算,提高数据准确性。 ```
497 lines
14 KiB
JavaScript
497 lines
14 KiB
JavaScript
import express from 'express';
|
||
import cors from 'cors';
|
||
import axios from 'axios';
|
||
import * as cheerio from 'cheerio';
|
||
import iconv from 'iconv-lite';
|
||
|
||
const app = express();
|
||
const PORT = 3000;
|
||
|
||
app.use(cors());
|
||
app.use(express.json());
|
||
app.use(express.static('public'));
|
||
|
||
const BASE_URL = 'https://gjzx.nanjing.gov.cn/gggs/';
|
||
|
||
// 获取分页URL
|
||
function getPageUrl(pageIndex, baseUrl = BASE_URL) {
|
||
if (pageIndex === 0) {
|
||
return baseUrl;
|
||
}
|
||
const cleanBaseUrl = baseUrl.replace(/\/$/, '');
|
||
return `${cleanBaseUrl}/index_${pageIndex}.html`;
|
||
}
|
||
|
||
// 检查日期是否在范围内
|
||
function isDateInRange(dateStr, startDate, endDate) {
|
||
if (!dateStr) return false;
|
||
const date = new Date(dateStr);
|
||
if (isNaN(date.getTime())) return false;
|
||
|
||
if (startDate && date < new Date(startDate)) return false;
|
||
if (endDate && date > new Date(endDate)) return false;
|
||
return true;
|
||
}
|
||
|
||
// 按时间范围抓取多页列表
|
||
async function fetchListByDateRange(startDate, endDate, maxPages = 23) {
|
||
const allItems = [];
|
||
let shouldContinue = true;
|
||
let pageIndex = 0;
|
||
|
||
console.log(`开始按时间范围抓取: ${startDate || '不限'} 至 ${endDate || '不限'}`);
|
||
|
||
while (shouldContinue && pageIndex < maxPages) {
|
||
const pageUrl = getPageUrl(pageIndex);
|
||
console.log(`正在抓取第 ${pageIndex + 1} 页: ${pageUrl}`);
|
||
|
||
try {
|
||
const html = await fetchHtml(pageUrl);
|
||
const items = parseList(html);
|
||
|
||
if (items.length === 0) {
|
||
console.log(`第 ${pageIndex + 1} 页没有数据,停止抓取`);
|
||
break;
|
||
}
|
||
|
||
let hasItemsInRange = false;
|
||
let allItemsBeforeRange = true;
|
||
|
||
for (const item of items) {
|
||
if (isDateInRange(item.date, startDate, endDate)) {
|
||
allItems.push(item);
|
||
hasItemsInRange = true;
|
||
allItemsBeforeRange = false;
|
||
} else if (startDate && new Date(item.date) < new Date(startDate)) {
|
||
allItemsBeforeRange = allItemsBeforeRange && true;
|
||
} else {
|
||
allItemsBeforeRange = false;
|
||
}
|
||
}
|
||
|
||
if (allItemsBeforeRange && startDate) {
|
||
console.log(`第 ${pageIndex + 1} 页所有项目都早于起始日期,停止抓取`);
|
||
shouldContinue = false;
|
||
}
|
||
|
||
console.log(`第 ${pageIndex + 1} 页找到 ${items.length} 条,符合条件 ${hasItemsInRange ? '有' : '无'}`);
|
||
|
||
pageIndex++;
|
||
|
||
if (shouldContinue && pageIndex < maxPages) {
|
||
await new Promise(resolve => setTimeout(resolve, 500));
|
||
}
|
||
} catch (err) {
|
||
console.error(`抓取第 ${pageIndex + 1} 页失败: ${err.message}`);
|
||
break;
|
||
}
|
||
}
|
||
|
||
console.log(`总共抓取了 ${pageIndex} 页,找到 ${allItems.length} 条符合条件的公告`);
|
||
return allItems;
|
||
}
|
||
|
||
const http = axios.create({
|
||
responseType: 'arraybuffer',
|
||
timeout: 10000,
|
||
headers: {
|
||
'User-Agent': 'Mozilla/5.0 (compatible; gjzx-scraper/1.0)',
|
||
},
|
||
});
|
||
|
||
function pickEncoding(contentType = '') {
|
||
const match = /charset=([^;]+)/i.exec(contentType);
|
||
if (!match) return 'utf-8';
|
||
const charset = match[1].trim().toLowerCase();
|
||
if (charset.includes('gb')) return 'gbk';
|
||
return charset;
|
||
}
|
||
|
||
async function fetchHtml(url) {
|
||
const res = await http.get(url);
|
||
const encoding = pickEncoding(res.headers['content-type']);
|
||
const html = iconv.decode(res.data, encoding || 'utf-8');
|
||
return html;
|
||
}
|
||
|
||
function parseList(html) {
|
||
const $ = cheerio.load(html);
|
||
const items = [];
|
||
|
||
// 查找所有表格行中的链接
|
||
$('table tr').each((_, row) => {
|
||
const $row = $(row);
|
||
const link = $row.find('td:first-child a').first();
|
||
const dateCell = $row.find('td:nth-child(2)');
|
||
|
||
if (link.length && dateCell.length) {
|
||
const title = link.attr('title') || link.text().trim();
|
||
const rawHref = link.attr('href') || '';
|
||
const dateText = dateCell.text().trim();
|
||
|
||
// 过滤掉导航链接和空链接
|
||
if (!rawHref || !title || title.length < 5) return;
|
||
if (rawHref === './' || rawHref === '../') return;
|
||
|
||
// 验证日期格式 (YYYY-MM-DD)
|
||
if (!/^\d{4}-\d{2}-\d{2}$/.test(dateText)) return;
|
||
|
||
try {
|
||
const href = new URL(rawHref, BASE_URL).toString();
|
||
items.push({ title, href, date: dateText });
|
||
} catch (err) {
|
||
// 跳过无效URL
|
||
return;
|
||
}
|
||
}
|
||
});
|
||
|
||
return items;
|
||
}
|
||
|
||
function parseDetail(html) {
|
||
const $ = cheerio.load(html);
|
||
|
||
// 尝试多种标题选择器
|
||
let title = $('.title18').text().trim();
|
||
if (!title) {
|
||
title = $('.article-info h1').text().trim();
|
||
}
|
||
if (!title) {
|
||
title = $('h1').first().text().trim();
|
||
}
|
||
|
||
// 尝试提取发布时间
|
||
const publishTd = $('td:contains("发布部门")').filter((_, el) => {
|
||
return $(el).text().includes('发布时间');
|
||
});
|
||
const publishText = publishTd.text().trim();
|
||
let timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/);
|
||
let publishTime = timeMatch ? timeMatch[1] : '';
|
||
|
||
// 如果第一种方式没找到,尝试其他方式
|
||
if (!publishTime) {
|
||
const infoText = $('.info-sources').text() || $('body').text();
|
||
timeMatch = infoText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})/);
|
||
publishTime = timeMatch ? timeMatch[1] : '';
|
||
}
|
||
|
||
// 尝试多种内容选择器
|
||
let content = '';
|
||
const contentSelectors = [
|
||
'.zhenwen td', // 原有格式
|
||
'.con', // 新格式(宁易新系统)
|
||
'.article-content', // 通用格式
|
||
'.ewb-article-content',
|
||
'body' // 兜底方案
|
||
];
|
||
|
||
for (const selector of contentSelectors) {
|
||
const el = $(selector).first();
|
||
if (el.length > 0) {
|
||
const text = el.text().trim();
|
||
if (text.length > content.length) {
|
||
content = text;
|
||
}
|
||
}
|
||
}
|
||
|
||
const budget = extractBudget(content);
|
||
|
||
return {
|
||
title,
|
||
publishTime,
|
||
content,
|
||
budget,
|
||
};
|
||
}
|
||
|
||
function extractBudget(content) {
|
||
// 直接定义金额匹配模式(从高优先级到低优先级)
|
||
const patterns = [
|
||
// 优先级1: 带货币符号的万元
|
||
{ regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 1 },
|
||
|
||
// 优先级2: 括号内的金额(元)
|
||
{ regex: /[((][¥¥]([\d,,]+(?:\.\d+)?)[))]/i, priority: 2, divider: 10000 },
|
||
|
||
// 优先级3: 普通万元格式
|
||
{ regex: /([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 3 },
|
||
|
||
// 优先级4: 带货币符号的元(转万元)
|
||
{ regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*元/i, priority: 4, divider: 10000 },
|
||
|
||
// 优先级5: 普通元格式(转万元,最低优先级,排除"元整"避免误匹配中文大写)
|
||
{ regex: /([\d,,]+(?:\.\d+)?)\s*元(?!整)/i, priority: 5, divider: 10000 }
|
||
];
|
||
|
||
let bestMatch = null;
|
||
let bestPriority = Infinity;
|
||
|
||
// 遍历所有模式,找到优先级最高的匹配
|
||
for (const pattern of patterns) {
|
||
const match = content.match(pattern.regex);
|
||
if (match && pattern.priority < bestPriority) {
|
||
// 清理数字中的逗号并转换
|
||
const numberStr = match[1].replace(/[,,]/g, '');
|
||
let amount = parseFloat(numberStr);
|
||
|
||
// 如果是元单位,转换为万元
|
||
if (pattern.divider) {
|
||
amount = amount / pattern.divider;
|
||
}
|
||
|
||
// 验证金额合理性(0.01万元到1000000万元之间)
|
||
if (!isNaN(amount) && amount >= 0.01 && amount <= 1000000) {
|
||
bestMatch = {
|
||
amount,
|
||
unit: '万元',
|
||
text: match[0],
|
||
originalUnit: pattern.divider ? '元' : '万元'
|
||
};
|
||
bestPriority = pattern.priority;
|
||
}
|
||
}
|
||
}
|
||
|
||
return bestMatch;
|
||
}
|
||
|
||
// API 路由
|
||
|
||
// 获取列表
|
||
app.get('/api/list', async (req, res) => {
|
||
try {
|
||
const baseUrl = req.query.url || BASE_URL;
|
||
const page = parseInt(req.query.page) || 1;
|
||
|
||
// 根据页码构建URL
|
||
let url = baseUrl;
|
||
if (page > 1) {
|
||
// 移除baseUrl末尾的斜杠
|
||
const cleanBaseUrl = baseUrl.replace(/\/$/, '');
|
||
url = `${cleanBaseUrl}/index_${page - 1}.html`;
|
||
}
|
||
|
||
const html = await fetchHtml(url);
|
||
const items = parseList(html);
|
||
res.json({ success: true, data: items, page });
|
||
} catch (error) {
|
||
res.status(500).json({ success: false, error: error.message });
|
||
}
|
||
});
|
||
|
||
// 按时间范围获取列表
|
||
app.post('/api/list-daterange', async (req, res) => {
|
||
try {
|
||
const { startDate, endDate, maxPages = 23 } = req.body;
|
||
const items = await fetchListByDateRange(startDate, endDate, maxPages);
|
||
res.json({ success: true, data: items });
|
||
} catch (error) {
|
||
res.status(500).json({ success: false, error: error.message });
|
||
}
|
||
});
|
||
|
||
// 获取详情
|
||
app.post('/api/details', async (req, res) => {
|
||
try {
|
||
const { items, limit = 10 } = req.body;
|
||
const results = [];
|
||
const toFetch = items.slice(0, limit);
|
||
|
||
for (const item of toFetch) {
|
||
try {
|
||
const html = await fetchHtml(item.href);
|
||
const detail = parseDetail(html);
|
||
results.push({
|
||
...item,
|
||
detail,
|
||
});
|
||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||
} catch (err) {
|
||
results.push({
|
||
...item,
|
||
detail: null,
|
||
error: err.message,
|
||
});
|
||
}
|
||
}
|
||
|
||
res.json({ success: true, data: results });
|
||
} catch (error) {
|
||
res.status(500).json({ success: false, error: error.message });
|
||
}
|
||
});
|
||
|
||
// 生成报告
|
||
app.post('/api/report', async (req, res) => {
|
||
try {
|
||
const { limit = 15, threshold = 50, url } = req.body;
|
||
const targetUrl = url && url.trim() !== '' ? url : BASE_URL;
|
||
|
||
// 按需抓取多页以获取足够的数据
|
||
const items = [];
|
||
let pageIndex = 0;
|
||
const maxPagesToFetch = Math.ceil(limit / 10) + 1; // 假设每页约10条,多抓一页保险
|
||
|
||
while (items.length < limit && pageIndex < maxPagesToFetch) {
|
||
const pageUrl = getPageUrl(pageIndex, targetUrl);
|
||
console.log(`正在抓取第 ${pageIndex + 1} 页: ${pageUrl}`);
|
||
|
||
try {
|
||
const html = await fetchHtml(pageUrl);
|
||
const pageItems = parseList(html);
|
||
|
||
if (pageItems.length === 0) {
|
||
console.log(`第 ${pageIndex + 1} 页没有数据,停止抓取`);
|
||
break;
|
||
}
|
||
|
||
items.push(...pageItems);
|
||
pageIndex++;
|
||
|
||
if (items.length < limit && pageIndex < maxPagesToFetch) {
|
||
await new Promise(resolve => setTimeout(resolve, 500));
|
||
}
|
||
} catch (err) {
|
||
console.error(`抓取第 ${pageIndex + 1} 页失败: ${err.message}`);
|
||
break;
|
||
}
|
||
}
|
||
|
||
const results = [];
|
||
const toFetch = items.slice(0, limit);
|
||
|
||
for (const item of toFetch) {
|
||
try {
|
||
const html = await fetchHtml(item.href);
|
||
const detail = parseDetail(html);
|
||
results.push({
|
||
...item,
|
||
detail,
|
||
});
|
||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||
} catch (err) {
|
||
results.push({
|
||
...item,
|
||
detail: null,
|
||
error: err.message,
|
||
});
|
||
}
|
||
}
|
||
|
||
const filtered = results.filter((item) => {
|
||
return item.detail?.budget && item.detail.budget.amount > threshold;
|
||
});
|
||
|
||
const total = filtered.reduce(
|
||
(sum, item) => sum + (item.detail.budget?.amount || 0),
|
||
0
|
||
);
|
||
|
||
const report = {
|
||
summary: {
|
||
total_count: results.length,
|
||
filtered_count: filtered.length,
|
||
threshold: `${threshold}万元`,
|
||
total_amount: `${total.toFixed(2)}万元`,
|
||
generated_at: new Date().toISOString(),
|
||
},
|
||
projects: filtered.map((item) => ({
|
||
title: item.title,
|
||
date: item.date,
|
||
publish_time: item.detail.publishTime,
|
||
budget: item.detail.budget,
|
||
url: item.href,
|
||
})),
|
||
};
|
||
|
||
res.json({ success: true, data: report });
|
||
} catch (error) {
|
||
res.status(500).json({ success: false, error: error.message });
|
||
}
|
||
});
|
||
|
||
// 按时间范围生成报告
|
||
app.post('/api/report-daterange', async (req, res) => {
|
||
try {
|
||
const { startDate, endDate, threshold = 50, maxPages = 23 } = req.body;
|
||
|
||
// 按时间范围抓取列表
|
||
const items = await fetchListByDateRange(startDate, endDate, maxPages);
|
||
|
||
if (items.length === 0) {
|
||
return res.json({
|
||
success: true,
|
||
data: {
|
||
summary: {
|
||
total_count: 0,
|
||
filtered_count: 0,
|
||
threshold: `${threshold}万元`,
|
||
total_amount: '0.00万元',
|
||
generated_at: new Date().toISOString(),
|
||
date_range: { startDate, endDate },
|
||
},
|
||
projects: [],
|
||
},
|
||
});
|
||
}
|
||
|
||
// 抓取详情
|
||
const results = [];
|
||
for (const item of items) {
|
||
try {
|
||
const html = await fetchHtml(item.href);
|
||
const detail = parseDetail(html);
|
||
results.push({
|
||
...item,
|
||
detail,
|
||
});
|
||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||
} catch (err) {
|
||
results.push({
|
||
...item,
|
||
detail: null,
|
||
error: err.message,
|
||
});
|
||
}
|
||
}
|
||
|
||
// 生成报告
|
||
const filtered = results.filter((item) => {
|
||
return item.detail?.budget && item.detail.budget.amount > threshold;
|
||
});
|
||
|
||
const total = filtered.reduce(
|
||
(sum, item) => sum + (item.detail.budget?.amount || 0),
|
||
0
|
||
);
|
||
|
||
const report = {
|
||
summary: {
|
||
total_count: results.length,
|
||
filtered_count: filtered.length,
|
||
threshold: `${threshold}万元`,
|
||
total_amount: `${total.toFixed(2)}万元`,
|
||
generated_at: new Date().toISOString(),
|
||
date_range: { startDate, endDate },
|
||
},
|
||
projects: filtered.map((item) => ({
|
||
title: item.title,
|
||
date: item.date,
|
||
publish_time: item.detail.publishTime,
|
||
budget: item.detail.budget,
|
||
url: item.href,
|
||
})),
|
||
};
|
||
|
||
res.json({ success: true, data: report });
|
||
} catch (error) {
|
||
res.status(500).json({ success: false, error: error.message });
|
||
}
|
||
});
|
||
|
||
app.listen(PORT, () => {
|
||
console.log(`Server running at http://localhost:${PORT}`);
|
||
});
|