初始化

This commit is contained in:
2025-12-12 15:54:04 +08:00
commit 83a8a3bb9a
2404 changed files with 508391 additions and 0 deletions

409
src/server.js Normal file
View File

@@ -0,0 +1,409 @@
import express from 'express';
import cors from 'cors';
import axios from 'axios';
import * as cheerio from 'cheerio';
import iconv from 'iconv-lite';
const app = express();
const PORT = 3000;
app.use(cors());
app.use(express.json());
app.use(express.static('public'));
const BASE_URL = 'https://gjzx.nanjing.gov.cn/gggs/';
// 获取分页URL
function getPageUrl(pageIndex) {
if (pageIndex === 0) {
return BASE_URL;
}
return `${BASE_URL}index_${pageIndex}.html`;
}
// 检查日期是否在范围内
function isDateInRange(dateStr, startDate, endDate) {
if (!dateStr) return false;
const date = new Date(dateStr);
if (isNaN(date.getTime())) return false;
if (startDate && date < new Date(startDate)) return false;
if (endDate && date > new Date(endDate)) return false;
return true;
}
// 按时间范围抓取多页列表
async function fetchListByDateRange(startDate, endDate, maxPages = 23) {
const allItems = [];
let shouldContinue = true;
let pageIndex = 0;
console.log(`开始按时间范围抓取: ${startDate || '不限'}${endDate || '不限'}`);
while (shouldContinue && pageIndex < maxPages) {
const pageUrl = getPageUrl(pageIndex);
console.log(`正在抓取第 ${pageIndex + 1} 页: ${pageUrl}`);
try {
const html = await fetchHtml(pageUrl);
const items = parseList(html);
if (items.length === 0) {
console.log(`${pageIndex + 1} 页没有数据,停止抓取`);
break;
}
let hasItemsInRange = false;
let allItemsBeforeRange = true;
for (const item of items) {
if (isDateInRange(item.date, startDate, endDate)) {
allItems.push(item);
hasItemsInRange = true;
allItemsBeforeRange = false;
} else if (startDate && new Date(item.date) < new Date(startDate)) {
allItemsBeforeRange = allItemsBeforeRange && true;
} else {
allItemsBeforeRange = false;
}
}
if (allItemsBeforeRange && startDate) {
console.log(`${pageIndex + 1} 页所有项目都早于起始日期,停止抓取`);
shouldContinue = false;
}
console.log(`${pageIndex + 1} 页找到 ${items.length} 条,符合条件 ${hasItemsInRange ? '有' : '无'}`);
pageIndex++;
if (shouldContinue && pageIndex < maxPages) {
await new Promise(resolve => setTimeout(resolve, 500));
}
} catch (err) {
console.error(`抓取第 ${pageIndex + 1} 页失败: ${err.message}`);
break;
}
}
console.log(`总共抓取了 ${pageIndex} 页,找到 ${allItems.length} 条符合条件的公告`);
return allItems;
}
const http = axios.create({
responseType: 'arraybuffer',
timeout: 10000,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; gjzx-scraper/1.0)',
},
});
function pickEncoding(contentType = '') {
const match = /charset=([^;]+)/i.exec(contentType);
if (!match) return 'utf-8';
const charset = match[1].trim().toLowerCase();
if (charset.includes('gb')) return 'gbk';
return charset;
}
async function fetchHtml(url) {
const res = await http.get(url);
const encoding = pickEncoding(res.headers['content-type']);
const html = iconv.decode(res.data, encoding || 'utf-8');
return html;
}
function parseList(html) {
const $ = cheerio.load(html);
const items = [];
// 查找所有表格行中的链接
$('table tr').each((_, row) => {
const $row = $(row);
const link = $row.find('td:first-child a').first();
const dateCell = $row.find('td:nth-child(2)');
if (link.length && dateCell.length) {
const title = link.attr('title') || link.text().trim();
const rawHref = link.attr('href') || '';
const dateText = dateCell.text().trim();
// 过滤掉导航链接和空链接
if (!rawHref || !title || title.length < 5) return;
if (rawHref === './' || rawHref === '../') return;
// 验证日期格式 (YYYY-MM-DD)
if (!/^\d{4}-\d{2}-\d{2}$/.test(dateText)) return;
try {
const href = new URL(rawHref, BASE_URL).toString();
items.push({ title, href, date: dateText });
} catch (err) {
// 跳过无效URL
return;
}
}
});
return items;
}
function parseDetail(html) {
const $ = cheerio.load(html);
const title = $('.title18').text().trim();
const publishTd = $('td:contains("发布部门")').filter((_, el) => {
return $(el).text().includes('发布时间');
});
const publishText = publishTd.text().trim();
const timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/);
const publishTime = timeMatch ? timeMatch[1] : '';
const contentTd = $('.zhenwen td').first();
const content = contentTd.text().trim();
const budget = extractBudget(content);
return {
title,
publishTime,
content,
budget,
};
}
function extractBudget(content) {
const patterns = [
/预算金额[:]\s*(\d+(?:\.\d+)?)\s*万元/,
/最高限价[:]\s*(\d+(?:\.\d+)?)\s*万元/,
/预算[:]\s*(\d+(?:\.\d+)?)\s*万元/,
/金额[:]\s*(\d+(?:\.\d+)?)\s*万元/,
/(\d+(?:\.\d+)?)\s*万元/,
];
for (const pattern of patterns) {
const match = content.match(pattern);
if (match) {
const amount = parseFloat(match[1]);
return {
amount,
unit: '万元',
text: match[0],
};
}
}
return null;
}
// API 路由
// 获取列表
app.get('/api/list', async (req, res) => {
try {
const baseUrl = req.query.url || BASE_URL;
const page = parseInt(req.query.page) || 1;
// 根据页码构建URL
let url = baseUrl;
if (page > 1) {
// 移除baseUrl末尾的斜杠
const cleanBaseUrl = baseUrl.replace(/\/$/, '');
url = `${cleanBaseUrl}/index_${page - 1}.html`;
}
const html = await fetchHtml(url);
const items = parseList(html);
res.json({ success: true, data: items, page });
} catch (error) {
res.status(500).json({ success: false, error: error.message });
}
});
// 按时间范围获取列表
app.post('/api/list-daterange', async (req, res) => {
try {
const { startDate, endDate, maxPages = 23 } = req.body;
const items = await fetchListByDateRange(startDate, endDate, maxPages);
res.json({ success: true, data: items });
} catch (error) {
res.status(500).json({ success: false, error: error.message });
}
});
// 获取详情
app.post('/api/details', async (req, res) => {
try {
const { items, limit = 10 } = req.body;
const results = [];
const toFetch = items.slice(0, limit);
for (const item of toFetch) {
try {
const html = await fetchHtml(item.href);
const detail = parseDetail(html);
results.push({
...item,
detail,
});
await new Promise((resolve) => setTimeout(resolve, 500));
} catch (err) {
results.push({
...item,
detail: null,
error: err.message,
});
}
}
res.json({ success: true, data: results });
} catch (error) {
res.status(500).json({ success: false, error: error.message });
}
});
// 生成报告
app.post('/api/report', async (req, res) => {
try {
const { limit = 15, threshold = 50, url } = req.body;
const targetUrl = url && url.trim() !== '' ? url : BASE_URL;
const listHtml = await fetchHtml(targetUrl);
const items = parseList(listHtml);
const results = [];
const toFetch = items.slice(0, limit);
for (const item of toFetch) {
try {
const html = await fetchHtml(item.href);
const detail = parseDetail(html);
results.push({
...item,
detail,
});
await new Promise((resolve) => setTimeout(resolve, 500));
} catch (err) {
results.push({
...item,
detail: null,
error: err.message,
});
}
}
const filtered = results.filter((item) => {
return item.detail?.budget && item.detail.budget.amount > threshold;
});
const total = filtered.reduce(
(sum, item) => sum + (item.detail.budget?.amount || 0),
0
);
const report = {
summary: {
total_count: results.length,
filtered_count: filtered.length,
threshold: `${threshold}万元`,
total_amount: `${total.toFixed(2)}万元`,
generated_at: new Date().toISOString(),
},
projects: filtered.map((item) => ({
title: item.title,
date: item.date,
publish_time: item.detail.publishTime,
budget: item.detail.budget,
url: item.href,
})),
};
res.json({ success: true, data: report });
} catch (error) {
res.status(500).json({ success: false, error: error.message });
}
});
// 按时间范围生成报告
app.post('/api/report-daterange', async (req, res) => {
try {
const { startDate, endDate, threshold = 50, maxPages = 23 } = req.body;
// 按时间范围抓取列表
const items = await fetchListByDateRange(startDate, endDate, maxPages);
if (items.length === 0) {
return res.json({
success: true,
data: {
summary: {
total_count: 0,
filtered_count: 0,
threshold: `${threshold}万元`,
total_amount: '0.00万元',
generated_at: new Date().toISOString(),
date_range: { startDate, endDate },
},
projects: [],
},
});
}
// 抓取详情
const results = [];
for (const item of items) {
try {
const html = await fetchHtml(item.href);
const detail = parseDetail(html);
results.push({
...item,
detail,
});
await new Promise((resolve) => setTimeout(resolve, 500));
} catch (err) {
results.push({
...item,
detail: null,
error: err.message,
});
}
}
// 生成报告
const filtered = results.filter((item) => {
return item.detail?.budget && item.detail.budget.amount > threshold;
});
const total = filtered.reduce(
(sum, item) => sum + (item.detail.budget?.amount || 0),
0
);
const report = {
summary: {
total_count: results.length,
filtered_count: filtered.length,
threshold: `${threshold}万元`,
total_amount: `${total.toFixed(2)}万元`,
generated_at: new Date().toISOString(),
date_range: { startDate, endDate },
},
projects: filtered.map((item) => ({
title: item.title,
date: item.date,
publish_time: item.detail.publishTime,
budget: item.detail.budget,
url: item.href,
})),
};
res.json({ success: true, data: report });
} catch (error) {
res.status(500).json({ success: false, error: error.message });
}
});
app.listen(PORT, () => {
console.log(`Server running at http://localhost:${PORT}`);
});