import express from 'express'; import cors from 'cors'; import axios from 'axios'; import * as cheerio from 'cheerio'; import iconv from 'iconv-lite'; const app = express(); const PORT = 3000; app.use(cors()); app.use(express.json()); app.use(express.static('public')); const BASE_URL = 'https://gjzx.nanjing.gov.cn/gggs/'; // 获取分页URL function getPageUrl(pageIndex, baseUrl = BASE_URL) { if (pageIndex === 0) { return baseUrl; } const cleanBaseUrl = baseUrl.replace(/\/$/, ''); return `${cleanBaseUrl}/index_${pageIndex}.html`; } // 检查日期是否在范围内 function isDateInRange(dateStr, startDate, endDate) { if (!dateStr) return false; const date = new Date(dateStr); if (isNaN(date.getTime())) return false; if (startDate && date < new Date(startDate)) return false; if (endDate && date > new Date(endDate)) return false; return true; } // 按时间范围抓取多页列表 async function fetchListByDateRange(startDate, endDate, maxPages = 23) { const allItems = []; let shouldContinue = true; let pageIndex = 0; console.log(`开始按时间范围抓取: ${startDate || '不限'} 至 ${endDate || '不限'}`); while (shouldContinue && pageIndex < maxPages) { const pageUrl = getPageUrl(pageIndex); console.log(`正在抓取第 ${pageIndex + 1} 页: ${pageUrl}`); try { const html = await fetchHtml(pageUrl); const items = parseList(html); if (items.length === 0) { console.log(`第 ${pageIndex + 1} 页没有数据,停止抓取`); break; } let hasItemsInRange = false; let allItemsBeforeRange = true; for (const item of items) { if (isDateInRange(item.date, startDate, endDate)) { allItems.push(item); hasItemsInRange = true; allItemsBeforeRange = false; } else if (startDate && new Date(item.date) < new Date(startDate)) { allItemsBeforeRange = allItemsBeforeRange && true; } else { allItemsBeforeRange = false; } } if (allItemsBeforeRange && startDate) { console.log(`第 ${pageIndex + 1} 页所有项目都早于起始日期,停止抓取`); shouldContinue = false; } console.log(`第 ${pageIndex + 1} 页找到 ${items.length} 条,符合条件 ${hasItemsInRange ? '有' : '无'}`); pageIndex++; if (shouldContinue && pageIndex < maxPages) { await new Promise(resolve => setTimeout(resolve, 500)); } } catch (err) { console.error(`抓取第 ${pageIndex + 1} 页失败: ${err.message}`); break; } } console.log(`总共抓取了 ${pageIndex} 页,找到 ${allItems.length} 条符合条件的公告`); return allItems; } const http = axios.create({ responseType: 'arraybuffer', timeout: 10000, headers: { 'User-Agent': 'Mozilla/5.0 (compatible; gjzx-scraper/1.0)', }, }); function pickEncoding(contentType = '') { const match = /charset=([^;]+)/i.exec(contentType); if (!match) return 'utf-8'; const charset = match[1].trim().toLowerCase(); if (charset.includes('gb')) return 'gbk'; return charset; } async function fetchHtml(url) { const res = await http.get(url); const encoding = pickEncoding(res.headers['content-type']); const html = iconv.decode(res.data, encoding || 'utf-8'); return html; } function parseList(html) { const $ = cheerio.load(html); const items = []; // 查找所有表格行中的链接 $('table tr').each((_, row) => { const $row = $(row); const link = $row.find('td:first-child a').first(); const dateCell = $row.find('td:nth-child(2)'); if (link.length && dateCell.length) { const title = link.attr('title') || link.text().trim(); const rawHref = link.attr('href') || ''; const dateText = dateCell.text().trim(); // 过滤掉导航链接和空链接 if (!rawHref || !title || title.length < 5) return; if (rawHref === './' || rawHref === '../') return; // 验证日期格式 (YYYY-MM-DD) if (!/^\d{4}-\d{2}-\d{2}$/.test(dateText)) return; try { const href = new URL(rawHref, BASE_URL).toString(); items.push({ title, href, date: dateText }); } catch (err) { // 跳过无效URL return; } } }); return items; } function parseDetail(html) { const $ = cheerio.load(html); // 尝试多种标题选择器 let title = $('.title18').text().trim(); if (!title) { title = $('.article-info h1').text().trim(); } if (!title) { title = $('h1').first().text().trim(); } // 尝试提取发布时间 const publishTd = $('td:contains("发布部门")').filter((_, el) => { return $(el).text().includes('发布时间'); }); const publishText = publishTd.text().trim(); let timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/); let publishTime = timeMatch ? timeMatch[1] : ''; // 如果第一种方式没找到,尝试其他方式 if (!publishTime) { const infoText = $('.info-sources').text() || $('body').text(); timeMatch = infoText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})/); publishTime = timeMatch ? timeMatch[1] : ''; } // 尝试多种内容选择器 let content = ''; const contentSelectors = [ '.zhenwen td', // 原有格式 '.con', // 新格式(宁易新系统) '.article-content', // 通用格式 '.ewb-article-content', 'body' // 兜底方案 ]; for (const selector of contentSelectors) { const el = $(selector).first(); if (el.length > 0) { const text = el.text().trim(); if (text.length > content.length) { content = text; } } } const budget = extractBudget(content); return { title, publishTime, content, budget, }; } function extractBudget(content) { // 直接定义金额匹配模式(从高优先级到低优先级) const patterns = [ // 优先级1: 带货币符号的万元 { regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 1 }, // 优先级2: 括号内的金额(元) { regex: /[((][¥¥]([\d,,]+(?:\.\d+)?)[))]/i, priority: 2, divider: 10000 }, // 优先级3: 普通万元格式 { regex: /([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 3 }, // 优先级4: 带货币符号的元(转万元) { regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*元/i, priority: 4, divider: 10000 }, // 优先级5: 普通元格式(转万元,最低优先级,排除"元整"避免误匹配中文大写) { regex: /([\d,,]+(?:\.\d+)?)\s*元(?!整)/i, priority: 5, divider: 10000 } ]; let bestMatch = null; let bestPriority = Infinity; // 遍历所有模式,找到优先级最高的匹配 for (const pattern of patterns) { const match = content.match(pattern.regex); if (match && pattern.priority < bestPriority) { // 清理数字中的逗号并转换 const numberStr = match[1].replace(/[,,]/g, ''); let amount = parseFloat(numberStr); // 如果是元单位,转换为万元 if (pattern.divider) { amount = amount / pattern.divider; } // 验证金额合理性(0.01万元到1000000万元之间) if (!isNaN(amount) && amount >= 0.01 && amount <= 1000000) { bestMatch = { amount, unit: '万元', text: match[0], originalUnit: pattern.divider ? '元' : '万元' }; bestPriority = pattern.priority; } } } return bestMatch; } // API 路由 // 获取列表 app.get('/api/list', async (req, res) => { try { const baseUrl = req.query.url || BASE_URL; const page = parseInt(req.query.page) || 1; // 根据页码构建URL let url = baseUrl; if (page > 1) { // 移除baseUrl末尾的斜杠 const cleanBaseUrl = baseUrl.replace(/\/$/, ''); url = `${cleanBaseUrl}/index_${page - 1}.html`; } const html = await fetchHtml(url); const items = parseList(html); res.json({ success: true, data: items, page }); } catch (error) { res.status(500).json({ success: false, error: error.message }); } }); // 按时间范围获取列表 app.post('/api/list-daterange', async (req, res) => { try { const { startDate, endDate, maxPages = 23 } = req.body; const items = await fetchListByDateRange(startDate, endDate, maxPages); res.json({ success: true, data: items }); } catch (error) { res.status(500).json({ success: false, error: error.message }); } }); // 获取详情 app.post('/api/details', async (req, res) => { try { const { items, limit = 10 } = req.body; const results = []; const toFetch = items.slice(0, limit); for (const item of toFetch) { try { const html = await fetchHtml(item.href); const detail = parseDetail(html); results.push({ ...item, detail, }); await new Promise((resolve) => setTimeout(resolve, 500)); } catch (err) { results.push({ ...item, detail: null, error: err.message, }); } } res.json({ success: true, data: results }); } catch (error) { res.status(500).json({ success: false, error: error.message }); } }); // 生成报告 app.post('/api/report', async (req, res) => { try { const { limit = 15, threshold = 50, url } = req.body; const targetUrl = url && url.trim() !== '' ? url : BASE_URL; // 按需抓取多页以获取足够的数据 const items = []; let pageIndex = 0; const maxPagesToFetch = Math.ceil(limit / 10) + 1; // 假设每页约10条,多抓一页保险 while (items.length < limit && pageIndex < maxPagesToFetch) { const pageUrl = getPageUrl(pageIndex, targetUrl); console.log(`正在抓取第 ${pageIndex + 1} 页: ${pageUrl}`); try { const html = await fetchHtml(pageUrl); const pageItems = parseList(html); if (pageItems.length === 0) { console.log(`第 ${pageIndex + 1} 页没有数据,停止抓取`); break; } items.push(...pageItems); pageIndex++; if (items.length < limit && pageIndex < maxPagesToFetch) { await new Promise(resolve => setTimeout(resolve, 500)); } } catch (err) { console.error(`抓取第 ${pageIndex + 1} 页失败: ${err.message}`); break; } } const results = []; const toFetch = items.slice(0, limit); for (const item of toFetch) { try { const html = await fetchHtml(item.href); const detail = parseDetail(html); results.push({ ...item, detail, }); await new Promise((resolve) => setTimeout(resolve, 500)); } catch (err) { results.push({ ...item, detail: null, error: err.message, }); } } const filtered = results.filter((item) => { return item.detail?.budget && item.detail.budget.amount > threshold; }); const total = filtered.reduce( (sum, item) => sum + (item.detail.budget?.amount || 0), 0 ); const report = { summary: { total_count: results.length, filtered_count: filtered.length, threshold: `${threshold}万元`, total_amount: `${total.toFixed(2)}万元`, generated_at: new Date().toISOString(), }, projects: filtered.map((item) => ({ title: item.title, date: item.date, publish_time: item.detail.publishTime, budget: item.detail.budget, url: item.href, })), }; res.json({ success: true, data: report }); } catch (error) { res.status(500).json({ success: false, error: error.message }); } }); // 按时间范围生成报告 app.post('/api/report-daterange', async (req, res) => { try { const { startDate, endDate, threshold = 50, maxPages = 23 } = req.body; // 按时间范围抓取列表 const items = await fetchListByDateRange(startDate, endDate, maxPages); if (items.length === 0) { return res.json({ success: true, data: { summary: { total_count: 0, filtered_count: 0, threshold: `${threshold}万元`, total_amount: '0.00万元', generated_at: new Date().toISOString(), date_range: { startDate, endDate }, }, projects: [], }, }); } // 抓取详情 const results = []; for (const item of items) { try { const html = await fetchHtml(item.href); const detail = parseDetail(html); results.push({ ...item, detail, }); await new Promise((resolve) => setTimeout(resolve, 500)); } catch (err) { results.push({ ...item, detail: null, error: err.message, }); } } // 生成报告 const filtered = results.filter((item) => { return item.detail?.budget && item.detail.budget.amount > threshold; }); const total = filtered.reduce( (sum, item) => sum + (item.detail.budget?.amount || 0), 0 ); const report = { summary: { total_count: results.length, filtered_count: filtered.length, threshold: `${threshold}万元`, total_amount: `${total.toFixed(2)}万元`, generated_at: new Date().toISOString(), date_range: { startDate, endDate }, }, projects: filtered.map((item) => ({ title: item.title, date: item.date, publish_time: item.detail.publishTime, budget: item.detail.budget, url: item.href, })), }; res.json({ success: true, data: report }); } catch (error) { res.status(500).json({ success: false, error: error.message }); } }); app.listen(PORT, () => { console.log(`Server running at http://localhost:${PORT}`); });