import cron from 'node-cron'; import { readFileSync } from 'fs'; import { fileURLToPath } from 'url'; import { dirname, join } from 'path'; import axios from 'axios'; import * as cheerio from 'cheerio'; import iconv from 'iconv-lite'; import { sendReportEmail } from './emailService.js'; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); // 加载配置文件 function loadConfig() { try { const configPath = join(__dirname, '..', 'config.json'); const configContent = readFileSync(configPath, 'utf-8'); return JSON.parse(configContent); } catch (error) { console.error('加载配置文件失败:', error.message); console.error('请确保 config.json 文件存在并配置正确'); return null; } } // 根据时间范围类型获取开始和结束日期 function getDateRangeByType(timeRange) { const now = new Date(); const year = now.getFullYear(); const month = String(now.getMonth() + 1).padStart(2, '0'); const day = String(now.getDate()).padStart(2, '0'); let startDate, endDate; endDate = `${year}-${month}-${day}`; // 结束日期都是今天 switch (timeRange) { case 'today': // 今日 startDate = `${year}-${month}-${day}`; break; case 'thisWeek': { // 本周 (从周一开始) const dayOfWeek = now.getDay(); // 0是周日,1是周一 const diff = dayOfWeek === 0 ? 6 : dayOfWeek - 1; // 计算到周一的天数差 const monday = new Date(now); monday.setDate(now.getDate() - diff); const weekYear = monday.getFullYear(); const weekMonth = String(monday.getMonth() + 1).padStart(2, '0'); const weekDay = String(monday.getDate()).padStart(2, '0'); startDate = `${weekYear}-${weekMonth}-${weekDay}`; break; } case 'thisMonth': default: // 本月 startDate = `${year}-${month}-01`; break; } return { startDate, endDate }; } // 获取本月的开始和结束日期 (兼容旧代码) function getCurrentMonthDateRange() { return getDateRangeByType('thisMonth'); } // 从server.js复制的辅助函数 const BASE_URL = 'https://gjzx.nanjing.gov.cn/gggs/'; const http = axios.create({ responseType: 'arraybuffer', timeout: 10000, headers: { 'User-Agent': 'Mozilla/5.0 (compatible; gjzx-scraper/1.0)', }, }); function pickEncoding(contentType = '') { const match = /charset=([^;]+)/i.exec(contentType); if (!match) return 'utf-8'; const charset = match[1].trim().toLowerCase(); if (charset.includes('gb')) return 'gbk'; return charset; } async function fetchHtml(url) { const res = await http.get(url); const encoding = pickEncoding(res.headers['content-type']); const html = iconv.decode(res.data, encoding || 'utf-8'); return html; } function getPageUrl(pageIndex, baseUrl = BASE_URL) { if (pageIndex === 0) { return baseUrl; } const cleanBaseUrl = baseUrl.replace(/\/$/, ''); return `${cleanBaseUrl}/index_${pageIndex}.html`; } function parseList(html) { const $ = cheerio.load(html); const items = []; $('table tr').each((_, row) => { const $row = $(row); const link = $row.find('td:first-child a').first(); const dateCell = $row.find('td:nth-child(2)'); if (link.length && dateCell.length) { const title = link.attr('title') || link.text().trim(); const rawHref = link.attr('href') || ''; const dateText = dateCell.text().trim(); if (!rawHref || !title || title.length < 5) return; if (rawHref === './' || rawHref === '../') return; if (!/^\d{4}-\d{2}-\d{2}$/.test(dateText)) return; try { const href = new URL(rawHref, BASE_URL).toString(); items.push({ title, href, date: dateText }); } catch (err) { return; } } }); return items; } function isDateInRange(dateStr, startDate, endDate) { if (!dateStr) return false; const date = new Date(dateStr); if (isNaN(date.getTime())) return false; if (startDate && date < new Date(startDate)) return false; if (endDate && date > new Date(endDate)) return false; return true; } async function fetchListByDateRange(startDate, endDate, maxPages = 23) { const allItems = []; let shouldContinue = true; let pageIndex = 0; console.log(`开始按时间范围采集: ${startDate || '不限'} 至 ${endDate || '不限'}`); while (shouldContinue && pageIndex < maxPages) { const pageUrl = getPageUrl(pageIndex); console.log(`正在采集第 ${pageIndex + 1} 页: ${pageUrl}`); try { const html = await fetchHtml(pageUrl); const items = parseList(html); if (items.length === 0) { console.log(`第 ${pageIndex + 1} 页没有数据,停止采集`); break; } let hasItemsInRange = false; let allItemsBeforeRange = true; for (const item of items) { if (isDateInRange(item.date, startDate, endDate)) { allItems.push(item); hasItemsInRange = true; allItemsBeforeRange = false; } else if (startDate && new Date(item.date) < new Date(startDate)) { allItemsBeforeRange = allItemsBeforeRange && true; } else { allItemsBeforeRange = false; } } if (allItemsBeforeRange && startDate) { console.log(`第 ${pageIndex + 1} 页所有项目都早于起始日期,停止采集`); shouldContinue = false; } console.log(`第 ${pageIndex + 1} 页找到 ${items.length} 条,符合条件 ${hasItemsInRange ? '有' : '无'}`); pageIndex++; if (shouldContinue && pageIndex < maxPages) { await new Promise(resolve => setTimeout(resolve, 500)); } } catch (err) { console.error(`采集第 ${pageIndex + 1} 页失败: ${err.message}`); break; } } console.log(`总共采集了 ${pageIndex} 页,找到 ${allItems.length} 条符合条件的公告`); return allItems; } // 从server.js导入parseDetail相关函数 function parseDetail(html) { const $ = cheerio.load(html); let title = $('.title18').text().trim(); if (!title) { title = $('.article-info h1').text().trim(); } if (!title) { title = $('h1').first().text().trim(); } const publishTd = $('td:contains("发布部门")').filter((_, el) => { return $(el).text().includes('发布时间'); }); const publishText = publishTd.text().trim(); let timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/); let publishTime = timeMatch ? timeMatch[1] : ''; if (!publishTime) { const infoText = $('.info-sources').text() || $('body').text(); timeMatch = infoText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})/); publishTime = timeMatch ? timeMatch[1] : ''; } let content = ''; const contentSelectors = [ '.zhenwen td', '.con', '.article-content', '.ewb-article-content', 'body' ]; for (const selector of contentSelectors) { const el = $(selector).first(); if (el.length > 0) { const text = el.text().trim(); if (text.length > content.length) { content = text; } } } const budget = extractBudget(content); return { title, publishTime, content, budget, }; } function extractBudget(content) { let cleanedContent = content.replace(/(\d)\s*[\n\r]\s*(?=\d)/g, '$1'); const patterns = [ { regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 1 }, { regex: /[((][¥¥]([\d,,]+(?:\.\d+)?)[))]/i, priority: 2, divider: 10000 }, { regex: /([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 3 }, { regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*元/i, priority: 4, divider: 10000 }, { regex: /([\d,,]+(?:\.\d+)?)\s*元(?!整)/i, priority: 5, divider: 10000 } ]; let bestMatch = null; let bestPriority = Infinity; for (const pattern of patterns) { const match = cleanedContent.match(pattern.regex); if (match && pattern.priority < bestPriority) { const numberStr = match[1].replace(/[,,]/g, ''); let amount = parseFloat(numberStr); if (pattern.divider) { amount = amount / pattern.divider; } if (!isNaN(amount) && amount >= 0.01 && amount <= 100000000) { bestMatch = { amount, unit: '万元', text: match[0], originalUnit: pattern.divider ? '元' : '万元' }; bestPriority = pattern.priority; } } } return bestMatch; } // 从API获取PDF URL async function fetchPdfUrlFromApi(pageUrl) { try { const bulletinIdMatch = pageUrl.match(/bulletinDetails\/[^\/]+\/([a-f0-9]+)/i); const bulletinTypeMatch = pageUrl.match(/bulletinType=(\d+)/); if (!bulletinIdMatch) { return null; } const bulletinId = bulletinIdMatch[1]; const bulletinType = bulletinTypeMatch ? bulletinTypeMatch[1] : '1'; const apiUrl = `https://api.jszbtb.com/DataGatewayApi/PublishBulletin/BulletinType/${bulletinType}/ID/${bulletinId}`; const response = await http.get(apiUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'application/json', 'Referer': 'https://www.jszbcg.com/' }, responseType: 'arraybuffer' }); const responseText = iconv.decode(response.data, 'utf-8'); const data = JSON.parse(responseText); if (data.success && data.data && data.data.signedPdfUrl) { return data.data.signedPdfUrl; } return null; } catch (err) { return null; } } function extractPdfUrl(html, pageUrl) { const $ = cheerio.load(html); let iframe = $('iframe').first(); if (!iframe.length) { iframe = $('iframe[src*="pdf"]').first(); } if (!iframe.length) { iframe = $('iframe[src*="viewer"]').first(); } if (iframe.length) { const src = iframe.attr('src'); if (!src) return null; const match = src.match(/[?&]file=([^&]+)/); if (match) { let pdfUrl = decodeURIComponent(match[1]); if (!pdfUrl.startsWith('http://') && !pdfUrl.startsWith('https://')) { try { pdfUrl = new URL(pdfUrl, pageUrl).toString(); } catch (err) { return null; } } return pdfUrl; } } return null; } async function fetchPdfContent(pdfUrl) { try { const { PDFParse } = await import('pdf-parse'); const response = await http.get(pdfUrl, { responseType: 'arraybuffer', timeout: 30000, }); const parser = new PDFParse({ data: response.data }); const result = await parser.getText(); await parser.destroy(); return result.text; } catch (err) { throw err; } } async function parseDetailEnhanced(html, pageUrl) { const $ = cheerio.load(html); let pdfUrl = null; if (pageUrl.includes('jszbcg.com')) { pdfUrl = await fetchPdfUrlFromApi(pageUrl); } if (!pdfUrl) { pdfUrl = extractPdfUrl(html, pageUrl); } let content = ''; let pdfParsed = false; if (pdfUrl) { try { content = await fetchPdfContent(pdfUrl); pdfParsed = true; } catch (err) { const htmlDetail = parseDetail(html); content = htmlDetail.content; } } else { const htmlDetail = parseDetail(html); content = htmlDetail.content; } const budget = extractBudget(content); const basicInfo = parseDetail(html); return { ...basicInfo, content, budget, hasPdf: pdfParsed, pdfUrl: pdfParsed ? pdfUrl : null, }; } // 定时任务执行函数 async function executeScheduledTask(config) { try { console.log('========================================'); console.log('定时任务开始执行'); console.log('执行时间:', new Date().toLocaleString('zh-CN')); console.log('========================================'); const timeRange = config.scheduler.timeRange || 'thisMonth'; const { startDate, endDate } = getDateRangeByType(timeRange); const threshold = config.scheduler.threshold || 100000; // 默认10亿(100000万元) const timeRangeNames = { 'today': '今日', 'thisWeek': '本周', 'thisMonth': '本月' }; console.log(`采集时间段: ${timeRangeNames[timeRange] || '本月'}`); console.log(`采集时间范围: ${startDate} 至 ${endDate}`); console.log(`金额阈值: ${threshold}万元 (${threshold / 10000}亿元)`); // 采集列表 const items = await fetchListByDateRange(startDate, endDate, 23); if (items.length === 0) { console.log('本月暂无公告数据'); return; } // 采集详情 console.log('========================================'); console.log(`开始采集 ${items.length} 条公告的详情...`); const results = []; for (let i = 0; i < items.length; i++) { const item = items[i]; try { console.log(`[${i + 1}/${items.length}] 正在采集: ${item.title}`); const html = await fetchHtml(item.href); const detail = await parseDetailEnhanced(html, item.href); results.push({ ...item, detail, }); await new Promise((resolve) => setTimeout(resolve, 500)); } catch (err) { console.error(`采集失败: ${err.message}`); results.push({ ...item, detail: null, error: err.message, }); } } // 筛选大于阈值的项目 const filtered = results.filter((item) => { return item.detail?.budget && item.detail.budget.amount > threshold; }); console.log('========================================'); console.log(`筛选结果: 找到 ${filtered.length} 个大于 ${threshold}万元 的项目`); if (filtered.length === 0) { console.log('本月暂无符合条件的大额项目'); return; } // 计算总金额 const total = filtered.reduce( (sum, item) => sum + (item.detail.budget?.amount || 0), 0 ); // 生成报告 const report = { summary: { total_count: results.length, filtered_count: filtered.length, threshold: `${threshold}万元`, total_amount: `${total.toFixed(2)}万元`, generated_at: new Date().toISOString(), date_range: { startDate, endDate }, }, projects: filtered.map((item) => ({ title: item.title, date: item.date, publish_time: item.detail.publishTime, budget: item.detail.budget, url: item.href, })), }; // 发送邮件 console.log('========================================'); console.log('正在发送邮件报告...'); const emailConfig = config.email; const result = await sendReportEmail(emailConfig, report); console.log('邮件发送成功!'); console.log('收件人:', emailConfig.recipients); console.log('MessageId:', result.messageId); console.log('========================================'); console.log('定时任务执行完成'); console.log('========================================'); } catch (error) { console.error('========================================'); console.error('定时任务执行失败:', error.message); console.error(error.stack); console.error('========================================'); } } // 存储当前的定时任务 let currentScheduledTask = null; // 初始化定时任务 export function initScheduler() { const config = loadConfig(); if (!config) { console.error('无法启动定时任务: 配置文件加载失败'); return; } if (!config.scheduler || !config.scheduler.enabled) { console.log('定时任务已禁用'); return; } if (!config.email || !config.email.smtpHost || !config.email.smtpUser) { console.error('无法启动定时任务: 邮件配置不完整'); console.error('请在 config.json 中配置邮件信息'); return; } const cronTime = config.scheduler.cronTime || '0 9 * * *'; console.log('========================================'); console.log('定时任务已启动'); console.log('执行计划:', cronTime); console.log('金额阈值:', config.scheduler.threshold, '万元'); console.log('收件人:', config.email.recipients); console.log('========================================'); // 如果已有任务在运行,先停止 if (currentScheduledTask) { currentScheduledTask.stop(); console.log('已停止旧的定时任务'); } // 创建定时任务 currentScheduledTask = cron.schedule(cronTime, () => { executeScheduledTask(config); }, { timezone: 'Asia/Shanghai' }); } // 重新加载配置并重启定时任务 export function reloadScheduler() { console.log('重新加载定时任务配置...'); // 停止当前任务 if (currentScheduledTask) { currentScheduledTask.stop(); currentScheduledTask = null; console.log('已停止当前定时任务'); } // 重新初始化 initScheduler(); } // 停止定时任务 export function stopScheduler() { if (currentScheduledTask) { currentScheduledTask.stop(); currentScheduledTask = null; console.log('定时任务已停止'); return true; } return false; } // 获取定时任务状态 export function getSchedulerStatus() { const config = loadConfig(); return { isRunning: currentScheduledTask !== null, config: config ? { enabled: config.scheduler?.enabled || false, cronTime: config.scheduler?.cronTime || '0 9 * * *', threshold: config.scheduler?.threshold || 100000, } : null, }; } // 手动执行任务(用于测试) export async function runTaskNow() { const config = loadConfig(); if (!config) { throw new Error('配置文件加载失败'); } await executeScheduledTask(config); }