feat(scheduler): 添加定时任务功能并集成前端配置界面 - 引入 node-cron 依赖以支持定时任务调度 - 新增定时任务相关 API 接口:获取配置、更新配置、查询状态、手动触发任务 - 前端新增“定时任务”标签页,支持 Cron 表达式配置与友好时间展示 - 支持通过 Web 界面启用/禁用定时任务、设置执行计划和金额阈值 - 定时任务可自动采集数据并发送邮件报告,无需重启服务即可生效新配置 - 优化配置保存逻辑,避免敏感信息泄露 ```
632 lines
17 KiB
JavaScript
632 lines
17 KiB
JavaScript
import cron from 'node-cron';
|
||
import { readFileSync } from 'fs';
|
||
import { fileURLToPath } from 'url';
|
||
import { dirname, join } from 'path';
|
||
import axios from 'axios';
|
||
import * as cheerio from 'cheerio';
|
||
import iconv from 'iconv-lite';
|
||
import { sendReportEmail } from './emailService.js';
|
||
|
||
const __filename = fileURLToPath(import.meta.url);
|
||
const __dirname = dirname(__filename);
|
||
|
||
// 加载配置文件
|
||
function loadConfig() {
|
||
try {
|
||
const configPath = join(__dirname, '..', 'config.json');
|
||
const configContent = readFileSync(configPath, 'utf-8');
|
||
return JSON.parse(configContent);
|
||
} catch (error) {
|
||
console.error('加载配置文件失败:', error.message);
|
||
console.error('请确保 config.json 文件存在并配置正确');
|
||
return null;
|
||
}
|
||
}
|
||
|
||
// 根据时间范围类型获取开始和结束日期
|
||
function getDateRangeByType(timeRange) {
|
||
const now = new Date();
|
||
const year = now.getFullYear();
|
||
const month = String(now.getMonth() + 1).padStart(2, '0');
|
||
const day = String(now.getDate()).padStart(2, '0');
|
||
|
||
let startDate, endDate;
|
||
endDate = `${year}-${month}-${day}`; // 结束日期都是今天
|
||
|
||
switch (timeRange) {
|
||
case 'today':
|
||
// 今日
|
||
startDate = `${year}-${month}-${day}`;
|
||
break;
|
||
|
||
case 'thisWeek': {
|
||
// 本周 (从周一开始)
|
||
const dayOfWeek = now.getDay(); // 0是周日,1是周一
|
||
const diff = dayOfWeek === 0 ? 6 : dayOfWeek - 1; // 计算到周一的天数差
|
||
const monday = new Date(now);
|
||
monday.setDate(now.getDate() - diff);
|
||
const weekYear = monday.getFullYear();
|
||
const weekMonth = String(monday.getMonth() + 1).padStart(2, '0');
|
||
const weekDay = String(monday.getDate()).padStart(2, '0');
|
||
startDate = `${weekYear}-${weekMonth}-${weekDay}`;
|
||
break;
|
||
}
|
||
|
||
case 'thisMonth':
|
||
default:
|
||
// 本月
|
||
startDate = `${year}-${month}-01`;
|
||
break;
|
||
}
|
||
|
||
return { startDate, endDate };
|
||
}
|
||
|
||
// 获取本月的开始和结束日期 (兼容旧代码)
|
||
function getCurrentMonthDateRange() {
|
||
return getDateRangeByType('thisMonth');
|
||
}
|
||
|
||
// 从server.js复制的辅助函数
|
||
const BASE_URL = 'https://gjzx.nanjing.gov.cn/gggs/';
|
||
|
||
const http = axios.create({
|
||
responseType: 'arraybuffer',
|
||
timeout: 10000,
|
||
headers: {
|
||
'User-Agent': 'Mozilla/5.0 (compatible; gjzx-scraper/1.0)',
|
||
},
|
||
});
|
||
|
||
function pickEncoding(contentType = '') {
|
||
const match = /charset=([^;]+)/i.exec(contentType);
|
||
if (!match) return 'utf-8';
|
||
const charset = match[1].trim().toLowerCase();
|
||
if (charset.includes('gb')) return 'gbk';
|
||
return charset;
|
||
}
|
||
|
||
async function fetchHtml(url) {
|
||
const res = await http.get(url);
|
||
const encoding = pickEncoding(res.headers['content-type']);
|
||
const html = iconv.decode(res.data, encoding || 'utf-8');
|
||
return html;
|
||
}
|
||
|
||
function getPageUrl(pageIndex, baseUrl = BASE_URL) {
|
||
if (pageIndex === 0) {
|
||
return baseUrl;
|
||
}
|
||
const cleanBaseUrl = baseUrl.replace(/\/$/, '');
|
||
return `${cleanBaseUrl}/index_${pageIndex}.html`;
|
||
}
|
||
|
||
function parseList(html) {
|
||
const $ = cheerio.load(html);
|
||
const items = [];
|
||
|
||
$('table tr').each((_, row) => {
|
||
const $row = $(row);
|
||
const link = $row.find('td:first-child a').first();
|
||
const dateCell = $row.find('td:nth-child(2)');
|
||
|
||
if (link.length && dateCell.length) {
|
||
const title = link.attr('title') || link.text().trim();
|
||
const rawHref = link.attr('href') || '';
|
||
const dateText = dateCell.text().trim();
|
||
|
||
if (!rawHref || !title || title.length < 5) return;
|
||
if (rawHref === './' || rawHref === '../') return;
|
||
if (!/^\d{4}-\d{2}-\d{2}$/.test(dateText)) return;
|
||
|
||
try {
|
||
const href = new URL(rawHref, BASE_URL).toString();
|
||
items.push({ title, href, date: dateText });
|
||
} catch (err) {
|
||
return;
|
||
}
|
||
}
|
||
});
|
||
|
||
return items;
|
||
}
|
||
|
||
function isDateInRange(dateStr, startDate, endDate) {
|
||
if (!dateStr) return false;
|
||
const date = new Date(dateStr);
|
||
if (isNaN(date.getTime())) return false;
|
||
|
||
if (startDate && date < new Date(startDate)) return false;
|
||
if (endDate && date > new Date(endDate)) return false;
|
||
return true;
|
||
}
|
||
|
||
async function fetchListByDateRange(startDate, endDate, maxPages = 23) {
|
||
const allItems = [];
|
||
let shouldContinue = true;
|
||
let pageIndex = 0;
|
||
|
||
console.log(`开始按时间范围采集: ${startDate || '不限'} 至 ${endDate || '不限'}`);
|
||
|
||
while (shouldContinue && pageIndex < maxPages) {
|
||
const pageUrl = getPageUrl(pageIndex);
|
||
console.log(`正在采集第 ${pageIndex + 1} 页: ${pageUrl}`);
|
||
|
||
try {
|
||
const html = await fetchHtml(pageUrl);
|
||
const items = parseList(html);
|
||
|
||
if (items.length === 0) {
|
||
console.log(`第 ${pageIndex + 1} 页没有数据,停止采集`);
|
||
break;
|
||
}
|
||
|
||
let hasItemsInRange = false;
|
||
let allItemsBeforeRange = true;
|
||
|
||
for (const item of items) {
|
||
if (isDateInRange(item.date, startDate, endDate)) {
|
||
allItems.push(item);
|
||
hasItemsInRange = true;
|
||
allItemsBeforeRange = false;
|
||
} else if (startDate && new Date(item.date) < new Date(startDate)) {
|
||
allItemsBeforeRange = allItemsBeforeRange && true;
|
||
} else {
|
||
allItemsBeforeRange = false;
|
||
}
|
||
}
|
||
|
||
if (allItemsBeforeRange && startDate) {
|
||
console.log(`第 ${pageIndex + 1} 页所有项目都早于起始日期,停止采集`);
|
||
shouldContinue = false;
|
||
}
|
||
|
||
console.log(`第 ${pageIndex + 1} 页找到 ${items.length} 条,符合条件 ${hasItemsInRange ? '有' : '无'}`);
|
||
|
||
pageIndex++;
|
||
|
||
if (shouldContinue && pageIndex < maxPages) {
|
||
await new Promise(resolve => setTimeout(resolve, 500));
|
||
}
|
||
} catch (err) {
|
||
console.error(`采集第 ${pageIndex + 1} 页失败: ${err.message}`);
|
||
break;
|
||
}
|
||
}
|
||
|
||
console.log(`总共采集了 ${pageIndex} 页,找到 ${allItems.length} 条符合条件的公告`);
|
||
return allItems;
|
||
}
|
||
|
||
// 从server.js导入parseDetail相关函数
|
||
function parseDetail(html) {
|
||
const $ = cheerio.load(html);
|
||
|
||
let title = $('.title18').text().trim();
|
||
if (!title) {
|
||
title = $('.article-info h1').text().trim();
|
||
}
|
||
if (!title) {
|
||
title = $('h1').first().text().trim();
|
||
}
|
||
|
||
const publishTd = $('td:contains("发布部门")').filter((_, el) => {
|
||
return $(el).text().includes('发布时间');
|
||
});
|
||
const publishText = publishTd.text().trim();
|
||
let timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/);
|
||
let publishTime = timeMatch ? timeMatch[1] : '';
|
||
|
||
if (!publishTime) {
|
||
const infoText = $('.info-sources').text() || $('body').text();
|
||
timeMatch = infoText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})/);
|
||
publishTime = timeMatch ? timeMatch[1] : '';
|
||
}
|
||
|
||
let content = '';
|
||
const contentSelectors = [
|
||
'.zhenwen td',
|
||
'.con',
|
||
'.article-content',
|
||
'.ewb-article-content',
|
||
'body'
|
||
];
|
||
|
||
for (const selector of contentSelectors) {
|
||
const el = $(selector).first();
|
||
if (el.length > 0) {
|
||
const text = el.text().trim();
|
||
if (text.length > content.length) {
|
||
content = text;
|
||
}
|
||
}
|
||
}
|
||
|
||
const budget = extractBudget(content);
|
||
|
||
return {
|
||
title,
|
||
publishTime,
|
||
content,
|
||
budget,
|
||
};
|
||
}
|
||
|
||
function extractBudget(content) {
|
||
let cleanedContent = content.replace(/(\d)\s*[\n\r]\s*(?=\d)/g, '$1');
|
||
|
||
const patterns = [
|
||
{ regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 1 },
|
||
{ regex: /[((][¥¥]([\d,,]+(?:\.\d+)?)[))]/i, priority: 2, divider: 10000 },
|
||
{ regex: /([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 3 },
|
||
{ regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*元/i, priority: 4, divider: 10000 },
|
||
{ regex: /([\d,,]+(?:\.\d+)?)\s*元(?!整)/i, priority: 5, divider: 10000 }
|
||
];
|
||
|
||
let bestMatch = null;
|
||
let bestPriority = Infinity;
|
||
|
||
for (const pattern of patterns) {
|
||
const match = cleanedContent.match(pattern.regex);
|
||
if (match && pattern.priority < bestPriority) {
|
||
const numberStr = match[1].replace(/[,,]/g, '');
|
||
let amount = parseFloat(numberStr);
|
||
|
||
if (pattern.divider) {
|
||
amount = amount / pattern.divider;
|
||
}
|
||
|
||
if (!isNaN(amount) && amount >= 0.01 && amount <= 100000000) {
|
||
bestMatch = {
|
||
amount,
|
||
unit: '万元',
|
||
text: match[0],
|
||
originalUnit: pattern.divider ? '元' : '万元'
|
||
};
|
||
bestPriority = pattern.priority;
|
||
}
|
||
}
|
||
}
|
||
|
||
return bestMatch;
|
||
}
|
||
|
||
// 从API获取PDF URL
|
||
async function fetchPdfUrlFromApi(pageUrl) {
|
||
try {
|
||
const bulletinIdMatch = pageUrl.match(/bulletinDetails\/[^\/]+\/([a-f0-9]+)/i);
|
||
const bulletinTypeMatch = pageUrl.match(/bulletinType=(\d+)/);
|
||
|
||
if (!bulletinIdMatch) {
|
||
return null;
|
||
}
|
||
|
||
const bulletinId = bulletinIdMatch[1];
|
||
const bulletinType = bulletinTypeMatch ? bulletinTypeMatch[1] : '1';
|
||
|
||
const apiUrl = `https://api.jszbtb.com/DataGatewayApi/PublishBulletin/BulletinType/${bulletinType}/ID/${bulletinId}`;
|
||
|
||
const response = await http.get(apiUrl, {
|
||
headers: {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||
'Accept': 'application/json',
|
||
'Referer': 'https://www.jszbcg.com/'
|
||
},
|
||
responseType: 'arraybuffer'
|
||
});
|
||
|
||
const responseText = iconv.decode(response.data, 'utf-8');
|
||
const data = JSON.parse(responseText);
|
||
|
||
if (data.success && data.data && data.data.signedPdfUrl) {
|
||
return data.data.signedPdfUrl;
|
||
}
|
||
|
||
return null;
|
||
} catch (err) {
|
||
return null;
|
||
}
|
||
}
|
||
|
||
function extractPdfUrl(html, pageUrl) {
|
||
const $ = cheerio.load(html);
|
||
|
||
let iframe = $('iframe').first();
|
||
if (!iframe.length) {
|
||
iframe = $('iframe[src*="pdf"]').first();
|
||
}
|
||
if (!iframe.length) {
|
||
iframe = $('iframe[src*="viewer"]').first();
|
||
}
|
||
|
||
if (iframe.length) {
|
||
const src = iframe.attr('src');
|
||
if (!src) return null;
|
||
|
||
const match = src.match(/[?&]file=([^&]+)/);
|
||
if (match) {
|
||
let pdfUrl = decodeURIComponent(match[1]);
|
||
|
||
if (!pdfUrl.startsWith('http://') && !pdfUrl.startsWith('https://')) {
|
||
try {
|
||
pdfUrl = new URL(pdfUrl, pageUrl).toString();
|
||
} catch (err) {
|
||
return null;
|
||
}
|
||
}
|
||
|
||
return pdfUrl;
|
||
}
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
async function fetchPdfContent(pdfUrl) {
|
||
try {
|
||
const { PDFParse } = await import('pdf-parse');
|
||
|
||
const response = await http.get(pdfUrl, {
|
||
responseType: 'arraybuffer',
|
||
timeout: 30000,
|
||
});
|
||
|
||
const parser = new PDFParse({ data: response.data });
|
||
const result = await parser.getText();
|
||
await parser.destroy();
|
||
|
||
return result.text;
|
||
} catch (err) {
|
||
throw err;
|
||
}
|
||
}
|
||
|
||
async function parseDetailEnhanced(html, pageUrl) {
|
||
const $ = cheerio.load(html);
|
||
|
||
let pdfUrl = null;
|
||
|
||
if (pageUrl.includes('jszbcg.com')) {
|
||
pdfUrl = await fetchPdfUrlFromApi(pageUrl);
|
||
}
|
||
|
||
if (!pdfUrl) {
|
||
pdfUrl = extractPdfUrl(html, pageUrl);
|
||
}
|
||
|
||
let content = '';
|
||
let pdfParsed = false;
|
||
|
||
if (pdfUrl) {
|
||
try {
|
||
content = await fetchPdfContent(pdfUrl);
|
||
pdfParsed = true;
|
||
} catch (err) {
|
||
const htmlDetail = parseDetail(html);
|
||
content = htmlDetail.content;
|
||
}
|
||
} else {
|
||
const htmlDetail = parseDetail(html);
|
||
content = htmlDetail.content;
|
||
}
|
||
|
||
const budget = extractBudget(content);
|
||
const basicInfo = parseDetail(html);
|
||
|
||
return {
|
||
...basicInfo,
|
||
content,
|
||
budget,
|
||
hasPdf: pdfParsed,
|
||
pdfUrl: pdfParsed ? pdfUrl : null,
|
||
};
|
||
}
|
||
|
||
// 定时任务执行函数
|
||
async function executeScheduledTask(config) {
|
||
try {
|
||
console.log('========================================');
|
||
console.log('定时任务开始执行');
|
||
console.log('执行时间:', new Date().toLocaleString('zh-CN'));
|
||
console.log('========================================');
|
||
|
||
const timeRange = config.scheduler.timeRange || 'thisMonth';
|
||
const { startDate, endDate } = getDateRangeByType(timeRange);
|
||
const threshold = config.scheduler.threshold || 100000; // 默认10亿(100000万元)
|
||
|
||
const timeRangeNames = {
|
||
'today': '今日',
|
||
'thisWeek': '本周',
|
||
'thisMonth': '本月'
|
||
};
|
||
console.log(`采集时间段: ${timeRangeNames[timeRange] || '本月'}`);
|
||
console.log(`采集时间范围: ${startDate} 至 ${endDate}`);
|
||
console.log(`金额阈值: ${threshold}万元 (${threshold / 10000}亿元)`);
|
||
|
||
// 采集列表
|
||
const items = await fetchListByDateRange(startDate, endDate, 23);
|
||
|
||
if (items.length === 0) {
|
||
console.log('本月暂无公告数据');
|
||
return;
|
||
}
|
||
|
||
// 采集详情
|
||
console.log('========================================');
|
||
console.log(`开始采集 ${items.length} 条公告的详情...`);
|
||
const results = [];
|
||
for (let i = 0; i < items.length; i++) {
|
||
const item = items[i];
|
||
try {
|
||
console.log(`[${i + 1}/${items.length}] 正在采集: ${item.title}`);
|
||
const html = await fetchHtml(item.href);
|
||
const detail = await parseDetailEnhanced(html, item.href);
|
||
results.push({
|
||
...item,
|
||
detail,
|
||
});
|
||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||
} catch (err) {
|
||
console.error(`采集失败: ${err.message}`);
|
||
results.push({
|
||
...item,
|
||
detail: null,
|
||
error: err.message,
|
||
});
|
||
}
|
||
}
|
||
|
||
// 筛选大于阈值的项目
|
||
const filtered = results.filter((item) => {
|
||
return item.detail?.budget && item.detail.budget.amount > threshold;
|
||
});
|
||
|
||
console.log('========================================');
|
||
console.log(`筛选结果: 找到 ${filtered.length} 个大于 ${threshold}万元 的项目`);
|
||
|
||
if (filtered.length === 0) {
|
||
console.log('本月暂无符合条件的大额项目');
|
||
return;
|
||
}
|
||
|
||
// 计算总金额
|
||
const total = filtered.reduce(
|
||
(sum, item) => sum + (item.detail.budget?.amount || 0),
|
||
0
|
||
);
|
||
|
||
// 生成报告
|
||
const report = {
|
||
summary: {
|
||
total_count: results.length,
|
||
filtered_count: filtered.length,
|
||
threshold: `${threshold}万元`,
|
||
total_amount: `${total.toFixed(2)}万元`,
|
||
generated_at: new Date().toISOString(),
|
||
date_range: { startDate, endDate },
|
||
},
|
||
projects: filtered.map((item) => ({
|
||
title: item.title,
|
||
date: item.date,
|
||
publish_time: item.detail.publishTime,
|
||
budget: item.detail.budget,
|
||
url: item.href,
|
||
})),
|
||
};
|
||
|
||
// 发送邮件
|
||
console.log('========================================');
|
||
console.log('正在发送邮件报告...');
|
||
const emailConfig = config.email;
|
||
|
||
const result = await sendReportEmail(emailConfig, report);
|
||
|
||
console.log('邮件发送成功!');
|
||
console.log('收件人:', emailConfig.recipients);
|
||
console.log('MessageId:', result.messageId);
|
||
console.log('========================================');
|
||
console.log('定时任务执行完成');
|
||
console.log('========================================');
|
||
|
||
} catch (error) {
|
||
console.error('========================================');
|
||
console.error('定时任务执行失败:', error.message);
|
||
console.error(error.stack);
|
||
console.error('========================================');
|
||
}
|
||
}
|
||
|
||
// 存储当前的定时任务
|
||
let currentScheduledTask = null;
|
||
|
||
// 初始化定时任务
|
||
export function initScheduler() {
|
||
const config = loadConfig();
|
||
|
||
if (!config) {
|
||
console.error('无法启动定时任务: 配置文件加载失败');
|
||
return;
|
||
}
|
||
|
||
if (!config.scheduler || !config.scheduler.enabled) {
|
||
console.log('定时任务已禁用');
|
||
return;
|
||
}
|
||
|
||
if (!config.email || !config.email.smtpHost || !config.email.smtpUser) {
|
||
console.error('无法启动定时任务: 邮件配置不完整');
|
||
console.error('请在 config.json 中配置邮件信息');
|
||
return;
|
||
}
|
||
|
||
const cronTime = config.scheduler.cronTime || '0 9 * * *';
|
||
|
||
console.log('========================================');
|
||
console.log('定时任务已启动');
|
||
console.log('执行计划:', cronTime);
|
||
console.log('金额阈值:', config.scheduler.threshold, '万元');
|
||
console.log('收件人:', config.email.recipients);
|
||
console.log('========================================');
|
||
|
||
// 如果已有任务在运行,先停止
|
||
if (currentScheduledTask) {
|
||
currentScheduledTask.stop();
|
||
console.log('已停止旧的定时任务');
|
||
}
|
||
|
||
// 创建定时任务
|
||
currentScheduledTask = cron.schedule(cronTime, () => {
|
||
executeScheduledTask(config);
|
||
}, {
|
||
timezone: 'Asia/Shanghai'
|
||
});
|
||
}
|
||
|
||
// 重新加载配置并重启定时任务
|
||
export function reloadScheduler() {
|
||
console.log('重新加载定时任务配置...');
|
||
|
||
// 停止当前任务
|
||
if (currentScheduledTask) {
|
||
currentScheduledTask.stop();
|
||
currentScheduledTask = null;
|
||
console.log('已停止当前定时任务');
|
||
}
|
||
|
||
// 重新初始化
|
||
initScheduler();
|
||
}
|
||
|
||
// 停止定时任务
|
||
export function stopScheduler() {
|
||
if (currentScheduledTask) {
|
||
currentScheduledTask.stop();
|
||
currentScheduledTask = null;
|
||
console.log('定时任务已停止');
|
||
return true;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
// 获取定时任务状态
|
||
export function getSchedulerStatus() {
|
||
const config = loadConfig();
|
||
return {
|
||
isRunning: currentScheduledTask !== null,
|
||
config: config ? {
|
||
enabled: config.scheduler?.enabled || false,
|
||
cronTime: config.scheduler?.cronTime || '0 9 * * *',
|
||
threshold: config.scheduler?.threshold || 100000,
|
||
} : null,
|
||
};
|
||
}
|
||
|
||
// 手动执行任务(用于测试)
|
||
export async function runTaskNow() {
|
||
const config = loadConfig();
|
||
if (!config) {
|
||
throw new Error('配置文件加载失败');
|
||
}
|
||
await executeScheduledTask(config);
|
||
}
|