feat: 使用firecrawl 实现公告抓取与分析工具的网页界面,包括报告生成、导出和邮件发送功能。
This commit is contained in:
@@ -730,3 +730,166 @@ function generateReportHtml(report) {
|
||||
</html>
|
||||
`;
|
||||
}
|
||||
|
||||
// ========== 通用抓取结果邮件(定时任务使用) ==========
|
||||
|
||||
export async function sendScraperResultsEmail(emailConfig, results) {
|
||||
try {
|
||||
const transporter = nodemailer.createTransport({
|
||||
host: emailConfig.smtpHost,
|
||||
port: emailConfig.smtpPort || 587,
|
||||
secure: emailConfig.smtpPort === 465,
|
||||
auth: {
|
||||
user: emailConfig.smtpUser,
|
||||
pass: emailConfig.smtpPass,
|
||||
},
|
||||
});
|
||||
|
||||
const htmlContent = generateScraperResultsHtml(results);
|
||||
const successCount = results.filter(r => !r.error).length;
|
||||
|
||||
const info = await transporter.sendMail({
|
||||
from: `"公告采集系统" <${emailConfig.smtpUser}>`,
|
||||
to: emailConfig.recipients,
|
||||
subject: `公告采集结果报告(${successCount}条) - ${new Date().toLocaleDateString('zh-CN')}`,
|
||||
html: htmlContent,
|
||||
});
|
||||
|
||||
return { success: true, messageId: info.messageId };
|
||||
} catch (error) {
|
||||
console.error('发送抓取结果邮件失败:', error);
|
||||
throw new Error(`邮件发送失败: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
function generateScraperResultsHtml(results) {
|
||||
const successResults = results.filter(r => !r.error);
|
||||
const failResults = results.filter(r => r.error);
|
||||
const generatedAt = new Date().toLocaleString('zh-CN');
|
||||
|
||||
// 把所有成功来源的 items 展开,附带来源信息
|
||||
const allRows = [];
|
||||
for (const r of successResults) {
|
||||
const items = r.data?.result || [];
|
||||
for (const item of items) {
|
||||
allRows.push({
|
||||
section: [r.section, r.subsection].filter(Boolean).join(' · ') || r.city || '-',
|
||||
type: r.type || '-',
|
||||
title: item.title || '-',
|
||||
date: item.date || '-',
|
||||
amount: item.amount || '未公开',
|
||||
url: item.url || '',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// 按日期降序排列
|
||||
allRows.sort((a, b) => {
|
||||
if (a.date === b.date) return 0;
|
||||
return a.date > b.date ? -1 : 1;
|
||||
});
|
||||
|
||||
const totalItems = allRows.length;
|
||||
|
||||
// 行颜色交替
|
||||
const rowHtml = allRows.length === 0
|
||||
? `<tr><td colspan="6" style="text-align:center;color:#999;padding:30px;font-size:14px;">暂无数据</td></tr>`
|
||||
: allRows.map((row, i) => `
|
||||
<tr style="background:${i % 2 === 0 ? '#fff' : '#f7f8ff'};">
|
||||
<td style="padding:9px 12px;border-bottom:1px solid #eaecf5;white-space:nowrap;color:#555;font-size:13px;">${row.section}</td>
|
||||
<td style="padding:9px 12px;border-bottom:1px solid #eaecf5;white-space:nowrap;">
|
||||
<span style="display:inline-block;padding:2px 8px;background:#e8f4fd;color:#1a73c8;border-radius:10px;font-size:11px;font-weight:600;">${row.type}</span>
|
||||
</td>
|
||||
<td style="padding:9px 12px;border-bottom:1px solid #eaecf5;font-size:13px;max-width:320px;">${row.title}</td>
|
||||
<td style="padding:9px 12px;border-bottom:1px solid #eaecf5;white-space:nowrap;font-size:13px;color:#555;">${row.date}</td>
|
||||
<td style="padding:9px 12px;border-bottom:1px solid #eaecf5;white-space:nowrap;font-size:13px;font-weight:600;color:${row.amount === '未公开' ? '#aaa' : '#e67e22'};">${row.amount}</td>
|
||||
<td style="padding:9px 12px;border-bottom:1px solid #eaecf5;text-align:center;">
|
||||
${row.url
|
||||
? `<a href="${row.url}" target="_blank" style="color:#667eea;font-size:12px;text-decoration:none;white-space:nowrap;">查看 →</a>`
|
||||
: '<span style="color:#ccc;font-size:12px;">-</span>'
|
||||
}
|
||||
</td>
|
||||
</tr>`).join('');
|
||||
|
||||
// 失败来源列表
|
||||
const failHtml = failResults.length === 0 ? '' : `
|
||||
<div style="margin-top:24px;">
|
||||
<div style="font-size:14px;font-weight:600;color:#c0392b;margin-bottom:10px;">⚠️ 抓取失败的来源(${failResults.length} 个)</div>
|
||||
${failResults.map(r => `
|
||||
<div style="background:#fdeaea;border-left:3px solid #e74c3c;padding:10px 14px;border-radius:4px;margin-bottom:8px;font-size:13px;">
|
||||
<strong>${r.city || ''}${r.section ? ' · ' + r.section : ''}${r.type ? ' · ' + r.type : ''}</strong>
|
||||
<div style="color:#999;font-size:12px;margin-top:4px;">${r.url}</div>
|
||||
<div style="color:#c0392b;margin-top:4px;">❌ ${r.error}</div>
|
||||
</div>`).join('')}
|
||||
</div>`;
|
||||
|
||||
return `
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>公告采集结果报告</title>
|
||||
</head>
|
||||
<body style="font-family:'PingFang SC','Microsoft YaHei',Arial,sans-serif;line-height:1.6;color:#333;margin:0;padding:20px;background:#f0f2f8;">
|
||||
<div style="max-width:960px;margin:0 auto;background:white;border-radius:10px;overflow:hidden;box-shadow:0 4px 20px rgba(0,0,0,.1);">
|
||||
|
||||
<!-- 标题栏 -->
|
||||
<div style="background:linear-gradient(135deg,#667eea 0%,#764ba2 100%);padding:24px 30px;color:white;">
|
||||
<h1 style="margin:0;font-size:20px;font-weight:700;">📋 公告采集结果报告</h1>
|
||||
<div style="margin-top:6px;opacity:.85;font-size:13px;">生成时间:${generatedAt}</div>
|
||||
</div>
|
||||
|
||||
<!-- 统计栏 -->
|
||||
<div style="display:flex;gap:0;border-bottom:1px solid #eaecf5;">
|
||||
<div style="flex:1;padding:16px 24px;text-align:center;border-right:1px solid #eaecf5;">
|
||||
<div style="font-size:28px;font-weight:700;color:#667eea;">${totalItems}</div>
|
||||
<div style="font-size:12px;color:#888;margin-top:2px;">公告总数</div>
|
||||
</div>
|
||||
<div style="flex:1;padding:16px 24px;text-align:center;border-right:1px solid #eaecf5;">
|
||||
<div style="font-size:28px;font-weight:700;color:#1a8a4a;">${successResults.length}</div>
|
||||
<div style="font-size:12px;color:#888;margin-top:2px;">成功来源</div>
|
||||
</div>
|
||||
<div style="flex:1;padding:16px 24px;text-align:center;border-right:1px solid #eaecf5;">
|
||||
<div style="font-size:28px;font-weight:700;color:#e67e22;">${allRows.filter(r => r.amount && r.amount !== '未公开').length}</div>
|
||||
<div style="font-size:12px;color:#888;margin-top:2px;">有金额</div>
|
||||
</div>
|
||||
<div style="flex:1;padding:16px 24px;text-align:center;">
|
||||
<div style="font-size:28px;font-weight:700;color:${failResults.length > 0 ? '#c0392b' : '#aaa'};">${failResults.length}</div>
|
||||
<div style="font-size:12px;color:#888;margin-top:2px;">失败来源</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 公告汇总表格 -->
|
||||
<div style="padding:24px 30px;">
|
||||
<div style="font-size:15px;font-weight:600;color:#333;margin-bottom:14px;">公告汇总(共 ${totalItems} 条)</div>
|
||||
<div style="overflow-x:auto;">
|
||||
<table style="width:100%;border-collapse:collapse;font-size:13px;">
|
||||
<thead>
|
||||
<tr style="background:linear-gradient(135deg,#667eea 0%,#764ba2 100%);color:white;">
|
||||
<th style="padding:10px 12px;text-align:left;font-weight:600;white-space:nowrap;">板块</th>
|
||||
<th style="padding:10px 12px;text-align:left;font-weight:600;white-space:nowrap;">类型</th>
|
||||
<th style="padding:10px 12px;text-align:left;font-weight:600;">公告标题</th>
|
||||
<th style="padding:10px 12px;text-align:left;font-weight:600;white-space:nowrap;">发布日期</th>
|
||||
<th style="padding:10px 12px;text-align:left;font-weight:600;white-space:nowrap;">项目金额</th>
|
||||
<th style="padding:10px 12px;text-align:center;font-weight:600;white-space:nowrap;">详情</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
${rowHtml}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
${failHtml}
|
||||
|
||||
<div style="margin-top:24px;padding-top:16px;border-top:1px solid #eaecf5;color:#aaa;font-size:12px;text-align:center;">
|
||||
本报告由公告采集系统自动生成 · ${generatedAt}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
}
|
||||
|
||||
669
src/scheduler.js
669
src/scheduler.js
@@ -1,503 +1,194 @@
|
||||
import 'dotenv/config';
|
||||
import cron from 'node-cron';
|
||||
import { readFileSync } from 'fs';
|
||||
import { readFileSync, writeFileSync, existsSync } from 'fs';
|
||||
import { fileURLToPath } from 'url';
|
||||
import { dirname, join } from 'path';
|
||||
import axios from 'axios';
|
||||
import * as cheerio from 'cheerio';
|
||||
import iconv from 'iconv-lite';
|
||||
import { sendCombinedReportEmail } from './emailService.js';
|
||||
import Firecrawl from '@mendable/firecrawl-js';
|
||||
import { z } from 'zod';
|
||||
import { sendScraperResultsEmail } from './emailService.js';
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = dirname(__filename);
|
||||
|
||||
// 初始化 Firecrawl 客户端
|
||||
const firecrawl = new Firecrawl({ apiKey: process.env.FIRECRAWL_API_KEY });
|
||||
|
||||
const RESULTS_PATH = join(__dirname, '..', 'results.json');
|
||||
|
||||
// 加载配置文件
|
||||
function loadConfig() {
|
||||
try {
|
||||
const configPath = join(__dirname, '..', 'config.json');
|
||||
const configContent = readFileSync(configPath, 'utf-8');
|
||||
return JSON.parse(configContent);
|
||||
return JSON.parse(readFileSync(configPath, 'utf-8'));
|
||||
} catch (error) {
|
||||
console.error('加载配置文件失败:', error.message);
|
||||
console.error('请确保 config.json 文件存在并配置正确');
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// 根据时间范围类型获取开始和结束日期
|
||||
function getDateRangeByType(timeRange) {
|
||||
const now = new Date();
|
||||
const year = now.getFullYear();
|
||||
const month = String(now.getMonth() + 1).padStart(2, '0');
|
||||
const day = String(now.getDate()).padStart(2, '0');
|
||||
// ========== 结果存取(与 server.js 保持一致) ==========
|
||||
|
||||
let startDate, endDate;
|
||||
endDate = `${year}-${month}-${day}`; // 结束日期都是今天
|
||||
|
||||
switch (timeRange) {
|
||||
case 'today':
|
||||
// 今日
|
||||
startDate = `${year}-${month}-${day}`;
|
||||
break;
|
||||
|
||||
case 'thisWeek': {
|
||||
// 本周 (从周一开始)
|
||||
const dayOfWeek = now.getDay(); // 0是周日,1是周一
|
||||
const diff = dayOfWeek === 0 ? 6 : dayOfWeek - 1; // 计算到周一的天数差
|
||||
const monday = new Date(now);
|
||||
monday.setDate(now.getDate() - diff);
|
||||
const weekYear = monday.getFullYear();
|
||||
const weekMonth = String(monday.getMonth() + 1).padStart(2, '0');
|
||||
const weekDay = String(monday.getDate()).padStart(2, '0');
|
||||
startDate = `${weekYear}-${weekMonth}-${weekDay}`;
|
||||
break;
|
||||
}
|
||||
|
||||
case 'thisMonth':
|
||||
default:
|
||||
// 本月
|
||||
startDate = `${year}-${month}-01`;
|
||||
break;
|
||||
function readResults() {
|
||||
if (!existsSync(RESULTS_PATH)) return [];
|
||||
try {
|
||||
return JSON.parse(readFileSync(RESULTS_PATH, 'utf-8'));
|
||||
} catch (e) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return { startDate, endDate };
|
||||
}
|
||||
|
||||
// 南京市公共资源交易平台 - 交通水务中标结果公示
|
||||
const BASE_URL = 'https://njggzy.nanjing.gov.cn/njweb/jtsw/069008/';
|
||||
function saveResults(results) {
|
||||
writeFileSync(RESULTS_PATH, JSON.stringify(results, null, 2), 'utf-8');
|
||||
}
|
||||
|
||||
// 南京市公共资源交易平台 - 交通水务招标公告
|
||||
const BID_ANNOUNCE_BASE_URL = 'https://njggzy.nanjing.gov.cn/njweb/jtsw/069001/';
|
||||
function appendResult(result) {
|
||||
const results = readResults();
|
||||
results.unshift({ ...result, id: `result-${Date.now()}-${Math.random().toString(36).slice(2, 7)}` });
|
||||
if (results.length > 500) results.splice(500);
|
||||
saveResults(results);
|
||||
}
|
||||
|
||||
const http = axios.create({
|
||||
responseType: 'arraybuffer',
|
||||
timeout: 15000,
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
},
|
||||
// ========== 统一的公告抓取 Schema ==========
|
||||
|
||||
// 公告抓取 Schema(result 包装数组)
|
||||
const announcementSchema = z.object({
|
||||
result: z.array(z.object({
|
||||
title: z.string().describe('公告标题'),
|
||||
amount: z.string().nullable().describe('项目金额(合同预估价/最高投标限价等),没有则为null'),
|
||||
date: z.string().describe('发布日期,YYYY-MM-DD格式'),
|
||||
url: z.string().describe('详情页完整URL,以https://开头'),
|
||||
})).describe('页面上提取到的所有公告条目'),
|
||||
});
|
||||
|
||||
function pickEncoding(contentType = '') {
|
||||
const match = /charset=([^;]+)/i.exec(contentType);
|
||||
if (!match) return 'utf-8';
|
||||
const charset = match[1].trim().toLowerCase();
|
||||
if (charset.includes('gb')) return 'gbk';
|
||||
return charset;
|
||||
}
|
||||
|
||||
async function fetchHtml(url) {
|
||||
const res = await http.get(url);
|
||||
const encoding = pickEncoding(res.headers['content-type']);
|
||||
const html = iconv.decode(res.data, encoding || 'utf-8');
|
||||
return html;
|
||||
}
|
||||
|
||||
function getPageUrl(pageIndex) {
|
||||
if (pageIndex === 1) {
|
||||
return `${BASE_URL}moreinfosl3.html`;
|
||||
/** 从 Firecrawl 返回结果中提取 result 数组 */
|
||||
function extractItems(raw) {
|
||||
if (!raw) return [];
|
||||
const root = (raw.data && typeof raw.data === 'object') ? raw.data : raw;
|
||||
if (Array.isArray(root.result)) return root.result;
|
||||
if (root.result && typeof root.result === 'object') {
|
||||
const keys = Object.keys(root.result).filter(k => !isNaN(parseInt(k)));
|
||||
if (keys.length > 0) return keys.sort((a, b) => parseInt(a) - parseInt(b)).map(k => root.result[k]);
|
||||
}
|
||||
return `${BASE_URL}${pageIndex}.html`;
|
||||
if (Array.isArray(root)) return root;
|
||||
const numericKeys = Object.keys(root).filter(k => !isNaN(parseInt(k)));
|
||||
if (numericKeys.length > 0) return numericKeys.sort((a, b) => parseInt(a) - parseInt(b)).map(k => root[k]);
|
||||
return [];
|
||||
}
|
||||
|
||||
// 解析列表页HTML,提取中标结果信息
|
||||
function parseList(html) {
|
||||
const $ = cheerio.load(html);
|
||||
const items = [];
|
||||
// ========== 抓取执行(复用 server.js 中 runScraper 的逻辑) ==========
|
||||
|
||||
$('li.ewb-info-item2').each((_, row) => {
|
||||
const $row = $(row);
|
||||
const cells = $row.find('div.ewb-info-num2');
|
||||
async function runScraper(scraper) {
|
||||
console.log(`[定时任务] ${scraper.city} - ${scraper.section} ${scraper.subsection} - ${scraper.type}:${scraper.url}`);
|
||||
const fullPrompt = `访问这个URL: ${scraper.url}
|
||||
【目标区域】:${scraper.section || ''} - ${scraper.subsection || ''}
|
||||
【公告类型】:${scraper.type || ''}
|
||||
|
||||
if (cells.length >= 5) {
|
||||
const bidNo = $(cells[0]).find('p').attr('title') || $(cells[0]).find('p').text().trim();
|
||||
const projectName = $(cells[1]).find('p').attr('title') || $(cells[1]).find('p').text().trim();
|
||||
const bidName = $(cells[2]).find('p').attr('title') || $(cells[2]).find('p').text().trim();
|
||||
const winningPrice = $(cells[3]).find('p').text().trim(); // 中标价格
|
||||
const winningDate = $(cells[4]).find('p').text().trim(); // 中标日期
|
||||
${scraper.prompt || '提取页面上今日的招标公告信息,包括:标题、项目金额(可能为合同预估价/最高投标限价等等)、发布日期(YYYY-MM-DD格式)、详情页完整URL'}
|
||||
|
||||
const onclick = $row.attr('onclick') || '';
|
||||
const hrefMatch = onclick.match(/window\.open\(['"]([^'"]+)['"]\)/);
|
||||
let href = '';
|
||||
if (hrefMatch) {
|
||||
href = hrefMatch[1];
|
||||
if (href.startsWith('/')) {
|
||||
href = `https://njggzy.nanjing.gov.cn${href}`;
|
||||
}
|
||||
}
|
||||
请严格按照定义的 JSON 格式返回,每条公告包含 title、amount、date、url 四个字段。`;
|
||||
|
||||
if (!/^\d{4}-\d{2}-\d{2}$/.test(winningDate)) return;
|
||||
|
||||
const price = parseFloat(winningPrice);
|
||||
if (isNaN(price)) return;
|
||||
|
||||
items.push({
|
||||
bidNo,
|
||||
title: projectName,
|
||||
bidName,
|
||||
winningBid: { // 中标金额
|
||||
amount: price,
|
||||
unit: '万元'
|
||||
},
|
||||
date: winningDate,
|
||||
href
|
||||
});
|
||||
}
|
||||
const result = await firecrawl.agent({
|
||||
prompt: fullPrompt,
|
||||
schema: announcementSchema,
|
||||
model: scraper.model || 'spark-1-mini',
|
||||
});
|
||||
|
||||
return items;
|
||||
console.log('[定时任务] 原始返回结果:', JSON.stringify(result).slice(0, 500));
|
||||
|
||||
// 标准化结果
|
||||
const rawItems = extractItems(result);
|
||||
const items = rawItems.map(item => ({
|
||||
title: item.title || '',
|
||||
amount: item.amount || null,
|
||||
date: item.date || '',
|
||||
url: item.url || '',
|
||||
}));
|
||||
|
||||
console.log(`[定时任务] 提取到 ${items.length} 条公告`);
|
||||
|
||||
const record = {
|
||||
scraperId: scraper.id,
|
||||
city: scraper.city,
|
||||
section: scraper.section,
|
||||
subsection: scraper.subsection,
|
||||
type: scraper.type,
|
||||
url: scraper.url,
|
||||
scrapedAt: new Date().toISOString(),
|
||||
data: { result: items, total: items.length },
|
||||
};
|
||||
appendResult(record);
|
||||
return record;
|
||||
}
|
||||
|
||||
function isDateInRange(dateStr, startDate, endDate) {
|
||||
if (!dateStr) return false;
|
||||
const date = new Date(dateStr);
|
||||
if (isNaN(date.getTime())) return false;
|
||||
// ========== 定时任务执行函数 ==========
|
||||
|
||||
if (startDate && date < new Date(startDate)) return false;
|
||||
if (endDate && date > new Date(endDate)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
async function fetchListByDateRange(startDate, endDate, maxPages = 50) {
|
||||
const allItems = [];
|
||||
let shouldContinue = true;
|
||||
let pageIndex = 1;
|
||||
|
||||
console.log(`开始按时间范围采集: ${startDate || '不限'} 至 ${endDate || '不限'}`);
|
||||
|
||||
while (shouldContinue && pageIndex <= maxPages) {
|
||||
const pageUrl = getPageUrl(pageIndex);
|
||||
console.log(`正在采集第 ${pageIndex} 页: ${pageUrl}`);
|
||||
|
||||
try {
|
||||
const html = await fetchHtml(pageUrl);
|
||||
const items = parseList(html);
|
||||
|
||||
if (items.length === 0) {
|
||||
console.log(`第 ${pageIndex} 页没有数据,停止采集`);
|
||||
break;
|
||||
}
|
||||
|
||||
let hasItemsInRange = false;
|
||||
let allItemsBeforeRange = true;
|
||||
|
||||
for (const item of items) {
|
||||
if (isDateInRange(item.date, startDate, endDate)) {
|
||||
allItems.push(item);
|
||||
hasItemsInRange = true;
|
||||
allItemsBeforeRange = false;
|
||||
} else if (startDate && new Date(item.date) < new Date(startDate)) {
|
||||
allItemsBeforeRange = allItemsBeforeRange && true;
|
||||
} else {
|
||||
allItemsBeforeRange = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (allItemsBeforeRange && startDate) {
|
||||
console.log(`第 ${pageIndex} 页所有项目都早于起始日期,停止采集`);
|
||||
shouldContinue = false;
|
||||
}
|
||||
|
||||
console.log(`第 ${pageIndex} 页找到 ${items.length} 条,符合条件 ${hasItemsInRange ? '有' : '无'}`);
|
||||
|
||||
pageIndex++;
|
||||
|
||||
if (shouldContinue && pageIndex <= maxPages) {
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`采集第 ${pageIndex} 页失败: ${err.message}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`总共采集了 ${pageIndex - 1} 页,找到 ${allItems.length} 条符合条件的公告`);
|
||||
return allItems;
|
||||
}
|
||||
|
||||
// ========== 招标公告采集函数 ==========
|
||||
|
||||
// 获取招标公告分页URL
|
||||
function getBidAnnouncePageUrl(pageIndex) {
|
||||
if (pageIndex === 1) {
|
||||
return `${BID_ANNOUNCE_BASE_URL}moreinfo5dc.html`;
|
||||
}
|
||||
return `${BID_ANNOUNCE_BASE_URL}${pageIndex}.html`;
|
||||
}
|
||||
|
||||
// 解析招标公告列表页HTML
|
||||
function parseBidAnnounceList(html) {
|
||||
const $ = cheerio.load(html);
|
||||
const items = [];
|
||||
|
||||
$('li.ewb-info-item2').each((_, row) => {
|
||||
const $row = $(row);
|
||||
const onclick = $row.attr('onclick') || '';
|
||||
|
||||
const hrefMatch = onclick.match(/window\.open\(['"]([^'"]+)['"]\)/);
|
||||
if (!hrefMatch) return;
|
||||
|
||||
let href = hrefMatch[1];
|
||||
if (href.startsWith('/')) {
|
||||
href = `https://njggzy.nanjing.gov.cn${href}`;
|
||||
}
|
||||
|
||||
const $titleP = $row.find('.ewb-info-num2').first().find('p');
|
||||
const title = $titleP.attr('title') || $titleP.text().trim();
|
||||
|
||||
const $dateP = $row.find('.ewb-info-num2').last().find('p');
|
||||
const dateText = $dateP.text().trim();
|
||||
const dateMatch = dateText.match(/\d{4}-\d{2}-\d{2}/);
|
||||
const date = dateMatch ? dateMatch[0] : '';
|
||||
|
||||
if (title && date) {
|
||||
items.push({
|
||||
title,
|
||||
date,
|
||||
href,
|
||||
estimatedAmount: null
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
// 解析招标公告详情页,获取合同估算价
|
||||
async function fetchBidAnnounceDetail(url) {
|
||||
try {
|
||||
const html = await fetchHtml(url);
|
||||
const $ = cheerio.load(html);
|
||||
const bodyText = $('body').text();
|
||||
|
||||
const amountMatch = bodyText.match(/合同估算价[::]\s*([\d,]+\.?\d*)\s*元/);
|
||||
let estimatedAmount = null;
|
||||
if (amountMatch) {
|
||||
const amountStr = amountMatch[1].replace(/,/g, '');
|
||||
estimatedAmount = parseFloat(amountStr);
|
||||
}
|
||||
|
||||
const bidCodeMatch = bodyText.match(/标段编码[::]\s*([A-Za-z0-9\-]+)/);
|
||||
const bidCode = bidCodeMatch ? bidCodeMatch[1] : null;
|
||||
|
||||
const tendereeMatch = bodyText.match(/招标人[为是][::]?\s*([^\s,,。]+)/);
|
||||
const tenderee = tendereeMatch ? tendereeMatch[1] : null;
|
||||
|
||||
const durationMatch = bodyText.match(/计划工期[::]\s*(\d+)\s*日历天/);
|
||||
const duration = durationMatch ? parseInt(durationMatch[1]) : null;
|
||||
|
||||
return { estimatedAmount, bidCode, tenderee, duration, url };
|
||||
} catch (error) {
|
||||
console.error(`获取招标详情失败 ${url}: ${error.message}`);
|
||||
return { estimatedAmount: null, url };
|
||||
}
|
||||
}
|
||||
|
||||
// 按时间范围采集招标公告
|
||||
async function fetchBidAnnounceByDateRange(startDate, endDate, maxPages = 20) {
|
||||
const allItems = [];
|
||||
let shouldContinue = true;
|
||||
let pageIndex = 1;
|
||||
|
||||
console.log(`开始采集招标公告: ${startDate || '不限'} 至 ${endDate || '不限'}`);
|
||||
|
||||
while (shouldContinue && pageIndex <= maxPages) {
|
||||
const pageUrl = getBidAnnouncePageUrl(pageIndex);
|
||||
console.log(`正在采集招标公告第 ${pageIndex} 页: ${pageUrl}`);
|
||||
|
||||
try {
|
||||
const html = await fetchHtml(pageUrl);
|
||||
const items = parseBidAnnounceList(html);
|
||||
|
||||
if (items.length === 0) {
|
||||
console.log(`第 ${pageIndex} 页没有数据,停止采集`);
|
||||
break;
|
||||
}
|
||||
|
||||
let hasItemsInRange = false;
|
||||
let allItemsBeforeRange = true;
|
||||
|
||||
for (const item of items) {
|
||||
if (isDateInRange(item.date, startDate, endDate)) {
|
||||
allItems.push(item);
|
||||
hasItemsInRange = true;
|
||||
allItemsBeforeRange = false;
|
||||
} else if (startDate && new Date(item.date) < new Date(startDate)) {
|
||||
allItemsBeforeRange = allItemsBeforeRange && true;
|
||||
} else {
|
||||
allItemsBeforeRange = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (allItemsBeforeRange && startDate) {
|
||||
console.log(`第 ${pageIndex} 页所有项目都早于起始日期,停止采集`);
|
||||
shouldContinue = false;
|
||||
}
|
||||
|
||||
console.log(`第 ${pageIndex} 页找到 ${items.length} 条,符合条件 ${hasItemsInRange ? '有' : '无'}`);
|
||||
|
||||
pageIndex++;
|
||||
|
||||
if (shouldContinue && pageIndex <= maxPages) {
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`采集第 ${pageIndex} 页失败: ${err.message}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`总共采集了 ${pageIndex - 1} 页,找到 ${allItems.length} 条符合条件的招标公告`);
|
||||
|
||||
// 获取详情(合同估算价)
|
||||
if (allItems.length > 0) {
|
||||
console.log(`开始获取 ${allItems.length} 条招标公告的详情...`);
|
||||
|
||||
for (let i = 0; i < allItems.length; i++) {
|
||||
const item = allItems[i];
|
||||
console.log(`获取详情 ${i + 1}/${allItems.length}: ${item.title.substring(0, 30)}...`);
|
||||
|
||||
const detail = await fetchBidAnnounceDetail(item.href);
|
||||
item.estimatedAmount = detail.estimatedAmount;
|
||||
item.bidCode = detail.bidCode;
|
||||
item.tenderee = detail.tenderee;
|
||||
item.duration = detail.duration;
|
||||
|
||||
if (i < allItems.length - 1) {
|
||||
await new Promise(resolve => setTimeout(resolve, 300));
|
||||
}
|
||||
}
|
||||
|
||||
console.log('招标公告详情获取完成');
|
||||
}
|
||||
|
||||
return allItems;
|
||||
}
|
||||
|
||||
// 定时任务执行函数
|
||||
async function executeScheduledTask(config) {
|
||||
try {
|
||||
console.log('========================================');
|
||||
console.log('定时任务开始执行(综合采集)');
|
||||
console.log('定时任务开始执行');
|
||||
console.log('执行时间:', new Date().toLocaleString('zh-CN'));
|
||||
console.log('========================================');
|
||||
|
||||
const timeRange = config.scheduler.timeRange || 'thisMonth';
|
||||
const { startDate, endDate } = getDateRangeByType(timeRange);
|
||||
const winningThreshold = config.scheduler.winningThreshold !== undefined ? config.scheduler.winningThreshold : 10000; // 中标阈值,默认1亿(10000万元)
|
||||
const bidThreshold = config.scheduler.bidThreshold !== undefined ? config.scheduler.bidThreshold : 0; // 招标阈值,默认0(不筛选)
|
||||
// 获取所有已启用的抓取来源
|
||||
const scrapers = (config.scrapers || []).filter(s => s.enabled);
|
||||
|
||||
const timeRangeNames = {
|
||||
'today': '今日',
|
||||
'thisWeek': '本周',
|
||||
'thisMonth': '本月'
|
||||
};
|
||||
console.log(`采集时间段: ${timeRangeNames[timeRange] || '本月'}`);
|
||||
console.log(`采集时间范围: ${startDate} 至 ${endDate}`);
|
||||
console.log(`中标金额阈值: ${winningThreshold}万元 (${(winningThreshold / 10000).toFixed(2)}亿元)`);
|
||||
console.log(`招标金额阈值: ${bidThreshold}万元 ${bidThreshold === 0 ? '(不筛选)' : `(${(bidThreshold / 10000).toFixed(2)}亿元)`}`);
|
||||
|
||||
// ========== 1. 采集中标公示 ==========
|
||||
console.log('\n========== 采集中标公示 ==========');
|
||||
const winningItems = await fetchListByDateRange(startDate, endDate, 50);
|
||||
|
||||
// 筛选大于阈值的中标项目
|
||||
const winningFiltered = winningItems.filter((item) => {
|
||||
return item.winningBid && item.winningBid.amount > winningThreshold;
|
||||
});
|
||||
|
||||
const winningTotal = winningFiltered.reduce(
|
||||
(sum, item) => sum + (item.winningBid?.amount || 0),
|
||||
0
|
||||
);
|
||||
|
||||
console.log(`中标公示: 采集 ${winningItems.length} 条,符合阈值 ${winningFiltered.length} 条`);
|
||||
|
||||
// 生成中标报告
|
||||
const winningReport = {
|
||||
summary: {
|
||||
total_count: winningItems.length,
|
||||
filtered_count: winningFiltered.length,
|
||||
threshold: `${winningThreshold}万元`,
|
||||
total_amount: `${winningTotal.toFixed(2)}万元`,
|
||||
generated_at: new Date().toISOString(),
|
||||
date_range: { startDate, endDate },
|
||||
},
|
||||
projects: winningFiltered.map((item) => ({
|
||||
bidNo: item.bidNo,
|
||||
title: item.title,
|
||||
bidName: item.bidName,
|
||||
date: item.date,
|
||||
winningBid: item.winningBid,
|
||||
url: item.href,
|
||||
})),
|
||||
};
|
||||
|
||||
// ========== 2. 采集招标公告 ==========
|
||||
console.log('\n========== 采集招标公告 ==========');
|
||||
const bidItems = await fetchBidAnnounceByDateRange(startDate, endDate, 20);
|
||||
|
||||
// 筛选招标项目(根据阈值筛选,阈值为0时不筛选只要求有金额)
|
||||
const bidFiltered = bidItems.filter(item => {
|
||||
if (!item.estimatedAmount) return false;
|
||||
if (bidThreshold === 0) return true; // 阈值为0时不筛选
|
||||
return item.estimatedAmount / 10000 > bidThreshold; // 估算价是元,阈值是万元,需要转换
|
||||
});
|
||||
|
||||
const bidTotal = bidFiltered.reduce(
|
||||
(sum, item) => sum + (item.estimatedAmount || 0),
|
||||
0
|
||||
);
|
||||
|
||||
console.log(`招标公告: 采集 ${bidItems.length} 条,有金额 ${bidFiltered.length} 条`);
|
||||
|
||||
// 生成招标报告
|
||||
const bidReport = {
|
||||
summary: {
|
||||
total_count: bidItems.length,
|
||||
filtered_count: bidFiltered.length,
|
||||
has_amount_count: bidFiltered.length,
|
||||
threshold: bidThreshold === 0 ? '无' : `${bidThreshold}万元`,
|
||||
total_amount: `${(bidTotal / 10000).toFixed(2)}万元`,
|
||||
total_amount_yuan: bidTotal,
|
||||
generated_at: new Date().toISOString(),
|
||||
date_range: { startDate, endDate },
|
||||
report_type: '招标公告'
|
||||
},
|
||||
projects: bidFiltered.map((item) => ({
|
||||
title: item.title,
|
||||
bidCode: item.bidCode,
|
||||
tenderee: item.tenderee,
|
||||
date: item.date,
|
||||
duration: item.duration,
|
||||
estimatedAmount: item.estimatedAmount ? {
|
||||
amount: item.estimatedAmount,
|
||||
amountWan: (item.estimatedAmount / 10000).toFixed(2),
|
||||
unit: '元'
|
||||
} : null,
|
||||
url: item.href,
|
||||
})),
|
||||
};
|
||||
|
||||
// ========== 3. 检查是否有数据需要发送 ==========
|
||||
if (winningFiltered.length === 0 && bidFiltered.length === 0) {
|
||||
console.log('\n========================================');
|
||||
console.log('暂无符合条件的项目,不发送邮件');
|
||||
console.log('========================================');
|
||||
if (scrapers.length === 0) {
|
||||
console.log('没有已启用的抓取来源,跳过');
|
||||
return;
|
||||
}
|
||||
|
||||
// ========== 4. 发送综合邮件 ==========
|
||||
console.log('\n========================================');
|
||||
console.log('正在发送综合报告邮件...');
|
||||
const emailConfig = config.email;
|
||||
console.log(`共 ${scrapers.length} 个已启用的抓取来源`);
|
||||
|
||||
const result = await sendCombinedReportEmail(emailConfig, winningReport, bidReport);
|
||||
// 逐个运行抓取任务
|
||||
const results = [];
|
||||
for (const scraper of scrapers) {
|
||||
try {
|
||||
console.log(`\n---------- 抓取: ${scraper.city} - ${scraper.section} ${scraper.type} ----------`);
|
||||
const r = await runScraper(scraper);
|
||||
results.push(r);
|
||||
console.log(`✓ 抓取成功`);
|
||||
} catch (err) {
|
||||
console.error(`✗ 抓取失败: ${err.message}`);
|
||||
const errRecord = {
|
||||
scraperId: scraper.id,
|
||||
city: scraper.city,
|
||||
section: scraper.section,
|
||||
subsection: scraper.subsection,
|
||||
type: scraper.type,
|
||||
url: scraper.url,
|
||||
scrapedAt: new Date().toISOString(),
|
||||
error: err.message,
|
||||
data: null,
|
||||
};
|
||||
appendResult(errRecord);
|
||||
results.push(errRecord);
|
||||
}
|
||||
}
|
||||
|
||||
const successCount = results.filter(r => !r.error).length;
|
||||
const failCount = results.filter(r => r.error).length;
|
||||
console.log(`\n========== 抓取完成 ==========`);
|
||||
console.log(`成功: ${successCount} 条,失败: ${failCount} 条`);
|
||||
|
||||
// 检查是否需要发送邮件
|
||||
if (successCount === 0) {
|
||||
console.log('没有成功的抓取结果,不发送邮件');
|
||||
return;
|
||||
}
|
||||
|
||||
// 发送邮件报告
|
||||
if (config.email?.smtpHost && config.email?.smtpUser) {
|
||||
console.log('\n正在发送抓取结果邮件...');
|
||||
try {
|
||||
const emailResult = await sendScraperResultsEmail(config.email, results);
|
||||
console.log('邮件发送成功! MessageId:', emailResult.messageId);
|
||||
} catch (emailErr) {
|
||||
console.error('邮件发送失败:', emailErr.message);
|
||||
}
|
||||
} else {
|
||||
console.log('邮件配置不完整,跳过邮件发送');
|
||||
}
|
||||
|
||||
console.log('邮件发送成功!');
|
||||
console.log('收件人:', emailConfig.recipients);
|
||||
console.log('MessageId:', result.messageId);
|
||||
console.log(`内容: 中标公示 ${winningFiltered.length} 条,招标公告 ${bidFiltered.length} 条`);
|
||||
console.log('========================================');
|
||||
console.log('定时任务执行完成');
|
||||
console.log('========================================');
|
||||
|
||||
} catch (error) {
|
||||
@@ -511,96 +202,60 @@ async function executeScheduledTask(config) {
|
||||
// 存储当前的定时任务
|
||||
let currentScheduledTask = null;
|
||||
|
||||
// 初始化定时任务
|
||||
export function initScheduler() {
|
||||
const config = loadConfig();
|
||||
|
||||
if (!config) {
|
||||
console.error('无法启动定时任务: 配置文件加载失败');
|
||||
return;
|
||||
}
|
||||
|
||||
if (!config.scheduler || !config.scheduler.enabled) {
|
||||
console.log('定时任务已禁用');
|
||||
return;
|
||||
}
|
||||
|
||||
if (!config.email || !config.email.smtpHost || !config.email.smtpUser) {
|
||||
console.error('无法启动定时任务: 邮件配置不完整');
|
||||
console.error('请在 config.json 中配置邮件信息');
|
||||
return;
|
||||
}
|
||||
if (!config) { console.error('无法启动定时任务: 配置文件加载失败'); return; }
|
||||
if (!config.scheduler?.enabled) { console.log('定时任务已禁用'); return; }
|
||||
|
||||
const cronTime = config.scheduler.cronTime || '0 9 * * *';
|
||||
|
||||
const enabledCount = (config.scrapers || []).filter(s => s.enabled).length;
|
||||
console.log('========================================');
|
||||
console.log('定时任务已启动');
|
||||
console.log('执行计划:', cronTime);
|
||||
console.log('中标阈值:', config.scheduler.winningThreshold, '万元');
|
||||
console.log('招标阈值:', config.scheduler.bidThreshold, '万元', config.scheduler.bidThreshold === 0 ? '(不筛选)' : '');
|
||||
console.log('收件人:', config.email.recipients);
|
||||
console.log('定时任务已启动,执行计划:', cronTime);
|
||||
console.log(`已启用的抓取来源: ${enabledCount} 个`);
|
||||
if (config.email?.recipients) console.log('收件人:', config.email.recipients);
|
||||
console.log('========================================');
|
||||
|
||||
// 如果已有任务在运行,先停止
|
||||
if (currentScheduledTask) {
|
||||
currentScheduledTask.stop();
|
||||
console.log('已停止旧的定时任务');
|
||||
}
|
||||
if (currentScheduledTask) { currentScheduledTask.stop(); }
|
||||
|
||||
// 创建定时任务
|
||||
currentScheduledTask = cron.schedule(cronTime, () => {
|
||||
executeScheduledTask(config);
|
||||
}, {
|
||||
timezone: 'Asia/Shanghai'
|
||||
});
|
||||
// 每次执行时重新加载配置,确保使用最新的 scrapers
|
||||
const latestConfig = loadConfig();
|
||||
if (latestConfig) {
|
||||
executeScheduledTask(latestConfig);
|
||||
}
|
||||
}, { timezone: 'Asia/Shanghai' });
|
||||
}
|
||||
|
||||
// 重新加载配置并重启定时任务
|
||||
export function reloadScheduler() {
|
||||
console.log('重新加载定时任务配置...');
|
||||
|
||||
// 停止当前任务
|
||||
if (currentScheduledTask) {
|
||||
currentScheduledTask.stop();
|
||||
currentScheduledTask = null;
|
||||
console.log('已停止当前定时任务');
|
||||
}
|
||||
|
||||
// 重新初始化
|
||||
if (currentScheduledTask) { currentScheduledTask.stop(); currentScheduledTask = null; }
|
||||
initScheduler();
|
||||
}
|
||||
|
||||
// 停止定时任务
|
||||
export function stopScheduler() {
|
||||
if (currentScheduledTask) {
|
||||
currentScheduledTask.stop();
|
||||
currentScheduledTask = null;
|
||||
console.log('定时任务已停止');
|
||||
return true;
|
||||
currentScheduledTask.stop(); currentScheduledTask = null;
|
||||
console.log('定时任务已停止'); return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// 获取定时任务状态
|
||||
export function getSchedulerStatus() {
|
||||
const config = loadConfig();
|
||||
const enabledScrapers = (config?.scrapers || []).filter(s => s.enabled).length;
|
||||
return {
|
||||
isRunning: currentScheduledTask !== null,
|
||||
enabledScrapers,
|
||||
config: config ? {
|
||||
enabled: config.scheduler?.enabled || false,
|
||||
cronTime: config.scheduler?.cronTime || '0 9 * * *',
|
||||
winningThreshold: config.scheduler?.winningThreshold !== undefined ? config.scheduler.winningThreshold : 10000,
|
||||
bidThreshold: config.scheduler?.bidThreshold !== undefined ? config.scheduler.bidThreshold : 0,
|
||||
timeRange: config.scheduler?.timeRange || 'thisMonth',
|
||||
description: config.scheduler?.description || '',
|
||||
} : null,
|
||||
};
|
||||
}
|
||||
|
||||
// 手动执行任务(用于测试)
|
||||
export async function runTaskNow() {
|
||||
const config = loadConfig();
|
||||
if (!config) {
|
||||
throw new Error('配置文件加载失败');
|
||||
}
|
||||
if (!config) throw new Error('配置文件加载失败');
|
||||
await executeScheduledTask(config);
|
||||
}
|
||||
|
||||
1066
src/server.js
1066
src/server.js
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user