feat(config): 南京公共资源交易中心
This commit is contained in:
@@ -1,237 +0,0 @@
|
||||
/**
|
||||
* LLM 服务模块 - 使用阿里云通义千问 API 提取招标金额
|
||||
*/
|
||||
|
||||
import { readFileSync } from 'fs';
|
||||
import { join, dirname } from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = dirname(__filename);
|
||||
|
||||
// 获取 LLM 配置
|
||||
function getLLMConfig() {
|
||||
try {
|
||||
const configPath = join(__dirname, '..', 'config.json');
|
||||
const configContent = readFileSync(configPath, 'utf-8');
|
||||
const config = JSON.parse(configContent);
|
||||
return config.llm || null;
|
||||
} catch (err) {
|
||||
console.error('读取 LLM 配置失败:', err.message);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// 检查 LLM 是否已启用
|
||||
export function isLLMEnabled() {
|
||||
const config = getLLMConfig();
|
||||
return config && config.enabled && config.apiKey;
|
||||
}
|
||||
|
||||
// 使用 LLM 提取招标金额
|
||||
export async function extractBudgetWithLLM(content) {
|
||||
const config = getLLMConfig();
|
||||
|
||||
if (!config || !config.enabled || !config.apiKey) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 智能提取包含金额信息的段落,避免截断丢失关键信息
|
||||
const maxContentLength = 4000;
|
||||
let truncatedContent = content;
|
||||
|
||||
if (content.length > maxContentLength) {
|
||||
// 查找金额关键词的位置,提取关键词周围的上下文
|
||||
const budgetKeywords = ['预算金额', '项目预算', '采购预算', '控制价', '最高限价', '招标金额', '项目金额', '合同金额', '投标报价', '中标金额', '成交金额', '中标价', '成交价'];
|
||||
const contextRadius = 200; // 关键词前后各取200字符
|
||||
const extractedContexts = [];
|
||||
|
||||
for (const keyword of budgetKeywords) {
|
||||
let pos = content.indexOf(keyword);
|
||||
while (pos !== -1) {
|
||||
const start = Math.max(0, pos - contextRadius);
|
||||
const end = Math.min(content.length, pos + keyword.length + contextRadius);
|
||||
extractedContexts.push(content.substring(start, end));
|
||||
pos = content.indexOf(keyword, pos + 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (extractedContexts.length > 0) {
|
||||
// 有相关内容,拼接:开头部分 + 提取的上下文
|
||||
const headerContent = content.substring(0, 1500);
|
||||
const relevantContent = [...new Set(extractedContexts)].join('\n---\n'); // 去重
|
||||
truncatedContent = headerContent + '\n\n【以下为金额相关内容】\n' + relevantContent;
|
||||
|
||||
if (truncatedContent.length > maxContentLength) {
|
||||
truncatedContent = truncatedContent.substring(0, maxContentLength) + '...(内容已截断)';
|
||||
}
|
||||
} else {
|
||||
// 没找到相关内容,使用原来的截断方式
|
||||
truncatedContent = content.substring(0, maxContentLength) + '...(内容已截断)';
|
||||
}
|
||||
}
|
||||
|
||||
const prompt = `你是一个专业的招标文件分析助手。请从以下招标公告内容中提取预算金额信息。
|
||||
|
||||
要求:
|
||||
1. 优先查找以下字段对应的金额:预算金额、项目预算、采购预算、预算、控制价、最高限价、招标金额、项目金额、合同金额、投标报价、中标金额、成交金额、中标价、成交价
|
||||
2. 如果有多个金额,优先选择"预算金额"或"项目预算"
|
||||
3. 金额统一转换为万元单位(如 70万元 = 70,700000元 = 70)
|
||||
4. 严格按照 JSON 格式返回,不要添加任何其他文字
|
||||
|
||||
常见格式示例:
|
||||
- "预算金额:70万元" → amount: 70
|
||||
- "预算金额:700000元" → amount: 70
|
||||
- "项目预算:70.00万元" → amount: 70
|
||||
|
||||
返回格式(必须是合法的 JSON):
|
||||
{"amount": 数值, "unit": "万元", "text": "原文中的金额描述"}
|
||||
|
||||
如果没有找到金额,返回:
|
||||
{"amount": null, "unit": null, "text": null}
|
||||
|
||||
公告内容:
|
||||
${truncatedContent}`;
|
||||
|
||||
try {
|
||||
const response = await fetch(`${config.baseUrl}/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': `Bearer ${config.apiKey}`,
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: config.model || 'qwen-turbo',
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: prompt,
|
||||
},
|
||||
],
|
||||
temperature: 0.1, // 低温度,保证输出稳定
|
||||
max_tokens: 200,
|
||||
}),
|
||||
signal: AbortSignal.timeout(15000), // 15秒超时
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
console.error('LLM API 错误:', response.status, errorText);
|
||||
return null;
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const assistantMessage = data.choices?.[0]?.message?.content;
|
||||
|
||||
if (!assistantMessage) {
|
||||
console.error('LLM 返回内容为空');
|
||||
return null;
|
||||
}
|
||||
|
||||
// 解析 JSON 响应
|
||||
const jsonMatch = assistantMessage.match(/\{[\s\S]*\}/);
|
||||
if (!jsonMatch) {
|
||||
console.error('LLM 返回格式异常:', assistantMessage);
|
||||
return null;
|
||||
}
|
||||
|
||||
const result = JSON.parse(jsonMatch[0]);
|
||||
|
||||
if (result.amount === null || result.amount === undefined) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 验证金额合理性
|
||||
const amount = parseFloat(result.amount);
|
||||
if (isNaN(amount) || amount < 0.01 || amount > 100000000) {
|
||||
console.error('LLM 提取的金额不合理:', result.amount);
|
||||
return null;
|
||||
}
|
||||
|
||||
console.log(`LLM 提取金额成功: ${amount} 万元`);
|
||||
|
||||
return {
|
||||
amount: amount,
|
||||
unit: '万元',
|
||||
text: result.text || `${amount}万元`,
|
||||
source: 'llm', // 标记来源
|
||||
};
|
||||
} catch (err) {
|
||||
if (err.name === 'TimeoutError') {
|
||||
console.error('LLM API 超时');
|
||||
} else {
|
||||
console.error('LLM 提取金额失败:', err.message);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// 测试 LLM 连接
|
||||
export async function testLLMConnection() {
|
||||
const config = getLLMConfig();
|
||||
|
||||
if (!config || !config.apiKey) {
|
||||
return { success: false, error: '未配置 API Key' };
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(`${config.baseUrl}/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': `Bearer ${config.apiKey}`,
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: config.model || 'qwen-turbo',
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: '请回复"连接成功"',
|
||||
},
|
||||
],
|
||||
max_tokens: 10,
|
||||
}),
|
||||
signal: AbortSignal.timeout(10000),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
return { success: false, error: `API 错误: ${response.status} - ${errorText}` };
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const reply = data.choices?.[0]?.message?.content;
|
||||
|
||||
return {
|
||||
success: true,
|
||||
message: '连接成功',
|
||||
model: config.model || 'qwen-turbo',
|
||||
reply: reply,
|
||||
};
|
||||
} catch (err) {
|
||||
if (err.name === 'TimeoutError') {
|
||||
return { success: false, error: '连接超时' };
|
||||
}
|
||||
return { success: false, error: err.message };
|
||||
}
|
||||
}
|
||||
|
||||
// 获取 LLM 状态
|
||||
export function getLLMStatus() {
|
||||
const config = getLLMConfig();
|
||||
|
||||
if (!config) {
|
||||
return {
|
||||
configured: false,
|
||||
enabled: false,
|
||||
model: null,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
configured: !!config.apiKey,
|
||||
enabled: config.enabled && !!config.apiKey,
|
||||
model: config.model || 'qwen-turbo',
|
||||
baseUrl: config.baseUrl,
|
||||
};
|
||||
}
|
||||
369
src/scheduler.js
369
src/scheduler.js
@@ -62,19 +62,14 @@ function getDateRangeByType(timeRange) {
|
||||
return { startDate, endDate };
|
||||
}
|
||||
|
||||
// 获取本月的开始和结束日期 (兼容旧代码)
|
||||
function getCurrentMonthDateRange() {
|
||||
return getDateRangeByType('thisMonth');
|
||||
}
|
||||
|
||||
// 从server.js复制的辅助函数
|
||||
const BASE_URL = 'https://gjzx.nanjing.gov.cn/gggs/';
|
||||
// 南京市公共资源交易平台 - 房建市政招标公告
|
||||
const BASE_URL = 'https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/';
|
||||
|
||||
const http = axios.create({
|
||||
responseType: 'arraybuffer',
|
||||
timeout: 10000,
|
||||
timeout: 15000,
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; gjzx-scraper/1.0)',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
},
|
||||
});
|
||||
|
||||
@@ -93,38 +88,55 @@ async function fetchHtml(url) {
|
||||
return html;
|
||||
}
|
||||
|
||||
function getPageUrl(pageIndex, baseUrl = BASE_URL) {
|
||||
if (pageIndex === 0) {
|
||||
return baseUrl;
|
||||
function getPageUrl(pageIndex) {
|
||||
if (pageIndex === 1) {
|
||||
return `${BASE_URL}moreinfo.html`;
|
||||
}
|
||||
const cleanBaseUrl = baseUrl.replace(/\/$/, '');
|
||||
return `${cleanBaseUrl}/index_${pageIndex}.html`;
|
||||
return `${BASE_URL}${pageIndex}.html`;
|
||||
}
|
||||
|
||||
// 解析列表页HTML,提取招标信息
|
||||
function parseList(html) {
|
||||
const $ = cheerio.load(html);
|
||||
const items = [];
|
||||
|
||||
$('table tr').each((_, row) => {
|
||||
$('li.ewb-info-item2').each((_, row) => {
|
||||
const $row = $(row);
|
||||
const link = $row.find('td:first-child a').first();
|
||||
const dateCell = $row.find('td:nth-child(2)');
|
||||
const cells = $row.find('div.ewb-info-num2');
|
||||
|
||||
if (link.length && dateCell.length) {
|
||||
const title = link.attr('title') || link.text().trim();
|
||||
const rawHref = link.attr('href') || '';
|
||||
const dateText = dateCell.text().trim();
|
||||
if (cells.length >= 5) {
|
||||
const bidNo = $(cells[0]).find('p').attr('title') || $(cells[0]).find('p').text().trim();
|
||||
const projectName = $(cells[1]).find('p').attr('title') || $(cells[1]).find('p').text().trim();
|
||||
const bidName = $(cells[2]).find('p').attr('title') || $(cells[2]).find('p').text().trim();
|
||||
const estimatedPrice = $(cells[3]).find('p').text().trim();
|
||||
const publishDate = $(cells[4]).find('p').text().trim();
|
||||
|
||||
if (!rawHref || !title || title.length < 5) return;
|
||||
if (rawHref === './' || rawHref === '../') return;
|
||||
if (!/^\d{4}-\d{2}-\d{2}$/.test(dateText)) return;
|
||||
|
||||
try {
|
||||
const href = new URL(rawHref, BASE_URL).toString();
|
||||
items.push({ title, href, date: dateText });
|
||||
} catch (err) {
|
||||
return;
|
||||
const onclick = $row.attr('onclick') || '';
|
||||
const hrefMatch = onclick.match(/window\.open\(['"]([^'"]+)['"]\)/);
|
||||
let href = '';
|
||||
if (hrefMatch) {
|
||||
href = hrefMatch[1];
|
||||
if (href.startsWith('/')) {
|
||||
href = `https://njggzy.nanjing.gov.cn${href}`;
|
||||
}
|
||||
}
|
||||
|
||||
if (!/^\d{4}-\d{2}-\d{2}$/.test(publishDate)) return;
|
||||
|
||||
const price = parseFloat(estimatedPrice);
|
||||
if (isNaN(price)) return;
|
||||
|
||||
items.push({
|
||||
bidNo,
|
||||
title: projectName,
|
||||
bidName,
|
||||
budget: {
|
||||
amount: price,
|
||||
unit: '万元'
|
||||
},
|
||||
date: publishDate,
|
||||
href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
@@ -141,23 +153,23 @@ function isDateInRange(dateStr, startDate, endDate) {
|
||||
return true;
|
||||
}
|
||||
|
||||
async function fetchListByDateRange(startDate, endDate, maxPages = 23) {
|
||||
async function fetchListByDateRange(startDate, endDate, maxPages = 50) {
|
||||
const allItems = [];
|
||||
let shouldContinue = true;
|
||||
let pageIndex = 0;
|
||||
let pageIndex = 1;
|
||||
|
||||
console.log(`开始按时间范围采集: ${startDate || '不限'} 至 ${endDate || '不限'}`);
|
||||
|
||||
while (shouldContinue && pageIndex < maxPages) {
|
||||
while (shouldContinue && pageIndex <= maxPages) {
|
||||
const pageUrl = getPageUrl(pageIndex);
|
||||
console.log(`正在采集第 ${pageIndex + 1} 页: ${pageUrl}`);
|
||||
console.log(`正在采集第 ${pageIndex} 页: ${pageUrl}`);
|
||||
|
||||
try {
|
||||
const html = await fetchHtml(pageUrl);
|
||||
const items = parseList(html);
|
||||
|
||||
if (items.length === 0) {
|
||||
console.log(`第 ${pageIndex + 1} 页没有数据,停止采集`);
|
||||
console.log(`第 ${pageIndex} 页没有数据,停止采集`);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -177,251 +189,27 @@ async function fetchListByDateRange(startDate, endDate, maxPages = 23) {
|
||||
}
|
||||
|
||||
if (allItemsBeforeRange && startDate) {
|
||||
console.log(`第 ${pageIndex + 1} 页所有项目都早于起始日期,停止采集`);
|
||||
console.log(`第 ${pageIndex} 页所有项目都早于起始日期,停止采集`);
|
||||
shouldContinue = false;
|
||||
}
|
||||
|
||||
console.log(`第 ${pageIndex + 1} 页找到 ${items.length} 条,符合条件 ${hasItemsInRange ? '有' : '无'}`);
|
||||
console.log(`第 ${pageIndex} 页找到 ${items.length} 条,符合条件 ${hasItemsInRange ? '有' : '无'}`);
|
||||
|
||||
pageIndex++;
|
||||
|
||||
if (shouldContinue && pageIndex < maxPages) {
|
||||
if (shouldContinue && pageIndex <= maxPages) {
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`采集第 ${pageIndex + 1} 页失败: ${err.message}`);
|
||||
console.error(`采集第 ${pageIndex} 页失败: ${err.message}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`总共采集了 ${pageIndex} 页,找到 ${allItems.length} 条符合条件的公告`);
|
||||
console.log(`总共采集了 ${pageIndex - 1} 页,找到 ${allItems.length} 条符合条件的公告`);
|
||||
return allItems;
|
||||
}
|
||||
|
||||
// 从server.js导入parseDetail相关函数
|
||||
function parseDetail(html) {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
let title = $('.title18').text().trim();
|
||||
if (!title) {
|
||||
title = $('.article-info h1').text().trim();
|
||||
}
|
||||
if (!title) {
|
||||
title = $('h1').first().text().trim();
|
||||
}
|
||||
|
||||
const publishTd = $('td:contains("发布部门")').filter((_, el) => {
|
||||
return $(el).text().includes('发布时间');
|
||||
});
|
||||
const publishText = publishTd.text().trim();
|
||||
let timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/);
|
||||
let publishTime = timeMatch ? timeMatch[1] : '';
|
||||
|
||||
if (!publishTime) {
|
||||
const infoText = $('.info-sources').text() || $('body').text();
|
||||
timeMatch = infoText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})/);
|
||||
publishTime = timeMatch ? timeMatch[1] : '';
|
||||
}
|
||||
|
||||
let content = '';
|
||||
const contentSelectors = [
|
||||
'.zhenwen td',
|
||||
'.con',
|
||||
'.article-content',
|
||||
'.ewb-article-content',
|
||||
'body'
|
||||
];
|
||||
|
||||
for (const selector of contentSelectors) {
|
||||
const el = $(selector).first();
|
||||
if (el.length > 0) {
|
||||
const text = el.text().trim();
|
||||
if (text.length > content.length) {
|
||||
content = text;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const budget = extractBudget(content);
|
||||
|
||||
return {
|
||||
title,
|
||||
publishTime,
|
||||
content,
|
||||
budget,
|
||||
};
|
||||
}
|
||||
|
||||
function extractBudget(content) {
|
||||
let cleanedContent = content.replace(/(\d)\s*[\n\r]\s*(?=\d)/g, '$1');
|
||||
|
||||
const patterns = [
|
||||
{ regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 1 },
|
||||
{ regex: /[((][¥¥]([\d,,]+(?:\.\d+)?)[))]/i, priority: 2, divider: 10000 },
|
||||
{ regex: /([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 3 },
|
||||
{ regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*元/i, priority: 4, divider: 10000 },
|
||||
{ regex: /([\d,,]+(?:\.\d+)?)\s*元(?!整)/i, priority: 5, divider: 10000 }
|
||||
];
|
||||
|
||||
let bestMatch = null;
|
||||
let bestPriority = Infinity;
|
||||
|
||||
for (const pattern of patterns) {
|
||||
const match = cleanedContent.match(pattern.regex);
|
||||
if (match && pattern.priority < bestPriority) {
|
||||
const numberStr = match[1].replace(/[,,]/g, '');
|
||||
let amount = parseFloat(numberStr);
|
||||
|
||||
if (pattern.divider) {
|
||||
amount = amount / pattern.divider;
|
||||
}
|
||||
|
||||
if (!isNaN(amount) && amount >= 0.01 && amount <= 100000000) {
|
||||
bestMatch = {
|
||||
amount,
|
||||
unit: '万元',
|
||||
text: match[0],
|
||||
originalUnit: pattern.divider ? '元' : '万元'
|
||||
};
|
||||
bestPriority = pattern.priority;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return bestMatch;
|
||||
}
|
||||
|
||||
// 从API获取PDF URL
|
||||
async function fetchPdfUrlFromApi(pageUrl) {
|
||||
try {
|
||||
const bulletinIdMatch = pageUrl.match(/bulletinDetails\/[^\/]+\/([a-f0-9]+)/i);
|
||||
const bulletinTypeMatch = pageUrl.match(/bulletinType=(\d+)/);
|
||||
|
||||
if (!bulletinIdMatch) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const bulletinId = bulletinIdMatch[1];
|
||||
const bulletinType = bulletinTypeMatch ? bulletinTypeMatch[1] : '1';
|
||||
|
||||
const apiUrl = `https://api.jszbtb.com/DataGatewayApi/PublishBulletin/BulletinType/${bulletinType}/ID/${bulletinId}`;
|
||||
|
||||
const response = await http.get(apiUrl, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
'Accept': 'application/json',
|
||||
'Referer': 'https://www.jszbcg.com/'
|
||||
},
|
||||
responseType: 'arraybuffer'
|
||||
});
|
||||
|
||||
const responseText = iconv.decode(response.data, 'utf-8');
|
||||
const data = JSON.parse(responseText);
|
||||
|
||||
if (data.success && data.data && data.data.signedPdfUrl) {
|
||||
return data.data.signedPdfUrl;
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (err) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function extractPdfUrl(html, pageUrl) {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
let iframe = $('iframe').first();
|
||||
if (!iframe.length) {
|
||||
iframe = $('iframe[src*="pdf"]').first();
|
||||
}
|
||||
if (!iframe.length) {
|
||||
iframe = $('iframe[src*="viewer"]').first();
|
||||
}
|
||||
|
||||
if (iframe.length) {
|
||||
const src = iframe.attr('src');
|
||||
if (!src) return null;
|
||||
|
||||
const match = src.match(/[?&]file=([^&]+)/);
|
||||
if (match) {
|
||||
let pdfUrl = decodeURIComponent(match[1]);
|
||||
|
||||
if (!pdfUrl.startsWith('http://') && !pdfUrl.startsWith('https://')) {
|
||||
try {
|
||||
pdfUrl = new URL(pdfUrl, pageUrl).toString();
|
||||
} catch (err) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
return pdfUrl;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function fetchPdfContent(pdfUrl) {
|
||||
try {
|
||||
const { PDFParse } = await import('pdf-parse');
|
||||
|
||||
const response = await http.get(pdfUrl, {
|
||||
responseType: 'arraybuffer',
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
const parser = new PDFParse({ data: response.data });
|
||||
const result = await parser.getText();
|
||||
await parser.destroy();
|
||||
|
||||
return result.text;
|
||||
} catch (err) {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
async function parseDetailEnhanced(html, pageUrl) {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
let pdfUrl = null;
|
||||
|
||||
if (pageUrl.includes('jszbcg.com')) {
|
||||
pdfUrl = await fetchPdfUrlFromApi(pageUrl);
|
||||
}
|
||||
|
||||
if (!pdfUrl) {
|
||||
pdfUrl = extractPdfUrl(html, pageUrl);
|
||||
}
|
||||
|
||||
let content = '';
|
||||
let pdfParsed = false;
|
||||
|
||||
if (pdfUrl) {
|
||||
try {
|
||||
content = await fetchPdfContent(pdfUrl);
|
||||
pdfParsed = true;
|
||||
} catch (err) {
|
||||
const htmlDetail = parseDetail(html);
|
||||
content = htmlDetail.content;
|
||||
}
|
||||
} else {
|
||||
const htmlDetail = parseDetail(html);
|
||||
content = htmlDetail.content;
|
||||
}
|
||||
|
||||
const budget = extractBudget(content);
|
||||
const basicInfo = parseDetail(html);
|
||||
|
||||
return {
|
||||
...basicInfo,
|
||||
content,
|
||||
budget,
|
||||
hasPdf: pdfParsed,
|
||||
pdfUrl: pdfParsed ? pdfUrl : null,
|
||||
};
|
||||
}
|
||||
|
||||
// 定时任务执行函数
|
||||
async function executeScheduledTask(config) {
|
||||
try {
|
||||
@@ -432,7 +220,7 @@ async function executeScheduledTask(config) {
|
||||
|
||||
const timeRange = config.scheduler.timeRange || 'thisMonth';
|
||||
const { startDate, endDate } = getDateRangeByType(timeRange);
|
||||
const threshold = config.scheduler.threshold || 100000; // 默认10亿(100000万元)
|
||||
const threshold = config.scheduler.threshold || 10000; // 默认1亿(10000万元)
|
||||
|
||||
const timeRangeNames = {
|
||||
'today': '今日',
|
||||
@@ -441,64 +229,39 @@ async function executeScheduledTask(config) {
|
||||
};
|
||||
console.log(`采集时间段: ${timeRangeNames[timeRange] || '本月'}`);
|
||||
console.log(`采集时间范围: ${startDate} 至 ${endDate}`);
|
||||
console.log(`金额阈值: ${threshold}万元 (${threshold / 10000}亿元)`);
|
||||
console.log(`金额阈值: ${threshold}万元 (${(threshold / 10000).toFixed(2)}亿元)`);
|
||||
|
||||
// 采集列表
|
||||
const items = await fetchListByDateRange(startDate, endDate, 23);
|
||||
// 采集列表(直接包含合同估算价)
|
||||
const items = await fetchListByDateRange(startDate, endDate, 50);
|
||||
|
||||
if (items.length === 0) {
|
||||
console.log('本月暂无公告数据');
|
||||
console.log('暂无公告数据');
|
||||
return;
|
||||
}
|
||||
|
||||
// 采集详情
|
||||
console.log('========================================');
|
||||
console.log(`开始采集 ${items.length} 条公告的详情...`);
|
||||
const results = [];
|
||||
for (let i = 0; i < items.length; i++) {
|
||||
const item = items[i];
|
||||
try {
|
||||
console.log(`[${i + 1}/${items.length}] 正在采集: ${item.title}`);
|
||||
const html = await fetchHtml(item.href);
|
||||
const detail = await parseDetailEnhanced(html, item.href);
|
||||
results.push({
|
||||
...item,
|
||||
detail,
|
||||
});
|
||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||
} catch (err) {
|
||||
console.error(`采集失败: ${err.message}`);
|
||||
results.push({
|
||||
...item,
|
||||
detail: null,
|
||||
error: err.message,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// 筛选大于阈值的项目
|
||||
const filtered = results.filter((item) => {
|
||||
return item.detail?.budget && item.detail.budget.amount > threshold;
|
||||
const filtered = items.filter((item) => {
|
||||
return item.budget && item.budget.amount > threshold;
|
||||
});
|
||||
|
||||
console.log('========================================');
|
||||
console.log(`筛选结果: 找到 ${filtered.length} 个大于 ${threshold}万元 的项目`);
|
||||
|
||||
if (filtered.length === 0) {
|
||||
console.log('本月暂无符合条件的大额项目');
|
||||
console.log('暂无符合条件的大额项目');
|
||||
return;
|
||||
}
|
||||
|
||||
// 计算总金额
|
||||
const total = filtered.reduce(
|
||||
(sum, item) => sum + (item.detail.budget?.amount || 0),
|
||||
(sum, item) => sum + (item.budget?.amount || 0),
|
||||
0
|
||||
);
|
||||
|
||||
// 生成报告
|
||||
const report = {
|
||||
summary: {
|
||||
total_count: results.length,
|
||||
total_count: items.length,
|
||||
filtered_count: filtered.length,
|
||||
threshold: `${threshold}万元`,
|
||||
total_amount: `${total.toFixed(2)}万元`,
|
||||
@@ -506,10 +269,11 @@ async function executeScheduledTask(config) {
|
||||
date_range: { startDate, endDate },
|
||||
},
|
||||
projects: filtered.map((item) => ({
|
||||
bidNo: item.bidNo,
|
||||
title: item.title,
|
||||
bidName: item.bidName,
|
||||
date: item.date,
|
||||
publish_time: item.detail.publishTime,
|
||||
budget: item.detail.budget,
|
||||
budget: item.budget,
|
||||
url: item.href,
|
||||
})),
|
||||
};
|
||||
@@ -616,7 +380,8 @@ export function getSchedulerStatus() {
|
||||
config: config ? {
|
||||
enabled: config.scheduler?.enabled || false,
|
||||
cronTime: config.scheduler?.cronTime || '0 9 * * *',
|
||||
threshold: config.scheduler?.threshold || 100000,
|
||||
threshold: config.scheduler?.threshold || 10000,
|
||||
timeRange: config.scheduler?.timeRange || 'thisMonth',
|
||||
} : null,
|
||||
};
|
||||
}
|
||||
|
||||
580
src/server.js
580
src/server.js
@@ -6,7 +6,6 @@ import * as cheerio from 'cheerio';
|
||||
import iconv from 'iconv-lite';
|
||||
import { sendReportEmail } from './emailService.js';
|
||||
import { initScheduler, runTaskNow, reloadScheduler, getSchedulerStatus } from './scheduler.js';
|
||||
import { extractBudgetWithLLM, testLLMConnection, getLLMStatus, isLLMEnabled } from './llmService.js';
|
||||
|
||||
const app = express();
|
||||
const PORT = process.env.PORT || 5000;
|
||||
@@ -15,15 +14,15 @@ app.use(cors());
|
||||
app.use(express.json());
|
||||
app.use(express.static('public'));
|
||||
|
||||
const BASE_URL = 'https://gjzx.nanjing.gov.cn/gggs/';
|
||||
// 南京市公共资源交易平台 - 房建市政招标公告
|
||||
const BASE_URL = 'https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/';
|
||||
|
||||
// 获取分页URL
|
||||
function getPageUrl(pageIndex, baseUrl = BASE_URL) {
|
||||
if (pageIndex === 0) {
|
||||
return baseUrl;
|
||||
function getPageUrl(pageIndex) {
|
||||
if (pageIndex === 1) {
|
||||
return `${BASE_URL}moreinfo.html`;
|
||||
}
|
||||
const cleanBaseUrl = baseUrl.replace(/\/$/, '');
|
||||
return `${cleanBaseUrl}/index_${pageIndex}.html`;
|
||||
return `${BASE_URL}${pageIndex}.html`;
|
||||
}
|
||||
|
||||
// 检查日期是否在范围内
|
||||
@@ -38,23 +37,23 @@ function isDateInRange(dateStr, startDate, endDate) {
|
||||
}
|
||||
|
||||
// 按时间范围采集多页列表
|
||||
async function fetchListByDateRange(startDate, endDate, maxPages = 23) {
|
||||
async function fetchListByDateRange(startDate, endDate, maxPages = 50) {
|
||||
const allItems = [];
|
||||
let shouldContinue = true;
|
||||
let pageIndex = 0;
|
||||
let pageIndex = 1;
|
||||
|
||||
console.log(`开始按时间范围采集: ${startDate || '不限'} 至 ${endDate || '不限'}`);
|
||||
|
||||
while (shouldContinue && pageIndex < maxPages) {
|
||||
while (shouldContinue && pageIndex <= maxPages) {
|
||||
const pageUrl = getPageUrl(pageIndex);
|
||||
console.log(`正在采集第 ${pageIndex + 1} 页: ${pageUrl}`);
|
||||
console.log(`正在采集第 ${pageIndex} 页: ${pageUrl}`);
|
||||
|
||||
try {
|
||||
const html = await fetchHtml(pageUrl);
|
||||
const items = parseList(html);
|
||||
|
||||
if (items.length === 0) {
|
||||
console.log(`第 ${pageIndex + 1} 页没有数据,停止采集`);
|
||||
console.log(`第 ${pageIndex} 页没有数据,停止采集`);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -74,32 +73,32 @@ async function fetchListByDateRange(startDate, endDate, maxPages = 23) {
|
||||
}
|
||||
|
||||
if (allItemsBeforeRange && startDate) {
|
||||
console.log(`第 ${pageIndex + 1} 页所有项目都早于起始日期,停止采集`);
|
||||
console.log(`第 ${pageIndex} 页所有项目都早于起始日期,停止采集`);
|
||||
shouldContinue = false;
|
||||
}
|
||||
|
||||
console.log(`第 ${pageIndex + 1} 页找到 ${items.length} 条,符合条件 ${hasItemsInRange ? '有' : '无'}`);
|
||||
console.log(`第 ${pageIndex} 页找到 ${items.length} 条,符合条件 ${hasItemsInRange ? '有' : '无'}`);
|
||||
|
||||
pageIndex++;
|
||||
|
||||
if (shouldContinue && pageIndex < maxPages) {
|
||||
if (shouldContinue && pageIndex <= maxPages) {
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`采集第 ${pageIndex + 1} 页失败: ${err.message}`);
|
||||
console.error(`采集第 ${pageIndex} 页失败: ${err.message}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`总共采集了 ${pageIndex} 页,找到 ${allItems.length} 条符合条件的公告`);
|
||||
console.log(`总共采集了 ${pageIndex - 1} 页,找到 ${allItems.length} 条符合条件的公告`);
|
||||
return allItems;
|
||||
}
|
||||
|
||||
const http = axios.create({
|
||||
responseType: 'arraybuffer',
|
||||
timeout: 10000,
|
||||
timeout: 15000,
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; gjzx-scraper/1.0)',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
},
|
||||
});
|
||||
|
||||
@@ -118,352 +117,70 @@ async function fetchHtml(url) {
|
||||
return html;
|
||||
}
|
||||
|
||||
// 从jszbcg.com的API获取PDF URL
|
||||
async function fetchPdfUrlFromApi(pageUrl) {
|
||||
try {
|
||||
// 从URL中提取公告ID和bulletinType
|
||||
// URL格式: https://www.jszbcg.com/#/bulletinDetails/招标公告/2c9180899a7e34d2019a95630c931a8e?bulletinType=1
|
||||
const bulletinIdMatch = pageUrl.match(/bulletinDetails\/[^\/]+\/([a-f0-9]+)/i);
|
||||
const bulletinTypeMatch = pageUrl.match(/bulletinType=(\d+)/);
|
||||
|
||||
if (!bulletinIdMatch) {
|
||||
console.log('无法从URL中提取公告ID');
|
||||
return null;
|
||||
}
|
||||
|
||||
const bulletinId = bulletinIdMatch[1];
|
||||
const bulletinType = bulletinTypeMatch ? bulletinTypeMatch[1] : '1';
|
||||
|
||||
// 调用API获取公告详情
|
||||
const apiUrl = `https://api.jszbtb.com/DataGatewayApi/PublishBulletin/BulletinType/${bulletinType}/ID/${bulletinId}`;
|
||||
console.log(`调用API获取公告详情: ${apiUrl}`);
|
||||
|
||||
const response = await http.get(apiUrl, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
'Accept': 'application/json',
|
||||
'Referer': 'https://www.jszbcg.com/'
|
||||
},
|
||||
responseType: 'arraybuffer'
|
||||
});
|
||||
|
||||
// 解析返回的数据
|
||||
const responseText = iconv.decode(response.data, 'utf-8');
|
||||
const data = JSON.parse(responseText);
|
||||
|
||||
if (data.success && data.data && data.data.signedPdfUrl) {
|
||||
const pdfUrl = data.data.signedPdfUrl;
|
||||
console.log(`成功从API获取PDF URL: ${pdfUrl}`);
|
||||
return pdfUrl;
|
||||
}
|
||||
|
||||
console.log('API返回数据中没有PDF URL');
|
||||
return null;
|
||||
} catch (err) {
|
||||
console.error(`从API获取PDF URL失败: ${err.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// 从iframe嵌入页面提取PDF URL(备用方案)
|
||||
function extractPdfUrl(html, pageUrl) {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// 尝试多种iframe选择器
|
||||
let iframe = $('iframe').first();
|
||||
if (!iframe.length) {
|
||||
iframe = $('iframe[src*="pdf"]').first();
|
||||
}
|
||||
if (!iframe.length) {
|
||||
iframe = $('iframe[src*="viewer"]').first();
|
||||
}
|
||||
|
||||
if (iframe.length) {
|
||||
const src = iframe.attr('src');
|
||||
if (!src) return null;
|
||||
|
||||
// 从viewer.html?file=xxx.pdf中提取PDF URL
|
||||
const match = src.match(/[?&]file=([^&]+)/);
|
||||
if (match) {
|
||||
let pdfUrl = decodeURIComponent(match[1]);
|
||||
|
||||
// 检查是否是绝对路径
|
||||
if (!pdfUrl.startsWith('http://') && !pdfUrl.startsWith('https://')) {
|
||||
try {
|
||||
pdfUrl = new URL(pdfUrl, pageUrl).toString();
|
||||
} catch (err) {
|
||||
console.error(`URL拼接失败: ${err.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
return pdfUrl;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// 下载并解析PDF内容
|
||||
async function fetchPdfContent(pdfUrl) {
|
||||
console.log(`正在下载PDF: ${pdfUrl}`);
|
||||
try {
|
||||
// 导入pdf-parse v2
|
||||
const { PDFParse } = await import('pdf-parse');
|
||||
|
||||
const response = await http.get(pdfUrl, {
|
||||
responseType: 'arraybuffer',
|
||||
timeout: 30000, // PDF文件可能较大,增加超时时间
|
||||
});
|
||||
|
||||
// 使用pdf-parse v2 API
|
||||
const parser = new PDFParse({ data: response.data });
|
||||
const result = await parser.getText();
|
||||
await parser.destroy();
|
||||
|
||||
console.log(`PDF解析成功,文本长度: ${result.text.length}`);
|
||||
return result.text;
|
||||
} catch (err) {
|
||||
console.error(`PDF下载或解析失败: ${err.message}`);
|
||||
console.error(err.stack);
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
// 解析列表页HTML,提取招标信息
|
||||
function parseList(html) {
|
||||
const $ = cheerio.load(html);
|
||||
const items = [];
|
||||
|
||||
// 查找所有表格行中的链接
|
||||
$('table tr').each((_, row) => {
|
||||
// 解析南京公共资源交易平台的列表结构
|
||||
// <li class="ewb-info-item2 clearfix" onclick="window.open('详情URL');">
|
||||
$('li.ewb-info-item2').each((_, row) => {
|
||||
const $row = $(row);
|
||||
const link = $row.find('td:first-child a').first();
|
||||
const dateCell = $row.find('td:nth-child(2)');
|
||||
const cells = $row.find('div.ewb-info-num2');
|
||||
|
||||
if (link.length && dateCell.length) {
|
||||
const title = link.attr('title') || link.text().trim();
|
||||
const rawHref = link.attr('href') || '';
|
||||
const dateText = dateCell.text().trim();
|
||||
if (cells.length >= 5) {
|
||||
// 获取各字段
|
||||
const bidNo = $(cells[0]).find('p').attr('title') || $(cells[0]).find('p').text().trim();
|
||||
const projectName = $(cells[1]).find('p').attr('title') || $(cells[1]).find('p').text().trim();
|
||||
const bidName = $(cells[2]).find('p').attr('title') || $(cells[2]).find('p').text().trim();
|
||||
const estimatedPrice = $(cells[3]).find('p').text().trim();
|
||||
const publishDate = $(cells[4]).find('p').text().trim();
|
||||
|
||||
// 过滤掉导航链接和空链接
|
||||
if (!rawHref || !title || title.length < 5) return;
|
||||
if (rawHref === './' || rawHref === '../') return;
|
||||
// 从onclick提取详情链接
|
||||
const onclick = $row.attr('onclick') || '';
|
||||
const hrefMatch = onclick.match(/window\.open\(['"]([^'"]+)['"]\)/);
|
||||
let href = '';
|
||||
if (hrefMatch) {
|
||||
href = hrefMatch[1];
|
||||
// 转换为绝对URL
|
||||
if (href.startsWith('/')) {
|
||||
href = `https://njggzy.nanjing.gov.cn${href}`;
|
||||
}
|
||||
}
|
||||
|
||||
// 验证日期格式 (YYYY-MM-DD)
|
||||
if (!/^\d{4}-\d{2}-\d{2}$/.test(dateText)) return;
|
||||
if (!/^\d{4}-\d{2}-\d{2}$/.test(publishDate)) return;
|
||||
|
||||
try {
|
||||
const href = new URL(rawHref, BASE_URL).toString();
|
||||
items.push({ title, href, date: dateText });
|
||||
} catch (err) {
|
||||
// 跳过无效URL
|
||||
return;
|
||||
}
|
||||
// 解析合同估算价
|
||||
const price = parseFloat(estimatedPrice);
|
||||
if (isNaN(price)) return;
|
||||
|
||||
items.push({
|
||||
bidNo, // 标段编号
|
||||
title: projectName, // 项目名称
|
||||
bidName, // 标段名称
|
||||
budget: {
|
||||
amount: price,
|
||||
unit: '万元'
|
||||
},
|
||||
date: publishDate,
|
||||
href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
function parseDetail(html) {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// 尝试多种标题选择器
|
||||
let title = $('.title18').text().trim();
|
||||
if (!title) {
|
||||
title = $('.article-info h1').text().trim();
|
||||
}
|
||||
if (!title) {
|
||||
title = $('h1').first().text().trim();
|
||||
}
|
||||
|
||||
// 尝试提取发布时间
|
||||
const publishTd = $('td:contains("发布部门")').filter((_, el) => {
|
||||
return $(el).text().includes('发布时间');
|
||||
});
|
||||
const publishText = publishTd.text().trim();
|
||||
let timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/);
|
||||
let publishTime = timeMatch ? timeMatch[1] : '';
|
||||
|
||||
// 如果第一种方式没找到,尝试其他方式
|
||||
if (!publishTime) {
|
||||
const infoText = $('.info-sources').text() || $('body').text();
|
||||
timeMatch = infoText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})/);
|
||||
publishTime = timeMatch ? timeMatch[1] : '';
|
||||
}
|
||||
|
||||
// 尝试多种内容选择器
|
||||
let content = '';
|
||||
const contentSelectors = [
|
||||
'.zhenwen td', // 原有格式
|
||||
'.con', // 新格式(宁易新系统)
|
||||
'.article-content', // 通用格式
|
||||
'.ewb-article-content',
|
||||
'body' // 兜底方案
|
||||
];
|
||||
|
||||
for (const selector of contentSelectors) {
|
||||
const el = $(selector).first();
|
||||
if (el.length > 0) {
|
||||
const text = el.text().trim();
|
||||
if (text.length > content.length) {
|
||||
content = text;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const budget = extractBudget(content);
|
||||
|
||||
return {
|
||||
title,
|
||||
publishTime,
|
||||
content,
|
||||
budget,
|
||||
};
|
||||
}
|
||||
|
||||
// 增强版parseDetail,支持PDF解析和LLM金额提取
|
||||
async function parseDetailEnhanced(html, pageUrl) {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
let pdfUrl = null;
|
||||
|
||||
// 优先尝试从API获取PDF URL(适用于jszbcg.com)
|
||||
if (pageUrl.includes('jszbcg.com')) {
|
||||
pdfUrl = await fetchPdfUrlFromApi(pageUrl);
|
||||
}
|
||||
|
||||
// 如果API获取失败,回退到从HTML中提取
|
||||
if (!pdfUrl) {
|
||||
pdfUrl = extractPdfUrl(html, pageUrl);
|
||||
}
|
||||
|
||||
let content = '';
|
||||
let pdfParsed = false;
|
||||
|
||||
if (pdfUrl) {
|
||||
// 如果有PDF,从PDF中提取内容
|
||||
console.log(`发现PDF: ${pdfUrl}`);
|
||||
try {
|
||||
content = await fetchPdfContent(pdfUrl);
|
||||
pdfParsed = true;
|
||||
console.log(`成功从PDF提取内容,长度: ${content.length}`);
|
||||
} catch (err) {
|
||||
console.error('PDF解析失败,回退到HTML解析:', err.message);
|
||||
// 回退到普通HTML解析
|
||||
const htmlDetail = parseDetail(html);
|
||||
content = htmlDetail.content;
|
||||
}
|
||||
} else {
|
||||
// 普通HTML页面,使用原有逻辑
|
||||
const htmlDetail = parseDetail(html);
|
||||
content = htmlDetail.content;
|
||||
}
|
||||
|
||||
// 提取金额:优先使用 LLM,失败则降级到正则表达式
|
||||
let budget = null;
|
||||
if (isLLMEnabled()) {
|
||||
console.log('使用 LLM 提取金额...');
|
||||
budget = await extractBudgetWithLLM(content);
|
||||
if (budget) {
|
||||
console.log(`LLM 提取成功: ${budget.amount} ${budget.unit}`);
|
||||
} else {
|
||||
console.log('LLM 提取失败,降级到正则表达式');
|
||||
}
|
||||
}
|
||||
|
||||
// 如果 LLM 未启用或提取失败,使用正则表达式
|
||||
if (!budget) {
|
||||
budget = extractBudget(content);
|
||||
if (budget) {
|
||||
budget.source = 'regex'; // 标记来源
|
||||
}
|
||||
}
|
||||
|
||||
// 获取其他基本信息(标题、发布时间等)
|
||||
const basicInfo = parseDetail(html);
|
||||
|
||||
return {
|
||||
...basicInfo,
|
||||
content,
|
||||
budget,
|
||||
hasPdf: pdfParsed,
|
||||
pdfUrl: pdfParsed ? pdfUrl : null,
|
||||
};
|
||||
}
|
||||
|
||||
function extractBudget(content) {
|
||||
// 预处理内容:去除数字之间的换行符和空白字符
|
||||
// 这样可以匹配被换行符分隔的数字,例如 "1\n1\n0\n9\n0\n0" -> "110900"
|
||||
let cleanedContent = content.replace(/(\d)\s*[\n\r]\s*(?=\d)/g, '$1');
|
||||
|
||||
// 直接定义金额匹配模式(从高优先级到低优先级)
|
||||
const patterns = [
|
||||
// 优先级1: 带货币符号的万元
|
||||
{ regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 1 },
|
||||
|
||||
// 优先级2: 括号内的金额(元)
|
||||
{ regex: /[((][¥¥]([\d,,]+(?:\.\d+)?)[))]/i, priority: 2, divider: 10000 },
|
||||
|
||||
// 优先级3: 普通万元格式
|
||||
{ regex: /([\d,,]+(?:\.\d+)?)\s*万元/i, priority: 3 },
|
||||
|
||||
// 优先级4: 带货币符号的元(转万元)
|
||||
{ regex: /(?:¥|¥|人民币)\s*([\d,,]+(?:\.\d+)?)\s*元/i, priority: 4, divider: 10000 },
|
||||
|
||||
// 优先级5: 普通元格式(转万元,最低优先级,排除"元整"避免误匹配中文大写)
|
||||
{ regex: /([\d,,]+(?:\.\d+)?)\s*元(?!整)/i, priority: 5, divider: 10000 }
|
||||
];
|
||||
|
||||
let bestMatch = null;
|
||||
let bestPriority = Infinity;
|
||||
|
||||
// 遍历所有模式,找到优先级最高的匹配
|
||||
for (const pattern of patterns) {
|
||||
const match = cleanedContent.match(pattern.regex);
|
||||
if (match && pattern.priority < bestPriority) {
|
||||
// 清理数字中的逗号并转换
|
||||
const numberStr = match[1].replace(/[,,]/g, '');
|
||||
let amount = parseFloat(numberStr);
|
||||
|
||||
// 如果是元单位,转换为万元
|
||||
if (pattern.divider) {
|
||||
amount = amount / pattern.divider;
|
||||
}
|
||||
|
||||
// 验证金额合理性(0.01万元到100000000万元之间)
|
||||
if (!isNaN(amount) && amount >= 0.01 && amount <= 100000000) {
|
||||
bestMatch = {
|
||||
amount,
|
||||
unit: '万元',
|
||||
text: match[0],
|
||||
originalUnit: pattern.divider ? '元' : '万元'
|
||||
};
|
||||
bestPriority = pattern.priority;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return bestMatch;
|
||||
}
|
||||
|
||||
// API 路由
|
||||
|
||||
// 获取列表
|
||||
app.get('/api/list', async (req, res) => {
|
||||
try {
|
||||
const baseUrl = req.query.url || BASE_URL;
|
||||
const page = parseInt(req.query.page) || 1;
|
||||
const pageUrl = getPageUrl(page);
|
||||
|
||||
// 根据页码构建URL
|
||||
let url = baseUrl;
|
||||
if (page > 1) {
|
||||
// 移除baseUrl末尾的斜杠
|
||||
const cleanBaseUrl = baseUrl.replace(/\/$/, '');
|
||||
url = `${cleanBaseUrl}/index_${page - 1}.html`;
|
||||
}
|
||||
|
||||
const html = await fetchHtml(url);
|
||||
const html = await fetchHtml(pageUrl);
|
||||
const items = parseList(html);
|
||||
res.json({ success: true, data: items, page });
|
||||
} catch (error) {
|
||||
@@ -474,7 +191,7 @@ app.get('/api/list', async (req, res) => {
|
||||
// 按时间范围获取列表
|
||||
app.post('/api/list-daterange', async (req, res) => {
|
||||
try {
|
||||
const { startDate, endDate, maxPages = 23 } = req.body;
|
||||
const { startDate, endDate, maxPages = 50 } = req.body;
|
||||
const items = await fetchListByDateRange(startDate, endDate, maxPages);
|
||||
res.json({ success: true, data: items });
|
||||
} catch (error) {
|
||||
@@ -482,100 +199,50 @@ app.post('/api/list-daterange', async (req, res) => {
|
||||
}
|
||||
});
|
||||
|
||||
// 获取详情
|
||||
app.post('/api/details', async (req, res) => {
|
||||
try {
|
||||
const { items, limit = 10 } = req.body;
|
||||
const results = [];
|
||||
const toFetch = items.slice(0, limit);
|
||||
|
||||
for (const item of toFetch) {
|
||||
try {
|
||||
const html = await fetchHtml(item.href);
|
||||
const detail = await parseDetailEnhanced(html, item.href);
|
||||
results.push({
|
||||
...item,
|
||||
detail,
|
||||
});
|
||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||
} catch (err) {
|
||||
results.push({
|
||||
...item,
|
||||
detail: null,
|
||||
error: err.message,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
res.json({ success: true, data: results });
|
||||
} catch (error) {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// 生成报告
|
||||
app.post('/api/report', async (req, res) => {
|
||||
try {
|
||||
const { limit = 15, threshold = 50, url } = req.body;
|
||||
const targetUrl = url && url.trim() !== '' ? url : BASE_URL;
|
||||
const { limit = 50, threshold = 50 } = req.body;
|
||||
|
||||
// 按需采集多页以获取足够的数据
|
||||
// 采集列表
|
||||
const items = [];
|
||||
let pageIndex = 0;
|
||||
const maxPagesToFetch = Math.ceil(limit / 10) + 1; // 假设每页约10条,多抓一页保险
|
||||
let pageIndex = 1;
|
||||
const maxPagesToFetch = Math.ceil(limit / 10) + 1;
|
||||
|
||||
while (items.length < limit && pageIndex < maxPagesToFetch) {
|
||||
const pageUrl = getPageUrl(pageIndex, targetUrl);
|
||||
console.log(`正在采集第 ${pageIndex + 1} 页: ${pageUrl}`);
|
||||
while (items.length < limit && pageIndex <= maxPagesToFetch) {
|
||||
const pageUrl = getPageUrl(pageIndex);
|
||||
console.log(`正在采集第 ${pageIndex} 页: ${pageUrl}`);
|
||||
|
||||
try {
|
||||
const html = await fetchHtml(pageUrl);
|
||||
const pageItems = parseList(html);
|
||||
|
||||
if (pageItems.length === 0) {
|
||||
console.log(`第 ${pageIndex + 1} 页没有数据,停止采集`);
|
||||
console.log(`第 ${pageIndex} 页没有数据,停止采集`);
|
||||
break;
|
||||
}
|
||||
|
||||
items.push(...pageItems);
|
||||
pageIndex++;
|
||||
|
||||
if (items.length < limit && pageIndex < maxPagesToFetch) {
|
||||
if (items.length < limit && pageIndex <= maxPagesToFetch) {
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`采集第 ${pageIndex + 1} 页失败: ${err.message}`);
|
||||
console.error(`采集第 ${pageIndex} 页失败: ${err.message}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const results = [];
|
||||
const toFetch = items.slice(0, limit);
|
||||
|
||||
for (const item of toFetch) {
|
||||
try {
|
||||
const html = await fetchHtml(item.href);
|
||||
const detail = await parseDetailEnhanced(html, item.href);
|
||||
results.push({
|
||||
...item,
|
||||
detail,
|
||||
});
|
||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||
} catch (err) {
|
||||
results.push({
|
||||
...item,
|
||||
detail: null,
|
||||
error: err.message,
|
||||
});
|
||||
}
|
||||
}
|
||||
const results = items.slice(0, limit);
|
||||
|
||||
// 按阈值筛选
|
||||
const filtered = results.filter((item) => {
|
||||
return item.detail?.budget && item.detail.budget.amount > threshold;
|
||||
return item.budget && item.budget.amount > threshold;
|
||||
});
|
||||
|
||||
const total = filtered.reduce(
|
||||
(sum, item) => sum + (item.detail.budget?.amount || 0),
|
||||
(sum, item) => sum + (item.budget?.amount || 0),
|
||||
0
|
||||
);
|
||||
|
||||
@@ -588,10 +255,11 @@ app.post('/api/report', async (req, res) => {
|
||||
generated_at: new Date().toISOString(),
|
||||
},
|
||||
projects: filtered.map((item) => ({
|
||||
bidNo: item.bidNo,
|
||||
title: item.title,
|
||||
bidName: item.bidName,
|
||||
date: item.date,
|
||||
publish_time: item.detail.publishTime,
|
||||
budget: item.detail.budget,
|
||||
budget: item.budget,
|
||||
url: item.href,
|
||||
})),
|
||||
};
|
||||
@@ -605,7 +273,7 @@ app.post('/api/report', async (req, res) => {
|
||||
// 按时间范围生成报告
|
||||
app.post('/api/report-daterange', async (req, res) => {
|
||||
try {
|
||||
const { startDate, endDate, threshold = 50, maxPages = 23 } = req.body;
|
||||
const { startDate, endDate, threshold = 50, maxPages = 50 } = req.body;
|
||||
|
||||
// 按时间范围采集列表
|
||||
const items = await fetchListByDateRange(startDate, endDate, maxPages);
|
||||
@@ -627,39 +295,19 @@ app.post('/api/report-daterange', async (req, res) => {
|
||||
});
|
||||
}
|
||||
|
||||
// 采集详情
|
||||
const results = [];
|
||||
for (const item of items) {
|
||||
try {
|
||||
const html = await fetchHtml(item.href);
|
||||
const detail = await parseDetailEnhanced(html, item.href);
|
||||
results.push({
|
||||
...item,
|
||||
detail,
|
||||
});
|
||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||
} catch (err) {
|
||||
results.push({
|
||||
...item,
|
||||
detail: null,
|
||||
error: err.message,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// 生成报告
|
||||
const filtered = results.filter((item) => {
|
||||
return item.detail?.budget && item.detail.budget.amount > threshold;
|
||||
// 按阈值筛选
|
||||
const filtered = items.filter((item) => {
|
||||
return item.budget && item.budget.amount > threshold;
|
||||
});
|
||||
|
||||
const total = filtered.reduce(
|
||||
(sum, item) => sum + (item.detail.budget?.amount || 0),
|
||||
(sum, item) => sum + (item.budget?.amount || 0),
|
||||
0
|
||||
);
|
||||
|
||||
const report = {
|
||||
summary: {
|
||||
total_count: results.length,
|
||||
total_count: items.length,
|
||||
filtered_count: filtered.length,
|
||||
threshold: `${threshold}万元`,
|
||||
total_amount: `${total.toFixed(2)}万元`,
|
||||
@@ -667,10 +315,11 @@ app.post('/api/report-daterange', async (req, res) => {
|
||||
date_range: { startDate, endDate },
|
||||
},
|
||||
projects: filtered.map((item) => ({
|
||||
bidNo: item.bidNo,
|
||||
title: item.title,
|
||||
bidName: item.bidName,
|
||||
date: item.date,
|
||||
publish_time: item.detail.publishTime,
|
||||
budget: item.detail.budget,
|
||||
budget: item.budget,
|
||||
url: item.href,
|
||||
})),
|
||||
};
|
||||
@@ -725,33 +374,6 @@ app.post('/api/send-email', async (req, res) => {
|
||||
}
|
||||
});
|
||||
|
||||
// 测试PDF解析的API
|
||||
app.post('/api/test-pdf', async (req, res) => {
|
||||
try {
|
||||
const { pdfUrl } = req.body;
|
||||
|
||||
if (!pdfUrl) {
|
||||
return res.status(400).json({ success: false, error: '请提供PDF URL' });
|
||||
}
|
||||
|
||||
console.log(`测试PDF URL: ${pdfUrl}`);
|
||||
const content = await fetchPdfContent(pdfUrl);
|
||||
const budget = extractBudget(content);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
data: {
|
||||
contentLength: content.length,
|
||||
contentPreview: content.substring(0, 500),
|
||||
budget,
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('测试PDF失败:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// 获取配置
|
||||
app.get('/api/config', async (req, res) => {
|
||||
try {
|
||||
@@ -767,13 +389,10 @@ app.get('/api/config', async (req, res) => {
|
||||
const configContent = readFileSync(configPath, 'utf-8');
|
||||
const config = JSON.parse(configContent);
|
||||
|
||||
// 不返回敏感信息(密码和API Key)
|
||||
// 不返回敏感信息(密码)
|
||||
if (config.email && config.email.smtpPass) {
|
||||
config.email.smtpPass = '***已配置***';
|
||||
}
|
||||
if (config.llm && config.llm.apiKey) {
|
||||
config.llm.apiKey = '***已配置***';
|
||||
}
|
||||
|
||||
res.json({ success: true, data: config });
|
||||
} catch (error) {
|
||||
@@ -804,11 +423,6 @@ app.post('/api/config', async (req, res) => {
|
||||
newConfig.email.smtpPass = oldConfig.email?.smtpPass || '';
|
||||
}
|
||||
|
||||
// 如果 LLM API Key 是占位符,保留原 API Key
|
||||
if (newConfig.llm && newConfig.llm.apiKey === '***已配置***') {
|
||||
newConfig.llm.apiKey = oldConfig.llm?.apiKey || '';
|
||||
}
|
||||
|
||||
// 保存配置
|
||||
writeFileSync(configPath, JSON.stringify(newConfig, null, 2), 'utf-8');
|
||||
|
||||
@@ -821,26 +435,6 @@ app.post('/api/config', async (req, res) => {
|
||||
}
|
||||
});
|
||||
|
||||
// LLM 状态接口
|
||||
app.get('/api/llm/status', async (req, res) => {
|
||||
try {
|
||||
const status = getLLMStatus();
|
||||
res.json({ success: true, data: status });
|
||||
} catch (error) {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// LLM 连接测试接口
|
||||
app.post('/api/llm/test', async (req, res) => {
|
||||
try {
|
||||
const result = await testLLMConnection();
|
||||
res.json({ success: result.success, data: result });
|
||||
} catch (error) {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// 获取定时任务状态
|
||||
app.get('/api/scheduler/status', async (req, res) => {
|
||||
try {
|
||||
|
||||
Reference in New Issue
Block a user