feat(config): 南京公共资源交易中心

This commit is contained in:
2025-12-15 18:15:05 +08:00
parent 6fc9748009
commit 02e3728c5e
373 changed files with 227 additions and 216925 deletions

View File

@@ -62,19 +62,14 @@ function getDateRangeByType(timeRange) {
return { startDate, endDate };
}
// 获取本月的开始和结束日期 (兼容旧代码)
function getCurrentMonthDateRange() {
return getDateRangeByType('thisMonth');
}
// 从server.js复制的辅助函数
const BASE_URL = 'https://gjzx.nanjing.gov.cn/gggs/';
// 南京市公共资源交易平台 - 房建市政招标公告
const BASE_URL = 'https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/';
const http = axios.create({
responseType: 'arraybuffer',
timeout: 10000,
timeout: 15000,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; gjzx-scraper/1.0)',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
},
});
@@ -93,38 +88,55 @@ async function fetchHtml(url) {
return html;
}
function getPageUrl(pageIndex, baseUrl = BASE_URL) {
if (pageIndex === 0) {
return baseUrl;
function getPageUrl(pageIndex) {
if (pageIndex === 1) {
return `${BASE_URL}moreinfo.html`;
}
const cleanBaseUrl = baseUrl.replace(/\/$/, '');
return `${cleanBaseUrl}/index_${pageIndex}.html`;
return `${BASE_URL}${pageIndex}.html`;
}
// 解析列表页HTML提取招标信息
function parseList(html) {
const $ = cheerio.load(html);
const items = [];
$('table tr').each((_, row) => {
$('li.ewb-info-item2').each((_, row) => {
const $row = $(row);
const link = $row.find('td:first-child a').first();
const dateCell = $row.find('td:nth-child(2)');
const cells = $row.find('div.ewb-info-num2');
if (link.length && dateCell.length) {
const title = link.attr('title') || link.text().trim();
const rawHref = link.attr('href') || '';
const dateText = dateCell.text().trim();
if (cells.length >= 5) {
const bidNo = $(cells[0]).find('p').attr('title') || $(cells[0]).find('p').text().trim();
const projectName = $(cells[1]).find('p').attr('title') || $(cells[1]).find('p').text().trim();
const bidName = $(cells[2]).find('p').attr('title') || $(cells[2]).find('p').text().trim();
const estimatedPrice = $(cells[3]).find('p').text().trim();
const publishDate = $(cells[4]).find('p').text().trim();
if (!rawHref || !title || title.length < 5) return;
if (rawHref === './' || rawHref === '../') return;
if (!/^\d{4}-\d{2}-\d{2}$/.test(dateText)) return;
try {
const href = new URL(rawHref, BASE_URL).toString();
items.push({ title, href, date: dateText });
} catch (err) {
return;
const onclick = $row.attr('onclick') || '';
const hrefMatch = onclick.match(/window\.open\(['"]([^'"]+)['"]\)/);
let href = '';
if (hrefMatch) {
href = hrefMatch[1];
if (href.startsWith('/')) {
href = `https://njggzy.nanjing.gov.cn${href}`;
}
}
if (!/^\d{4}-\d{2}-\d{2}$/.test(publishDate)) return;
const price = parseFloat(estimatedPrice);
if (isNaN(price)) return;
items.push({
bidNo,
title: projectName,
bidName,
budget: {
amount: price,
unit: '万元'
},
date: publishDate,
href
});
}
});
@@ -141,23 +153,23 @@ function isDateInRange(dateStr, startDate, endDate) {
return true;
}
async function fetchListByDateRange(startDate, endDate, maxPages = 23) {
async function fetchListByDateRange(startDate, endDate, maxPages = 50) {
const allItems = [];
let shouldContinue = true;
let pageIndex = 0;
let pageIndex = 1;
console.log(`开始按时间范围采集: ${startDate || '不限'}${endDate || '不限'}`);
while (shouldContinue && pageIndex < maxPages) {
while (shouldContinue && pageIndex <= maxPages) {
const pageUrl = getPageUrl(pageIndex);
console.log(`正在采集第 ${pageIndex + 1} 页: ${pageUrl}`);
console.log(`正在采集第 ${pageIndex} 页: ${pageUrl}`);
try {
const html = await fetchHtml(pageUrl);
const items = parseList(html);
if (items.length === 0) {
console.log(`${pageIndex + 1} 页没有数据,停止采集`);
console.log(`${pageIndex} 页没有数据,停止采集`);
break;
}
@@ -177,251 +189,27 @@ async function fetchListByDateRange(startDate, endDate, maxPages = 23) {
}
if (allItemsBeforeRange && startDate) {
console.log(`${pageIndex + 1} 页所有项目都早于起始日期,停止采集`);
console.log(`${pageIndex} 页所有项目都早于起始日期,停止采集`);
shouldContinue = false;
}
console.log(`${pageIndex + 1} 页找到 ${items.length} 条,符合条件 ${hasItemsInRange ? '有' : '无'}`);
console.log(`${pageIndex} 页找到 ${items.length} 条,符合条件 ${hasItemsInRange ? '有' : '无'}`);
pageIndex++;
if (shouldContinue && pageIndex < maxPages) {
if (shouldContinue && pageIndex <= maxPages) {
await new Promise(resolve => setTimeout(resolve, 500));
}
} catch (err) {
console.error(`采集第 ${pageIndex + 1} 页失败: ${err.message}`);
console.error(`采集第 ${pageIndex} 页失败: ${err.message}`);
break;
}
}
console.log(`总共采集了 ${pageIndex} 页,找到 ${allItems.length} 条符合条件的公告`);
console.log(`总共采集了 ${pageIndex - 1} 页,找到 ${allItems.length} 条符合条件的公告`);
return allItems;
}
// 从server.js导入parseDetail相关函数
function parseDetail(html) {
const $ = cheerio.load(html);
let title = $('.title18').text().trim();
if (!title) {
title = $('.article-info h1').text().trim();
}
if (!title) {
title = $('h1').first().text().trim();
}
const publishTd = $('td:contains("发布部门")').filter((_, el) => {
return $(el).text().includes('发布时间');
});
const publishText = publishTd.text().trim();
let timeMatch = publishText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/);
let publishTime = timeMatch ? timeMatch[1] : '';
if (!publishTime) {
const infoText = $('.info-sources').text() || $('body').text();
timeMatch = infoText.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})/);
publishTime = timeMatch ? timeMatch[1] : '';
}
let content = '';
const contentSelectors = [
'.zhenwen td',
'.con',
'.article-content',
'.ewb-article-content',
'body'
];
for (const selector of contentSelectors) {
const el = $(selector).first();
if (el.length > 0) {
const text = el.text().trim();
if (text.length > content.length) {
content = text;
}
}
}
const budget = extractBudget(content);
return {
title,
publishTime,
content,
budget,
};
}
function extractBudget(content) {
let cleanedContent = content.replace(/(\d)\s*[\n\r]\s*(?=\d)/g, '$1');
const patterns = [
{ regex: /(?:¥|¥|人民币)\s*([\d,]+(?:\.\d+)?)\s*万元/i, priority: 1 },
{ regex: /[(][¥¥]([\d,]+(?:\.\d+)?)[)]/i, priority: 2, divider: 10000 },
{ regex: /([\d,]+(?:\.\d+)?)\s*万元/i, priority: 3 },
{ regex: /(?:¥|¥|人民币)\s*([\d,]+(?:\.\d+)?)\s*元/i, priority: 4, divider: 10000 },
{ regex: /([\d,]+(?:\.\d+)?)\s*元(?!整)/i, priority: 5, divider: 10000 }
];
let bestMatch = null;
let bestPriority = Infinity;
for (const pattern of patterns) {
const match = cleanedContent.match(pattern.regex);
if (match && pattern.priority < bestPriority) {
const numberStr = match[1].replace(/[,]/g, '');
let amount = parseFloat(numberStr);
if (pattern.divider) {
amount = amount / pattern.divider;
}
if (!isNaN(amount) && amount >= 0.01 && amount <= 100000000) {
bestMatch = {
amount,
unit: '万元',
text: match[0],
originalUnit: pattern.divider ? '元' : '万元'
};
bestPriority = pattern.priority;
}
}
}
return bestMatch;
}
// 从API获取PDF URL
async function fetchPdfUrlFromApi(pageUrl) {
try {
const bulletinIdMatch = pageUrl.match(/bulletinDetails\/[^\/]+\/([a-f0-9]+)/i);
const bulletinTypeMatch = pageUrl.match(/bulletinType=(\d+)/);
if (!bulletinIdMatch) {
return null;
}
const bulletinId = bulletinIdMatch[1];
const bulletinType = bulletinTypeMatch ? bulletinTypeMatch[1] : '1';
const apiUrl = `https://api.jszbtb.com/DataGatewayApi/PublishBulletin/BulletinType/${bulletinType}/ID/${bulletinId}`;
const response = await http.get(apiUrl, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'application/json',
'Referer': 'https://www.jszbcg.com/'
},
responseType: 'arraybuffer'
});
const responseText = iconv.decode(response.data, 'utf-8');
const data = JSON.parse(responseText);
if (data.success && data.data && data.data.signedPdfUrl) {
return data.data.signedPdfUrl;
}
return null;
} catch (err) {
return null;
}
}
function extractPdfUrl(html, pageUrl) {
const $ = cheerio.load(html);
let iframe = $('iframe').first();
if (!iframe.length) {
iframe = $('iframe[src*="pdf"]').first();
}
if (!iframe.length) {
iframe = $('iframe[src*="viewer"]').first();
}
if (iframe.length) {
const src = iframe.attr('src');
if (!src) return null;
const match = src.match(/[?&]file=([^&]+)/);
if (match) {
let pdfUrl = decodeURIComponent(match[1]);
if (!pdfUrl.startsWith('http://') && !pdfUrl.startsWith('https://')) {
try {
pdfUrl = new URL(pdfUrl, pageUrl).toString();
} catch (err) {
return null;
}
}
return pdfUrl;
}
}
return null;
}
async function fetchPdfContent(pdfUrl) {
try {
const { PDFParse } = await import('pdf-parse');
const response = await http.get(pdfUrl, {
responseType: 'arraybuffer',
timeout: 30000,
});
const parser = new PDFParse({ data: response.data });
const result = await parser.getText();
await parser.destroy();
return result.text;
} catch (err) {
throw err;
}
}
async function parseDetailEnhanced(html, pageUrl) {
const $ = cheerio.load(html);
let pdfUrl = null;
if (pageUrl.includes('jszbcg.com')) {
pdfUrl = await fetchPdfUrlFromApi(pageUrl);
}
if (!pdfUrl) {
pdfUrl = extractPdfUrl(html, pageUrl);
}
let content = '';
let pdfParsed = false;
if (pdfUrl) {
try {
content = await fetchPdfContent(pdfUrl);
pdfParsed = true;
} catch (err) {
const htmlDetail = parseDetail(html);
content = htmlDetail.content;
}
} else {
const htmlDetail = parseDetail(html);
content = htmlDetail.content;
}
const budget = extractBudget(content);
const basicInfo = parseDetail(html);
return {
...basicInfo,
content,
budget,
hasPdf: pdfParsed,
pdfUrl: pdfParsed ? pdfUrl : null,
};
}
// 定时任务执行函数
async function executeScheduledTask(config) {
try {
@@ -432,7 +220,7 @@ async function executeScheduledTask(config) {
const timeRange = config.scheduler.timeRange || 'thisMonth';
const { startDate, endDate } = getDateRangeByType(timeRange);
const threshold = config.scheduler.threshold || 100000; // 默认10亿(100000万元)
const threshold = config.scheduler.threshold || 10000; // 默认1亿(10000万元)
const timeRangeNames = {
'today': '今日',
@@ -441,64 +229,39 @@ async function executeScheduledTask(config) {
};
console.log(`采集时间段: ${timeRangeNames[timeRange] || '本月'}`);
console.log(`采集时间范围: ${startDate}${endDate}`);
console.log(`金额阈值: ${threshold}万元 (${threshold / 10000}亿元)`);
console.log(`金额阈值: ${threshold}万元 (${(threshold / 10000).toFixed(2)}亿元)`);
// 采集列表
const items = await fetchListByDateRange(startDate, endDate, 23);
// 采集列表(直接包含合同估算价)
const items = await fetchListByDateRange(startDate, endDate, 50);
if (items.length === 0) {
console.log('本月暂无公告数据');
console.log('暂无公告数据');
return;
}
// 采集详情
console.log('========================================');
console.log(`开始采集 ${items.length} 条公告的详情...`);
const results = [];
for (let i = 0; i < items.length; i++) {
const item = items[i];
try {
console.log(`[${i + 1}/${items.length}] 正在采集: ${item.title}`);
const html = await fetchHtml(item.href);
const detail = await parseDetailEnhanced(html, item.href);
results.push({
...item,
detail,
});
await new Promise((resolve) => setTimeout(resolve, 500));
} catch (err) {
console.error(`采集失败: ${err.message}`);
results.push({
...item,
detail: null,
error: err.message,
});
}
}
// 筛选大于阈值的项目
const filtered = results.filter((item) => {
return item.detail?.budget && item.detail.budget.amount > threshold;
const filtered = items.filter((item) => {
return item.budget && item.budget.amount > threshold;
});
console.log('========================================');
console.log(`筛选结果: 找到 ${filtered.length} 个大于 ${threshold}万元 的项目`);
if (filtered.length === 0) {
console.log('本月暂无符合条件的大额项目');
console.log('暂无符合条件的大额项目');
return;
}
// 计算总金额
const total = filtered.reduce(
(sum, item) => sum + (item.detail.budget?.amount || 0),
(sum, item) => sum + (item.budget?.amount || 0),
0
);
// 生成报告
const report = {
summary: {
total_count: results.length,
total_count: items.length,
filtered_count: filtered.length,
threshold: `${threshold}万元`,
total_amount: `${total.toFixed(2)}万元`,
@@ -506,10 +269,11 @@ async function executeScheduledTask(config) {
date_range: { startDate, endDate },
},
projects: filtered.map((item) => ({
bidNo: item.bidNo,
title: item.title,
bidName: item.bidName,
date: item.date,
publish_time: item.detail.publishTime,
budget: item.detail.budget,
budget: item.budget,
url: item.href,
})),
};
@@ -616,7 +380,8 @@ export function getSchedulerStatus() {
config: config ? {
enabled: config.scheduler?.enabled || false,
cronTime: config.scheduler?.cronTime || '0 9 * * *',
threshold: config.scheduler?.threshold || 100000,
threshold: config.scheduler?.threshold || 10000,
timeRange: config.scheduler?.timeRange || 'thisMonth',
} : null,
};
}