```
feat(server): 增加对PDF公告内容的解析支持 - 新增 `fetchPdfUrlFromApi` 函数,用于从 jszbcg.com 的 API 接口获取 PDF 文件链接 - 新增 `extractPdfUrl` 函数,作为备选方案从 HTML 页面中提取 PDF 地址 - 新增 `fetchPdfContent` 函数,使用 pdf-parse 库下载并解析 PDF 内容 - 新增 `parseDetailEnhanced` 函数,整合 HTML 和 PDF 解析逻辑,优先使用 PDF 内容 - 修改预算金额验证范围上限,从 1000000 万元提升至 100000000 万元 - 在 /api/details、/api/report、/api/report-daterange 接口中启用增强解析逻辑 - 新增 /api/test-pdf 接口用于测试 PDF 解析功能 - 添加 pdf-parse 依赖到 package.json ```
This commit is contained in:
@@ -14,6 +14,7 @@
|
||||
"docx": "^9.5.1",
|
||||
"express": "^5.2.1",
|
||||
"iconv-lite": "^0.6.3",
|
||||
"nodemailer": "^7.0.11"
|
||||
"nodemailer": "^7.0.11",
|
||||
"pdf-parse": "^2.4.5"
|
||||
}
|
||||
}
|
||||
|
||||
202
src/server.js
202
src/server.js
@@ -115,6 +115,118 @@ async function fetchHtml(url) {
|
||||
return html;
|
||||
}
|
||||
|
||||
// 从jszbcg.com的API获取PDF URL
|
||||
async function fetchPdfUrlFromApi(pageUrl) {
|
||||
try {
|
||||
// 从URL中提取公告ID和bulletinType
|
||||
// URL格式: https://www.jszbcg.com/#/bulletinDetails/招标公告/2c9180899a7e34d2019a95630c931a8e?bulletinType=1
|
||||
const bulletinIdMatch = pageUrl.match(/bulletinDetails\/[^\/]+\/([a-f0-9]+)/i);
|
||||
const bulletinTypeMatch = pageUrl.match(/bulletinType=(\d+)/);
|
||||
|
||||
if (!bulletinIdMatch) {
|
||||
console.log('无法从URL中提取公告ID');
|
||||
return null;
|
||||
}
|
||||
|
||||
const bulletinId = bulletinIdMatch[1];
|
||||
const bulletinType = bulletinTypeMatch ? bulletinTypeMatch[1] : '1';
|
||||
|
||||
// 调用API获取公告详情
|
||||
const apiUrl = `https://api.jszbtb.com/DataGatewayApi/PublishBulletin/BulletinType/${bulletinType}/ID/${bulletinId}`;
|
||||
console.log(`调用API获取公告详情: ${apiUrl}`);
|
||||
|
||||
const response = await http.get(apiUrl, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
'Accept': 'application/json',
|
||||
'Referer': 'https://www.jszbcg.com/'
|
||||
},
|
||||
responseType: 'arraybuffer'
|
||||
});
|
||||
|
||||
// 解析返回的数据
|
||||
const responseText = iconv.decode(response.data, 'utf-8');
|
||||
const data = JSON.parse(responseText);
|
||||
|
||||
if (data.success && data.data && data.data.signedPdfUrl) {
|
||||
const pdfUrl = data.data.signedPdfUrl;
|
||||
console.log(`成功从API获取PDF URL: ${pdfUrl}`);
|
||||
return pdfUrl;
|
||||
}
|
||||
|
||||
console.log('API返回数据中没有PDF URL');
|
||||
return null;
|
||||
} catch (err) {
|
||||
console.error(`从API获取PDF URL失败: ${err.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// 从iframe嵌入页面提取PDF URL(备用方案)
|
||||
function extractPdfUrl(html, pageUrl) {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// 尝试多种iframe选择器
|
||||
let iframe = $('iframe').first();
|
||||
if (!iframe.length) {
|
||||
iframe = $('iframe[src*="pdf"]').first();
|
||||
}
|
||||
if (!iframe.length) {
|
||||
iframe = $('iframe[src*="viewer"]').first();
|
||||
}
|
||||
|
||||
if (iframe.length) {
|
||||
const src = iframe.attr('src');
|
||||
if (!src) return null;
|
||||
|
||||
// 从viewer.html?file=xxx.pdf中提取PDF URL
|
||||
const match = src.match(/[?&]file=([^&]+)/);
|
||||
if (match) {
|
||||
let pdfUrl = decodeURIComponent(match[1]);
|
||||
|
||||
// 检查是否是绝对路径
|
||||
if (!pdfUrl.startsWith('http://') && !pdfUrl.startsWith('https://')) {
|
||||
try {
|
||||
pdfUrl = new URL(pdfUrl, pageUrl).toString();
|
||||
} catch (err) {
|
||||
console.error(`URL拼接失败: ${err.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
return pdfUrl;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// 下载并解析PDF内容
|
||||
async function fetchPdfContent(pdfUrl) {
|
||||
console.log(`正在下载PDF: ${pdfUrl}`);
|
||||
try {
|
||||
// 导入pdf-parse v2
|
||||
const { PDFParse } = await import('pdf-parse');
|
||||
|
||||
const response = await http.get(pdfUrl, {
|
||||
responseType: 'arraybuffer',
|
||||
timeout: 30000, // PDF文件可能较大,增加超时时间
|
||||
});
|
||||
|
||||
// 使用pdf-parse v2 API
|
||||
const parser = new PDFParse({ data: response.data });
|
||||
const result = await parser.getText();
|
||||
await parser.destroy();
|
||||
|
||||
console.log(`PDF解析成功,文本长度: ${result.text.length}`);
|
||||
return result.text;
|
||||
} catch (err) {
|
||||
console.error(`PDF下载或解析失败: ${err.message}`);
|
||||
console.error(err.stack);
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
function parseList(html) {
|
||||
const $ = cheerio.load(html);
|
||||
const items = [];
|
||||
@@ -207,6 +319,59 @@ function parseDetail(html) {
|
||||
};
|
||||
}
|
||||
|
||||
// 增强版parseDetail,支持PDF解析
|
||||
async function parseDetailEnhanced(html, pageUrl) {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
let pdfUrl = null;
|
||||
|
||||
// 优先尝试从API获取PDF URL(适用于jszbcg.com)
|
||||
if (pageUrl.includes('jszbcg.com')) {
|
||||
pdfUrl = await fetchPdfUrlFromApi(pageUrl);
|
||||
}
|
||||
|
||||
// 如果API获取失败,回退到从HTML中提取
|
||||
if (!pdfUrl) {
|
||||
pdfUrl = extractPdfUrl(html, pageUrl);
|
||||
}
|
||||
|
||||
let content = '';
|
||||
let pdfParsed = false;
|
||||
|
||||
if (pdfUrl) {
|
||||
// 如果有PDF,从PDF中提取内容
|
||||
console.log(`发现PDF: ${pdfUrl}`);
|
||||
try {
|
||||
content = await fetchPdfContent(pdfUrl);
|
||||
pdfParsed = true;
|
||||
console.log(`成功从PDF提取内容,长度: ${content.length}`);
|
||||
} catch (err) {
|
||||
console.error('PDF解析失败,回退到HTML解析:', err.message);
|
||||
// 回退到普通HTML解析
|
||||
const htmlDetail = parseDetail(html);
|
||||
content = htmlDetail.content;
|
||||
}
|
||||
} else {
|
||||
// 普通HTML页面,使用原有逻辑
|
||||
const htmlDetail = parseDetail(html);
|
||||
content = htmlDetail.content;
|
||||
}
|
||||
|
||||
// 使用现有的extractBudget函数提取金额
|
||||
const budget = extractBudget(content);
|
||||
|
||||
// 获取其他基本信息(标题、发布时间等)
|
||||
const basicInfo = parseDetail(html);
|
||||
|
||||
return {
|
||||
...basicInfo,
|
||||
content,
|
||||
budget,
|
||||
hasPdf: pdfParsed,
|
||||
pdfUrl: pdfParsed ? pdfUrl : null,
|
||||
};
|
||||
}
|
||||
|
||||
function extractBudget(content) {
|
||||
// 预处理内容:去除数字之间的换行符和空白字符
|
||||
// 这样可以匹配被换行符分隔的数字,例如 "1\n1\n0\n9\n0\n0" -> "110900"
|
||||
@@ -246,8 +411,8 @@ function extractBudget(content) {
|
||||
amount = amount / pattern.divider;
|
||||
}
|
||||
|
||||
// 验证金额合理性(0.01万元到1000000万元之间)
|
||||
if (!isNaN(amount) && amount >= 0.01 && amount <= 1000000) {
|
||||
// 验证金额合理性(0.01万元到100000000万元之间)
|
||||
if (!isNaN(amount) && amount >= 0.01 && amount <= 100000000) {
|
||||
bestMatch = {
|
||||
amount,
|
||||
unit: '万元',
|
||||
@@ -307,7 +472,7 @@ app.post('/api/details', async (req, res) => {
|
||||
for (const item of toFetch) {
|
||||
try {
|
||||
const html = await fetchHtml(item.href);
|
||||
const detail = parseDetail(html);
|
||||
const detail = await parseDetailEnhanced(html, item.href);
|
||||
results.push({
|
||||
...item,
|
||||
detail,
|
||||
@@ -370,7 +535,7 @@ app.post('/api/report', async (req, res) => {
|
||||
for (const item of toFetch) {
|
||||
try {
|
||||
const html = await fetchHtml(item.href);
|
||||
const detail = parseDetail(html);
|
||||
const detail = await parseDetailEnhanced(html, item.href);
|
||||
results.push({
|
||||
...item,
|
||||
detail,
|
||||
@@ -447,7 +612,7 @@ app.post('/api/report-daterange', async (req, res) => {
|
||||
for (const item of items) {
|
||||
try {
|
||||
const html = await fetchHtml(item.href);
|
||||
const detail = parseDetail(html);
|
||||
const detail = await parseDetailEnhanced(html, item.href);
|
||||
results.push({
|
||||
...item,
|
||||
detail,
|
||||
@@ -540,6 +705,33 @@ app.post('/api/send-email', async (req, res) => {
|
||||
}
|
||||
});
|
||||
|
||||
// 测试PDF解析的API
|
||||
app.post('/api/test-pdf', async (req, res) => {
|
||||
try {
|
||||
const { pdfUrl } = req.body;
|
||||
|
||||
if (!pdfUrl) {
|
||||
return res.status(400).json({ success: false, error: '请提供PDF URL' });
|
||||
}
|
||||
|
||||
console.log(`测试PDF URL: ${pdfUrl}`);
|
||||
const content = await fetchPdfContent(pdfUrl);
|
||||
const budget = extractBudget(content);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
data: {
|
||||
contentLength: content.length,
|
||||
contentPreview: content.substring(0, 500),
|
||||
budget,
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('测试PDF失败:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
app.listen(PORT, () => {
|
||||
console.log(`Server running at http://localhost:${PORT}`);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user