diff --git a/package.json b/package.json index 3a4a6a8..9f895ed 100644 --- a/package.json +++ b/package.json @@ -14,6 +14,7 @@ "docx": "^9.5.1", "express": "^5.2.1", "iconv-lite": "^0.6.3", - "nodemailer": "^7.0.11" + "nodemailer": "^7.0.11", + "pdf-parse": "^2.4.5" } } diff --git a/src/server.js b/src/server.js index 680e249..2afd464 100644 --- a/src/server.js +++ b/src/server.js @@ -115,6 +115,118 @@ async function fetchHtml(url) { return html; } +// 从jszbcg.com的API获取PDF URL +async function fetchPdfUrlFromApi(pageUrl) { + try { + // 从URL中提取公告ID和bulletinType + // URL格式: https://www.jszbcg.com/#/bulletinDetails/招标公告/2c9180899a7e34d2019a95630c931a8e?bulletinType=1 + const bulletinIdMatch = pageUrl.match(/bulletinDetails\/[^\/]+\/([a-f0-9]+)/i); + const bulletinTypeMatch = pageUrl.match(/bulletinType=(\d+)/); + + if (!bulletinIdMatch) { + console.log('无法从URL中提取公告ID'); + return null; + } + + const bulletinId = bulletinIdMatch[1]; + const bulletinType = bulletinTypeMatch ? bulletinTypeMatch[1] : '1'; + + // 调用API获取公告详情 + const apiUrl = `https://api.jszbtb.com/DataGatewayApi/PublishBulletin/BulletinType/${bulletinType}/ID/${bulletinId}`; + console.log(`调用API获取公告详情: ${apiUrl}`); + + const response = await http.get(apiUrl, { + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', + 'Accept': 'application/json', + 'Referer': 'https://www.jszbcg.com/' + }, + responseType: 'arraybuffer' + }); + + // 解析返回的数据 + const responseText = iconv.decode(response.data, 'utf-8'); + const data = JSON.parse(responseText); + + if (data.success && data.data && data.data.signedPdfUrl) { + const pdfUrl = data.data.signedPdfUrl; + console.log(`成功从API获取PDF URL: ${pdfUrl}`); + return pdfUrl; + } + + console.log('API返回数据中没有PDF URL'); + return null; + } catch (err) { + console.error(`从API获取PDF URL失败: ${err.message}`); + return null; + } +} + +// 从iframe嵌入页面提取PDF URL(备用方案) +function extractPdfUrl(html, pageUrl) { + const $ = cheerio.load(html); + + // 尝试多种iframe选择器 + let iframe = $('iframe').first(); + if (!iframe.length) { + iframe = $('iframe[src*="pdf"]').first(); + } + if (!iframe.length) { + iframe = $('iframe[src*="viewer"]').first(); + } + + if (iframe.length) { + const src = iframe.attr('src'); + if (!src) return null; + + // 从viewer.html?file=xxx.pdf中提取PDF URL + const match = src.match(/[?&]file=([^&]+)/); + if (match) { + let pdfUrl = decodeURIComponent(match[1]); + + // 检查是否是绝对路径 + if (!pdfUrl.startsWith('http://') && !pdfUrl.startsWith('https://')) { + try { + pdfUrl = new URL(pdfUrl, pageUrl).toString(); + } catch (err) { + console.error(`URL拼接失败: ${err.message}`); + return null; + } + } + + return pdfUrl; + } + } + + return null; +} + +// 下载并解析PDF内容 +async function fetchPdfContent(pdfUrl) { + console.log(`正在下载PDF: ${pdfUrl}`); + try { + // 导入pdf-parse v2 + const { PDFParse } = await import('pdf-parse'); + + const response = await http.get(pdfUrl, { + responseType: 'arraybuffer', + timeout: 30000, // PDF文件可能较大,增加超时时间 + }); + + // 使用pdf-parse v2 API + const parser = new PDFParse({ data: response.data }); + const result = await parser.getText(); + await parser.destroy(); + + console.log(`PDF解析成功,文本长度: ${result.text.length}`); + return result.text; + } catch (err) { + console.error(`PDF下载或解析失败: ${err.message}`); + console.error(err.stack); + throw err; + } +} + function parseList(html) { const $ = cheerio.load(html); const items = []; @@ -207,6 +319,59 @@ function parseDetail(html) { }; } +// 增强版parseDetail,支持PDF解析 +async function parseDetailEnhanced(html, pageUrl) { + const $ = cheerio.load(html); + + let pdfUrl = null; + + // 优先尝试从API获取PDF URL(适用于jszbcg.com) + if (pageUrl.includes('jszbcg.com')) { + pdfUrl = await fetchPdfUrlFromApi(pageUrl); + } + + // 如果API获取失败,回退到从HTML中提取 + if (!pdfUrl) { + pdfUrl = extractPdfUrl(html, pageUrl); + } + + let content = ''; + let pdfParsed = false; + + if (pdfUrl) { + // 如果有PDF,从PDF中提取内容 + console.log(`发现PDF: ${pdfUrl}`); + try { + content = await fetchPdfContent(pdfUrl); + pdfParsed = true; + console.log(`成功从PDF提取内容,长度: ${content.length}`); + } catch (err) { + console.error('PDF解析失败,回退到HTML解析:', err.message); + // 回退到普通HTML解析 + const htmlDetail = parseDetail(html); + content = htmlDetail.content; + } + } else { + // 普通HTML页面,使用原有逻辑 + const htmlDetail = parseDetail(html); + content = htmlDetail.content; + } + + // 使用现有的extractBudget函数提取金额 + const budget = extractBudget(content); + + // 获取其他基本信息(标题、发布时间等) + const basicInfo = parseDetail(html); + + return { + ...basicInfo, + content, + budget, + hasPdf: pdfParsed, + pdfUrl: pdfParsed ? pdfUrl : null, + }; +} + function extractBudget(content) { // 预处理内容:去除数字之间的换行符和空白字符 // 这样可以匹配被换行符分隔的数字,例如 "1\n1\n0\n9\n0\n0" -> "110900" @@ -246,8 +411,8 @@ function extractBudget(content) { amount = amount / pattern.divider; } - // 验证金额合理性(0.01万元到1000000万元之间) - if (!isNaN(amount) && amount >= 0.01 && amount <= 1000000) { + // 验证金额合理性(0.01万元到100000000万元之间) + if (!isNaN(amount) && amount >= 0.01 && amount <= 100000000) { bestMatch = { amount, unit: '万元', @@ -307,7 +472,7 @@ app.post('/api/details', async (req, res) => { for (const item of toFetch) { try { const html = await fetchHtml(item.href); - const detail = parseDetail(html); + const detail = await parseDetailEnhanced(html, item.href); results.push({ ...item, detail, @@ -370,7 +535,7 @@ app.post('/api/report', async (req, res) => { for (const item of toFetch) { try { const html = await fetchHtml(item.href); - const detail = parseDetail(html); + const detail = await parseDetailEnhanced(html, item.href); results.push({ ...item, detail, @@ -447,7 +612,7 @@ app.post('/api/report-daterange', async (req, res) => { for (const item of items) { try { const html = await fetchHtml(item.href); - const detail = parseDetail(html); + const detail = await parseDetailEnhanced(html, item.href); results.push({ ...item, detail, @@ -540,6 +705,33 @@ app.post('/api/send-email', async (req, res) => { } }); +// 测试PDF解析的API +app.post('/api/test-pdf', async (req, res) => { + try { + const { pdfUrl } = req.body; + + if (!pdfUrl) { + return res.status(400).json({ success: false, error: '请提供PDF URL' }); + } + + console.log(`测试PDF URL: ${pdfUrl}`); + const content = await fetchPdfContent(pdfUrl); + const budget = extractBudget(content); + + res.json({ + success: true, + data: { + contentLength: content.length, + contentPreview: content.substring(0, 500), + budget, + }, + }); + } catch (error) { + console.error('测试PDF失败:', error); + res.status(500).json({ success: false, error: error.message }); + } +}); + app.listen(PORT, () => { console.log(`Server running at http://localhost:${PORT}`); });