```

feat(server): 增加对PDF公告内容的解析支持 - 新增 `fetchPdfUrlFromApi` 函数，用于从 jszbcg.com 的 API 接口获取 PDF 文件链接 - 新增 `extractPdfUrl` 函数，作为备选方案从 HTML 页面中提取 PDF 地址 - 新增 `fetchPdfContent` 函数，使用 pdf-parse 库下载并解析 PDF 内容 - 新增 `parseDetailEnhanced` 函数，整合 HTML 和 PDF 解析逻辑，优先使用 PDF 内容 - 修改预算金额验证范围上限，从 1000000 万元提升至 100000000 万元 - 在 /api/details、/api/report、/api/report-daterange 接口中启用增强解析逻辑 - 新增 /api/test-pdf 接口用于测试 PDF 解析功能 - 添加 pdf-parse 依赖到 package.json ```
2025-12-15 11:40:58 +08:00
parent b044e918aa
commit 3aee6af9ae
2 changed files with 199 additions and 6 deletions
--- a/package.json
+++ b/package.json
@@ -14,6 +14,7 @@
    "docx": "^9.5.1",
    "express": "^5.2.1",
    "iconv-lite": "^0.6.3",
-    "nodemailer": "^7.0.11"
+    "nodemailer": "^7.0.11",
+    "pdf-parse": "^2.4.5"
  }
 }
--- a/src/server.js
+++ b/src/server.js
@@ -115,6 +115,118 @@ async function fetchHtml(url) {
  return html;
 }

+// 从jszbcg.com的API获取PDF URL
+async function fetchPdfUrlFromApi(pageUrl) {
+  try {
+    // 从URL中提取公告ID和bulletinType
+    // URL格式: https://www.jszbcg.com/#/bulletinDetails/招标公告/2c9180899a7e34d2019a95630c931a8e?bulletinType=1
+    const bulletinIdMatch = pageUrl.match(/bulletinDetails\/[^\/]+\/([a-f0-9]+)/i);
+    const bulletinTypeMatch = pageUrl.match(/bulletinType=(\d+)/);
+
+    if (!bulletinIdMatch) {
+      console.log('无法从URL中提取公告ID');
+      return null;
+    }
+
+    const bulletinId = bulletinIdMatch[1];
+    const bulletinType = bulletinTypeMatch ? bulletinTypeMatch[1] : '1';
+
+    // 调用API获取公告详情
+    const apiUrl = `https://api.jszbtb.com/DataGatewayApi/PublishBulletin/BulletinType/${bulletinType}/ID/${bulletinId}`;
+    console.log(`调用API获取公告详情: ${apiUrl}`);
+
+    const response = await http.get(apiUrl, {
+      headers: {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+        'Accept': 'application/json',
+        'Referer': 'https://www.jszbcg.com/'
+      },
+      responseType: 'arraybuffer'
+    });
+
+    // 解析返回的数据
+    const responseText = iconv.decode(response.data, 'utf-8');
+    const data = JSON.parse(responseText);
+
+    if (data.success && data.data && data.data.signedPdfUrl) {
+      const pdfUrl = data.data.signedPdfUrl;
+      console.log(`成功从API获取PDF URL: ${pdfUrl}`);
+      return pdfUrl;
+    }
+
+    console.log('API返回数据中没有PDF URL');
+    return null;
+  } catch (err) {
+    console.error(`从API获取PDF URL失败: ${err.message}`);
+    return null;
+  }
+}
+
+// 从iframe嵌入页面提取PDF URL（备用方案）
+function extractPdfUrl(html, pageUrl) {
+  const $ = cheerio.load(html);
+
+  // 尝试多种iframe选择器
+  let iframe = $('iframe').first();
+  if (!iframe.length) {
+    iframe = $('iframe[src*="pdf"]').first();
+  }
+  if (!iframe.length) {
+    iframe = $('iframe[src*="viewer"]').first();
+  }
+
+  if (iframe.length) {
+    const src = iframe.attr('src');
+    if (!src) return null;
+
+    // 从viewer.html?file=xxx.pdf中提取PDF URL
+    const match = src.match(/[?&]file=([^&]+)/);
+    if (match) {
+      let pdfUrl = decodeURIComponent(match[1]);
+
+      // 检查是否是绝对路径
+      if (!pdfUrl.startsWith('http://') && !pdfUrl.startsWith('https://')) {
+        try {
+          pdfUrl = new URL(pdfUrl, pageUrl).toString();
+        } catch (err) {
+          console.error(`URL拼接失败: ${err.message}`);
+          return null;
+        }
+      }
+
+      return pdfUrl;
+    }
+  }
+
+  return null;
+}
+
+// 下载并解析PDF内容
+async function fetchPdfContent(pdfUrl) {
+  console.log(`正在下载PDF: ${pdfUrl}`);
+  try {
+    // 导入pdf-parse v2
+    const { PDFParse } = await import('pdf-parse');
+
+    const response = await http.get(pdfUrl, {
+      responseType: 'arraybuffer',
+      timeout: 30000, // PDF文件可能较大，增加超时时间
+    });
+
+    // 使用pdf-parse v2 API
+    const parser = new PDFParse({ data: response.data });
+    const result = await parser.getText();
+    await parser.destroy();
+
+    console.log(`PDF解析成功，文本长度: ${result.text.length}`);
+    return result.text;
+  } catch (err) {
+    console.error(`PDF下载或解析失败: ${err.message}`);
+    console.error(err.stack);
+    throw err;
+  }
+}
+
 function parseList(html) {
  const $ = cheerio.load(html);
  const items = [];
@@ -207,6 +319,59 @@ function parseDetail(html) {
  };
 }

+// 增强版parseDetail，支持PDF解析
+async function parseDetailEnhanced(html, pageUrl) {
+  const $ = cheerio.load(html);
+
+  let pdfUrl = null;
+
+  // 优先尝试从API获取PDF URL（适用于jszbcg.com）
+  if (pageUrl.includes('jszbcg.com')) {
+    pdfUrl = await fetchPdfUrlFromApi(pageUrl);
+  }
+
+  // 如果API获取失败，回退到从HTML中提取
+  if (!pdfUrl) {
+    pdfUrl = extractPdfUrl(html, pageUrl);
+  }
+
+  let content = '';
+  let pdfParsed = false;
+
+  if (pdfUrl) {
+    // 如果有PDF，从PDF中提取内容
+    console.log(`发现PDF: ${pdfUrl}`);
+    try {
+      content = await fetchPdfContent(pdfUrl);
+      pdfParsed = true;
+      console.log(`成功从PDF提取内容，长度: ${content.length}`);
+    } catch (err) {
+      console.error('PDF解析失败，回退到HTML解析:', err.message);
+      // 回退到普通HTML解析
+      const htmlDetail = parseDetail(html);
+      content = htmlDetail.content;
+    }
+  } else {
+    // 普通HTML页面，使用原有逻辑
+    const htmlDetail = parseDetail(html);
+    content = htmlDetail.content;
+  }
+
+  // 使用现有的extractBudget函数提取金额
+  const budget = extractBudget(content);
+
+  // 获取其他基本信息（标题、发布时间等）
+  const basicInfo = parseDetail(html);
+
+  return {
+    ...basicInfo,
+    content,
+    budget,
+    hasPdf: pdfParsed,
+    pdfUrl: pdfParsed ? pdfUrl : null,
+  };
+}
+
 function extractBudget(content) {
  // 预处理内容：去除数字之间的换行符和空白字符
  // 这样可以匹配被换行符分隔的数字，例如 "1\n1\n0\n9\n0\n0" -> "110900"
@@ -246,8 +411,8 @@ function extractBudget(content) {
        amount = amount / pattern.divider;
      }

-      // 验证金额合理性(0.01万元到1000000万元之间)
-      if (!isNaN(amount) && amount >= 0.01 && amount <= 1000000) {
+      // 验证金额合理性(0.01万元到100000000万元之间)
+      if (!isNaN(amount) && amount >= 0.01 && amount <= 100000000) {
        bestMatch = {
          amount,
          unit: '万元',
@@ -307,7 +472,7 @@ app.post('/api/details', async (req, res) => {
    for (const item of toFetch) {
      try {
        const html = await fetchHtml(item.href);
-        const detail = parseDetail(html);
+        const detail = await parseDetailEnhanced(html, item.href);
        results.push({
          ...item,
          detail,
@@ -370,7 +535,7 @@ app.post('/api/report', async (req, res) => {
    for (const item of toFetch) {
      try {
        const html = await fetchHtml(item.href);
-        const detail = parseDetail(html);
+        const detail = await parseDetailEnhanced(html, item.href);
        results.push({
          ...item,
          detail,
@@ -447,7 +612,7 @@ app.post('/api/report-daterange', async (req, res) => {
    for (const item of items) {
      try {
        const html = await fetchHtml(item.href);
-        const detail = parseDetail(html);
+        const detail = await parseDetailEnhanced(html, item.href);
        results.push({
          ...item,
          detail,
@@ -540,6 +705,33 @@ app.post('/api/send-email', async (req, res) => {
  }
 });

+// 测试PDF解析的API
+app.post('/api/test-pdf', async (req, res) => {
+  try {
+    const { pdfUrl } = req.body;
+
+    if (!pdfUrl) {
+      return res.status(400).json({ success: false, error: '请提供PDF URL' });
+    }
+
+    console.log(`测试PDF URL: ${pdfUrl}`);
+    const content = await fetchPdfContent(pdfUrl);
+    const budget = extractBudget(content);
+
+    res.json({
+      success: true,
+      data: {
+        contentLength: content.length,
+        contentPreview: content.substring(0, 500),
+        budget,
+      },
+    });
+  } catch (error) {
+    console.error('测试PDF失败:', error);
+    res.status(500).json({ success: false, error: error.message });
+  }
+});
+
 app.listen(PORT, () => {
  console.log(`Server running at http://localhost:${PORT}`);
 });