```

feat(server): 增加对PDF公告内容的解析支持 - 新增 `fetchPdfUrlFromApi` 函数，用于从 jszbcg.com 的 API 接口获取 PDF 文件链接 - 新增 `extractPdfUrl` 函数，作为备选方案从 HTML 页面中提取 PDF 地址 - 新增 `fetchPdfContent` 函数，使用 pdf-parse 库下载并解析 PDF 内容 - 新增 `parseDetailEnhanced` 函数，整合 HTML 和 PDF 解析逻辑，优先使用 PDF 内容 - 修改预算金额验证范围上限，从 1000000 万元提升至 100000000 万元 - 在 /api/details、/api/report、/api/report-daterange 接口中启用增强解析逻辑 - 新增 /api/test-pdf 接口用于测试 PDF 解析功能 - 添加 pdf-parse 依赖到 package.json ```
2025-12-15 11:40:58 +08:00
parent b044e918aa
commit 3aee6af9ae
2 changed files with 199 additions and 6 deletions
--- a/package.json
+++ b/package.json
@@ -14,6 +14,7 @@
    "docx": "^9.5.1",
    "express": "^5.2.1",
    "iconv-lite": "^0.6.3",
-    "nodemailer": "^7.0.11"
+    "nodemailer": "^7.0.11",
    "pdf-parse": "^2.4.5"
  }
 }
--- a/src/server.js
+++ b/src/server.js
@@ -115,6 +115,118 @@ async function fetchHtml(url) {
  return html;
 }
 // 从jszbcg.com的API获取PDF URL
 async function fetchPdfUrlFromApi(pageUrl) {
  try {
    // 从URL中提取公告ID和bulletinType
    // URL格式: https://www.jszbcg.com/#/bulletinDetails/招标公告/2c9180899a7e34d2019a95630c931a8e?bulletinType=1
    const bulletinIdMatch = pageUrl.match(/bulletinDetails\/[^\/]+\/([a-f0-9]+)/i);
    const bulletinTypeMatch = pageUrl.match(/bulletinType=(\d+)/);
    if (!bulletinIdMatch) {
      console.log('无法从URL中提取公告ID');
      return null;
    }
    const bulletinId = bulletinIdMatch[1];
    const bulletinType = bulletinTypeMatch ? bulletinTypeMatch[1] : '1';
    // 调用API获取公告详情
    const apiUrl = `https://api.jszbtb.com/DataGatewayApi/PublishBulletin/BulletinType/${bulletinType}/ID/${bulletinId}`;
    console.log(`调用API获取公告详情: ${apiUrl}`);
    const response = await http.get(apiUrl, {
      headers: {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'application/json',
        'Referer': 'https://www.jszbcg.com/'
      },
      responseType: 'arraybuffer'
    });
    // 解析返回的数据
    const responseText = iconv.decode(response.data, 'utf-8');
    const data = JSON.parse(responseText);
    if (data.success && data.data && data.data.signedPdfUrl) {
      const pdfUrl = data.data.signedPdfUrl;
      console.log(`成功从API获取PDF URL: ${pdfUrl}`);
      return pdfUrl;
    }
    console.log('API返回数据中没有PDF URL');
    return null;
  } catch (err) {
    console.error(`从API获取PDF URL失败: ${err.message}`);
    return null;
  }
 }
 // 从iframe嵌入页面提取PDF URL（备用方案）
 function extractPdfUrl(html, pageUrl) {
  const $ = cheerio.load(html);
  // 尝试多种iframe选择器
  let iframe = $('iframe').first();
  if (!iframe.length) {
    iframe = $('iframe[src*="pdf"]').first();
  }
  if (!iframe.length) {
    iframe = $('iframe[src*="viewer"]').first();
  }
  if (iframe.length) {
    const src = iframe.attr('src');
    if (!src) return null;
    // 从viewer.html?file=xxx.pdf中提取PDF URL
    const match = src.match(/[?&]file=([^&]+)/);
    if (match) {
      let pdfUrl = decodeURIComponent(match[1]);
      // 检查是否是绝对路径
      if (!pdfUrl.startsWith('http://') && !pdfUrl.startsWith('https://')) {
        try {
          pdfUrl = new URL(pdfUrl, pageUrl).toString();
        } catch (err) {
          console.error(`URL拼接失败: ${err.message}`);
          return null;
        }
      }
      return pdfUrl;
    }
  }
  return null;
 }
 // 下载并解析PDF内容
 async function fetchPdfContent(pdfUrl) {
  console.log(`正在下载PDF: ${pdfUrl}`);
  try {
    // 导入pdf-parse v2
    const { PDFParse } = await import('pdf-parse');
    const response = await http.get(pdfUrl, {
      responseType: 'arraybuffer',
      timeout: 30000, // PDF文件可能较大，增加超时时间
    });
    // 使用pdf-parse v2 API
    const parser = new PDFParse({ data: response.data });
    const result = await parser.getText();
    await parser.destroy();
    console.log(`PDF解析成功，文本长度: ${result.text.length}`);
    return result.text;
  } catch (err) {
    console.error(`PDF下载或解析失败: ${err.message}`);
    console.error(err.stack);
    throw err;
  }
 }
 function parseList(html) {
  const $ = cheerio.load(html);
  const items = [];
@@ -207,6 +319,59 @@ function parseDetail(html) {
  };
 }
 // 增强版parseDetail，支持PDF解析
 async function parseDetailEnhanced(html, pageUrl) {
  const $ = cheerio.load(html);
  let pdfUrl = null;
  // 优先尝试从API获取PDF URL（适用于jszbcg.com）
  if (pageUrl.includes('jszbcg.com')) {
    pdfUrl = await fetchPdfUrlFromApi(pageUrl);
  }
  // 如果API获取失败，回退到从HTML中提取
  if (!pdfUrl) {
    pdfUrl = extractPdfUrl(html, pageUrl);
  }
  let content = '';
  let pdfParsed = false;
  if (pdfUrl) {
    // 如果有PDF，从PDF中提取内容
    console.log(`发现PDF: ${pdfUrl}`);
    try {
      content = await fetchPdfContent(pdfUrl);
      pdfParsed = true;
      console.log(`成功从PDF提取内容，长度: ${content.length}`);
    } catch (err) {
      console.error('PDF解析失败，回退到HTML解析:', err.message);
      // 回退到普通HTML解析
      const htmlDetail = parseDetail(html);
      content = htmlDetail.content;
    }
  } else {
    // 普通HTML页面，使用原有逻辑
    const htmlDetail = parseDetail(html);
    content = htmlDetail.content;
  }
  // 使用现有的extractBudget函数提取金额
  const budget = extractBudget(content);
  // 获取其他基本信息（标题、发布时间等）
  const basicInfo = parseDetail(html);
  return {
    ...basicInfo,
    content,
    budget,
    hasPdf: pdfParsed,
    pdfUrl: pdfParsed ? pdfUrl : null,
  };
 }
 function extractBudget(content) {
  // 预处理内容：去除数字之间的换行符和空白字符
  // 这样可以匹配被换行符分隔的数字，例如 "1\n1\n0\n9\n0\n0" -> "110900"
@@ -246,8 +411,8 @@ function extractBudget(content) {
        amount = amount / pattern.divider;
      }
-      // 验证金额合理性(0.01万元到1000000万元之间)
+      // 验证金额合理性(0.01万元到100000000万元之间)
-      if (!isNaN(amount) && amount >= 0.01 && amount <= 1000000) {
+      if (!isNaN(amount) && amount >= 0.01 && amount <= 100000000) {
        bestMatch = {
          amount,
          unit: '万元',
@@ -307,7 +472,7 @@ app.post('/api/details', async (req, res) => {
    for (const item of toFetch) {
      try {
        const html = await fetchHtml(item.href);
-        const detail = parseDetail(html);
+        const detail = await parseDetailEnhanced(html, item.href);
        results.push({
          ...item,
          detail,
@@ -370,7 +535,7 @@ app.post('/api/report', async (req, res) => {
    for (const item of toFetch) {
      try {
        const html = await fetchHtml(item.href);
-        const detail = parseDetail(html);
+        const detail = await parseDetailEnhanced(html, item.href);
        results.push({
          ...item,
          detail,
@@ -447,7 +612,7 @@ app.post('/api/report-daterange', async (req, res) => {
    for (const item of items) {
      try {
        const html = await fetchHtml(item.href);
-        const detail = parseDetail(html);
+        const detail = await parseDetailEnhanced(html, item.href);
        results.push({
          ...item,
          detail,
@@ -540,6 +705,33 @@ app.post('/api/send-email', async (req, res) => {
  }
 });
 // 测试PDF解析的API
 app.post('/api/test-pdf', async (req, res) => {
  try {
    const { pdfUrl } = req.body;
    if (!pdfUrl) {
      return res.status(400).json({ success: false, error: '请提供PDF URL' });
    }
    console.log(`测试PDF URL: ${pdfUrl}`);
    const content = await fetchPdfContent(pdfUrl);
    const budget = extractBudget(content);
    res.json({
      success: true,
      data: {
        contentLength: content.length,
        contentPreview: content.substring(0, 500),
        budget,
      },
    });
  } catch (error) {
    console.error('测试PDF失败:', error);
    res.status(500).json({ success: false, error: error.message });
  }
 });
 app.listen(PORT, () => {
  console.log(`Server running at http://localhost:${PORT}`);
 });