```
feat: 切换到Firecrawl Browser Sandbox并更新API密钥 - 将抓取功能从Firecrawl Agent切换到Firecrawl Browser Sandbox - 更新.env文件中的FIRECRAWL_API_KEY为新密钥 - 修改前端界面文本,将"Firecrawl Agent"改为"Firecrawl Browser Sandbox" - 重构runScraper函数,添加按钮状态管理和滚动定位功能 - 移除zod验证schema,简化数据处理逻辑 - 更新定时任务调度器以使用新的浏览器抓取方式 - 清空results.json历史数据 ```
This commit is contained in:
275
src/firecrawlBrowserScraper.js
Normal file
275
src/firecrawlBrowserScraper.js
Normal file
@@ -0,0 +1,275 @@
|
||||
const DEFAULT_SCRAPER_PROMPT = '提取页面上今日的招标公告信息,包括:标题、项目金额(可能为合同预估价/最高投标限价等)、发布日期(YYYY-MM-DD格式)、详情页完整URL';
|
||||
const PAYLOAD_MARKER = '__FC_PAYLOAD__';
|
||||
|
||||
function pad2(value) {
|
||||
return String(value).padStart(2, '0');
|
||||
}
|
||||
|
||||
function formatDate(year, month, day) {
|
||||
return `${year}-${pad2(month)}-${pad2(day)}`;
|
||||
}
|
||||
|
||||
function getTodayInShanghai() {
|
||||
return new Intl.DateTimeFormat('en-CA', {
|
||||
timeZone: 'Asia/Shanghai',
|
||||
year: 'numeric',
|
||||
month: '2-digit',
|
||||
day: '2-digit',
|
||||
}).format(new Date());
|
||||
}
|
||||
|
||||
function parseTargetDate(prompt) {
|
||||
const text = String(prompt || '');
|
||||
if (!text) return null;
|
||||
|
||||
const fullDate = text.match(/(20\d{2})[-/.年](\d{1,2})[-/.月](\d{1,2})日?/);
|
||||
if (fullDate) {
|
||||
return formatDate(fullDate[1], fullDate[2], fullDate[3]);
|
||||
}
|
||||
|
||||
if (/(今天|今日|当日)/.test(text)) {
|
||||
return getTodayInShanghai();
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function normalizeDate(input) {
|
||||
if (!input) return '';
|
||||
const text = String(input).trim();
|
||||
if (!text) return '';
|
||||
|
||||
let m = text.match(/(20\d{2})[-/.年](\d{1,2})[-/.月](\d{1,2})日?/);
|
||||
if (m) return formatDate(m[1], m[2], m[3]);
|
||||
|
||||
m = text.match(/(\d{1,2})[-/.月](\d{1,2})日?/);
|
||||
if (m) {
|
||||
const currentYear = Number(getTodayInShanghai().slice(0, 4));
|
||||
return formatDate(currentYear, m[1], m[2]);
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
function extractDateFromText(text) {
|
||||
if (!text) return '';
|
||||
const m = String(text).match(/(20\d{2}[-/.年]\d{1,2}[-/.月]\d{1,2}日?)|(\d{1,2}[-/.月]\d{1,2}日?)/);
|
||||
return m ? normalizeDate(m[0]) : '';
|
||||
}
|
||||
|
||||
function extractAmountFromText(text) {
|
||||
if (!text) return null;
|
||||
const m = String(text).match(/([0-9][0-9,.\s]*(?:亿元|万元|万|元))/);
|
||||
if (!m) return null;
|
||||
return m[1].replace(/\s+/g, '').trim();
|
||||
}
|
||||
|
||||
function cleanText(text) {
|
||||
return String(text || '').replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
function toFiniteNumber(value, fallback) {
|
||||
const n = Number(value);
|
||||
return Number.isFinite(n) ? n : fallback;
|
||||
}
|
||||
|
||||
function parsePayloadFromText(rawText) {
|
||||
if (!rawText) return null;
|
||||
const text = String(rawText);
|
||||
|
||||
const markerIndex = text.lastIndexOf(PAYLOAD_MARKER);
|
||||
if (markerIndex >= 0) {
|
||||
const tail = text.slice(markerIndex + PAYLOAD_MARKER.length);
|
||||
const firstLine = tail.split(/\r?\n/).find(line => line.trim());
|
||||
if (firstLine) {
|
||||
try {
|
||||
return JSON.parse(firstLine.trim());
|
||||
} catch {
|
||||
// Continue fallback parsing.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
return JSON.parse(text.trim());
|
||||
} catch {
|
||||
// Continue fallback parsing.
|
||||
}
|
||||
|
||||
const lines = text.split(/\r?\n/).map(line => line.trim()).filter(Boolean).reverse();
|
||||
for (const line of lines) {
|
||||
try {
|
||||
return JSON.parse(line);
|
||||
} catch {
|
||||
// Try next line.
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function parseBrowserExecutePayload(executeResult) {
|
||||
const sources = [executeResult?.result, executeResult?.stdout]
|
||||
.filter(value => typeof value === 'string' && value.trim().length > 0);
|
||||
|
||||
for (const source of sources) {
|
||||
const payload = parsePayloadFromText(source);
|
||||
if (payload && typeof payload === 'object') return payload;
|
||||
}
|
||||
|
||||
return { items: [] };
|
||||
}
|
||||
|
||||
function splitKeywords(input) {
|
||||
return String(input || '')
|
||||
.split(/[、/,,|\s]+/)
|
||||
.map(item => item.trim())
|
||||
.filter(item => item.length >= 2);
|
||||
}
|
||||
|
||||
function filterByTypeIfPossible(items, type) {
|
||||
const keywords = splitKeywords(type);
|
||||
if (keywords.length === 0) return items;
|
||||
|
||||
const filtered = items.filter(item => {
|
||||
const haystack = `${item.title} ${item.context || ''}`;
|
||||
return keywords.some(keyword => haystack.includes(keyword));
|
||||
});
|
||||
|
||||
return filtered.length > 0 ? filtered : items;
|
||||
}
|
||||
|
||||
function normalizeItems(rawItems, targetDate, scraperType) {
|
||||
const dedup = new Map();
|
||||
|
||||
for (const raw of rawItems) {
|
||||
const title = cleanText(raw?.title);
|
||||
const url = cleanText(raw?.url);
|
||||
if (!title || !url) continue;
|
||||
|
||||
const context = cleanText(raw?.context);
|
||||
const date = normalizeDate(raw?.date) || extractDateFromText(context);
|
||||
const amount = cleanText(raw?.amount) || extractAmountFromText(context) || null;
|
||||
const key = `${title}@@${url}`;
|
||||
|
||||
if (!dedup.has(key)) {
|
||||
dedup.set(key, { title, amount, date, url, context });
|
||||
}
|
||||
}
|
||||
|
||||
let items = Array.from(dedup.values());
|
||||
items = filterByTypeIfPossible(items, scraperType);
|
||||
|
||||
if (targetDate) {
|
||||
items = items.filter(item => item.date === targetDate);
|
||||
}
|
||||
|
||||
return items
|
||||
.map(({ title, amount, date, url }) => ({ title, amount, date, url }))
|
||||
.slice(0, 100);
|
||||
}
|
||||
|
||||
function buildBrowserScript(url) {
|
||||
return `
|
||||
const targetUrl = ${JSON.stringify(url)};
|
||||
await page.goto(targetUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
||||
await page.waitForTimeout(1500);
|
||||
|
||||
const payload = await page.evaluate(() => {
|
||||
const normalize = (value) => String(value || '').replace(/\\s+/g, ' ').trim();
|
||||
const blockedTitles = new Set(['首页', '尾页', '上一页', '下一页', '更多', '详情', '查看', '返回', '跳转']);
|
||||
|
||||
const links = Array.from(document.querySelectorAll('a[href]'));
|
||||
const rows = [];
|
||||
const seen = new Set();
|
||||
|
||||
for (const a of links) {
|
||||
const href = a.getAttribute('href') || '';
|
||||
if (!href || href.startsWith('javascript:') || href.startsWith('#')) continue;
|
||||
|
||||
const title = normalize(a.textContent);
|
||||
if (!title || title.length < 6 || title.length > 180) continue;
|
||||
if (blockedTitles.has(title)) continue;
|
||||
|
||||
let absoluteUrl = '';
|
||||
try {
|
||||
absoluteUrl = new URL(href, location.href).href;
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
|
||||
const container = a.closest('tr,li,article,section,div,p,dd,dt') || a.parentElement;
|
||||
const context = normalize(container ? container.textContent : title);
|
||||
|
||||
const dateMatch = context.match(/(20\\d{2}[-/.年]\\d{1,2}[-/.月]\\d{1,2}日?)|(\\d{1,2}[-/.月]\\d{1,2}日?)/);
|
||||
const amountMatch = context.match(/([0-9][0-9,.\\s]*(?:亿元|万元|万|元))/);
|
||||
|
||||
const key = (title + '@@' + absoluteUrl).toLowerCase();
|
||||
if (seen.has(key)) continue;
|
||||
seen.add(key);
|
||||
|
||||
rows.push({
|
||||
title,
|
||||
url: absoluteUrl,
|
||||
date: dateMatch ? dateMatch[0] : '',
|
||||
amount: amountMatch ? amountMatch[0].replace(/\\s+/g, '') : null,
|
||||
context,
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
pageUrl: location.href,
|
||||
items: rows.slice(0, 300),
|
||||
};
|
||||
});
|
||||
|
||||
console.log('${PAYLOAD_MARKER}' + JSON.stringify(payload));
|
||||
JSON.stringify(payload);
|
||||
`;
|
||||
}
|
||||
|
||||
export async function runScraperWithBrowser(firecrawl, scraper, options = {}) {
|
||||
const prefix = options.logPrefix || '[Browser]';
|
||||
if (!scraper?.url) throw new Error('抓取 URL 不能为空');
|
||||
|
||||
const prompt = scraper.prompt || DEFAULT_SCRAPER_PROMPT;
|
||||
const targetDate = parseTargetDate(prompt);
|
||||
|
||||
const ttl = toFiniteNumber(scraper.browserTtl, 180);
|
||||
const activityTtl = toFiniteNumber(scraper.browserActivityTtl, 90);
|
||||
|
||||
const session = await firecrawl.browser({ ttl, activityTtl });
|
||||
if (!session?.success || !session.id) {
|
||||
throw new Error(session?.error || '创建 Browser 会话失败');
|
||||
}
|
||||
|
||||
let executeResult;
|
||||
try {
|
||||
executeResult = await firecrawl.browserExecute(session.id, {
|
||||
code: buildBrowserScript(scraper.url),
|
||||
language: 'node',
|
||||
});
|
||||
} finally {
|
||||
try {
|
||||
await firecrawl.deleteBrowser(session.id);
|
||||
} catch (closeError) {
|
||||
console.warn(`${prefix} 会话关闭失败: ${closeError.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (!executeResult?.success) {
|
||||
throw new Error(executeResult?.error || executeResult?.stderr || 'Browser 执行失败');
|
||||
}
|
||||
|
||||
const payload = parseBrowserExecutePayload(executeResult);
|
||||
const rawItems = Array.isArray(payload.items) ? payload.items : [];
|
||||
const items = normalizeItems(rawItems, targetDate, scraper.type);
|
||||
|
||||
console.log(`${prefix} URL=${scraper.url} raw=${rawItems.length} normalized=${items.length}${targetDate ? ` targetDate=${targetDate}` : ''}`);
|
||||
|
||||
return {
|
||||
items,
|
||||
targetDate,
|
||||
pageUrl: payload.pageUrl || scraper.url,
|
||||
};
|
||||
}
|
||||
@@ -4,8 +4,8 @@ import { readFileSync, writeFileSync, existsSync } from 'fs';
|
||||
import { fileURLToPath } from 'url';
|
||||
import { dirname, join } from 'path';
|
||||
import Firecrawl from '@mendable/firecrawl-js';
|
||||
import { z } from 'zod';
|
||||
import { sendScraperResultsEmail } from './emailService.js';
|
||||
import { runScraperWithBrowser } from './firecrawlBrowserScraper.js';
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = dirname(__filename);
|
||||
@@ -48,63 +48,12 @@ function appendResult(result) {
|
||||
saveResults(results);
|
||||
}
|
||||
|
||||
// ========== 统一的公告抓取 Schema ==========
|
||||
|
||||
// 公告抓取 Schema(result 包装数组)
|
||||
const announcementSchema = z.object({
|
||||
result: z.array(z.object({
|
||||
title: z.string().describe('公告标题'),
|
||||
amount: z.string().nullable().describe('项目金额(合同预估价/最高投标限价等),没有则为null'),
|
||||
date: z.string().describe('发布日期,YYYY-MM-DD格式'),
|
||||
url: z.string().describe('详情页完整URL,以https://开头'),
|
||||
})).describe('页面上提取到的所有公告条目'),
|
||||
});
|
||||
|
||||
/** 从 Firecrawl 返回结果中提取 result 数组 */
|
||||
function extractItems(raw) {
|
||||
if (!raw) return [];
|
||||
const root = (raw.data && typeof raw.data === 'object') ? raw.data : raw;
|
||||
if (Array.isArray(root.result)) return root.result;
|
||||
if (root.result && typeof root.result === 'object') {
|
||||
const keys = Object.keys(root.result).filter(k => !isNaN(parseInt(k)));
|
||||
if (keys.length > 0) return keys.sort((a, b) => parseInt(a) - parseInt(b)).map(k => root.result[k]);
|
||||
}
|
||||
if (Array.isArray(root)) return root;
|
||||
const numericKeys = Object.keys(root).filter(k => !isNaN(parseInt(k)));
|
||||
if (numericKeys.length > 0) return numericKeys.sort((a, b) => parseInt(a) - parseInt(b)).map(k => root[k]);
|
||||
return [];
|
||||
}
|
||||
|
||||
// ========== 抓取执行(复用 server.js 中 runScraper 的逻辑) ==========
|
||||
|
||||
async function runScraper(scraper) {
|
||||
console.log(`[定时任务] ${scraper.city} - ${scraper.section} ${scraper.subsection} - ${scraper.type}:${scraper.url}`);
|
||||
const fullPrompt = `访问这个URL: ${scraper.url}
|
||||
【目标区域】:${scraper.section || ''} - ${scraper.subsection || ''}
|
||||
【公告类型】:${scraper.type || ''}
|
||||
|
||||
${scraper.prompt || '提取页面上今日的招标公告信息,包括:标题、项目金额(可能为合同预估价/最高投标限价等等)、发布日期(YYYY-MM-DD格式)、详情页完整URL'}
|
||||
|
||||
请严格按照定义的 JSON 格式返回,每条公告包含 title、amount、date、url 四个字段。`;
|
||||
|
||||
const result = await firecrawl.agent({
|
||||
prompt: fullPrompt,
|
||||
schema: announcementSchema,
|
||||
model: scraper.model || 'spark-1-mini',
|
||||
});
|
||||
|
||||
console.log('[定时任务] 原始返回结果:', JSON.stringify(result).slice(0, 500));
|
||||
|
||||
// 标准化结果
|
||||
const rawItems = extractItems(result);
|
||||
const items = rawItems.map(item => ({
|
||||
title: item.title || '',
|
||||
amount: item.amount || null,
|
||||
date: item.date || '',
|
||||
url: item.url || '',
|
||||
}));
|
||||
|
||||
console.log(`[定时任务] 提取到 ${items.length} 条公告`);
|
||||
console.log(`[定时任务][Browser] ${scraper.city} - ${scraper.section} ${scraper.subsection} - ${scraper.type}:${scraper.url}`);
|
||||
const { items } = await runScraperWithBrowser(firecrawl, scraper, { logPrefix: '[Browser][Scheduler]' });
|
||||
console.log(`[定时任务][Browser] 提取到 ${items.length} 条公告`);
|
||||
|
||||
const record = {
|
||||
scraperId: scraper.id,
|
||||
|
||||
@@ -2,12 +2,12 @@ import 'dotenv/config';
|
||||
import express from 'express';
|
||||
import cors from 'cors';
|
||||
import Firecrawl from '@mendable/firecrawl-js';
|
||||
import { z } from 'zod';
|
||||
import { readFileSync, writeFileSync, existsSync } from 'fs';
|
||||
import { fileURLToPath } from 'url';
|
||||
import { dirname, join } from 'path';
|
||||
import { sendCombinedReportEmail } from './emailService.js';
|
||||
import { initScheduler, runTaskNow, reloadScheduler, getSchedulerStatus } from './scheduler.js';
|
||||
import { runScraperWithBrowser } from './firecrawlBrowserScraper.js';
|
||||
|
||||
const app = express();
|
||||
const PORT = process.env.PORT || 5000;
|
||||
@@ -171,68 +171,11 @@ app.delete('/api/scrapers/:id', (req, res) => {
|
||||
|
||||
// ========== 统一抓取执行 ==========
|
||||
|
||||
// 公告抓取 Schema(result 包装数组)
|
||||
const announcementSchema = z.object({
|
||||
result: z.array(z.object({
|
||||
title: z.string().describe('公告标题'),
|
||||
amount: z.string().nullable().describe('项目金额(合同预估价/最高投标限价等),没有则为null'),
|
||||
date: z.string().describe('发布日期,YYYY-MM-DD格式'),
|
||||
url: z.string().describe('详情页完整URL,以https://开头'),
|
||||
})).describe('页面上提取到的所有公告条目'),
|
||||
});
|
||||
|
||||
|
||||
/**
|
||||
* 从 Firecrawl agent 返回结果中提取 result 数组
|
||||
* 优先取 root.result,再回退数字键处理
|
||||
*/
|
||||
function extractItems(raw) {
|
||||
if (!raw) return [];
|
||||
const root = (raw.data && typeof raw.data === 'object') ? raw.data : raw;
|
||||
// 最优先:result 是真正数组
|
||||
if (Array.isArray(root.result)) return root.result;
|
||||
// result 是数字键对象
|
||||
if (root.result && typeof root.result === 'object') {
|
||||
const keys = Object.keys(root.result).filter(k => !isNaN(parseInt(k)));
|
||||
if (keys.length > 0) return keys.sort((a, b) => parseInt(a) - parseInt(b)).map(k => root.result[k]);
|
||||
}
|
||||
// 如果 root 本身是数组
|
||||
if (Array.isArray(root)) return root;
|
||||
// 顶层数字键回退
|
||||
const numericKeys = Object.keys(root).filter(k => !isNaN(parseInt(k)));
|
||||
if (numericKeys.length > 0) return numericKeys.sort((a, b) => parseInt(a) - parseInt(b)).map(k => root[k]);
|
||||
return [];
|
||||
}
|
||||
|
||||
// 执行单个抓取来源并保存结果
|
||||
async function runScraper(scraper) {
|
||||
console.log(`[Agent] ${scraper.city} - ${scraper.section} ${scraper.subsection} - ${scraper.type}:${scraper.url}`);
|
||||
const fullPrompt = `访问这个URL: ${scraper.url}
|
||||
【目标区域】:${scraper.section || ''} - ${scraper.subsection || ''}
|
||||
【公告类型】:${scraper.type || ''}
|
||||
|
||||
${scraper.prompt || '提取页面上今日的招标公告信息,包括:标题、项目金额(可能为合同预估价/最高投标限价等等)、发布日期(YYYY-MM-DD格式)、详情页完整URL'}
|
||||
|
||||
请严格按照定义的 JSON 格式返回,每条公告包含 title、amount、date、url 四个字段。`;
|
||||
console.log(fullPrompt, 'fullPrompt=======');
|
||||
|
||||
const result = await firecrawl.agent({
|
||||
prompt: fullPrompt,
|
||||
schema: announcementSchema,
|
||||
model: scraper.model || 'spark-1-mini',
|
||||
});
|
||||
|
||||
console.log('[Agent] 原始返回结果:', JSON.stringify(result).slice(0, 500));
|
||||
|
||||
const rawItems = extractItems(result);
|
||||
const items = rawItems.map(item => ({
|
||||
title: item.title || '',
|
||||
amount: item.amount || null,
|
||||
date: item.date || '',
|
||||
url: item.url || '',
|
||||
}));
|
||||
|
||||
console.log(`[Agent] 提取到 ${items.length} 条公告`);
|
||||
console.log(`[Browser] ${scraper.city} - ${scraper.section} ${scraper.subsection} - ${scraper.type}:${scraper.url}`);
|
||||
const { items } = await runScraperWithBrowser(firecrawl, scraper, { logPrefix: '[Browser][API]' });
|
||||
console.log(`[Browser] 提取到 ${items.length} 条公告`);
|
||||
|
||||
const record = {
|
||||
scraperId: scraper.id,
|
||||
|
||||
Reference in New Issue
Block a user