From a2408fa9527d4e5fc8b425648d37a2294968fe04 Mon Sep 17 00:00:00 2001 From: zhaojunlong <5482498@qq.com> Date: Tue, 10 Mar 2026 11:36:35 +0800 Subject: [PATCH] =?UTF-8?q?```=20feat:=20=E5=88=87=E6=8D=A2=E5=88=B0Firecr?= =?UTF-8?q?awl=20Browser=20Sandbox=E5=B9=B6=E6=9B=B4=E6=96=B0API=E5=AF=86?= =?UTF-8?q?=E9=92=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 将抓取功能从Firecrawl Agent切换到Firecrawl Browser Sandbox - 更新.env文件中的FIRECRAWL_API_KEY为新密钥 - 修改前端界面文本,将"Firecrawl Agent"改为"Firecrawl Browser Sandbox" - 重构runScraper函数,添加按钮状态管理和滚动定位功能 - 移除zod验证schema,简化数据处理逻辑 - 更新定时任务调度器以使用新的浏览器抓取方式 - 清空results.json历史数据 ``` --- .env | 2 +- public/index.html | 20 +- results.json | 460 +-------------------------------- src/firecrawlBrowserScraper.js | 275 ++++++++++++++++++++ src/scheduler.js | 59 +---- src/server.js | 65 +---- 6 files changed, 301 insertions(+), 580 deletions(-) create mode 100644 src/firecrawlBrowserScraper.js diff --git a/.env b/.env index c8a4096..49f019f 100644 --- a/.env +++ b/.env @@ -2,4 +2,4 @@ PORT=5000 # Firecrawl API Key(在 https://www.firecrawl.dev/app/api-keys 获取) -FIRECRAWL_API_KEY=fc-354d1bbd965d482c977796ff534e15ca \ No newline at end of file +FIRECRAWL_API_KEY=fc-595dd922780442f8a907202666a522ef \ No newline at end of file diff --git a/public/index.html b/public/index.html index 487f1be..ca6735d 100644 --- a/public/index.html +++ b/public/index.html @@ -906,7 +906,7 @@ -

通过配置 URL 和提示词,使用 Firecrawl Agent +

通过配置 URL 和提示词,使用 Firecrawl Browser Sandbox 抓取任意网页数据。结果会自动保存,可在「抓取结果」页查看历史。

@@ -1057,7 +1057,7 @@
- +
@@ -1140,12 +1140,19 @@ } } - async function runScraper(id) { + async function runScraper(id, btnEl) { const item = scrapersList.find(s => s.id === id); const resultDiv = document.getElementById('scraperRunResult'); const contentDiv = document.getElementById('scraperRunResultContent'); resultDiv.style.display = 'block'; contentDiv.textContent = `正在测试抓取「${item?.city} - ${item?.type}」,请稍候...`; + resultDiv.scrollIntoView({ behavior: 'smooth', block: 'nearest' }); + + const originalText = btnEl ? btnEl.textContent : ''; + if (btnEl) { + btnEl.disabled = true; + btnEl.textContent = '测试中...'; + } try { const res = await fetch(`/api/scrapers/${id}/run`, { method: 'POST' }); const json = await res.json(); @@ -1153,6 +1160,11 @@ contentDiv.textContent = JSON.stringify(json.data, null, 2); } catch (err) { contentDiv.textContent = '❌ 测试失败: ' + err.message; + } finally { + if (btnEl) { + btnEl.disabled = false; + btnEl.textContent = originalText || '测试'; + } } } @@ -1213,4 +1225,4 @@ - \ No newline at end of file + diff --git a/results.json b/results.json index e08dc47..0637a08 100644 --- a/results.json +++ b/results.json @@ -1,459 +1 @@ -[ - { - "scraperId": "scraper-1772762494299", - "city": "南京市", - "section": "房建市政", - "subsection": "工程类、服务类", - "type": "招标公告", - "url": "https://njggzy.nanjing.gov.cn/njweb/fjsz/buildService1.html", - "scrapedAt": "2026-03-09T03:19:39.057Z", - "data": { - "result": [ - { - "title": "【澄清公告】文化谷东路 (东吉大道-创新大道)、创新大道(研发二路-文化谷东路)一期道路建设项目 施工", - "amount": "3180", - "date": "2026-03-09", - "url": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260309/d8483e91-9a7b-4425-a860-c5c9b45365f0.html" - }, - { - "title": "【澄清公告】南京大学仙林校区学生宿舍楼第28-30幢 20KV变电所工程", - "amount": "528", - "date": "2026-03-09", - "url": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260309/5a14fb16-fbd1-44d0-9f3f-90823f3639dd.html" - } - ], - "total": 2 - }, - "id": "result-1773026379058-wd4gj" - }, - { - "scraperId": "scraper-1772762354799", - "city": "无锡市", - "section": "水利工程", - "subsection": "", - "type": "招标公告", - "url": "https://ggzyjy.wuxi.gov.cn/wxsggzyjyzxzl/jyxx/slgc/index.shtml", - "scrapedAt": "2026-03-06T06:57:46.881Z", - "data": { - "result": [ - { - "title": "高新区三级防控系统工程周三房浜闸站工程施工招标公告", - "amount": "5,923,797元", - "date": "2026-03-05", - "url": "http://ggzyjy.wuxi.gov.cn/doc/2026/03/05/4741071.shtml" - } - ], - "total": 1 - }, - "id": "result-1772780266881-odaof" - }, - { - "scraperId": "scraper-1772762354799", - "city": "无锡市", - "section": "水利工程", - "subsection": "", - "type": "招标公告", - "url": "https://ggzyjy.wuxi.gov.cn/wxsggzyjyzxzl/jyxx/slgc/index.shtml", - "scrapedAt": "2026-03-06T06:42:40.619Z", - "data": { - "result": [ - { - "title": "高新区三级防控系统工程周三房浜闸站工程施工招标公告", - "amount": "5923797元", - "date": "2026-03-05", - "url": "https://ggzyjy.wuxi.gov.cn/doc/2026/03/05/4741071.shtml" - } - ], - "total": 1 - }, - "id": "result-1772779360620-xr7ue" - }, - { - "scraperId": "scraper-1772762354799", - "city": "无锡市", - "section": "水利工程", - "subsection": "", - "type": "招标公告", - "url": "https://ggzyjy.wuxi.gov.cn/wxsggzyjyzxzl/jyxx/slgc/index.shtml", - "scrapedAt": "2026-03-06T04:02:43.530Z", - "data": { - "items": [ - { - "title": "高新区三级防控系统工程周三房浜闸站工程施工招标公告", - "amount": "5923797元", - "date": "2026-03-05", - "url": "https://ggzyjy.wuxi.gov.cn/doc/2026/03/05/4741071.shtml" - } - ], - "total": 1 - }, - "id": "result-1772769763530-3axw2" - }, - { - "scraperId": "scraper-1772762354799", - "city": "无锡市", - "section": "水利工程", - "subsection": "", - "type": "招标公告", - "url": "https://ggzyjy.wuxi.gov.cn/wxsggzyjyzxzl/jyxx/slgc/index.shtml", - "scrapedAt": "2026-03-06T02:51:39.452Z", - "error": "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value.", - "data": null, - "id": "result-1772765499452-ynhn0" - }, - { - "scraperId": "scraper-1772762494299", - "city": "南京市", - "section": "房建市政", - "subsection": "工程类", - "type": "招标公告", - "url": "https://njggzy.nanjing.gov.cn/njweb/fjsz/buildService1.html", - "scrapedAt": "2026-03-06T02:32:03.818Z", - "data": { - "success": true, - "status": "completed", - "data": { - "target_date": "2026-03-06", - "notice_count": 0, - "notices": [], - "message": "截至当前时间(2026-03-06 02:19),网站尚未发布今日(2026-03-06)的招标公告。最新公告日期为2026-03-05。", - "recent_notices_fallback": [ - { - "title": "麒麟科创园具身智能训练场装修项目", - "date": "2026-03-05", - "url": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260305/a20ee94f-b76e-4f88-b8df-2847c2f35ce1.html", - "amount": "5660000.00" - }, - { - "title": "站东13号(MCd080-07-08)地块10kV电力杆线迁改工程", - "date": "2026-03-05", - "url": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260305/f0b99840-e8de-4a08-b2ba-3e57a347864c.html", - "amount": "9543100.00" - }, - { - "title": "【澄清公告】螺丝桥大街北延(月安街至应天大街段)道路工程", - "date": "2026-03-05", - "url": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260305/1b3da624-fe86-4755-a268-a1967cd9d489.html", - "amount": "900万元" - }, - { - "title": "建邺路150-164号等9个地块城中村改造项目", - "date": "2026-03-05", - "url": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260305/6f4fcf2f-d198-4814-acd8-9817ef559a0c.html", - "amount": "1,900,000.00" - }, - { - "title": "【澄清公告】南京市溧水区柘塘街道供水管网及配套设施提升改造工程", - "date": "2026-03-05", - "url": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260305/11ec2263-4ed1-4115-bdd1-0a6dcbf1d6c1.html", - "amount": "11320.01万元" - } - ] - }, - "model": "spark-1-mini", - "expiresAt": "2026-03-07T02:32:00.316Z", - "creditsUsed": 0 - }, - "id": "result-1772764323818-mj8km" - }, - { - "scraperId": "scraper-1772762354799", - "city": "无锡市", - "section": "水利工程", - "subsection": "", - "type": "招标公告", - "url": "https://ggzyjy.wuxi.gov.cn/wxsggzyjyzxzl/jyxx/slgc/zbgg/index.shtml", - "scrapedAt": "2026-03-06T02:19:27.580Z", - "data": { - "success": true, - "status": "completed", - "data": [ - { - "title": "高新区三级防控系统工程周三房浜闸站工程施工招标公告", - "project_amount": "5,923,797元 (最高投标限价)", - "publish_date": "2026-03-05", - "detail_url": "http://ggzyjy.wuxi.gov.cn/doc/2026/03/05/4741071.shtml" - } - ], - "model": "spark-1-mini", - "expiresAt": "2026-03-07T02:19:24.631Z", - "creditsUsed": 0 - }, - "id": "result-1772763567581-ahz62" - }, - { - "scraperId": "scraper-1772699302521", - "city": "无锡市", - "section": "", - "subsection": "", - "type": "招标公告", - "url": "https://ggzyjy.wuxi.gov.cn/wxsggzyjyzxzl/jyxx/slgc/index.shtml", - "scrapedAt": "2026-03-05T10:05:46.148Z", - "data": { - "success": true, - "status": "completed", - "data": { - "announcements": [ - { - "title": "高新区三级防控系统工程周三房浜闸站工程施工招标公告", - "project_amount": "最高投标限价:5923797元", - "publish_date": "2026-03-05", - "url": "https://ggzyjy.wuxi.gov.cn/doc/2026/03/05/4741071.shtml" - } - ] - }, - "model": "spark-1-mini", - "expiresAt": "2026-03-06T10:05:45.297Z", - "creditsUsed": 180 - }, - "id": "result-1772705146148-kn0ko" - }, - { - "scraperId": "scraper-1772699302521", - "city": "无锡市", - "section": "水利工程", - "subsection": "", - "type": "招标公告", - "url": "https://ggzyjy.wuxi.gov.cn/wxsggzyjyzxzl/index.shtml", - "scrapedAt": "2026-03-05T10:02:01.153Z", - "data": { - "success": true, - "status": "completed", - "data": [ - { - "标题": "高新区三级防控系统工程周三房浜闸站工程施工招标公告", - "项目金额": "5,923,797元", - "发布日期": "2026-03-05", - "详情页完整URL": "http://ggzyjy.wuxi.gov.cn/doc/2026/03/05/4741071.shtml" - } - ], - "model": "spark-1-mini", - "expiresAt": "2026-03-06T10:02:00.100Z", - "creditsUsed": 769 - }, - "id": "result-1772704921153-jx48m" - }, - { - "scraperId": "scraper-1772699302521", - "city": "无锡市", - "section": "水利工程", - "subsection": "", - "type": "招标公告", - "url": "https://ggzyjy.wuxi.gov.cn/wxsggzyjyzxzl/", - "scrapedAt": "2026-03-05T09:23:03.452Z", - "data": { - "success": true, - "status": "completed", - "data": { - "announcements": [ - { - "title": "高新区三级防控系统工程周三房浜闸站工程施工招标公告", - "amount": "5,923,797元", - "publish_date": "2026-03-05", - "url": "https://ggzyjy.wuxi.gov.cn/doc/2026/03/05/4741071.shtml" - }, - { - "title": "[WXHS202603001-X01]惠山区紧密型县域医共体服务能力提标扩能建设项目(惠山区人民医院紧密型医共体资源共享中心建设项目)勘察设计", - "amount": "570.00万元", - "publish_date": "2026-03-05", - "url": "https://ggzyjy.wuxi.gov.cn/doc/2026/03/05/4741246.shtml" - } - ] - }, - "model": "spark-1-mini", - "expiresAt": "2026-03-06T09:23:01.561Z", - "creditsUsed": 0 - }, - "id": "result-1772702583452-9t3b8" - }, - { - "scraperId": "scraper-1772699302521", - "city": "无锡市", - "section": "水利工程", - "subsection": "", - "type": "招标公告", - "url": "https://ggzyjy.wuxi.gov.cn/wxsggzyjyzxzl/", - "scrapedAt": "2026-03-05T08:39:45.736Z", - "data": { - "success": true, - "status": "completed", - "data": [ - { - "title": "[WXJY202601013-X01]江阴市长泾镇蒲市村区域性综合农事服务中心江阴市", - "amount": "874.0万元", - "date": "2026-01-30", - "url": "https://ggzyjy.wuxi.gov.cn/doc/2026/01/30/4726538.shtml" - }, - { - "title": "[WXXS202406006-X02]中共锡山区委党校异地新建项目施工总承包", - "amount": "10350.0万元", - "date": "2026-01-30", - "url": "https://ggzyjy.wuxi.gov.cn/doc/2026/01/30/4726721.shtml" - }, - { - "title": "[WXXQ202601010-X01]无锡交响音乐厅“一厅”及“两中心”品牌商户用房", - "amount": "400.0万元", - "date": "2026-01-30", - "url": "https://ggzyjy.wuxi.gov.cn/doc/2026/01/30/4726619.shtml" - }, - { - "title": "[WXXQ202601008-X01]生命园三期2号楼、3号楼改造项目工程总承包", - "amount": "3650.0万元", - "date": "2026-01-30", - "url": "https://ggzyjy.wuxi.gov.cn/doc/2026/01/30/4726675.shtml" - }, - { - "title": "[WXBH202601007-X01]军嶂山显山透绿工程-吴杨路郊野覆绿工程施工", - "amount": "440.0万元", - "date": "2026-01-30", - "url": "https://ggzyjy.wuxi.gov.cn/doc/2026/01/30/4726726.shtml" - } - ], - "model": "spark-1-mini", - "expiresAt": "2026-03-06T08:39:45.265Z", - "creditsUsed": 0 - }, - "id": "result-1772699985736-b3nca" - }, - { - "scraperId": "nj-jtsw-zbgg", - "city": "南京市", - "section": "房建市政", - "subsection": "工程类", - "type": "招标公告", - "url": "https://njggzy.nanjing.gov.cn/njweb/", - "scrapedAt": "2026-03-05T08:05:33.097Z", - "data": { - "success": true, - "status": "completed", - "data": { - "招标公告": [ - { - "标题": "【澄清公告】螺丝桥大街北延(月安街至应天大街段)道路工程 - 施工", - "项目金额": "900 万元", - "发布日期": "2026-03-05", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260305/1b3da624-fe86-4755-a268-a1967cd9d489.html" - }, - { - "标题": "建邺路150-164号等9个地块城中村改造项目 - 施工", - "项目金额": "190 万元", - "发布日期": "2026-03-05", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260305/6f4fcf2f-d198-4814-acd8-9817ef559a0c.html" - }, - { - "标题": "【澄清公告】南京市溧水区柘塘街道供水管网及配套设施提升改造工程 - 施工", - "项目金额": "11320.01 万元", - "发布日期": "2026-03-05", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260305/11ec2263-4ed1-4115-bdd1-0a6dcbf1d6c1.html" - }, - { - "标题": "栖霞区百水芊城春水坊等5个片区排水管网改造工程 - 施工", - "项目金额": "435.86 万元", - "发布日期": "2026-03-05", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260305/d69e5640-d549-4638-a64a-d1f9df58a903.html" - }, - { - "标题": "【澄清公告】兰桥八期保障性住房项目 - 新建居住区供配电工程", - "项目金额": "6000 万元", - "发布日期": "2026-03-04", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260304/33e25a55-42c4-471e-9a3c-f8e792957141.html" - }, - { - "标题": "青云巷10号危房整治工程 - SG1施工", - "项目金额": "375 万元", - "发布日期": "2026-03-04", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260304/e821f82c-39d8-479e-9457-b6bf5d101d80.html" - }, - { - "标题": "百水工业园地块保障房一期项目 - D地块1#楼(公安编号)室内装饰工程", - "项目金额": "600 万元", - "发布日期": "2026-03-04", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260304/5f8f2183-e26f-4c03-a76a-8b4d61b0011c.html" - }, - { - "标题": "青云巷10号危房整治工程 - SG1施工", - "项目金额": "375 万元", - "发布日期": "2026-03-04", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260304/9aa2d916-c0c3-4fb6-afa4-37457f0d2ceb.html" - }, - { - "标题": "【澄清公告】全国高校区域技术转移转化中心生物药物创新平台 - 施工", - "项目金额": "11000 万元", - "发布日期": "2026-03-03", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260303/2d1fe57f-fe0e-42f9-a99a-c345683aed3f.html" - }, - { - "标题": "轻质耐热合金制造基地项目 - 施工", - "项目金额": "11000 万元", - "发布日期": "2026-03-03", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001002/20260303/78b81308-1389-42fc-a8de-23b6b2b40be1.html" - }, - { - "标题": "【澄清公告】润埠花园二期项目 - 监理", - "项目金额": "111.37 万元", - "发布日期": "2026-03-05", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001001/20260305/acb0010f-dcbc-4ea4-a988-e4dc75670999.html" - }, - { - "标题": "轻质耐热合金制造基地项目 - 监理", - "项目金额": "188 万元", - "发布日期": "2026-03-04", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001001/20260304/93ee4804-5a5e-4524-92a3-b6c367803bd1.html" - }, - { - "标题": "【澄清公告】南京江北新区无人机制造共享工厂项目 - 监理", - "项目金额": "212.44 万元", - "发布日期": "2026-03-04", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001001/20260304/e44a1d28-0f43-494e-8daf-2f81252ed06a.html" - }, - { - "标题": "2026年四项环卫设施大中修项目 - 设计", - "项目金额": "25.58 万元", - "发布日期": "2026-03-03", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001001/20260303/225961f4-08c8-4398-99c9-7777bf0d16b7.html" - }, - { - "标题": "【澄清公告】南京市溧水区柘塘街道供水管网及配套设施提升改造工程 - 监理", - "项目金额": "164.33 万元", - "发布日期": "2026-03-03", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001001/20260303/a827d48e-8e1f-42c9-bd07-09ce369c20c6.html" - }, - { - "标题": "江苏银行金融科技中心建设项目 - 勘察", - "项目金额": "170 万元", - "发布日期": "2026-03-02", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001001/20260302/0ead5303-03db-4d95-b8ea-b32070a39dfa.html" - }, - { - "标题": "【澄清公告】南京高新区溧水园和凤园区改扩建项目 - 精诚电工地块及惠诚工具地块扩建厂房设计", - "项目金额": "140.68 万元", - "发布日期": "2026-03-02", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001001/20260302/d8df73f9-88d0-4f5d-8831-f9857a1a4ebc.html" - }, - { - "标题": "【澄清公告】NO.新区2025G11房地产开发项目 - 全过程工程咨询服务", - "项目金额": "950 万元", - "发布日期": "2026-03-02", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001001/20260302/348f6add-d17e-406d-9690-b637762175d7.html" - }, - { - "标题": "江苏省六合高级中学新建食堂体育馆项目 - 渣土运输处置", - "项目金额": "242.97917 万元", - "发布日期": "2026-02-28", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001001/20260228/2099a860-b3c2-411f-8580-72cbb55fef42.html" - }, - { - "标题": "【澄清公告】药谷产业区药谷大道(华宝路-汤盘公路)建设工程 - 勘察设计", - "项目金额": "194 万元", - "发布日期": "2026-02-28", - "详情页完整URL": "https://njggzy.nanjing.gov.cn/njweb/fjsz/068001/068001001/20260228/ffee9562-374d-43fd-8829-bf51c5b3cb46.html" - } - ] - }, - "model": "spark-1-mini", - "expiresAt": "2026-03-06T08:05:31.995Z", - "creditsUsed": 0 - }, - "id": "result-1772697933097-7hm4v" - } -] \ No newline at end of file +[] \ No newline at end of file diff --git a/src/firecrawlBrowserScraper.js b/src/firecrawlBrowserScraper.js new file mode 100644 index 0000000..e70e63b --- /dev/null +++ b/src/firecrawlBrowserScraper.js @@ -0,0 +1,275 @@ +const DEFAULT_SCRAPER_PROMPT = '提取页面上今日的招标公告信息,包括:标题、项目金额(可能为合同预估价/最高投标限价等)、发布日期(YYYY-MM-DD格式)、详情页完整URL'; +const PAYLOAD_MARKER = '__FC_PAYLOAD__'; + +function pad2(value) { + return String(value).padStart(2, '0'); +} + +function formatDate(year, month, day) { + return `${year}-${pad2(month)}-${pad2(day)}`; +} + +function getTodayInShanghai() { + return new Intl.DateTimeFormat('en-CA', { + timeZone: 'Asia/Shanghai', + year: 'numeric', + month: '2-digit', + day: '2-digit', + }).format(new Date()); +} + +function parseTargetDate(prompt) { + const text = String(prompt || ''); + if (!text) return null; + + const fullDate = text.match(/(20\d{2})[-/.年](\d{1,2})[-/.月](\d{1,2})日?/); + if (fullDate) { + return formatDate(fullDate[1], fullDate[2], fullDate[3]); + } + + if (/(今天|今日|当日)/.test(text)) { + return getTodayInShanghai(); + } + + return null; +} + +function normalizeDate(input) { + if (!input) return ''; + const text = String(input).trim(); + if (!text) return ''; + + let m = text.match(/(20\d{2})[-/.年](\d{1,2})[-/.月](\d{1,2})日?/); + if (m) return formatDate(m[1], m[2], m[3]); + + m = text.match(/(\d{1,2})[-/.月](\d{1,2})日?/); + if (m) { + const currentYear = Number(getTodayInShanghai().slice(0, 4)); + return formatDate(currentYear, m[1], m[2]); + } + + return ''; +} + +function extractDateFromText(text) { + if (!text) return ''; + const m = String(text).match(/(20\d{2}[-/.年]\d{1,2}[-/.月]\d{1,2}日?)|(\d{1,2}[-/.月]\d{1,2}日?)/); + return m ? normalizeDate(m[0]) : ''; +} + +function extractAmountFromText(text) { + if (!text) return null; + const m = String(text).match(/([0-9][0-9,.\s]*(?:亿元|万元|万|元))/); + if (!m) return null; + return m[1].replace(/\s+/g, '').trim(); +} + +function cleanText(text) { + return String(text || '').replace(/\s+/g, ' ').trim(); +} + +function toFiniteNumber(value, fallback) { + const n = Number(value); + return Number.isFinite(n) ? n : fallback; +} + +function parsePayloadFromText(rawText) { + if (!rawText) return null; + const text = String(rawText); + + const markerIndex = text.lastIndexOf(PAYLOAD_MARKER); + if (markerIndex >= 0) { + const tail = text.slice(markerIndex + PAYLOAD_MARKER.length); + const firstLine = tail.split(/\r?\n/).find(line => line.trim()); + if (firstLine) { + try { + return JSON.parse(firstLine.trim()); + } catch { + // Continue fallback parsing. + } + } + } + + try { + return JSON.parse(text.trim()); + } catch { + // Continue fallback parsing. + } + + const lines = text.split(/\r?\n/).map(line => line.trim()).filter(Boolean).reverse(); + for (const line of lines) { + try { + return JSON.parse(line); + } catch { + // Try next line. + } + } + + return null; +} + +function parseBrowserExecutePayload(executeResult) { + const sources = [executeResult?.result, executeResult?.stdout] + .filter(value => typeof value === 'string' && value.trim().length > 0); + + for (const source of sources) { + const payload = parsePayloadFromText(source); + if (payload && typeof payload === 'object') return payload; + } + + return { items: [] }; +} + +function splitKeywords(input) { + return String(input || '') + .split(/[、/,,|\s]+/) + .map(item => item.trim()) + .filter(item => item.length >= 2); +} + +function filterByTypeIfPossible(items, type) { + const keywords = splitKeywords(type); + if (keywords.length === 0) return items; + + const filtered = items.filter(item => { + const haystack = `${item.title} ${item.context || ''}`; + return keywords.some(keyword => haystack.includes(keyword)); + }); + + return filtered.length > 0 ? filtered : items; +} + +function normalizeItems(rawItems, targetDate, scraperType) { + const dedup = new Map(); + + for (const raw of rawItems) { + const title = cleanText(raw?.title); + const url = cleanText(raw?.url); + if (!title || !url) continue; + + const context = cleanText(raw?.context); + const date = normalizeDate(raw?.date) || extractDateFromText(context); + const amount = cleanText(raw?.amount) || extractAmountFromText(context) || null; + const key = `${title}@@${url}`; + + if (!dedup.has(key)) { + dedup.set(key, { title, amount, date, url, context }); + } + } + + let items = Array.from(dedup.values()); + items = filterByTypeIfPossible(items, scraperType); + + if (targetDate) { + items = items.filter(item => item.date === targetDate); + } + + return items + .map(({ title, amount, date, url }) => ({ title, amount, date, url })) + .slice(0, 100); +} + +function buildBrowserScript(url) { + return ` +const targetUrl = ${JSON.stringify(url)}; +await page.goto(targetUrl, { waitUntil: 'domcontentloaded', timeout: 60000 }); +await page.waitForTimeout(1500); + +const payload = await page.evaluate(() => { + const normalize = (value) => String(value || '').replace(/\\s+/g, ' ').trim(); + const blockedTitles = new Set(['首页', '尾页', '上一页', '下一页', '更多', '详情', '查看', '返回', '跳转']); + + const links = Array.from(document.querySelectorAll('a[href]')); + const rows = []; + const seen = new Set(); + + for (const a of links) { + const href = a.getAttribute('href') || ''; + if (!href || href.startsWith('javascript:') || href.startsWith('#')) continue; + + const title = normalize(a.textContent); + if (!title || title.length < 6 || title.length > 180) continue; + if (blockedTitles.has(title)) continue; + + let absoluteUrl = ''; + try { + absoluteUrl = new URL(href, location.href).href; + } catch { + continue; + } + + const container = a.closest('tr,li,article,section,div,p,dd,dt') || a.parentElement; + const context = normalize(container ? container.textContent : title); + + const dateMatch = context.match(/(20\\d{2}[-/.年]\\d{1,2}[-/.月]\\d{1,2}日?)|(\\d{1,2}[-/.月]\\d{1,2}日?)/); + const amountMatch = context.match(/([0-9][0-9,.\\s]*(?:亿元|万元|万|元))/); + + const key = (title + '@@' + absoluteUrl).toLowerCase(); + if (seen.has(key)) continue; + seen.add(key); + + rows.push({ + title, + url: absoluteUrl, + date: dateMatch ? dateMatch[0] : '', + amount: amountMatch ? amountMatch[0].replace(/\\s+/g, '') : null, + context, + }); + } + + return { + pageUrl: location.href, + items: rows.slice(0, 300), + }; +}); + +console.log('${PAYLOAD_MARKER}' + JSON.stringify(payload)); +JSON.stringify(payload); +`; +} + +export async function runScraperWithBrowser(firecrawl, scraper, options = {}) { + const prefix = options.logPrefix || '[Browser]'; + if (!scraper?.url) throw new Error('抓取 URL 不能为空'); + + const prompt = scraper.prompt || DEFAULT_SCRAPER_PROMPT; + const targetDate = parseTargetDate(prompt); + + const ttl = toFiniteNumber(scraper.browserTtl, 180); + const activityTtl = toFiniteNumber(scraper.browserActivityTtl, 90); + + const session = await firecrawl.browser({ ttl, activityTtl }); + if (!session?.success || !session.id) { + throw new Error(session?.error || '创建 Browser 会话失败'); + } + + let executeResult; + try { + executeResult = await firecrawl.browserExecute(session.id, { + code: buildBrowserScript(scraper.url), + language: 'node', + }); + } finally { + try { + await firecrawl.deleteBrowser(session.id); + } catch (closeError) { + console.warn(`${prefix} 会话关闭失败: ${closeError.message}`); + } + } + + if (!executeResult?.success) { + throw new Error(executeResult?.error || executeResult?.stderr || 'Browser 执行失败'); + } + + const payload = parseBrowserExecutePayload(executeResult); + const rawItems = Array.isArray(payload.items) ? payload.items : []; + const items = normalizeItems(rawItems, targetDate, scraper.type); + + console.log(`${prefix} URL=${scraper.url} raw=${rawItems.length} normalized=${items.length}${targetDate ? ` targetDate=${targetDate}` : ''}`); + + return { + items, + targetDate, + pageUrl: payload.pageUrl || scraper.url, + }; +} diff --git a/src/scheduler.js b/src/scheduler.js index 1174f41..d780ada 100644 --- a/src/scheduler.js +++ b/src/scheduler.js @@ -4,8 +4,8 @@ import { readFileSync, writeFileSync, existsSync } from 'fs'; import { fileURLToPath } from 'url'; import { dirname, join } from 'path'; import Firecrawl from '@mendable/firecrawl-js'; -import { z } from 'zod'; import { sendScraperResultsEmail } from './emailService.js'; +import { runScraperWithBrowser } from './firecrawlBrowserScraper.js'; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); @@ -48,63 +48,12 @@ function appendResult(result) { saveResults(results); } -// ========== 统一的公告抓取 Schema ========== - -// 公告抓取 Schema(result 包装数组) -const announcementSchema = z.object({ - result: z.array(z.object({ - title: z.string().describe('公告标题'), - amount: z.string().nullable().describe('项目金额(合同预估价/最高投标限价等),没有则为null'), - date: z.string().describe('发布日期,YYYY-MM-DD格式'), - url: z.string().describe('详情页完整URL,以https://开头'), - })).describe('页面上提取到的所有公告条目'), -}); - -/** 从 Firecrawl 返回结果中提取 result 数组 */ -function extractItems(raw) { - if (!raw) return []; - const root = (raw.data && typeof raw.data === 'object') ? raw.data : raw; - if (Array.isArray(root.result)) return root.result; - if (root.result && typeof root.result === 'object') { - const keys = Object.keys(root.result).filter(k => !isNaN(parseInt(k))); - if (keys.length > 0) return keys.sort((a, b) => parseInt(a) - parseInt(b)).map(k => root.result[k]); - } - if (Array.isArray(root)) return root; - const numericKeys = Object.keys(root).filter(k => !isNaN(parseInt(k))); - if (numericKeys.length > 0) return numericKeys.sort((a, b) => parseInt(a) - parseInt(b)).map(k => root[k]); - return []; -} - // ========== 抓取执行(复用 server.js 中 runScraper 的逻辑) ========== async function runScraper(scraper) { - console.log(`[定时任务] ${scraper.city} - ${scraper.section} ${scraper.subsection} - ${scraper.type}:${scraper.url}`); - const fullPrompt = `访问这个URL: ${scraper.url} -【目标区域】:${scraper.section || ''} - ${scraper.subsection || ''} -【公告类型】:${scraper.type || ''} - -${scraper.prompt || '提取页面上今日的招标公告信息,包括:标题、项目金额(可能为合同预估价/最高投标限价等等)、发布日期(YYYY-MM-DD格式)、详情页完整URL'} - -请严格按照定义的 JSON 格式返回,每条公告包含 title、amount、date、url 四个字段。`; - - const result = await firecrawl.agent({ - prompt: fullPrompt, - schema: announcementSchema, - model: scraper.model || 'spark-1-mini', - }); - - console.log('[定时任务] 原始返回结果:', JSON.stringify(result).slice(0, 500)); - - // 标准化结果 - const rawItems = extractItems(result); - const items = rawItems.map(item => ({ - title: item.title || '', - amount: item.amount || null, - date: item.date || '', - url: item.url || '', - })); - - console.log(`[定时任务] 提取到 ${items.length} 条公告`); + console.log(`[定时任务][Browser] ${scraper.city} - ${scraper.section} ${scraper.subsection} - ${scraper.type}:${scraper.url}`); + const { items } = await runScraperWithBrowser(firecrawl, scraper, { logPrefix: '[Browser][Scheduler]' }); + console.log(`[定时任务][Browser] 提取到 ${items.length} 条公告`); const record = { scraperId: scraper.id, diff --git a/src/server.js b/src/server.js index 64f64e5..c072413 100644 --- a/src/server.js +++ b/src/server.js @@ -2,12 +2,12 @@ import 'dotenv/config'; import express from 'express'; import cors from 'cors'; import Firecrawl from '@mendable/firecrawl-js'; -import { z } from 'zod'; import { readFileSync, writeFileSync, existsSync } from 'fs'; import { fileURLToPath } from 'url'; import { dirname, join } from 'path'; import { sendCombinedReportEmail } from './emailService.js'; import { initScheduler, runTaskNow, reloadScheduler, getSchedulerStatus } from './scheduler.js'; +import { runScraperWithBrowser } from './firecrawlBrowserScraper.js'; const app = express(); const PORT = process.env.PORT || 5000; @@ -171,68 +171,11 @@ app.delete('/api/scrapers/:id', (req, res) => { // ========== 统一抓取执行 ========== -// 公告抓取 Schema(result 包装数组) -const announcementSchema = z.object({ - result: z.array(z.object({ - title: z.string().describe('公告标题'), - amount: z.string().nullable().describe('项目金额(合同预估价/最高投标限价等),没有则为null'), - date: z.string().describe('发布日期,YYYY-MM-DD格式'), - url: z.string().describe('详情页完整URL,以https://开头'), - })).describe('页面上提取到的所有公告条目'), -}); - - -/** - * 从 Firecrawl agent 返回结果中提取 result 数组 - * 优先取 root.result,再回退数字键处理 - */ -function extractItems(raw) { - if (!raw) return []; - const root = (raw.data && typeof raw.data === 'object') ? raw.data : raw; - // 最优先:result 是真正数组 - if (Array.isArray(root.result)) return root.result; - // result 是数字键对象 - if (root.result && typeof root.result === 'object') { - const keys = Object.keys(root.result).filter(k => !isNaN(parseInt(k))); - if (keys.length > 0) return keys.sort((a, b) => parseInt(a) - parseInt(b)).map(k => root.result[k]); - } - // 如果 root 本身是数组 - if (Array.isArray(root)) return root; - // 顶层数字键回退 - const numericKeys = Object.keys(root).filter(k => !isNaN(parseInt(k))); - if (numericKeys.length > 0) return numericKeys.sort((a, b) => parseInt(a) - parseInt(b)).map(k => root[k]); - return []; -} - // 执行单个抓取来源并保存结果 async function runScraper(scraper) { - console.log(`[Agent] ${scraper.city} - ${scraper.section} ${scraper.subsection} - ${scraper.type}:${scraper.url}`); - const fullPrompt = `访问这个URL: ${scraper.url} -【目标区域】:${scraper.section || ''} - ${scraper.subsection || ''} -【公告类型】:${scraper.type || ''} - -${scraper.prompt || '提取页面上今日的招标公告信息,包括:标题、项目金额(可能为合同预估价/最高投标限价等等)、发布日期(YYYY-MM-DD格式)、详情页完整URL'} - -请严格按照定义的 JSON 格式返回,每条公告包含 title、amount、date、url 四个字段。`; - console.log(fullPrompt, 'fullPrompt======='); - - const result = await firecrawl.agent({ - prompt: fullPrompt, - schema: announcementSchema, - model: scraper.model || 'spark-1-mini', - }); - - console.log('[Agent] 原始返回结果:', JSON.stringify(result).slice(0, 500)); - - const rawItems = extractItems(result); - const items = rawItems.map(item => ({ - title: item.title || '', - amount: item.amount || null, - date: item.date || '', - url: item.url || '', - })); - - console.log(`[Agent] 提取到 ${items.length} 条公告`); + console.log(`[Browser] ${scraper.city} - ${scraper.section} ${scraper.subsection} - ${scraper.type}:${scraper.url}`); + const { items } = await runScraperWithBrowser(firecrawl, scraper, { logPrefix: '[Browser][API]' }); + console.log(`[Browser] 提取到 ${items.length} 条公告`); const record = { scraperId: scraper.id,