From 79f1733aef2f19cff8a3a606f78f298750409c5c Mon Sep 17 00:00:00 2001 From: lenafx Date: Thu, 11 Dec 2025 19:03:07 +0100 Subject: [PATCH] added requests to scrapper --- src/shared/headless.js | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/shared/headless.js b/src/shared/headless.js index 0d1c1fe..9d23cfb 100644 --- a/src/shared/headless.js +++ b/src/shared/headless.js @@ -64,18 +64,26 @@ async function scrape(url, handler, options = {}) { } = options; if (!browser) await initHeadless(); const page = await context.newPage(); + let collectedRequests = []; await page.route("**/*", (route) => { const req = route.request(); - const url = req.url().toLowerCase(); + const rUrl = req.url().toLowerCase(); const type = req.resourceType(); - if ( - type === "font" || - type === "media" || - type === "manifest" - ) return route.abort(); - if (BLOCK_LIST.some(k => url.includes(k))) return route.abort(); + + collectedRequests.push({ + url: req.url(), + method: req.method(), + resourceType: type + }); + + if (type === "font" || type === "media" || type === "manifest") + return route.abort(); + + if (BLOCK_LIST.some(k => rUrl.includes(k))) + return route.abort(); + if (!loadImages && ( - type === "image" || url.match(/\.(jpg|jpeg|png|gif|webp|svg)$/) + type === "image" || rUrl.match(/\.(jpg|jpeg|png|gif|webp|svg)$/) )) return route.abort(); route.continue(); }); @@ -93,8 +101,10 @@ async function scrape(url, handler, options = {}) { } const result = await handler(page); await page.close(); - return { result , "": ""}; + + return { result, requests: collectedRequests }; } + async function closeScraper() { if (context) await context.close(); if (browser) await browser.close();