const { chromium } = require("playwright-chromium"); let browser; const BLOCK_LIST = [ "google-analytics", "doubleclick", "facebook", "twitter", "adsystem", "analytics", "tracker", "pixel", "quantserve", "newrelic" ]; async function initHeadless() { if (browser) return; browser = await chromium.launch({ headless: true, args: [ "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "--disable-gpu", "--disable-extensions", "--disable-background-networking", "--disable-sync", "--disable-translate", "--mute-audio", "--no-first-run", "--no-zygote", ] }); } async function turboScroll(page) { await page.evaluate(() => { return new Promise((resolve) => { let last = 0; let same = 0; const timer = setInterval(() => { const h = document.body.scrollHeight; window.scrollTo(0, h); if (h === last) { same++; if (same >= 5) { clearInterval(timer); resolve(); } } else { same = 0; last = h; } }, 20); }); }); } async function scrape(url, handler, options = {}) { const { waitUntil = "domcontentloaded", waitSelector = null, timeout = 10000, scrollToBottom = false, renderWaitTime = 0, loadImages = true } = options; if (!browser) await initHeadless(); const context = await browser.newContext({ userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36" }); const page = await context.newPage(); let collectedRequests = []; await page.route("**/*", (route) => { const req = route.request(); const rUrl = req.url().toLowerCase(); const type = req.resourceType(); collectedRequests.push({ url: req.url(), method: req.method(), resourceType: type }); if (type === "font" || type === "manifest") return route.abort(); if (BLOCK_LIST.some(k => rUrl.includes(k))) return route.abort(); if (!loadImages && ( type === "image" || rUrl.match(/\.(jpg|jpeg|png|gif|webp|svg)$/) )) return route.abort(); route.continue(); }); await page.addInitScript(() => { Object.defineProperty(navigator, "webdriver", { get: () => false }); }); await page.goto(url, { waitUntil, timeout }); if (waitSelector) { try { await page.waitForSelector(waitSelector, { timeout }); } catch {} } if (scrollToBottom) { await turboScroll(page); } if (renderWaitTime > 0) { await new Promise(r => setTimeout(r, renderWaitTime)); } const result = await handler(page); await page.close(); await context.close(); return { result, requests: collectedRequests }; } async function closeScraper() { if (browser) await browser.close(); browser = null; } module.exports = { initHeadless, scrape, closeScraper };