const { chromium } = require("playwright-chromium"); let browser; let context; const BLOCK_LIST = [ "google-analytics", "doubleclick", "facebook", "twitter", "adsystem", "analytics", "tracker", "pixel", "quantserve", "newrelic", "hotjar", "yandex", "ads", "widgets", "gravatar", "fonts.googleapis", "map", "cdn.ampproject.org", "googletagmanager" ]; const ALLOWED_SCRIPTS = []; async function initHeadless() { if (browser) return; browser = await chromium.launch({ headless: true, args: [ "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "--disable-gpu", "--disable-extensions", "--disable-background-networking", "--disable-sync", "--disable-translate", "--mute-audio", "--no-first-run", "--no-zygote", "--single-process", "--disable-software-rasterizer", "--disable-client-side-phishing-detection", "--no-default-browser-check", "--no-experiments" ] }); context = await browser.newContext({ userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36" }); } async function turboScroll(page) { await page.evaluate(() => { return new Promise((resolve) => { let last = 0; let same = 0; const timer = setInterval(() => { const h = document.body.scrollHeight; window.scrollTo(0, h); if (h === last) { same++; if (same >= 5) { clearInterval(timer); resolve(); } } else { same = 0; last = h; } }, 20); }); }); } async function scrape(url, handler, options = {}) { const { waitUntil = "domcontentloaded", waitSelector = null, timeout = 10000, scrollToBottom = false, renderWaitTime = 0, loadImages = true, blockScripts = true } = options; if (!browser) await initHeadless(); const page = await context.newPage(); const requests = []; page.on("request", req => { requests.push({ url: req.url(), method: req.method(), type: req.resourceType() }); }); await page.route("**/*", (route) => { const req = route.request(); const resUrl = req.url().toLowerCase(); const type = req.resourceType(); if ( type === "font" || type === "stylesheet" || type === "media" || type === "manifest" || type === "other" || (blockScripts && type === "script" && !ALLOWED_SCRIPTS.some(k => resUrl.includes(k))) ) { return route.abort("blockedbyclient", { timeout: 100 }); } if (BLOCK_LIST.some(k => resUrl.includes(k))) { return route.abort("blockedbyclient", { timeout: 100 }); } if (!loadImages && ( type === "image" || resUrl.match(/\.(jpg|jpeg|png|gif|webp|svg)$/) )) { return route.abort("blockedbyclient", { timeout: 100 }); } route.continue(); }); try { await page.goto(url, { waitUntil, timeout }); if (waitSelector) { try { await page.waitForSelector(waitSelector, { timeout }); } catch (e) { } } if (scrollToBottom) { await turboScroll(page); } if (renderWaitTime > 0) { await new Promise(r => setTimeout(r, renderWaitTime)); } const result = await handler(page); return { result, requests }; } catch (error) { console.error(`Error durante el scraping de ${url}:`, error); return null; } finally { await page.close(); } } async function closeScraper() { if (context) await context.close(); if (browser) await browser.close(); context = null; browser = null; } module.exports = { initHeadless, scrape, closeScraper };