const { chromium } = require("playwright-chromium"); let browser; let context; const BLOCK_LIST = [ "google-analytics", "doubleclick", "facebook", "twitter", "adsystem", "analytics", "tracker", "pixel", "quantserve", "newrelic", "hotjar", "yandex", "ads", "widgets", "gravatar", "fonts.googleapis", "map", "cdn.ampproject.org", "googletagmanager" ]; const ALLOWED_SCRIPTS = []; async function initHeadless() { if (browser && browser.isConnected()) return; try { browser = await chromium.launch({ headless: true, args: [ "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "--disable-gpu", "--disable-extensions", "--disable-background-networking", "--disable-sync", "--disable-translate", "--mute-audio", "--no-first-run", "--no-zygote", "--single-process", "--disable-software-rasterizer", "--disable-client-side-phishing-detection", "--no-default-browser-check", "--no-experiments" ] }); context = await browser.newContext({ userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36" }); } catch (error) { console.error("Error al inicializar browser:", error); throw error; } } async function turboScroll(page) { try { await page.evaluate(() => { return new Promise((resolve) => { let last = 0; let same = 0; const timer = setInterval(() => { const h = document.body.scrollHeight; window.scrollTo(0, h); if (h === last) { same++; if (same >= 5) { clearInterval(timer); resolve(); } } else { same = 0; last = h; } }, 20); // Safety timeout setTimeout(() => { clearInterval(timer); resolve(); }, 10000); }); }); } catch (error) { console.error("Error en turboScroll:", error.message); // No lanzamos el error, continuamos } } async function scrape(url, handler, options = {}) { const { waitUntil = "domcontentloaded", waitSelector = null, timeout = 15000, scrollToBottom = false, renderWaitTime = 0, loadImages = true, blockScripts = true, retries = 3, retryDelay = 1000 } = options; let lastError = null; for (let attempt = 1; attempt <= retries; attempt++) { let page = null; try { // Verificar que el browser esté activo if (!browser || !browser.isConnected()) { await initHeadless(); } page = await context.newPage(); const requests = []; // Listener para requests page.on("request", req => { requests.push({ url: req.url(), method: req.method(), type: req.resourceType() }); }); // Route para bloquear recursos await page.route("**/*", (route) => { const req = route.request(); const resUrl = req.url().toLowerCase(); const type = req.resourceType(); if ( type === "font" || type === "stylesheet" || type === "media" || type === "manifest" || type === "other" || (blockScripts && type === "script" && !ALLOWED_SCRIPTS.some(k => resUrl.includes(k))) ) { return route.abort("blockedbyclient"); } if (BLOCK_LIST.some(k => resUrl.includes(k))) { return route.abort("blockedbyclient"); } if (!loadImages && ( type === "image" || resUrl.match(/\.(jpg|jpeg|png|gif|webp|svg)$/) )) { return route.abort("blockedbyclient"); } route.continue(); }); // Navegar a la URL await page.goto(url, { waitUntil, timeout }); // Esperar selector si se especifica if (waitSelector) { try { await page.waitForSelector(waitSelector, { timeout: Math.min(timeout, 5000) }); } catch (e) { console.warn(`Selector '${waitSelector}' no encontrado, continuando...`); } } // Scroll si es necesario if (scrollToBottom) { await turboScroll(page); } // Tiempo de espera adicional para renderizado if (renderWaitTime > 0) { await page.waitForTimeout(renderWaitTime); } // Ejecutar el handler personalizado const result = await handler(page); // Cerrar la página antes de retornar await page.close(); return { result, requests }; } catch (error) { lastError = error; console.error(`[Intento ${attempt}/${retries}] Error durante el scraping de ${url}:`, error.message); // Cerrar página si está abierta if (page && !page.isClosed()) { try { await page.close(); } catch (closeError) { console.error("Error al cerrar página:", closeError.message); } } // Si el browser está cerrado, limpiar referencias if (error.message.includes("closed") || error.message.includes("Target closed")) { console.log("Browser cerrado detectado, reiniciando..."); await closeScraper(); } // Si no es el último intento, esperar antes de reintentar if (attempt < retries) { const delay = retryDelay * attempt; // Backoff exponencial console.log(`Reintentando en ${delay}ms...`); await new Promise(r => setTimeout(r, delay)); } } } // Si llegamos aquí, todos los intentos fallaron console.error(`Todos los intentos fallaron para ${url}`); throw lastError || new Error("Scraping failed after all retries"); } async function closeScraper() { try { if (context) { await context.close(); context = null; } } catch (error) { console.error("Error cerrando context:", error.message); } try { if (browser) { await browser.close(); browser = null; } } catch (error) { console.error("Error cerrando browser:", error.message); } } module.exports = { initHeadless, scrape, closeScraper };