238 lines
7.3 KiB
JavaScript
238 lines
7.3 KiB
JavaScript
const { chromium } = require("playwright-chromium");
|
|
|
|
let browser;
|
|
let context;
|
|
|
|
const BLOCK_LIST = [
|
|
"google-analytics", "doubleclick", "facebook", "twitter",
|
|
"adsystem", "analytics", "tracker", "pixel", "quantserve", "newrelic",
|
|
"hotjar", "yandex", "ads", "widgets", "gravatar", "fonts.googleapis",
|
|
"map", "cdn.ampproject.org", "googletagmanager"
|
|
];
|
|
|
|
const ALLOWED_SCRIPTS = [];
|
|
|
|
async function initHeadless() {
|
|
if (browser && browser.isConnected()) return;
|
|
|
|
try {
|
|
browser = await chromium.launch({
|
|
headless: true,
|
|
args: [
|
|
"--no-sandbox",
|
|
"--disable-setuid-sandbox",
|
|
"--disable-dev-shm-usage",
|
|
"--disable-gpu",
|
|
"--disable-extensions",
|
|
"--disable-background-networking",
|
|
"--disable-sync",
|
|
"--disable-translate",
|
|
"--mute-audio",
|
|
"--no-first-run",
|
|
"--no-zygote",
|
|
"--single-process",
|
|
"--disable-software-rasterizer",
|
|
"--disable-client-side-phishing-detection",
|
|
"--no-default-browser-check",
|
|
"--no-experiments"
|
|
]
|
|
});
|
|
|
|
context = await browser.newContext({
|
|
userAgent:
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36"
|
|
});
|
|
} catch (error) {
|
|
console.error("Error al inicializar browser:", error);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
async function turboScroll(page) {
|
|
try {
|
|
await page.evaluate(() => {
|
|
return new Promise((resolve) => {
|
|
let last = 0;
|
|
let same = 0;
|
|
const timer = setInterval(() => {
|
|
const h = document.body.scrollHeight;
|
|
window.scrollTo(0, h);
|
|
if (h === last) {
|
|
same++;
|
|
if (same >= 5) {
|
|
clearInterval(timer);
|
|
resolve();
|
|
}
|
|
} else {
|
|
same = 0;
|
|
last = h;
|
|
}
|
|
}, 20);
|
|
|
|
// Safety timeout
|
|
setTimeout(() => {
|
|
clearInterval(timer);
|
|
resolve();
|
|
}, 10000);
|
|
});
|
|
});
|
|
} catch (error) {
|
|
console.error("Error en turboScroll:", error.message);
|
|
// No lanzamos el error, continuamos
|
|
}
|
|
}
|
|
|
|
async function scrape(url, handler, options = {}) {
|
|
const {
|
|
waitUntil = "domcontentloaded",
|
|
waitSelector = null,
|
|
timeout = 15000,
|
|
scrollToBottom = false,
|
|
renderWaitTime = 0,
|
|
loadImages = true,
|
|
blockScripts = true,
|
|
retries = 3,
|
|
retryDelay = 1000
|
|
} = options;
|
|
|
|
let lastError = null;
|
|
|
|
for (let attempt = 1; attempt <= retries; attempt++) {
|
|
let page = null;
|
|
|
|
try {
|
|
// Verificar que el browser esté activo
|
|
if (!browser || !browser.isConnected()) {
|
|
await initHeadless();
|
|
}
|
|
|
|
page = await context.newPage();
|
|
const requests = [];
|
|
|
|
// Listener para requests
|
|
page.on("request", req => {
|
|
requests.push({
|
|
url: req.url(),
|
|
method: req.method(),
|
|
type: req.resourceType()
|
|
});
|
|
});
|
|
|
|
// Route para bloquear recursos
|
|
await page.route("**/*", (route) => {
|
|
const req = route.request();
|
|
const resUrl = req.url().toLowerCase();
|
|
const type = req.resourceType();
|
|
|
|
if (
|
|
type === "font" ||
|
|
type === "stylesheet" ||
|
|
type === "media" ||
|
|
type === "manifest" ||
|
|
type === "other" ||
|
|
(blockScripts && type === "script" && !ALLOWED_SCRIPTS.some(k => resUrl.includes(k)))
|
|
) {
|
|
return route.abort("blockedbyclient");
|
|
}
|
|
|
|
if (BLOCK_LIST.some(k => resUrl.includes(k))) {
|
|
return route.abort("blockedbyclient");
|
|
}
|
|
|
|
if (!loadImages && (
|
|
type === "image" || resUrl.match(/\.(jpg|jpeg|png|gif|webp|svg)$/)
|
|
)) {
|
|
return route.abort("blockedbyclient");
|
|
}
|
|
|
|
route.continue();
|
|
});
|
|
|
|
// Navegar a la URL
|
|
await page.goto(url, { waitUntil, timeout });
|
|
|
|
// Esperar selector si se especifica
|
|
if (waitSelector) {
|
|
try {
|
|
await page.waitForSelector(waitSelector, { timeout: Math.min(timeout, 5000) });
|
|
} catch (e) {
|
|
console.warn(`Selector '${waitSelector}' no encontrado, continuando...`);
|
|
}
|
|
}
|
|
|
|
// Scroll si es necesario
|
|
if (scrollToBottom) {
|
|
await turboScroll(page);
|
|
}
|
|
|
|
// Tiempo de espera adicional para renderizado
|
|
if (renderWaitTime > 0) {
|
|
await page.waitForTimeout(renderWaitTime);
|
|
}
|
|
|
|
// Ejecutar el handler personalizado
|
|
const result = await handler(page);
|
|
|
|
// Cerrar la página antes de retornar
|
|
await page.close();
|
|
|
|
return { result, requests };
|
|
|
|
} catch (error) {
|
|
lastError = error;
|
|
console.error(`[Intento ${attempt}/${retries}] Error durante el scraping de ${url}:`, error.message);
|
|
|
|
// Cerrar página si está abierta
|
|
if (page && !page.isClosed()) {
|
|
try {
|
|
await page.close();
|
|
} catch (closeError) {
|
|
console.error("Error al cerrar página:", closeError.message);
|
|
}
|
|
}
|
|
|
|
// Si el browser está cerrado, limpiar referencias
|
|
if (error.message.includes("closed") || error.message.includes("Target closed")) {
|
|
console.log("Browser cerrado detectado, reiniciando...");
|
|
await closeScraper();
|
|
}
|
|
|
|
// Si no es el último intento, esperar antes de reintentar
|
|
if (attempt < retries) {
|
|
const delay = retryDelay * attempt; // Backoff exponencial
|
|
console.log(`Reintentando en ${delay}ms...`);
|
|
await new Promise(r => setTimeout(r, delay));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Si llegamos aquí, todos los intentos fallaron
|
|
console.error(`Todos los intentos fallaron para ${url}`);
|
|
throw lastError || new Error("Scraping failed after all retries");
|
|
}
|
|
|
|
async function closeScraper() {
|
|
try {
|
|
if (context) {
|
|
await context.close();
|
|
context = null;
|
|
}
|
|
} catch (error) {
|
|
console.error("Error cerrando context:", error.message);
|
|
}
|
|
|
|
try {
|
|
if (browser) {
|
|
await browser.close();
|
|
browser = null;
|
|
}
|
|
} catch (error) {
|
|
console.error("Error cerrando browser:", error.message);
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
initHeadless,
|
|
scrape,
|
|
closeScraper
|
|
}; |