support for headless browser & changes on book api

2025-12-03 20:40:53 +01:00
parent 8e20743e8b
commit 920ce19cc2
9 changed files with 201 additions and 22 deletions
--- a/src/shared/headless.js
+++ b/src/shared/headless.js
@@ -0,0 +1,133 @@
+const { chromium } = require("playwright-chromium");
+
+let browser;
+let context;
+
+const BLOCK_LIST = [
+    "google-analytics", "doubleclick", "facebook", "twitter",
+    "adsystem", "analytics", "tracker", "pixel", "quantserve", "newrelic"
+];
+
+async function initHeadless() {
+    if (browser) return;
+
+    browser = await chromium.launch({
+        headless: true,
+        args: [
+            "--no-sandbox",
+            "--disable-setuid-sandbox",
+            "--disable-dev-shm-usage",
+            "--disable-gpu",
+            "--disable-extensions",
+            "--disable-background-networking",
+            "--disable-sync",
+            "--disable-translate",
+            "--mute-audio",
+            "--no-first-run",
+            "--no-zygote",
+            "--single-process"
+        ]
+    });
+
+    context = await browser.newContext({
+        userAgent:
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36"
+    });
+}
+
+// ✅ Scroll infinito
+async function turboScroll(page) {
+    await page.evaluate(() => {
+        return new Promise((resolve) => {
+            let last = 0;
+            let same = 0;
+            const timer = setInterval(() => {
+                const h = document.body.scrollHeight;
+                window.scrollTo(0, h);
+                if (h === last) {
+                    same++;
+                    if (same >= 5) {
+                        clearInterval(timer);
+                        resolve();
+                    }
+                } else {
+                    same = 0;
+                    last = h;
+                }
+            }, 20);
+        });
+    });
+}
+
+// ✅ Scrape principal
+async function scrape(url, handler, options = {}) {
+    const {
+        waitUntil = "domcontentloaded",
+        waitSelector = null,
+        timeout = 10000,
+        scrollToBottom = false,
+        renderWaitTime = 0,
+        loadImages = true
+    } = options;
+
+    if (!browser) await init();
+
+    const page = await context.newPage();
+
+    // 🔒 Bloqueo de recursos
+    await page.route("**/*", (route) => {
+        const req = route.request();
+        const url = req.url().toLowerCase();
+        const type = req.resourceType();
+
+        if (
+            type === "font" ||
+            type === "stylesheet" ||
+            type === "media" ||
+            type === "manifest"
+        ) return route.abort();
+
+        if (BLOCK_LIST.some(k => url.includes(k))) return route.abort();
+
+        if (!loadImages && (
+            type === "image" || url.match(/\.(jpg|jpeg|png|gif|webp|svg)$/)
+        )) return route.abort();
+
+        route.continue();
+    });
+
+    await page.goto(url, { waitUntil, timeout });
+
+    if (waitSelector) {
+        try {
+            await page.waitForSelector(waitSelector, { timeout });
+        } catch {}
+    }
+
+    if (scrollToBottom) {
+        await turboScroll(page);
+    }
+
+    if (renderWaitTime > 0) {
+        await new Promise(r => setTimeout(r, renderWaitTime));
+    }
+
+    const result = await handler(page);
+
+    await page.close();
+
+    return result;
+}
+
+async function closeScraper() {
+    if (context) await context.close();
+    if (browser) await browser.close();
+    context = null;
+    browser = null;
+}
+
+module.exports = {
+    initHeadless,
+    scrape,
+    closeScraper
+};