support for headless browser & changes on book api

This commit is contained in:
2025-12-03 20:40:53 +01:00
parent 8e20743e8b
commit 920ce19cc2
9 changed files with 201 additions and 22 deletions

133
src/shared/headless.js Normal file
View File

@@ -0,0 +1,133 @@
const { chromium } = require("playwright-chromium");
let browser;
let context;
const BLOCK_LIST = [
"google-analytics", "doubleclick", "facebook", "twitter",
"adsystem", "analytics", "tracker", "pixel", "quantserve", "newrelic"
];
async function initHeadless() {
if (browser) return;
browser = await chromium.launch({
headless: true,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu",
"--disable-extensions",
"--disable-background-networking",
"--disable-sync",
"--disable-translate",
"--mute-audio",
"--no-first-run",
"--no-zygote",
"--single-process"
]
});
context = await browser.newContext({
userAgent:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36"
});
}
// ✅ Scroll infinito
async function turboScroll(page) {
await page.evaluate(() => {
return new Promise((resolve) => {
let last = 0;
let same = 0;
const timer = setInterval(() => {
const h = document.body.scrollHeight;
window.scrollTo(0, h);
if (h === last) {
same++;
if (same >= 5) {
clearInterval(timer);
resolve();
}
} else {
same = 0;
last = h;
}
}, 20);
});
});
}
// ✅ Scrape principal
async function scrape(url, handler, options = {}) {
const {
waitUntil = "domcontentloaded",
waitSelector = null,
timeout = 10000,
scrollToBottom = false,
renderWaitTime = 0,
loadImages = true
} = options;
if (!browser) await init();
const page = await context.newPage();
// 🔒 Bloqueo de recursos
await page.route("**/*", (route) => {
const req = route.request();
const url = req.url().toLowerCase();
const type = req.resourceType();
if (
type === "font" ||
type === "stylesheet" ||
type === "media" ||
type === "manifest"
) return route.abort();
if (BLOCK_LIST.some(k => url.includes(k))) return route.abort();
if (!loadImages && (
type === "image" || url.match(/\.(jpg|jpeg|png|gif|webp|svg)$/)
)) return route.abort();
route.continue();
});
await page.goto(url, { waitUntil, timeout });
if (waitSelector) {
try {
await page.waitForSelector(waitSelector, { timeout });
} catch {}
}
if (scrollToBottom) {
await turboScroll(page);
}
if (renderWaitTime > 0) {
await new Promise(r => setTimeout(r, renderWaitTime));
}
const result = await handler(page);
await page.close();
return result;
}
async function closeScraper() {
if (context) await context.close();
if (browser) await browser.close();
context = null;
browser = null;
}
module.exports = {
initHeadless,
scrape,
closeScraper
};