my list page fixed for local lists
This commit is contained in:
@@ -1,301 +1,109 @@
|
||||
const { chromium } = require("playwright-chromium");
|
||||
|
||||
let browser;
|
||||
let context;
|
||||
|
||||
const BLOCK_LIST = [
|
||||
"google-analytics", "doubleclick", "facebook", "twitter",
|
||||
"adsystem", "analytics", "tracker", "pixel", "quantserve", "newrelic",
|
||||
"hotjar", "yandex", "ads", "widgets", "gravatar", "fonts.googleapis",
|
||||
"map", "cdn.ampproject.org", "googletagmanager"
|
||||
"adsystem", "analytics", "tracker", "pixel", "quantserve", "newrelic"
|
||||
];
|
||||
|
||||
async function initHeadless() {
|
||||
if (browser && browser.isConnected()) return;
|
||||
|
||||
try {
|
||||
browser = await chromium.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-gpu",
|
||||
"--disable-extensions",
|
||||
"--disable-background-networking",
|
||||
"--disable-sync",
|
||||
"--disable-translate",
|
||||
"--mute-audio",
|
||||
"--no-first-run",
|
||||
"--no-zygote",
|
||||
"--disable-software-rasterizer",
|
||||
"--disable-client-side-phishing-detection",
|
||||
"--no-default-browser-check",
|
||||
"--no-experiments"
|
||||
]
|
||||
});
|
||||
|
||||
context = await browser.newContext({
|
||||
userAgent:
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36"
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("Error initializing browser:", error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async function safeEvaluate(page, fn, ...args) {
|
||||
const maxAttempts = 3;
|
||||
|
||||
for (let i = 0; i < maxAttempts; i++) {
|
||||
try {
|
||||
// Checkeo de estado de página antes de intentar evaluar
|
||||
if (page.isClosed()) {
|
||||
throw new Error("Page is closed before evaluation");
|
||||
}
|
||||
|
||||
return await Promise.race([
|
||||
page.evaluate(fn, ...args),
|
||||
new Promise((_, reject) =>
|
||||
// Timeout más corto podría ser más seguro, e.g., 20000ms
|
||||
setTimeout(() => reject(new Error("Evaluate timeout")), 30000)
|
||||
)
|
||||
]);
|
||||
} catch (error) {
|
||||
const errorMsg = (error.message || "").toLowerCase();
|
||||
const isLastAttempt = i === maxAttempts - 1;
|
||||
|
||||
// Priorizar errores irrecuperables de contexto/página cerrada
|
||||
if (
|
||||
page.isClosed() ||
|
||||
errorMsg.includes("closed") ||
|
||||
errorMsg.includes("target closed") ||
|
||||
errorMsg.includes("session closed")
|
||||
) {
|
||||
console.error("Page context lost or closed, throwing fatal error.");
|
||||
throw error; // Lanzar inmediatamente, no tiene sentido reintentar
|
||||
}
|
||||
|
||||
// Reintentar solo por errores transitorios de ejecución
|
||||
if (!isLastAttempt && (
|
||||
errorMsg.includes("execution context was destroyed") ||
|
||||
errorMsg.includes("cannot find context") ||
|
||||
errorMsg.includes("timeout")
|
||||
)) {
|
||||
console.warn(`Evaluate attempt ${i + 1} failed, retrying...`, error.message);
|
||||
await new Promise(r => setTimeout(r, 500 * (i + 1)));
|
||||
continue;
|
||||
}
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
if (browser) return;
|
||||
browser = await chromium.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-gpu",
|
||||
"--disable-extensions",
|
||||
"--disable-background-networking",
|
||||
"--disable-sync",
|
||||
"--disable-translate",
|
||||
"--mute-audio",
|
||||
"--no-first-run",
|
||||
"--no-zygote",
|
||||
"--single-process"
|
||||
]
|
||||
});
|
||||
context = await browser.newContext({
|
||||
userAgent:
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36"
|
||||
});
|
||||
}
|
||||
|
||||
async function turboScroll(page) {
|
||||
try {
|
||||
|
||||
if (page.isClosed()) {
|
||||
console.warn("Page closed, skipping scroll");
|
||||
return;
|
||||
}
|
||||
|
||||
await safeEvaluate(page, () => {
|
||||
return new Promise((resolve) => {
|
||||
let lastHeight = 0;
|
||||
let sameCount = 0;
|
||||
|
||||
const scrollInterval = setInterval(() => {
|
||||
try {
|
||||
const currentHeight = document.body.scrollHeight;
|
||||
window.scrollTo(0, currentHeight);
|
||||
|
||||
if (currentHeight === lastHeight) {
|
||||
sameCount++;
|
||||
if (sameCount >= 5) {
|
||||
clearInterval(scrollInterval);
|
||||
resolve();
|
||||
}
|
||||
} else {
|
||||
sameCount = 0;
|
||||
lastHeight = currentHeight;
|
||||
}
|
||||
} catch (err) {
|
||||
clearInterval(scrollInterval);
|
||||
await page.evaluate(() => {
|
||||
return new Promise((resolve) => {
|
||||
let last = 0;
|
||||
let same = 0;
|
||||
const timer = setInterval(() => {
|
||||
const h = document.body.scrollHeight;
|
||||
window.scrollTo(0, h);
|
||||
if (h === last) {
|
||||
same++;
|
||||
if (same >= 5) {
|
||||
clearInterval(timer);
|
||||
resolve();
|
||||
}
|
||||
}, 20);
|
||||
|
||||
setTimeout(() => {
|
||||
clearInterval(scrollInterval);
|
||||
resolve();
|
||||
}, 10000);
|
||||
});
|
||||
} else {
|
||||
same = 0;
|
||||
last = h;
|
||||
}
|
||||
}, 20);
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
console.error("Error in turboScroll:", error.message);
|
||||
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async function scrape(url, handler, options = {}) {
|
||||
const {
|
||||
waitUntil = "domcontentloaded",
|
||||
waitSelector = null,
|
||||
timeout = 15000,
|
||||
timeout = 10000,
|
||||
scrollToBottom = false,
|
||||
renderWaitTime = 0,
|
||||
loadImages = true,
|
||||
retries = 3,
|
||||
retryDelay = 1000
|
||||
loadImages = true
|
||||
} = options;
|
||||
|
||||
let lastError = null;
|
||||
|
||||
for (let attempt = 1; attempt <= retries; attempt++) {
|
||||
let page = null;
|
||||
|
||||
if (!browser) await initHeadless();
|
||||
const page = await context.newPage();
|
||||
await page.route("**/*", (route) => {
|
||||
const req = route.request();
|
||||
const url = req.url().toLowerCase();
|
||||
const type = req.resourceType();
|
||||
if (
|
||||
type === "font" ||
|
||||
type === "stylesheet" ||
|
||||
type === "media" ||
|
||||
type === "manifest"
|
||||
) return route.abort();
|
||||
if (BLOCK_LIST.some(k => url.includes(k))) return route.abort();
|
||||
if (!loadImages && (
|
||||
type === "image" || url.match(/\.(jpg|jpeg|png|gif|webp|svg)$/)
|
||||
)) return route.abort();
|
||||
route.continue();
|
||||
});
|
||||
await page.goto(url, { waitUntil, timeout });
|
||||
if (waitSelector) {
|
||||
try {
|
||||
|
||||
if (!browser || !browser.isConnected()) {
|
||||
await initHeadless();
|
||||
}
|
||||
|
||||
page = await context.newPage();
|
||||
const requests = [];
|
||||
|
||||
page.on("close", () => {
|
||||
console.warn("Page closed unexpectedly");
|
||||
});
|
||||
|
||||
page.on("request", req => {
|
||||
requests.push({
|
||||
url: req.url(),
|
||||
method: req.method(),
|
||||
type: req.resourceType()
|
||||
});
|
||||
});
|
||||
|
||||
await page.route("**/*", (route) => {
|
||||
const req = route.request();
|
||||
const resUrl = req.url().toLowerCase();
|
||||
const type = req.resourceType();
|
||||
|
||||
if (
|
||||
type === "font" ||
|
||||
type === "stylesheet" ||
|
||||
type === "media" ||
|
||||
type === "other"
|
||||
) {
|
||||
return route.abort("blockedbyclient");
|
||||
}
|
||||
|
||||
if (BLOCK_LIST.some(k => resUrl.includes(k))) {
|
||||
return route.abort("blockedbyclient");
|
||||
}
|
||||
|
||||
if (!loadImages && (
|
||||
type === "image" || resUrl.match(/\.(jpg|jpeg|png|gif|webp|svg)$/)
|
||||
)) {
|
||||
return route.abort("blockedbyclient");
|
||||
}
|
||||
|
||||
route.continue();
|
||||
});
|
||||
|
||||
await page.goto(url, { waitUntil, timeout });
|
||||
|
||||
if (!page.isClosed()) {
|
||||
await page.waitForTimeout(500);
|
||||
}
|
||||
|
||||
if (waitSelector) {
|
||||
try {
|
||||
await page.waitForSelector(waitSelector, {
|
||||
timeout: Math.min(timeout, 5000)
|
||||
});
|
||||
} catch (e) {
|
||||
console.warn(`Selector '${waitSelector}' not found, continuing...`);
|
||||
}
|
||||
}
|
||||
|
||||
if (scrollToBottom) {
|
||||
await turboScroll(page);
|
||||
}
|
||||
|
||||
if (renderWaitTime > 0) {
|
||||
await page.waitForTimeout(renderWaitTime);
|
||||
}
|
||||
|
||||
if (page.isClosed()) {
|
||||
throw new Error("Page closed before handler execution");
|
||||
}
|
||||
|
||||
const result = await handler(page, safeEvaluate);
|
||||
|
||||
await page.close();
|
||||
|
||||
return { result, requests };
|
||||
|
||||
} catch (error) {
|
||||
lastError = error;
|
||||
console.error(`[Attempt ${attempt}/${retries}] Error scraping ${url}:`, error.message);
|
||||
|
||||
if (page && !page.isClosed()) {
|
||||
try {
|
||||
await page.close();
|
||||
} catch (closeError) {
|
||||
console.error("Error closing page:", closeError.message);
|
||||
}
|
||||
}
|
||||
|
||||
if (
|
||||
error.message.includes("closed") ||
|
||||
error.message.includes("Target closed") ||
|
||||
error.message.includes("Session closed")
|
||||
) {
|
||||
console.log("Browser closure detected, reinitializing...");
|
||||
await closeScraper();
|
||||
}
|
||||
|
||||
if (attempt < retries) {
|
||||
const delay = retryDelay * attempt;
|
||||
|
||||
console.log(`Retrying in ${delay}ms...`);
|
||||
await new Promise(r => setTimeout(r, delay));
|
||||
}
|
||||
}
|
||||
await page.waitForSelector(waitSelector, { timeout });
|
||||
} catch {}
|
||||
}
|
||||
|
||||
console.error(`All attempts failed for ${url}`);
|
||||
throw lastError || new Error("Scraping failed after all retries");
|
||||
if (scrollToBottom) {
|
||||
await turboScroll(page);
|
||||
}
|
||||
if (renderWaitTime > 0) {
|
||||
await new Promise(r => setTimeout(r, renderWaitTime));
|
||||
}
|
||||
const result = await handler(page);
|
||||
await page.close();
|
||||
return { result , "": ""};
|
||||
}
|
||||
|
||||
async function closeScraper() {
|
||||
try {
|
||||
if (context) {
|
||||
await context.close();
|
||||
context = null;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error closing context:", error.message);
|
||||
}
|
||||
|
||||
try {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
browser = null;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error closing browser:", error.message);
|
||||
}
|
||||
if (context) await context.close();
|
||||
if (browser) await browser.close();
|
||||
context = null;
|
||||
browser = null;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
initHeadless,
|
||||
scrape,
|
||||
closeScraper,
|
||||
safeEvaluate
|
||||
closeScraper
|
||||
};
|
||||
Reference in New Issue
Block a user