updated headless for compatibility

This commit is contained in:
2026-01-13 16:36:38 +01:00
parent 9ac6704bba
commit f6488d6e52
3 changed files with 25 additions and 19 deletions

View File

@@ -3,7 +3,6 @@ const { chromium } = require("playwright");
const {spawn} = require("node:child_process"); const {spawn} = require("node:child_process");
let browser; let browser;
let context;
const BLOCK_LIST = [ const BLOCK_LIST = [
"google-analytics", "doubleclick", "facebook", "twitter", "google-analytics", "doubleclick", "facebook", "twitter",
"adsystem", "analytics", "tracker", "pixel", "quantserve", "newrelic" "adsystem", "analytics", "tracker", "pixel", "quantserve", "newrelic"
@@ -48,10 +47,6 @@ async function initHeadless() {
"--no-zygote", "--no-zygote",
] ]
}); });
context = await browser.newContext({
userAgent:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36"
});
} }
async function turboScroll(page) { async function turboScroll(page) {
@@ -87,6 +82,12 @@ async function scrape(url, handler, options = {}) {
loadImages = true loadImages = true
} = options; } = options;
if (!browser) await initHeadless(); if (!browser) await initHeadless();
const context = await browser.newContext({
userAgent:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36"
});
const page = await context.newPage(); const page = await context.newPage();
let collectedRequests = []; let collectedRequests = [];
await page.route("**/*", (route) => { await page.route("**/*", (route) => {
@@ -100,7 +101,7 @@ async function scrape(url, handler, options = {}) {
resourceType: type resourceType: type
}); });
if (type === "font" || type === "media" || type === "manifest") if (type === "font" || type === "manifest")
return route.abort(); return route.abort();
if (BLOCK_LIST.some(k => rUrl.includes(k))) if (BLOCK_LIST.some(k => rUrl.includes(k)))
@@ -111,6 +112,10 @@ async function scrape(url, handler, options = {}) {
)) return route.abort(); )) return route.abort();
route.continue(); route.continue();
}); });
await page.addInitScript(() => {
Object.defineProperty(navigator, "webdriver", { get: () => false });
});
await page.goto(url, { waitUntil, timeout }); await page.goto(url, { waitUntil, timeout });
if (waitSelector) { if (waitSelector) {
try { try {
@@ -125,14 +130,12 @@ async function scrape(url, handler, options = {}) {
} }
const result = await handler(page); const result = await handler(page);
await page.close(); await page.close();
await context.close();
return { result, requests: collectedRequests }; return { result, requests: collectedRequests };
} }
async function closeScraper() { async function closeScraper() {
if (context) await context.close();
if (browser) await browser.close(); if (browser) await browser.close();
context = null;
browser = null; browser = null;
} }
module.exports = { module.exports = {

View File

@@ -1,6 +1,5 @@
const { chromium } = require("playwright-chromium"); const { chromium } = require("playwright-chromium");
let browser; let browser;
let context;
const BLOCK_LIST = [ const BLOCK_LIST = [
"google-analytics", "doubleclick", "facebook", "twitter", "google-analytics", "doubleclick", "facebook", "twitter",
"adsystem", "analytics", "tracker", "pixel", "quantserve", "newrelic" "adsystem", "analytics", "tracker", "pixel", "quantserve", "newrelic"
@@ -23,10 +22,6 @@ async function initHeadless() {
"--no-zygote", "--no-zygote",
] ]
}); });
context = await browser.newContext({
userAgent:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36"
});
} }
async function turboScroll(page) { async function turboScroll(page) {
@@ -62,6 +57,12 @@ async function scrape(url, handler, options = {}) {
loadImages = true loadImages = true
} = options; } = options;
if (!browser) await initHeadless(); if (!browser) await initHeadless();
const context = await browser.newContext({
userAgent:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36"
});
const page = await context.newPage(); const page = await context.newPage();
let collectedRequests = []; let collectedRequests = [];
await page.route("**/*", (route) => { await page.route("**/*", (route) => {
@@ -75,7 +76,7 @@ async function scrape(url, handler, options = {}) {
resourceType: type resourceType: type
}); });
if (type === "font" || type === "media" || type === "manifest") if (type === "font" || type === "manifest")
return route.abort(); return route.abort();
if (BLOCK_LIST.some(k => rUrl.includes(k))) if (BLOCK_LIST.some(k => rUrl.includes(k)))
@@ -86,6 +87,10 @@ async function scrape(url, handler, options = {}) {
)) return route.abort(); )) return route.abort();
route.continue(); route.continue();
}); });
await page.addInitScript(() => {
Object.defineProperty(navigator, "webdriver", { get: () => false });
});
await page.goto(url, { waitUntil, timeout }); await page.goto(url, { waitUntil, timeout });
if (waitSelector) { if (waitSelector) {
try { try {
@@ -100,14 +105,12 @@ async function scrape(url, handler, options = {}) {
} }
const result = await handler(page); const result = await handler(page);
await page.close(); await page.close();
await context.close();
return { result, requests: collectedRequests }; return { result, requests: collectedRequests };
} }
async function closeScraper() { async function closeScraper() {
if (context) await context.close();
if (browser) await browser.close(); if (browser) await browser.close();
context = null;
browser = null; browser = null;
} }
module.exports = { module.exports = {