// ─── Paywall bypass ──────────────────────────────────────────────── // Implements the Bypass Paywalls Clean (BPC) strategy catalog. For each // domain we know about, picks the cheapest trick that gets past the // paywall: bot UA, referer, archive.org Wayback, or a Playwright run // that aborts the paywall vendor's JS. // // References: // - BPC extension: https://gitflic.ru/project/magnolia1234/bypass-paywalls-chrome-clean // - bpc-fetch (Python): https://github.com/Sophomoresty/bpc-fetch // // All public functions are pure / side-effect free except for the // network-touching helpers (tryBotUAFetch, tryArchiveOrgFetch, // tryArchivePhFetch). Detection and strategy resolution are pure and // cheap — safe to call on every page load. import { fetch as wreqFetch } from "wreq-js"; import { fetchWithPlaywright, buildHeaders } from "./fetch.ts"; import { PAYWALL_SITES, PAYWALL_GROUPS } from "./paywall-sites.ts"; // ─── Types ───────────────────────────────────────────────────────── export type BypassStrategyType = | "ua:googlebot" // Spoof Googlebot UA — most common, ~85 sites | "ua:bingbot" // Spoof bingbot UA | "ua:facebookbot" // Spoof facebookexternalhit UA | "ua:custom" // Custom UA string per-site | "referer:google" // Google search referer header | "block_js" // Playwright with paywall JS blocked — ~425 sites | "archive" // Fetch from archive.org / archive.is — ~274 sites | "cookies" // Strip tracking cookies — ~138 sites | "archive_first" // Try archive before primary (cached versions) | "auto"; // Pick the cheapest strategy at runtime export interface PaywallStrategy { /** Ordered list of strategies to attempt. */ steps: BypassStrategyType[]; /** Patterns Playwright should abort (e.g. ["piano.io", "*.tinypass.com"]). */ blockScripts?: string[]; /** DOM CSS to apply after page load (hide paywall divs, set overflow). */ domOverride?: boolean; /** Custom UA string (only for "ua:custom"). */ useragentCustom?: string; /** Whether to allow cookies (false = send Cookie: header empty). */ allowCookies?: boolean; /** Cookies to drop by name (tracking cookies). */ dropCookies?: string[]; /** Path → custom strategy override (e.g. subdomain rules). */ overrides?: Record>; } export interface PaywallDetection { /** Did the content look paywalled? */ paywalled: boolean; /** Confidence 0..1. */ confidence: number; /** Which marker strings matched. */ matchedMarkers: string[]; /** Best guess at the paywall vendor. */ vendor?: | "piano" | "tinypass" | "poool" | "zephr" | "pelcro" | "sophi" | "generic"; /** Whether the content has *some* article text but was truncated. */ truncated?: boolean; } // ─── Bot UAs (from bpc-fetch strategy.py) ────────────────────────── export const UA_GOOGLEBOT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"; export const UA_BINGBOT = "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"; export const UA_FACEBOOKBOT = "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)"; export const UA_INSPECTIONTOOL = "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Mobile Safari/537.36 (compatible; Google-InspectionTool/1.0)"; // ─── Paywall marker detection ────────────────────────────────────── // Subset of bpc-fetch's _is_paywalled() — phrases that appear in the // page body when a paywall gate is up. All lowercased. export const PAYWALL_MARKERS: Array<{ text: string; weight: number; vendor?: PaywallDetection["vendor"]; }> = [ // Generic paywall prompts (high confidence) { text: "subscribe to continue reading", weight: 0.9 }, { text: "log in or create an account to continue", weight: 0.9 }, { text: "sign in to continue", weight: 0.7 }, { text: "create a free account to continue", weight: 0.9 }, { text: "this article is for subscribers", weight: 0.95 }, { text: "to read the full story", weight: 0.6 }, { text: "register for free to continue reading", weight: 0.9 }, { text: "already a subscriber? sign in", weight: 0.8 }, { text: "want to read more?", weight: 0.5 }, { text: "unlock this article", weight: 0.9 }, { text: "premium content", weight: 0.4 }, // low weight — appears in legit pages too { text: "members only", weight: 0.6 }, { text: "subscribe to read", weight: 0.7 }, { text: "to continue reading, please subscribe", weight: 0.9 }, { text: "you have reached your limit of free articles", weight: 0.85 }, { text: "enjoying our latest content?", weight: 0.6 }, { text: "access the most recent journalism", weight: 0.6 }, { text: "explore the latest features & opinion", weight: 0.5 }, // B2B / analysis-news sites (e.g. macropolis.gr, eegora.pt, cefriel.it) { text: "you need a subscription to access", weight: 0.95 }, { text: "to access our analysis", weight: 0.85 }, { text: "please choose one of the packages", weight: 0.7 }, { text: "subscription required to access", weight: 0.95 }, { text: "subscription required to read", weight: 0.95 }, { text: "subscribe to access this", weight: 0.9 }, { text: "subscribe to unlock this", weight: 0.9 }, { text: "for subscribers only", weight: 0.85 }, { text: "this content is for subscribers", weight: 0.9 }, { text: "this story is for subscribers", weight: 0.9 }, { text: "subscribers can read", weight: 0.7 }, { text: "to read the rest of this", weight: 0.7 }, { text: "please subscribe to read the full", weight: 0.9 }, { text: "this is a subscriber-only", weight: 0.9 }, { text: "paid subscribers only", weight: 0.9 }, { text: "become a member to read", weight: 0.85 }, { text: "to continue reading, sign in or subscribe", weight: 0.95 }, { text: "read the full article requires a subscription", weight: 0.95 }, { text: "to access this content, please subscribe", weight: 0.9 }, // Vendor-specific markers (very high confidence) { text: "piano.io", weight: 0.95, vendor: "piano" }, { text: "tinypass.com", weight: 0.95, vendor: "tinypass" }, { text: "poool.fr", weight: 0.95, vendor: "poool" }, { text: "zephr.com", weight: 0.95, vendor: "zephr" }, { text: "pelcro.com", weight: 0.95, vendor: "pelcro" }, { text: "sophi.io", weight: 0.9, vendor: "sophi" }, { text: "cxense.com", weight: 0.85, vendor: "sophi" }, ]; // ─── Paywall detection ───────────────────────────────────────────── // Markers are scanned in three windows so we can catch paywall // curtains that appear at any position in the page: // - HEAD_WINDOW: opening paragraph, "this article is for subscribers" // - TAIL_WINDOW: closing curtain, sign-in form, "you need a subscription" // - FULL_SCAN: matches anywhere (the 16KB head limit used to miss // markers that were buried under a long header/nav, e.g. raw HTML // from a Googlebot fetch of macropolis.gr has the curtain at // position ~16,800) const HEAD_WINDOW = 16_000; const TAIL_WINDOW = 4_000; // Only enable full-scan if the page is large enough that head+tail // coverage might miss. Short pages are covered by the head alone. const FULL_SCAN_THRESHOLD = 20_000; export function detectPaywall(text: string): PaywallDetection { if (!text) { return { paywalled: false, confidence: 0, matchedMarkers: [] }; } const lower = text.toLowerCase(); const head = lower.slice(0, HEAD_WINDOW); const tail = lower.slice(-TAIL_WINDOW); const scan = text.length > FULL_SCAN_THRESHOLD ? lower : head; // Truncation detection: if the article ends with a "..." or short // paragraph after
, it's likely a soft paywall that // returns the first ~2 paragraphs. const truncated = detectTruncation(text); const matchedMarkers: string[] = []; let totalWeight = 0; let vendor: PaywallDetection["vendor"] | undefined; let confidence = 0; for (const marker of PAYWALL_MARKERS) { // Vendor markers (script src patterns) are rare and global — // always check the full text. // Text markers — check head, tail, and (for large pages) the // full scan, so we don't miss curtains that sit between the // head and tail windows. if (marker.vendor) { if (lower.includes(marker.text)) { matchedMarkers.push(marker.text); totalWeight += marker.weight; if (marker.vendor && !vendor) vendor = marker.vendor; } continue; } if ( scan.includes(marker.text) || head.includes(marker.text) || tail.includes(marker.text) ) { matchedMarkers.push(marker.text); totalWeight += marker.weight; } } // A vendor-specific marker (e.g. "piano.io") is a hard signal // even in very short content — the script tag alone is enough. // High-weight text markers (>=0.85) are also extremely specific // ("you need a subscription to access", "this article is for // subscribers", "subscribe to continue reading") and are trusted // even in short content. General low-weight markers ("premium // content", "members only") still need enough content to be // meaningful. const hasHighWeightMarker = matchedMarkers.length > 0 && PAYWALL_MARKERS.some( (m) => matchedMarkers.includes(m.text) && m.weight >= 0.85, ); const hasContent = text.length >= 200 || !!vendor || hasHighWeightMarker; if (totalWeight === 0 && !truncated) { return { paywalled: false, confidence: 0, matchedMarkers: [] }; } if (!hasContent) { return { paywalled: false, confidence, matchedMarkers }; } // Two or more markers, or a single vendor-specific marker, is // very strong evidence. Truncation alone is weak evidence. confidence = Math.min(totalWeight, 1.0); if (truncated && totalWeight < 0.3) confidence += 0.25; if (vendor) confidence = Math.max(confidence, 0.8); return { paywalled: confidence >= 0.45, confidence, matchedMarkers, vendor, truncated, }; } function detectTruncation(text: string): boolean { // Strong signal (raw HTML): the body ends with a paywall curtain. const tail = text.slice(-2000).toLowerCase(); if ( tail.includes("
") && (tail.includes("paywall") || tail.includes("subscribe") || tail.includes("sign in") || tail.includes("register")) ) { return true; } // Strong signal (post-defuddle markdown): the article preview ends // mid-word or mid-sentence with "..." and is immediately followed // by a paywall curtain in the tail (sign in form, subscription // cards, "you need a subscription" prompt, etc.). if ( /(?:^|\n\n)\s*\S[\s\S]{0,400}\.\.\.\s*\n+/.test(text.slice(-4000)) && (tail.includes("you need a subscription") || tail.includes("subscribe to access") || tail.includes("subscribe to read") || tail.includes("subscribe to continue") || tail.includes("sign in to read") || tail.includes("register to read") || tail.includes("for subscribers only") || tail.includes("please choose one of the packages") || tail.includes("subscription required") || tail.includes("members only") || tail.includes("to read the rest of this") || tail.includes("unlock this article") || tail.includes("premium content")) ) { return true; } // Weak signal: no closing or but lots of content. // (e.g. server returned a JSON paywall response but with a long // body of repeated text) return false; } // ─── Strategy lookup ─────────────────────────────────────────────── const GROUP_CACHE = new Map(); /** * Resolve a URL to its paywall strategy, if any. Returns null if the * domain is not in the catalog (caller should fall back to "auto"). */ export function findStrategy(url: string): PaywallStrategy | null { let hostname: string; try { hostname = new URL(url).hostname.toLowerCase().replace(/^www\./, ""); } catch { return null; } // Check direct cache if (GROUP_CACHE.has(hostname)) return GROUP_CACHE.get(hostname)!; // Direct hit const direct = PAYWALL_SITES[hostname]; if (direct) { GROUP_CACHE.set(hostname, direct); return direct; } // Group member: many newspaper groups share a single strategy // across all their regional domains. for (const [suffix, strategy] of Object.entries( PAYWALL_GROUPS as Record, )) { if (hostname === suffix || hostname.endsWith(`.${suffix}`)) { GROUP_CACHE.set(hostname, strategy); return strategy; } } // No strategy known — use generic const generic = GENERIC_STRATEGY; GROUP_CACHE.set(hostname, generic); return generic; } /** * Check if a URL has a SPECIFIC paywall strategy (curated or group * member). Returns false for sites that would only get the GENERIC_STRATEGY * fallback. Used to gate the 403/401 bypass trigger so we don't try to * bypass non-paywall 403s (e.g. blocked by CDN, geo-restriction, etc.). * Handles mobile subdomains (m.example.com, mobile.example.com, etc.) by * matching the base domain in PAYWALL_SITES. */ export function isKnownPaywallSite(url: string): boolean { let hostname: string; try { hostname = new URL(url).hostname.toLowerCase().replace(/^www\./, ""); } catch { return false; } if (PAYWALL_SITES[hostname]) return true; // Match mobile subdomains (m.example.com, mobile.example.com, etc.) for (const site of Object.keys(PAYWALL_SITES)) { if (hostname === site || hostname.endsWith(`.${site}`)) return true; } for (const suffix of Object.keys(PAYWALL_GROUPS as Record)) { if (hostname === suffix || hostname.endsWith(`.${suffix}`)) return true; } return false; } /** * Known paywall vendor script hosts. Aborted in Playwright when a * site uses `block_js` strategy. Declared before GENERIC_STRATEGY so * the constant can be referenced at module init time. */ export const KNOWN_PAYWALL_VENDORS: string[] = [ "piano.io", "*.piano.io", "tinypass.com", "*.tinypass.com", "poool.fr", "*.poool.fr", "zephr.com", "*.zephr.com", "pelcro.com", "*.pelcro.com", "sophi.io", "*.sophi.io", "cxense.com", "*.cxense.com", "temptation", "px.ads.linkedin.com", "shop.nfl.com", // paywall-ish "paywall.quantcast.com", "*.ampproject.org/v0/amp-access-*.js", "*.ampproject.org/v0/amp-subscriptions-*.js", "*.cloudflare.com/cdn-cgi/challenge-platform/", // CF paywall overlay ]; /** Default fallback for unknown domains. */ export const GENERIC_STRATEGY: PaywallStrategy = { steps: ["archive", "ua:googlebot", "block_js"], blockScripts: KNOWN_PAYWALL_VENDORS, domOverride: true, }; // ─── DOM override script (run after page load) ──────────────────── /** * Injected via page.evaluate() after the page renders. Hides paywall * divs, restores body scrolling, and unlocks truncated article * containers. */ export const DOM_OVERRIDE_SCRIPT = ` (function() { // Hide paywall overlay/gate elements var hideSelectors = [ '[class*="paywall"]', '[id*="paywall"]', '[class*="Paywall"]', '[class*="piano"]', '[id*="piano"]', '[class*="Piano"]', '[class*="gate-"]', '[class*="-gate"]', '[class*="regwall"]', '[class*="reg-wall"]', '[class*="registration-wall"]', '[class*="subscription-wall"]', '[class*="subwall"]', '[class*="hardwall"]', '[class*="truncated"]', '[class*="locked"]', '[data-paywall]', '[data-testid*="paywall"]', 'div[class*="overlay"][style*="fixed"]', '.tp-modal', '.tp-backdrop', '.piano-modal', '.poool-widget', '.zephr-paywall', '.qc-cmp2-main', ]; hideSelectors.forEach(function(sel) { document.querySelectorAll(sel).forEach(function(el) { el.style.setProperty('display', 'none', 'important'); el.style.setProperty('visibility', 'hidden', 'important'); el.style.setProperty('opacity', '0', 'important'); el.style.setProperty('pointer-events', 'none', 'important'); }); }); // Restore scrolling document.documentElement.style.overflow = 'auto'; document.body.style.overflow = 'auto'; // Unlock article containers (some sites cap height/overflow on the //
element to hide the rest of the content) var unlockSelectors = ['article', '[data-article]', '.article-body', '[itemprop="articleBody"]', 'main', '[role="main"]', '.story-body', '.post-content', '.entry-content']; unlockSelectors.forEach(function(sel) { document.querySelectorAll(sel).forEach(function(el) { el.style.setProperty('overflow', 'visible', 'important'); el.style.setProperty('max-height', 'none', 'important'); el.style.setProperty('height', 'auto', 'important'); el.style.setProperty('-webkit-mask-image', 'none', 'important'); el.style.setProperty('mask-image', 'none', 'important'); }); }); // Remove any blur/gradient masks var blurSelectors = ['[class*="fade"]', '[class*="gradient"]', '[class*="mask"]']; blurSelectors.forEach(function(sel) { document.querySelectorAll(sel).forEach(function(el) { el.style.setProperty('-webkit-mask-image', 'none', 'important'); el.style.setProperty('mask-image', 'none', 'important'); el.style.setProperty('background', 'transparent', 'important'); }); }); })(); `; // ─── Bot UA fetch ────────────────────────────────────────────────── export interface BypassFetchResult { ok: boolean; status: number; text: string; finalUrl: string; strategy: BypassStrategyType; paywall?: PaywallDetection; error?: string; } /** * Fetch a URL with a search-engine bot UA. Many news sites render * full content to Googlebot/Bingbot/Facebook crawler to ensure * articles get indexed. */ export async function tryBotUAFetch( url: string, strategy: BypassStrategyType, opts: { browser?: string; os?: string; proxy?: string; wreqSession?: any; }, ): Promise { const ua = botUAFor(strategy); if (!ua) return null; try { const headers: Record = { "User-Agent": ua, Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", }; // Googlebot and Bingbot don't send Sec-Ch-Ua const fetchFn = opts.wreqSession ? (u: string, init: any) => opts.wreqSession.fetch(u, init) : wreqFetch; const res = await fetchFn(url, { redirect: "follow", headers, browser: (opts.browser ?? "chrome_145") as any, os: (opts.os ?? "windows") as any, ...(opts.proxy ? { proxy: opts.proxy } : {}), }); if (!res?.ok) return null; const text = await res.text(); const finalUrl = res.url ?? url; const paywall = detectPaywall(text); return { ok: !paywall.paywalled || paywall.confidence < 0.5, status: res.status, text, finalUrl, strategy, paywall, }; } catch (err) { return { ok: false, status: 0, text: "", finalUrl: url, strategy, error: err instanceof Error ? err.message : String(err), }; } } export function botUAFor(strategy: BypassStrategyType): string | null { switch (strategy) { case "ua:googlebot": return UA_GOOGLEBOT; case "ua:bingbot": return UA_BINGBOT; case "ua:facebookbot": return UA_FACEBOOKBOT; case "ua:custom": return null; // caller must supply default: return null; } } // ─── Archive.org Wayback fetch ───────────────────────────────────── const WAYBACK_TIMEOUT_MS = 15000; /** * Fetch a URL from the Wayback Machine. The "2/" prefix returns the * original (un-Wayback-toolbar-ed) version. Returns null if the URL * isn't archived. */ export async function tryArchiveOrgFetch( url: string, _opts: { proxy?: string } = {}, ): Promise { const archiveUrl = `https://web.archive.org/web/2/${url}`; try { const ctrl = new AbortController(); const timer = setTimeout(() => ctrl.abort(), WAYBACK_TIMEOUT_MS); const res = await fetch(archiveUrl, { redirect: "follow", signal: ctrl.signal, headers: { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36", Accept: "text/html,application/xhtml+xml,*/*;q=0.8", }, }); clearTimeout(timer); if (!res.ok) return null; const text = await res.text(); // Wayback sometimes returns its own 404 page if (text.includes("Wayback Machine has not archived")) return null; const finalUrl = res.url ?? archiveUrl; const paywall = detectPaywall(text); return { ok: true, status: 200, text, finalUrl, strategy: "archive", paywall, }; } catch (err) { return null; } } // ─── Archive.ph (archive.is) fetch ───────────────────────────────── const ARCHIVE_PH_TIMEOUT_MS = 15000; /** * Fetch a URL from archive.ph (archive.is). Returns null if the URL * isn't archived. Useful as a fallback when Wayback doesn't have a * copy. */ export async function tryArchivePhFetch( url: string, ): Promise { const archiveUrl = `https://archive.ph/newest/${url}`; try { const ctrl = new AbortController(); const timer = setTimeout(() => ctrl.abort(), ARCHIVE_PH_TIMEOUT_MS); // archive.ph returns a 302 to the timestamped URL. The fetch // will follow it automatically. const res = await fetch(archiveUrl, { redirect: "follow", signal: ctrl.signal, headers: { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36", Accept: "text/html,application/xhtml+xml,*/*;q=0.8", }, }); clearTimeout(timer); if (!res.ok) return null; const text = await res.text(); // archive.ph wraps the content — strip its chrome if present const cleaned = stripArchivePhChrome(text); return { ok: true, status: 200, text: cleaned, finalUrl: res.url ?? archiveUrl, strategy: "archive", paywall: detectPaywall(cleaned), }; } catch { return null; } } function stripArchivePhChrome(html: string): string { // archive.ph wraps the archived page in an