// ─── SPA data-island recovery ────────────────────────────────────── // Extracts JSON hydration data, application state, and structured data // from HTML that frameworks inject into the page for client-side hydration. import { parseHTML } from "linkedom"; export interface DataIsland { /** Source identifier: the script ID or global variable name. */ source: string; /** Parsed data payload. */ data: unknown; /** Estimated size of the raw payload in bytes. */ size: number; } export interface DataIslandResult { /** Whether any data islands were found. */ found: boolean; /** Discovered islands, sorted by size descending. */ islands: DataIsland[]; /** A markdown representation of the most useful island(s). */ markdown?: string; } // Known global variables used by frameworks for server → client state transfer. // Each entry is a tuple: [name, windowRegex, varRegex] where both regexes are // pre-compiled constants to avoid dynamic RegExp construction. type GlobalPattern = [name: string, windowRe: RegExp, varRe: RegExp]; const GLOBAL_PATTERNS: GlobalPattern[] = [ [ "__DATA__", /window\.__DATA__\s*=\s*(\{.*?\});?/s, /(?:var|const|let)\s+__DATA__\s*=\s*(\{.*?\});?/s, ], [ "__INITIAL_STATE__", /window\.__INITIAL_STATE__\s*=\s*(\{.*?\});?/s, /(?:var|const|let)\s+__INITIAL_STATE__\s*=\s*(\{.*?\});?/s, ], [ "__APOLLO_STATE__", /window\.__APOLLO_STATE__\s*=\s*(\{.*?\});?/s, /(?:var|const|let)\s+__APOLLO_STATE__\s*=\s*(\{.*?\});?/s, ], [ "__PRELOADED_STATE__", /window\.__PRELOADED_STATE__\s*=\s*(\{.*?\});?/s, /(?:var|const|let)\s+__PRELOADED_STATE__\s*=\s*(\{.*?\});?/s, ], [ "__NEXT_DATA__", /window\.__NEXT_DATA__\s*=\s*(\{.*?\});?/s, /(?:var|const|let)\s+__NEXT_DATA__\s*=\s*(\{.*?\});?/s, ], [ "__NUXT__", /window\.__NUXT__\s*=\s*(\{.*?\});?/s, /(?:var|const|let)\s+__NUXT__\s*=\s*(\{.*?\});?/s, ], [ "__GATSBY__", /window\.__GATSBY__\s*=\s*(\{.*?\});?/s, /(?:var|const|let)\s+__GATSBY__\s*=\s*(\{.*?\});?/s, ], [ "__SHOPIFY_SDA__", /window\.__SHOPIFY_SDA__\s*=\s*(\{.*?\});?/s, /(?:var|const|let)\s+__SHOPIFY_SDA__\s*=\s*(\{.*?\});?/s, ], [ "__remixContext", /window\.__remixContext\s*=\s*(\{.*?\});?/s, /(?:var|const|let)\s+__remixContext\s*=\s*(\{.*?\});?/s, ], [ "__reactServerManifest", /window\.__reactServerManifest\s*=\s*(\{.*?\});?/s, /(?:var|const|let)\s+__reactServerManifest\s*=\s*(\{.*?\});?/s, ], [ "__remixRouteModules", /window\.__remixRouteModules\s*=\s*(\{.*?\});?/s, /(?:var|const|let)\s+__remixRouteModules\s*=\s*(\{.*?\});?/s, ], [ "__vite_plugin_react_preamble_installed__", /window\.__vite_plugin_react_preamble_installed__\s*=\s*(\{.*?\});?/s, /(?:var|const|let)\s+__vite_plugin_react_preamble_installed__\s*=\s*(\{.*?\});?/s, ], [ "__sveltekit_", /window\.__sveltekit_\w+\s*=\s*(\{.*?\});?/s, /(?:var|const|let)\s+__sveltekit_\w+\s*=\s*(\{.*?\});?/s, ], [ "_env_", /window\._env_\s*=\s*(\{.*?\});?/s, /(?:var|const|let)\s+_env_\s*=\s*(\{.*?\});?/s, ], [ "window._data", /window\._data\s*=\s*(\{.*?\});?/s, /(?:var|const|let)\s+_data\s*=\s*(\{.*?\});?/s, ], [ "window.__store", /window\.__store\s*=\s*(\{.*?\});?/s, /(?:var|const|let)\s+__store\s*=\s*(\{.*?\});?/s, ], ]; // Known script IDs that contain application/json data const KNOWN_SCRIPT_IDS = [ "__NEXT_DATA__", "__NUXT_DATA__", "__GATSBY_DATA__", "__REMIX_DATA__", "__APOLLO_STATE__", "__INITIAL_STATE__", "__PRELOADED_STATE__", "__DATA__", "__SHOPIFY_SDA__", "bootstrap-data", "initial-state", "app-data", "server-data", "hydration-data", "page-props", ]; /** * Try to parse a string as JSON, returning null on failure. */ function tryParseJSON(text: string): unknown | null { try { return JSON.parse(text); } catch { return null; } } /** * Extract JSON from inline