/** @file Diff normalize module. */ import type { ScrapeResult } from "../scrape/pipeline.ts"; export interface SnapshotLink { url: string; text?: string; rel?: string; } export interface NormalizedSnapshotContent { url: string; finalUrl?: string; title?: string; rawText: string; text: string; headings: string[]; links: SnapshotLink[]; metadata: Record; paragraphs: string[]; } export function normalizeScrapeForSnapshot(result: ScrapeResult): NormalizedSnapshotContent { const rawText = normalizeSnapshotText(result.data.markdown ?? result.data.text ?? ""); const text = normalizeVolatileSnapshotText(rawText); const metadata = snapshotMetadata(result); const title = result.data.title ?? metadata.title; return { url: result.url ?? "", finalUrl: result.finalUrl, title, rawText, text, headings: headingsFromText(rawText), links: snapshotLinks(result.data.links), metadata, paragraphs: paragraphsFromText(rawText), }; } export function normalizeSnapshotText(text: string): string { return text .replaceAll(/\r\n?/gu, "\n") .split("\n") .map((line) => line.trim()) .filter(Boolean) .join("\n"); } export function normalizeVolatileSnapshotText(text: string): string { return normalizeSnapshotText(text) .split("\n") .map((line) => line .replaceAll( /\b(last updated|updated|posted)\s+(?:about\s+)?\d+\s+(seconds?|minutes?|hours?|days?)\s+ago\b/giu, "$1 ", ) .replaceAll(/\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z\b/gu, "") .replaceAll( /\b(?:csrf|xsrf|session|sid|nonce|token)=['"]?[A-Za-z0-9._:-]{12,}['"]?/giu, "token=", ) .replaceAll( /\b(?:ad-slot|ad_unit|google_ads_iframe)[:=_-][A-Za-z0-9._-]{6,}\b/giu, "ad-slot=", ) .replaceAll(/https?:\/\/[^\s)]+/giu, stripTrackingParams), ) .join("\n"); } function stripTrackingParams(urlText: string): string { try { const url = new URL(urlText); for (const key of Array.from(url.searchParams.keys())) { if (/^(utm_|fbclid$|gclid$|msclkid$|yclid$|mc_cid$|mc_eid$)/iu.test(key)) url.searchParams.delete(key); if (/^(session|sid|csrf|xsrf|nonce|token)$/iu.test(key)) url.searchParams.set(key, ""); } return url.toString(); } catch { return urlText; } } function headingsFromText(text: string): string[] { return ( text .split("\n") .map((line) => line.match(/^#{1,6}\s+(.+)$/u)?.[1]?.trim()) .filter(Boolean) as string[] ).slice(0, 100); } function paragraphsFromText(text: string): string[] { return text .split("\n") .filter((line) => !/^#{1,6}\s+/u.test(line)) .filter((line) => line.length >= 24) .slice(0, 100); } function snapshotLinks(links: unknown[] | undefined): SnapshotLink[] { const seen = new Set(); const result: SnapshotLink[] = []; for (const link of links ?? []) { if (typeof link !== "object" || link === null) continue; const record = link as Record; if (typeof record.url !== "string" || seen.has(record.url)) continue; seen.add(record.url); result.push({ url: record.url, text: typeof record.text === "string" ? record.text : undefined, rel: typeof record.rel === "string" ? record.rel : undefined, }); } return result.slice(0, 200); } function snapshotMetadata(result: ScrapeResult): Record { const entries: Record = {}; if (result.data.title) entries.title = result.data.title; if (result.data.description) entries.description = result.data.description; flattenMetadata(entries, "metadata", result.data.metadata); return Object.fromEntries( Object.entries(entries).toSorted(([left], [right]) => left.localeCompare(right)), ); } function flattenMetadata(output: Record, prefix: string, value: unknown): void { if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") { output[prefix] = String(value); return; } if (typeof value !== "object" || value === null || Array.isArray(value)) return; for (const [key, nested] of Object.entries(value as Record)) { flattenMetadata(output, `${prefix}.${key}`, nested); } }