/** * Core fetch logic: HTTP fetch → mdream HTML-to-markdown conversion. */ import { htmlToMarkdown } from "mdream"; import { withMinimalPreset } from "mdream/preset/minimal"; import { scanUrl, scanHeaders } from "./secrets"; import { analyzeInjection, wrapInjection, type InjectionAction, type InjectionAnalysis, } from "./injection"; export interface FetchOptions { /** Custom headers */ headers?: Record; /** Request timeout in ms (default: 30000) */ timeout?: number; /** Use minimal preset for LLM-optimized output (default: true) */ minimal?: boolean; /** Return raw HTML without markdown conversion */ raw?: boolean; /** Enable prompt injection detection (default: true) */ detectInjection?: boolean; /** Injection detection threshold 0.0–1.0 (default: 0.3) */ injectionThreshold?: number; /** Action on injection: warn, redact, tag, none (default: warn) */ injectionAction?: InjectionAction; /** Enable secret scanning on outgoing URLs (default: true) */ scanSecrets?: boolean; /** AbortSignal for cancellation */ signal?: AbortSignal; } export interface FetchResult { /** Markdown (or raw HTML) content */ content: string; /** HTTP status code */ status: number; /** Content type from response */ contentType: string; /** Original URL */ url: string; /** Whether content was converted to markdown */ converted: boolean; /** Byte size of original HTML */ originalSize: number; /** Byte size of converted output */ outputSize: number; /** Injection analysis (if detection enabled) */ injection?: InjectionAnalysis; /** Error message if something went wrong */ error?: string; } /** Content types that should be converted to markdown */ function isHtmlLike(contentType: string): boolean { const ct = contentType.toLowerCase(); return ct.includes("text/html") || ct.includes("application/xhtml"); } /** Extract origin (scheme + host) from URL for mdream link resolution */ function getOrigin(url: string): string { try { const u = new URL(url); return u.origin; } catch { return ""; } } /** * Fetch a URL, convert HTML to markdown via mdream, scan for secrets & injections. */ export async function secureFetch(url: string, options: FetchOptions = {}): Promise { const { headers = {}, timeout = 30_000, minimal = true, raw = false, detectInjection = true, injectionThreshold = 0.3, injectionAction = "warn", scanSecrets = true, signal, } = options; // --- Secret scanning on outgoing request --- if (scanSecrets) { const urlScan = scanUrl(url); if (urlScan.found) { return { content: "", status: 0, contentType: "", url, converted: false, originalSize: 0, outputSize: 0, error: `BLOCKED: ${urlScan.pattern!.name} detected in ${urlScan.location}. Remove the secret before fetching.`, }; } const headerScan = scanHeaders(headers); if (headerScan.found) { return { content: "", status: 0, contentType: "", url, converted: false, originalSize: 0, outputSize: 0, error: `BLOCKED: ${headerScan.pattern!.name} detected in ${headerScan.location}. Remove the secret before fetching.`, }; } } // --- HTTP fetch --- let response: Response; try { const controller = new AbortController(); const combinedSignal = signal ? AbortSignal.any([signal, controller.signal]) : controller.signal; const timer = setTimeout(() => controller.abort(), timeout); response = await fetch(url, { headers: { "User-Agent": "pi-scurl/0.1", Accept: "text/html,application/xhtml+xml,text/plain,application/json,*/*", ...headers, }, signal: combinedSignal, redirect: "follow", }); clearTimeout(timer); } catch (err: any) { return { content: "", status: 0, contentType: "", url, converted: false, originalSize: 0, outputSize: 0, error: err.name === "AbortError" ? "Request timed out or was cancelled" : `Fetch error: ${err.message}`, }; } const contentType = response.headers.get("content-type") ?? ""; const body = await response.text(); const originalSize = new TextEncoder().encode(body).length; // --- Non-HTML: return as-is --- if (!isHtmlLike(contentType) || raw) { let content = body; let injection: InjectionAnalysis | undefined; if (detectInjection && !raw) { injection = analyzeInjection(body, injectionThreshold); if (injection.flagged) { content = wrapInjection(body, injection, injectionAction); } } return { content, status: response.status, contentType, url, converted: false, originalSize, outputSize: new TextEncoder().encode(content).length, injection, }; } // --- HTML → Markdown via mdream --- let markdown: string; try { const origin = getOrigin(url); if (minimal) { markdown = htmlToMarkdown(body, withMinimalPreset({ origin })); } else { markdown = htmlToMarkdown(body, { origin }); } } catch (err: any) { return { content: body, status: response.status, contentType, url, converted: false, originalSize, outputSize: originalSize, error: `Markdown conversion failed: ${err.message}`, }; } // --- Injection detection on converted markdown --- let injection: InjectionAnalysis | undefined; let content = markdown; if (detectInjection) { injection = analyzeInjection(markdown, injectionThreshold); if (injection.flagged) { content = wrapInjection(markdown, injection, injectionAction); } } const outputSize = new TextEncoder().encode(content).length; return { content, status: response.status, contentType, url, converted: true, originalSize, outputSize, injection, }; }