/** @file Content-addressed streaming download storage with TTL cleanup. */ import { createHash } from "node:crypto"; import { createReadStream, createWriteStream } from "node:fs"; import { mkdir, readdir, stat, unlink } from "node:fs/promises"; import path from "node:path"; import { Readable } from "node:stream"; import { pipeline } from "node:stream/promises"; import { resolvePiStoragePaths } from "../storage/paths.ts"; import { BodySizeLimitError } from "./download.ts"; const HEX_PREFIX_LENGTH = 2; // 7 days const MAX_FILENAME_BYTES = 200; const DEFAULT_TTL_MS = 7 * 24 * 60 * 60 * 1000; export interface SaveToFileOptions { dir?: string; filename?: string; maxBytes?: number; } export interface DownloadResult { filePath: string; bytes: number; contentType?: string; sha256?: string; } /** Returns the base directory for content-addressed downloads. Defaults to ~/.pi/scraper/downloads/. */ export function getDownloadsBaseDir(override?: string): string { if (override) return path.resolve(override); return resolvePiStoragePaths().downloads; } /** * Derive a safe filename from response properties. Priority: explicit name > Content-Disposition > * URL basename > fallback. */ export function deriveFilename( url: string, contentType?: string, disposition?: string, override?: string, ): string { if (override) return sanitizeFilename(override); if (disposition) { const match = disposition.match(/(?:^|;)\s*filename\*?=(?:UTF-8'')?["']?([^"';]+)["']?/iu); if (match?.[1]) { // filename*= uses URL-encoding (RFC 5987); filename= is plain const isRfc5987 = /filename\*=/iu.test(disposition); const raw = match[1].trim(); const decoded = isRfc5987 ? decodeURIComponent(raw) : raw; return sanitizeFilename(decoded); } } try { const urlPath = new URL(url).pathname; const basename = urlPath.match(/\/([^/]+)$/u)?.[1]; if (basename && basename.length > 0 && basename !== "/") { const decoded = decodeURIComponent(basename); const cleaned = sanitizeFilename(decoded); // Append extension from content-type if basename lacks one if (!cleaned.includes(".")) { return cleaned + extFromContentType(contentType); } return cleaned; } } catch { // invalid URL — fall through } const ext = extFromContentType(contentType); return `download${ext}`; } function extFromContentType(contentType?: string): string { if (!contentType) return ".bin"; const map: Record = { "application/pdf": ".pdf", "application/zip": ".zip", "application/gzip": ".gz", "application/x-tar": ".tar", "application/x-gtar": ".tar.gz", "image/jpeg": ".jpg", "image/png": ".png", "image/gif": ".gif", "image/webp": ".webp", "image/svg+xml": ".svg", "video/mp4": ".mp4", "audio/mpeg": ".mp3", "application/json": ".json", "text/csv": ".csv", "text/markdown": ".md", "text/plain": ".txt", "text/html": ".html", }; const base = contentType.toLowerCase().split(";", 1)[0]?.trim() ?? ""; return map[base] ?? ".bin"; } /** * Sanitize a filename to prevent path traversal and other attacks. Strips control characters, * parent directory references, and caps length. */ export function sanitizeFilename(name: string): string { const basename = name.split(/[/\\]/u).pop() ?? "download"; const cleaned = basename // oxlint-disable-next-line eslint/no-control-regex -- intentional: strip control chars for path safety .replaceAll(/[\u0000-\u001F\u007F]/gu, "") .replace(/^\.+/u, "") .replaceAll(/~+/gu, "") .trim(); if (cleaned.length === 0) return "download"; if (Buffer.byteLength(cleaned, "utf8") > MAX_FILENAME_BYTES) { return cleaned.slice(0, MAX_FILENAME_BYTES); } return cleaned; } /** * Stream a response body to a content-addressed file in the downloads directory. Path: * // Deduplicates: reuses existing file with same sha256 prefix. */ export async function saveBodyToDownloads( body: AsyncIterable, contentType: string | undefined, url: string, fetchHeaders: Record | undefined, options: SaveToFileOptions = {}, ): Promise { const baseDir = getDownloadsBaseDir(options.dir); // 50MB default const maxBytes = options.maxBytes ?? 50 * 1024 * 1024; const disposition = fetchHeaders?.["content-disposition"]; const filename = deriveFilename(url, contentType, disposition, options.filename); let downloadedBytes = 0; const hasher = createHash("sha256"); const chunks: Buffer[] = []; for await (const chunk of body) { downloadedBytes += chunk.byteLength; if (downloadedBytes > maxBytes) { throw new BodySizeLimitError(maxBytes, downloadedBytes); } const buf = Buffer.from(chunk); hasher.update(buf); chunks.push(buf); } const hexDigest = hasher.digest("hex"); const prefix = hexDigest.slice(0, HEX_PREFIX_LENGTH); const prefixDir = path.join(baseDir, prefix); await mkdir(prefixDir, { recursive: true, mode: 0o700 }); const filePath = path.join(prefixDir, filename); // If the file already exists (same prefix + name), assume dedup try { await stat(filePath); return { filePath, bytes: downloadedBytes, contentType, sha256: hexDigest }; } catch { // file doesn't exist — write it } const combined = Buffer.concat(chunks); await mkdir(prefixDir, { recursive: true, mode: 0o700 }); await pipeline(Readable.from([combined]), createWriteStream(filePath, { mode: 0o600 })); return { filePath, bytes: downloadedBytes, contentType, sha256: hexDigest }; } /** Remove downloaded files older than maxAgeMs (default 7 days). Returns count of removed files. */ export async function cleanupOldDownloads(maxAgeMs?: number, baseDir?: string): Promise { const age = maxAgeMs ?? DEFAULT_TTL_MS; baseDir ??= getDownloadsBaseDir(); const cutoff = Date.now() - age; let removed = 0; let prefixDirs: string[]; try { prefixDirs = await readdir(baseDir, { withFileTypes: true }).then((entries) => entries.filter((e) => e.isDirectory()).map((e) => path.join(baseDir, e.name)), ); } catch { // directory doesn't exist yet return 0; } for (const dir of prefixDirs) { let files: string[]; try { files = await readdir(dir); } catch { continue; } for (const file of files) { const filePath = path.join(dir, file); try { const stats = await stat(filePath); if (stats.isFile() && stats.mtimeMs < cutoff) { await unlink(filePath); removed++; } } catch { // race with concurrent access — skip } } // Remove empty prefix directories try { const remaining = await readdir(dir); if (remaining.length === 0) { await unlink(dir).catch(() => null); } } catch { // ignore } } return removed; } export async function computeSha256(filePath: string): Promise { const hash = createHash("sha256"); const stream = createReadStream(filePath); for await (const chunk of stream) { hash.update(chunk); } return hash.digest("hex"); }