/** * analyze_image - Pi Extension * * Allows non-vision models to analyze images by spawning a vision-capable * subagent. The subagent reads images using the `read` tool and returns * a plain text description. * * Install: Copy or symlink to ~/.pi/agent/extensions/analyze-image/index.ts * * Config: ~/.pi/agent/extensions/analyze-image/config.json * { * "defaultModel": "gemma4:31b-cloud", * "systemPrompt": "...", * "maxImagesPerCall": 10 * } */ import { spawn } from "node:child_process"; import * as fs from "node:fs"; import * as os from "node:os"; import * as path from "node:path"; import type { AgentToolResult } from "@mariozechner/pi-agent-core"; import type { Message } from "@mariozechner/pi-ai"; import { type ExtensionAPI } from "@mariozechner/pi-coding-agent"; import { Text } from "@mariozechner/pi-tui"; import { Type } from "@sinclair/typebox"; // ============================================================================ // Configuration // ============================================================================ interface AnalyzeImageConfig { defaultModel: string; systemPrompt: string; maxImagesPerCall: number; } const DEFAULT_CONFIG: AnalyzeImageConfig = { defaultModel: "gemma4:31b-cloud", systemPrompt: [ "You are an image analysis assistant. You MUST:", "1. Read EVERY image file listed below using the `read` tool before answering", "2. Answer the user's question about the images accurately and thoroughly", "3. Return ONLY your analysis as plain text", "", "Do NOT describe images without reading them first.", "Do NOT suggest further actions or offer to do additional work.", "Focus entirely on answering the user's question about the images.", ].join("\n"), maxImagesPerCall: 10, }; const SUPPORTED_EXTENSIONS = new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp"]); function loadConfig(): AnalyzeImageConfig { const configPath = path.join( os.homedir(), ".pi", "agent", "extensions", "analyze-image", "config.json", ); try { if (fs.existsSync(configPath)) { const raw = JSON.parse(fs.readFileSync(configPath, "utf-8")); return { defaultModel: raw.defaultModel ?? DEFAULT_CONFIG.defaultModel, systemPrompt: raw.systemPrompt ?? DEFAULT_CONFIG.systemPrompt, maxImagesPerCall: raw.maxImagesPerCall ?? DEFAULT_CONFIG.maxImagesPerCall, }; } } catch (error) { console.error(`[analyze-image] Failed to load config from '${configPath}':`, error); } return { ...DEFAULT_CONFIG }; } // ============================================================================ // Image validation // ============================================================================ function validateImagePath(imagePath: string, cwd: string): string { const absolutePath = path.resolve(cwd, imagePath); if (!fs.existsSync(absolutePath)) { throw new Error(`Image file not found: ${absolutePath}`); } const ext = path.extname(absolutePath).toLowerCase(); if (!SUPPORTED_EXTENSIONS.has(ext)) { throw new Error( `Unsupported image format '${ext}'. Supported: ${[...SUPPORTED_EXTENSIONS].join(", ")}`, ); } return absolutePath; } // ============================================================================ // Subagent execution // ============================================================================ function getFinalOutput(messages: Message[]): string { for (let i = messages.length - 1; i >= 0; i--) { const msg = messages[i]; if (msg.role === "assistant") { const content = msg.content; if (Array.isArray(content)) { for (const part of content) { if (typeof part === "object" && "type" in part && part.type === "text") { return (part as { type: "text"; text: string }).text; } } } } } return ""; } function getPiInvocation(args: string[]): { command: string; args: string[] } { const currentScript = process.argv[1]; if (currentScript && fs.existsSync(currentScript)) { return { command: process.execPath, args: [currentScript, ...args] }; } const execName = path.basename(process.execPath).toLowerCase(); if (!/^(node|bun)(\.exe)?$/.test(execName)) { return { command: process.execPath, args }; } return { command: "pi", args }; } async function runVisionSubagent( model: string, systemPrompt: string, task: string, cwd: string, signal: AbortSignal | undefined, ): Promise<{ exitCode: number; output: string; stderr: string }> { const tmpDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), "pi-analyze-image-")); const promptPath = path.join(tmpDir, "system-prompt.md"); const result = { exitCode: 0, output: "", stderr: "" }; const messages: Message[] = []; let wasAborted = false; try { await fs.promises.writeFile(promptPath, systemPrompt, { encoding: "utf-8", mode: 0o600 }); const args: string[] = [ "--mode", "json", "-p", "--no-session", "--model", model, "--tools", "read", "--append-system-prompt", promptPath, `Task: ${task}`, ]; result.exitCode = await new Promise((resolve) => { const invocation = getPiInvocation(args); const proc = spawn(invocation.command, invocation.args, { cwd, shell: false, stdio: ["ignore", "pipe", "pipe"], }); let stdoutBuffer = ""; const processLine = (line: string) => { if (!line.trim()) return; try { const event = JSON.parse(line); if (event.type === "message_end" && event.message) { messages.push(event.message as Message); } } catch { /* skip non-JSON lines */ } }; proc.stdout.on("data", (data: Buffer) => { stdoutBuffer += data.toString(); const lines = stdoutBuffer.split("\n"); stdoutBuffer = lines.pop() || ""; for (const line of lines) processLine(line); }); proc.stderr.on("data", (data: Buffer) => { result.stderr += data.toString(); }); proc.on("close", (code) => { if (stdoutBuffer.trim()) processLine(stdoutBuffer); resolve(code ?? 0); }); proc.on("error", () => { resolve(1); }); if (signal) { const killProc = () => { wasAborted = true; proc.kill("SIGTERM"); setTimeout(() => { if (!proc.killed) proc.kill("SIGKILL"); }, 5000); }; if (signal.aborted) killProc(); else signal.addEventListener("abort", killProc, { once: true }); } }); if (wasAborted) throw new Error("Image analysis was aborted"); result.output = getFinalOutput(messages); return result; } finally { try { fs.unlinkSync(promptPath); } catch { /* ignore */ } try { fs.rmdirSync(tmpDir); } catch { /* ignore */ } } } // ============================================================================ // Tool details // ============================================================================ interface AnalyzeImageDetails { model: string; imageCount: number; imagePaths: string[]; question: string; exitCode: number; durationMs: number; } // ============================================================================ // Schema // ============================================================================ const AnalyzeImageParams = Type.Object({ images: Type.Array(Type.String({ description: "Local file path to an image" }), { description: "One or more local image file paths to analyze", }), question: Type.String({ description: "Question about the image(s)" }), model: Type.Optional( Type.String({ description: "Override the configured default vision model" }), ), }); // ============================================================================ // Extension // ============================================================================ export default function (pi: ExtensionAPI) { let config = loadConfig(); // Reload config on each new session pi.on("session_start", () => { config = loadConfig(); }); pi.registerTool({ name: "analyze_image", label: "Analyze Image", description: [ "Analyze one or more images using a vision-capable model.", "Provide local file paths and a question about the images.", "A vision subagent reads the images and returns a plain text description.", "Use this tool when you cannot view images directly.", "Supports PNG, JPG, JPEG, GIF, WebP, and BMP formats.", ].join(" "), promptSnippet: "Analyze images with a vision model", promptGuidelines: [ "Use analyze_image when you need to understand image content but cannot view images directly.", "Provide specific, focused questions for best results.", ], parameters: AnalyzeImageParams, async execute( _toolCallId: string, params: any, signal: AbortSignal | undefined, _onUpdate: any, ctx: any, ): Promise> { const { images, question, model: modelOverride } = params; const model = modelOverride || config.defaultModel; const baseDetails: AnalyzeImageDetails = { model, imageCount: images.length, imagePaths: images, question, exitCode: 0, durationMs: 0, }; // Validate image count if (images.length === 0) { return { content: [{ type: "text", text: "No images provided. Pass at least one image path." }], isError: true, details: { ...baseDetails, exitCode: 1 }, }; } if (images.length > config.maxImagesPerCall) { return { content: [ { type: "text", text: `Too many images (${images.length}). Maximum is ${config.maxImagesPerCall}.`, }, ], isError: true, details: { ...baseDetails, exitCode: 1 }, }; } // Validate image paths const absolutePaths: string[] = []; for (const img of images) { try { absolutePaths.push(validateImagePath(img, ctx.cwd)); } catch (error) { return { content: [ { type: "text", text: error instanceof Error ? error.message : String(error) }, ], isError: true, details: { ...baseDetails, exitCode: 1 }, }; } } // Build task prompt for the subagent const imageList = absolutePaths.map((p) => ` - ${p}`).join("\n"); const task = [ `Read and analyze the following image(s):`, imageList, "", `Question: ${question}`, "", "Remember: Read each image file first using the read tool, then answer the question.", ].join("\n"); const startTime = Date.now(); try { const subResult = await runVisionSubagent(model, config.systemPrompt, task, ctx.cwd, signal); const durationMs = Date.now() - startTime; if (subResult.exitCode !== 0 || !subResult.output) { const errorMsg = subResult.stderr || subResult.output || "Subagent failed with no output"; return { content: [{ type: "text", text: `Image analysis failed: ${errorMsg}` }], isError: true, details: { ...baseDetails, imagePaths: absolutePaths, exitCode: subResult.exitCode, durationMs }, }; } return { content: [{ type: "text", text: subResult.output }], details: { ...baseDetails, imagePaths: absolutePaths, exitCode: 0, durationMs }, }; } catch (error) { const durationMs = Date.now() - startTime; return { content: [ { type: "text", text: `Image analysis error: ${error instanceof Error ? error.message : String(error)}`, }, ], isError: true, details: { ...baseDetails, imagePaths: absolutePaths, exitCode: 1, durationMs }, }; } }, renderCall(args, theme) { const imageCount = args.images?.length ?? 0; const preview = args.question ? args.question.length > 50 ? `${args.question.slice(0, 50)}...` : args.question : "(no question)"; let text = theme.fg("toolTitle", theme.bold("analyze_image ")); text += theme.fg("accent", `${imageCount} image${imageCount !== 1 ? "s" : ""}`); if (args.model) text += theme.fg("muted", ` [${args.model}]`); text += "\n " + theme.fg("dim", preview); // Show image file names when few enough if (args.images && args.images.length <= 3) { for (const img of args.images) { const name = path.basename(img); text += "\n " + theme.fg("dim", `📄 ${name}`); } } return new Text(text, 0, 0); }, renderResult(result, options, theme) { const details = result.details as AnalyzeImageDetails | undefined; const icon = result.isError ? theme.fg("error", "✗") : theme.fg("success", "✓"); let text = icon + " " + theme.fg("toolTitle", theme.bold("analyze_image")); if (details?.model) text += theme.fg("muted", ` (${details.model})`); if (details?.durationMs) { const seconds = (details.durationMs / 1000).toFixed(1); text += theme.fg("dim", ` ${seconds}s`); } const output = result.content[0]; if (output?.type === "text") { if (options.expanded) { text += "\n" + theme.fg("toolOutput", output.text); } else { const lines = output.text.split("\n"); const previewLines = lines.slice(0, 5); text += "\n" + theme.fg("toolOutput", previewLines.join("\n")); if (lines.length > 5) { text += theme.fg("muted", `\n... (${lines.length - 5} more lines, Ctrl+O to expand)`); } } } return new Text(text, 0, 0); }, }); }