import { tool } from '@strands-agents/sdk' import { z } from 'zod' export const analyzeImageTool = tool({ name: 'analyze_image', description: 'Analyze an image (paste, drop, or camera). Image content is provided as multimodal content block by the user.', inputSchema: z.object({ query: z.string().describe('What to look for or analyze') }), callback: (input) => JSON.stringify({ status: 'ready', message: `Vision analysis ready: "${input.query}". Provide image as multimodal block.`, capabilities: ['object_detection', 'text_extraction', 'scene_description', 'code_reading'], }), }) /** * capture_screenshot — captures the current viewport (or element) as PNG, * dispatches a careless:screenshot-captured event (for UI display), AND * pushes the image as a follow-up multimodal block so the next model turn * SEES the pixels. * * The SDK tool callback returns text, but we dispatch an event carrying the * raw base64 — App listens and injects it into the next message on the agent's * behalf via agent.invoke / a trailing user block. To keep this self-contained, * we return a data URL + instruct the model to ask the user to forward it * OR we inject via the assistant-appended user block system below. */ export const captureScreenshotTool = tool({ name: 'capture_screenshot', description: 'Take a screenshot of the current page viewport (or a CSS-selected element). The image is shown inline AND appended to the conversation so you (the AI) can analyze it visually on the next turn. Use this when the user asks what you see on screen, or to debug UI.', inputSchema: z.object({ selector: z.string().optional().describe('Optional CSS selector; defaults to whole body'), reason: z.string().optional().describe('Short description of what you want to look at'), }), callback: async (input) => { try { const html2canvas = (await import('html2canvas')).default const target = input.selector ? document.querySelector(input.selector) as HTMLElement : document.body if (!target) return JSON.stringify({ status: 'error', error: `Element not found: ${input.selector}` }) const canvas = await html2canvas(target, { useCORS: true, allowTaint: true, scale: Math.min(2, window.devicePixelRatio || 1), logging: false, // Skip careless UI bits so screenshot isn't recursive ignoreElements: (el) => el.classList?.contains('careless-ignore-screenshot'), }) const dataUrl = canvas.toDataURL('image/png') const base64 = dataUrl.split(',')[1] const sizeKB = Math.round(base64.length * 0.75 / 1024) if (typeof window !== 'undefined') { window.dispatchEvent(new CustomEvent('careless:screenshot-captured', { detail: { id: 'shot-' + Date.now(), dataUrl, base64, sizeKB, width: canvas.width, height: canvas.height, reason: input.reason || 'screenshot', selector: input.selector || 'body', timestamp: Date.now(), }, })) } // Return as tool result — embedding the image block so Strands // sends it back to the model on the next turn. return JSON.stringify({ status: 'success', width: canvas.width, height: canvas.height, sizeKB, imageBlock: { image: { format: 'png', source: { bytes: base64 } } }, note: 'Screenshot captured and attached. It will be visible in the next model turn.', }) } catch (err: unknown) { return JSON.stringify({ status: 'error', error: (err as Error).message, hint: 'html2canvas failed. Check that no cross-origin iframes block capture.', }) } }, }) export const captureCameraRequestTool = tool({ name: 'capture_camera', description: 'Open the device camera so the user can capture a photo for you. The photo becomes a multimodal block on the next user turn.', inputSchema: z.object({ reason: z.string() }), callback: (input) => { if (typeof window !== 'undefined') { window.dispatchEvent(new CustomEvent('careless:camera-open', { detail: { reason: input.reason } })) } return JSON.stringify({ status: 'capture_requested', reason: input.reason }) }, }) export const VISION_TOOLS = [analyzeImageTool, captureScreenshotTool, captureCameraRequestTool]