import * as path from 'path' import * as fs from 'fs' import { GoogleGenAI } from '@google/genai' import { convertGifToVideo } from './gif-converter' import { tldrawExampleBase64 } from './tldraw_example' function getPrompt(mimeType: string) { return `Please analyze this ${ mimeType.split('/')[1] } and provide a detailed description of its contents. For images, summarize the image in a human-readable way, with a focus on keywords that might be important for understanding the content. For videos, provide a detailed description of what is happening. Include information about the main subject, any notable actions or movements, and any relevant context that might be important for understanding the content. Be sure to pay attention to what is changed or modified in the GIF, such as the placement or display of shapes, their place in the scene graph (parenting), and any other changes that might be important for understanding the image without seeing it, or getting a better understanding of the scene alongside the media. Return your response in a single paragraph with no markdown formatting. Here's more information that can help you make your descriptions. The images or videos you receive are most likely related to tldraw, an infinite canvas SDK that has a popular whiteboard app (also called tldraw) at tldraw.com. If the GIF might be tldraw, it almost certainly is tldraw. You may see the tldraw logo in the image or video. You can refer to the toolbar, which is the horizontal set of buttons at the bottom of the screen, with buttons for the folllowing tools: 'select', 'hand' 'draw', 'eraser', 'arrow', 'text', 'note', 'image', 'rectangle', as well as an overflow menu containing more tools such as 'line', 'highlighter', 'laser', and 'frame', as well as many other geometric tools. The media may also contain the "style panel", which includes a grid of colors, an opacity slider, and radio buttons for the fill, stroke type, size, font alignment, label alignment, and other style properties of the selected shapes. When using the 'select' tool, the user may 'select' shapes by clicking on them and may 'translate' or 'resize' their selection using the corner or edge 'handles'. Similarly, a user may 'bend' arrows by dragging its middle handle, or may 'bind' arrows to shapes by dragging the arrowhead handles onto the shapes. The main menu is at the top left, with a page menu, and a quick action panel with commands for moving shapes front to back, rotating them, and aligning or distributing them.` } export async function analyzeWithGemini( filePath: string, model: string = 'gemini-2.0-flash', apiKey: string ): Promise { if (!apiKey) { throw new Error('No API key provided') } const ai = new GoogleGenAI({ apiKey }) // Read the file const fileBuffer = fs.readFileSync(filePath) const base64Data = Buffer.from(fileBuffer).toString('base64') // Determine file type and MIME type const fileExt = filePath.split('.').pop()?.toLowerCase() let mimeType: string if (fileExt === 'gif') { return generateDescriptionOfGif(ai, model, filePath) } else { // For everything else, videos or images... if (fileExt === 'mp4') { mimeType = 'video/mp4' } else if (fileExt === 'png' || fileExt === 'jpg' || fileExt === 'jpeg') { mimeType = `image/${fileExt}` } else { throw new Error(`Unsupported file type: ${fileExt}`) } const result = await ai.models.generateContent({ model, contents: [ { parts: [ { inlineData: { mimeType: 'image/jpeg', data: tldrawExampleBase64, }, }, { text: 'This is the tldraw user interface. You can use this to check whether the media is tldraw; and if it is, then you can describe it in terms of the tldraw application as provided.', }, { inlineData: { mimeType, data: base64Data, }, }, { text: getPrompt(mimeType), }, ], }, ], }) const text = result.text if (!text) { throw new Error('No response text received from Gemini API') } return text } } async function generateDescriptionOfGif( ai: GoogleGenAI, model: string, filePath: string ) { // Convert GIF to MP4 first const mp4Path = await convertGifToVideo(filePath) try { const mp4Buffer = fs.readFileSync(mp4Path) const mp4Base64 = Buffer.from(mp4Buffer).toString('base64') const mimeType = 'video/mp4' // Use the MP4 data instead const result = await ai.models.generateContent({ model, contents: [ { parts: [ { inlineData: { mimeType, data: mp4Base64, }, }, { text: getPrompt(mimeType), }, ], }, ], }) const text = result.text if (!text) { throw new Error('No response text received from Gemini API') } return text } finally { // Clean up the temporary MP4 file try { fs.unlinkSync(mp4Path) } catch (error) { console.warn('Failed to clean up temporary MP4 file:', error) } } }