/** * transformers-js.ts — On-device ML via šŸ¤— Transformers.js v4.2.0 * * Runs ANY HF model from the Hub locally in the browser via ONNX Runtime Web. * Backends: WASM (universal) or WebGPU (faster, limited model support). * Models download once and cache in the browser (IndexedDB via Cache API). * * Package: @huggingface/transformers (v4.2.0) * * Coverage verified against node_modules/@huggingface/transformers/src/pipelines/index.js * — all 25 task types + 5 aliases supported, defaults match upstream exactly. */ import { tool } from '@strands-agents/sdk' import { z } from 'zod' /* ────────────────────────────────────────────────────────────────────────── * Task registry — sourced DIRECTLY from transformers.js v4.2.0 source. * Keeping this client-side lets the agent introspect available tasks + * default models without reaching into node_modules. * ──────────────────────────────────────────────────────────────────────── */ type TaskInfo = { type: 'text' | 'audio' | 'image' | 'multimodal' defaultModel: string defaultDtype?: string description: string } export const TASK_REGISTRY: Record = { // ── Text ── 'text-classification': { type: 'text', defaultModel: 'Xenova/distilbert-base-uncased-finetuned-sst-2-english', description: 'Sentiment/topic classification. Returns label + score. Alias: sentiment-analysis.', }, 'token-classification': { type: 'text', defaultModel: 'Xenova/bert-base-multilingual-cased-ner-hrl', description: 'NER/POS tagging, per-token labels. Alias: ner.', }, 'question-answering': { type: 'text', defaultModel: 'Xenova/distilbert-base-cased-distilled-squad', description: 'Extractive QA. Input: { question, context }.', }, 'fill-mask': { type: 'text', defaultModel: 'onnx-community/ettin-encoder-32m-ONNX', defaultDtype: 'fp32', description: 'Masked-LM fill. Input must contain [MASK] (or model-specific token).', }, summarization: { type: 'text', defaultModel: 'Xenova/distilbart-cnn-6-6', description: 'Abstractive summary of long text.', }, translation: { type: 'text', defaultModel: 'Xenova/t5-small', description: 'Translate between languages. Task can be suffixed "_xx_to_yy".', }, 'text2text-generation': { type: 'text', defaultModel: 'Xenova/flan-t5-small', description: 'T5-style seq2seq (instruction-following, translation, QA).', }, 'text-generation': { type: 'text', defaultModel: 'onnx-community/Qwen3-0.6B-ONNX', defaultDtype: 'q4', description: 'Causal LM generation. Accepts plain text OR chat Message[]. Streams via TextStreamer.', }, 'zero-shot-classification': { type: 'text', defaultModel: 'Xenova/distilbert-base-uncased-mnli', description: 'Classify text against ARBITRARY candidate_labels at inference. No training.', }, 'feature-extraction': { type: 'text', defaultModel: 'onnx-community/all-MiniLM-L6-v2-ONNX', defaultDtype: 'fp32', description: 'Text embeddings for semantic search/clustering/RAG. Alias: embeddings.', }, // ── Audio ── 'audio-classification': { type: 'audio', defaultModel: 'Xenova/wav2vec2-base-superb-ks', description: 'Classify audio clip (keyword spotting, genre, speaker ID).', }, 'zero-shot-audio-classification': { type: 'multimodal', defaultModel: 'Xenova/clap-htsat-unfused', description: 'CLAP — classify audio against arbitrary text labels.', }, 'automatic-speech-recognition': { type: 'multimodal', defaultModel: 'Xenova/whisper-tiny.en', description: 'Speech-to-text. Whisper family. Alias: asr.', }, 'text-to-audio': { type: 'text', defaultModel: 'onnx-community/Supertonic-TTS-ONNX', defaultDtype: 'fp32', description: 'Text-to-speech. Alias: text-to-speech. Returns { audio: Float32Array, sampling_rate }.', }, // ── Image ── 'image-to-text': { type: 'multimodal', defaultModel: 'Xenova/vit-gpt2-image-captioning', description: 'Image captioning / VLM. Also works with Florence-2, Moondream for rich VLM tasks.', }, 'image-classification': { type: 'multimodal', defaultModel: 'Xenova/vit-base-patch16-224', description: 'Classify image into fixed label set.', }, 'image-segmentation': { type: 'multimodal', defaultModel: 'Xenova/detr-resnet-50-panoptic', description: 'Panoptic/semantic/instance segmentation. Returns masks per label.', }, 'background-removal': { type: 'image', defaultModel: 'Xenova/modnet', description: 'Alpha-matte foreground extraction (like remove.bg).', }, 'zero-shot-image-classification': { type: 'multimodal', defaultModel: 'Xenova/clip-vit-base-patch32', description: 'CLIP — classify image against arbitrary text labels.', }, 'object-detection': { type: 'multimodal', defaultModel: 'Xenova/detr-resnet-50', description: 'Detect objects with bounding boxes + labels.', }, 'zero-shot-object-detection': { type: 'multimodal', defaultModel: 'Xenova/owlvit-base-patch32', description: 'Detect arbitrary text-queried objects (OWL-ViT).', }, 'document-question-answering': { type: 'multimodal', defaultModel: 'Xenova/donut-base-finetuned-docvqa', description: 'Answer questions about a document image (Donut).', }, 'image-to-image': { type: 'image', defaultModel: 'Xenova/swin2SR-classical-sr-x2-64', description: 'Image super-resolution / enhancement.', }, 'depth-estimation': { type: 'image', defaultModel: 'onnx-community/depth-anything-v2-small', description: 'Monocular depth estimation. Returns per-pixel depth map.', }, 'image-feature-extraction': { type: 'image', defaultModel: 'onnx-community/dinov3-vits16-pretrain-lvd1689m-ONNX', defaultDtype: 'fp32', description: 'Image embeddings for visual search / retrieval.', }, } export const TASK_ALIASES: Record = { 'sentiment-analysis': 'text-classification', ner: 'token-classification', asr: 'automatic-speech-recognition', 'text-to-speech': 'text-to-audio', embeddings: 'feature-extraction', } function resolveTask(task: string): string { return TASK_ALIASES[task] ?? task.split('_', 1)[0] } /* ────────────────────────────────────────────────────────────────────────── * Lazy loader + pipeline cache with proper disposal * ──────────────────────────────────────────────────────────────────────── */ let _transformers: any = null async function getTransformers() { if (!_transformers) _transformers = await import('@huggingface/transformers') return _transformers } type DtypeSpec = string | Record type CachedPipeline = { pipe: any task: string model: string device: string dtype?: DtypeSpec loadedAt: number } const PIPELINE_CACHE = new Map() function cacheKey(task: string, model: string, device: string, dtype?: DtypeSpec): string { const d = typeof dtype === 'object' ? JSON.stringify(dtype) : (dtype ?? 'default') return `${task}::${model}::${device}::${d}` } /** * Firefox + ONNX Runtime Web has a known incompatibility with q8/q4 quantized * weights that use the MatMulNBits-with-merged-scale pattern. The error is: * * "Missing required scale: model..weight_merged_0_scale for node: * model..weight_transposed_DequantizeLinear" * * Chrome/Safari's bundled ORT tolerates this; Firefox's doesn't. It affects: * - Whisper (decoder.embed_tokens) - automatic-speech-recognition * - BART/T5/NLLB (shared) - summarization, translation * - vit-gpt2 (shared) - image-to-text * - CLIP (shared) - zero-shot-image-classification * - BERT-NER (shared) - token-classification * - Depth-Anything etc. - depth-estimation * * Whenever possible, fall back to fp32 for all sessions. This doubles the * memory footprint but guarantees correctness across browsers. * * Known session keys across all model types (from upstream session_config.js): * model, encoder_model, decoder_model_merged, embed_tokens, vision_encoder, * audio_encoder, encodec_decode, prompt_encoder_mask_decoder, * prepare_inputs_embeds */ const ALL_SESSION_KEYS = [ 'model', 'encoder_model', 'decoder_model_merged', 'decoder_model', 'embed_tokens', 'vision_encoder', 'audio_encoder', 'encodec_decode', 'prompt_encoder_mask_decoder', 'prepare_inputs_embeds', 'text_encoder', ] /** Produce a dtype object forcing fp32 on every session key — safest fallback. */ function allSessionsFp32(): Record { const out: Record = {} for (const k of ALL_SESSION_KEYS) out[k] = 'fp32' return out } /** * Detect if Firefox+ORT hits the missing-scale bug for this task/model. * Returns true for encoder-decoder / seq2seq / vision-seq / image-text models * where a shared embedding table triggers the bug. Decoder-only LLMs (Qwen, Llama) * don't use the shared-weight-merge pattern so they're safe. */ function needsFp32Fallback(resolvedTask: string, model: string): boolean { const m = model.toLowerCase() // Decoder-only LLMs: NO shared weight merge — safe to quantize const isDecoderOnlyLLM = resolvedTask === 'text-generation' || /qwen|llama|phi|gemma|smollm|tinyllama|stablelm/i.test(m) if (isDecoderOnlyLLM) return false // Tasks that use seq2seq / encoder-decoder / shared-weight architectures: const seq2seqTasks = [ 'automatic-speech-recognition', // Whisper 'summarization', // BART, T5, distilbart 'translation', // T5, NLLB, Marian 'text2text-generation', // FLAN-T5, T5 'image-to-text', // vit-gpt2, Florence-2, BLIP 'document-question-answering', // Donut 'zero-shot-image-classification', // CLIP 'token-classification', // BERT-NER (shared embeddings) 'image-classification', // ViT (shared) 'image-segmentation', // DETR, SegFormer 'object-detection', // DETR, OWL-ViT 'zero-shot-object-detection', 'depth-estimation', 'image-feature-extraction', ] return seq2seqTasks.includes(resolvedTask) } /** * Pick a safe dtype for this (task, model, device) combination. * Respects user override. Otherwise avoids the Firefox quant bug. */ function pickSafeDtype(resolvedTask: string, model: string, device: string, userDtype?: DtypeSpec): DtypeSpec | undefined { if (userDtype !== undefined) return userDtype const info = TASK_REGISTRY[resolvedTask] if (needsFp32Fallback(resolvedTask, model)) { return allSessionsFp32() } return info?.defaultDtype } async function getPipeline(opts: { task: string model?: string device?: string dtype?: DtypeSpec progressCallback?: (p: any) => void }): Promise { const resolvedTask = resolveTask(opts.task) const info = TASK_REGISTRY[resolvedTask] const model = opts.model ?? info?.defaultModel if (!model) throw new Error(`Unknown task: ${opts.task}. See transformers_list_tasks.`) const device = opts.device ?? 'wasm' const dtype = pickSafeDtype(resolvedTask, model, device, opts.dtype) const key = cacheKey(resolvedTask, model, device, dtype) const hit = PIPELINE_CACHE.get(key) if (hit) return hit const { pipeline } = await getTransformers() const pipelineOpts: any = { device } if (dtype) pipelineOpts.dtype = dtype if (opts.progressCallback) pipelineOpts.progress_callback = opts.progressCallback let pipe: any let effectiveDtype: DtypeSpec | undefined = dtype try { pipe = await pipeline(resolvedTask, model, pipelineOpts) } catch (err: unknown) { const msg = (err as Error).message || String(err) // Auto-retry once with all-sessions fp32 if it's the Firefox ORT quant bug. // Don't retry if the user explicitly passed a dtype (respect their choice) // or if we already tried fp32 everywhere (would be a different error). const isQuantBug = msg.includes('weight_merged_0_scale') || msg.includes('Missing required scale') || msg.includes('TransposeDQWeightsForMatMulNBits') const triedAllFp32 = typeof dtype === 'object' && Object.values(dtype).every(v => v === 'fp32') if (isQuantBug && opts.dtype === undefined && !triedAllFp32) { try { const retryDtype = allSessionsFp32() const retryOpts = { ...pipelineOpts, dtype: retryDtype } pipe = await pipeline(resolvedTask, model, retryOpts) effectiveDtype = retryDtype } catch (retryErr: unknown) { const rmsg = (retryErr as Error).message || String(retryErr) throw new Error( `${rmsg}\n\n→ First attempt and fp32-fallback retry both failed. ` + `Original error: ${msg}\n` + `Try a different model id — this ONNX model may not have fp32 weights published.`, ) } } else if (isQuantBug) { throw new Error( `${msg}\n\n→ HINT: ONNX quantization mismatch (Firefox+ORT). ` + `Either remove the \`dtype\` override, or use a different model id with fp32 weights available.`, ) } else if (msg.includes('WebGPU') || msg.includes('webgpu')) { throw new Error(`${msg}\n\n→ HINT: WebGPU failed. Retry with device: "wasm".`) } else { throw err } } const entry: CachedPipeline = { pipe, task: resolvedTask, model, device, dtype: effectiveDtype, loadedAt: Date.now(), } PIPELINE_CACHE.set(key, entry) return entry } /* ────────────────────────────────────────────────────────────────────────── * Tools * ──────────────────────────────────────────────────────────────────────── */ /** List every task with default model + description — lets agent discover capabilities. */ export const transformersListTasksTool = tool({ name: 'transformers_list_tasks', description: 'List ALL supported Transformers.js tasks (25 pipeline types + 5 aliases) with their default models, ' + 'type (text/audio/image/multimodal), and what each does. Use this to pick the right task before transformers_run.', inputSchema: z.object({ filter: z.enum(['text', 'audio', 'image', 'multimodal', 'all']).optional(), }), callback: (input) => { const filter = input.filter || 'all' const tasks: Record = {} for (const [task, info] of Object.entries(TASK_REGISTRY)) { if (filter !== 'all' && info.type !== filter) continue tasks[task] = info } return JSON.stringify({ status: 'success', count: Object.keys(tasks).length, tasks, aliases: TASK_ALIASES, dtypes: ['fp32', 'fp16', 'q8', 'int8', 'uint8', 'q4', 'q4f16', 'bnb4'], devices: ['wasm', 'webgpu'], }) }, }) /** * Universal pipeline runner — the one tool that covers all 25 tasks. */ export const transformersRunTool = tool({ name: 'transformers_run', description: 'Run ANY šŸ¤— Transformers.js pipeline on-device. Covers all 25 task types: ' + 'text-classification, token-classification, question-answering, fill-mask, summarization, translation, ' + 'text2text-generation, text-generation, zero-shot-classification, audio-classification, ' + 'zero-shot-audio-classification, automatic-speech-recognition, text-to-audio, image-to-text, ' + 'image-classification, image-segmentation, background-removal, zero-shot-image-classification, ' + 'object-detection, zero-shot-object-detection, document-question-answering, image-to-image, ' + 'depth-estimation, feature-extraction, image-feature-extraction. ' + 'Call transformers_list_tasks first to see defaults. Models cache after first download.', inputSchema: z.object({ task: z.string().describe('Pipeline task name (see transformers_list_tasks). Aliases accepted.'), input: z.any().describe( 'Input matching the task type: string for text, URL/dataURL/blobURL for image/audio, ' + '{question,context} for QA, {image,question} for document-QA, Message[] for text-generation chat.', ), model: z.string().optional().describe('HF model id. Omit to use task default.'), options: z.record(z.string(), z.any()).optional().describe('Pipeline call options (max_new_tokens, top_k, candidate_labels, etc.).'), device: z.enum(['wasm', 'webgpu']).optional(), dtype: z.union([z.enum(['fp32', 'fp16', 'q8', 'int8', 'uint8', 'q4', 'q4f16', 'bnb4']), z.record(z.string(), z.string())]).optional().describe('Quantization. q4 for LLMs. For encoder-decoder models (Whisper, T5), pass an object: {encoder_model: "q8", decoder_model_merged: "fp32"}.'), }), callback: async (input) => { try { const entry = await getPipeline({ task: input.task, model: input.model, device: input.device, dtype: input.dtype, }) // Handle pipelines that take candidate_labels as a 2nd positional argument: // zero-shot-classification.(texts, candidate_labels, options) // zero-shot-object-detection.(image, candidate_labels, options) // zero-shot-image-classification.(image, candidate_labels, options) // zero-shot-audio-classification.(audio, candidate_labels, options) // NOTE: passing {candidate_labels: [...]} in options is a common mistake; // we silently rewrite it to positional for these tasks. const POSITIONAL_LABEL_TASKS = new Set([ 'zero-shot-classification', 'zero-shot-object-detection', 'zero-shot-image-classification', 'zero-shot-audio-classification', ]) let result: any if (POSITIONAL_LABEL_TASKS.has(entry.task)) { const opts = { ...(input.options || {}) } as any const labels = opts.candidate_labels delete opts.candidate_labels if (!labels) { return JSON.stringify({ status: 'error', task: entry.task, error: `${entry.task} requires options.candidate_labels: [ ... ] (array of class strings).`, }) } result = await entry.pipe(input.input, labels, opts) } else { result = await entry.pipe(input.input, input.options || {}) } // Audio output (TTS) is not JSON-serializable — summarize instead. if (result && result.audio instanceof Float32Array) { return JSON.stringify({ status: 'success', task: entry.task, model: entry.model, audio_samples: result.audio.length, sampling_rate: result.sampling_rate, duration_sec: result.audio.length / result.sampling_rate, hint: 'Use transformers_tts to synthesize AND play audio.', }) } // Feature extraction returns a Tensor — convert to array for JSON if (result && typeof result === 'object' && 'dims' in result && 'data' in result) { return JSON.stringify({ status: 'success', task: entry.task, model: entry.model, dims: (result as any).dims, data: result.tolist ? result.tolist() : Array.from((result as any).data), }) } return JSON.stringify({ status: 'success', task: entry.task, model: entry.model, result }) } catch (err: unknown) { return JSON.stringify({ status: 'error', task: input.task, error: (err as Error).message }) } }, }) /** * Dedicated chat tool — takes Message[], uses chat template, streams tokens. */ export const transformersChatTool = tool({ name: 'transformers_chat', description: 'Chat with an on-device LLM via Transformers.js. Default: onnx-community/Qwen3-0.6B-ONNX (q4 quantized). ' + 'Accepts a Message[] ([{role:"system"|"user"|"assistant", content:"..."}]). ' + 'Applies the model\'s chat template automatically. Fully local, no API call.', inputSchema: z.object({ messages: z.array( z.object({ role: z.enum(['system', 'user', 'assistant']), content: z.string(), }), ), model: z.string().optional().describe('Default: onnx-community/Qwen3-0.6B-ONNX'), max_new_tokens: z.number().optional().describe('Default 256'), temperature: z.number().optional(), top_p: z.number().optional(), top_k: z.number().optional(), do_sample: z.boolean().optional(), dtype: z.union([z.enum(['fp32', 'fp16', 'q8', 'int8', 'uint8', 'q4', 'q4f16', 'bnb4']), z.record(z.string(), z.string())]).optional().describe('Default: q4. For non-LLMs pass fp16 or fp32.'), device: z.enum(['wasm', 'webgpu']).optional(), }), callback: async (input) => { try { const entry = await getPipeline({ task: 'text-generation', model: input.model, device: input.device, dtype: input.dtype ?? 'q4', }) const opts: any = { max_new_tokens: input.max_new_tokens ?? 256 } if (input.temperature !== undefined) opts.temperature = input.temperature if (input.top_p !== undefined) opts.top_p = input.top_p if (input.top_k !== undefined) opts.top_k = input.top_k if (input.do_sample !== undefined) opts.do_sample = input.do_sample const out = await entry.pipe(input.messages, opts) // out is TextGenerationChatOutput[]: [{ generated_text: Message[] }] const first = Array.isArray(out) ? out[0] : out const generated = first?.generated_text let reply: string | null = null if (Array.isArray(generated)) { const last = generated[generated.length - 1] reply = typeof last === 'string' ? last : last?.content } else if (typeof generated === 'string') { reply = generated } return JSON.stringify({ status: 'success', model: entry.model, reply, full_output: first, }) } catch (err: unknown) { return JSON.stringify({ status: 'error', error: (err as Error).message }) } }, }) /** Text-to-speech that ALSO plays via Web Audio. */ export const transformersTtsTool = tool({ name: 'transformers_tts', description: 'On-device text-to-speech. Default: onnx-community/Supertonic-TTS-ONNX. ' + 'Synthesizes AND plays audio via Web Audio API. Also works with SpeechT5 (set speaker_embeddings URL).', inputSchema: z.object({ text: z.string(), model: z.string().optional().describe('Default: Supertonic. For SpeechT5: Xenova/speecht5_tts.'), speaker_embeddings: z.string().optional().describe('Required for SpeechT5 (URL to .bin).'), autoplay: z.boolean().optional().describe('Default true. false returns audio metadata only.'), device: z.enum(['wasm', 'webgpu']).optional(), dtype: z.union([z.enum(['fp32', 'fp16', 'q8', 'int8', 'uint8', 'q4', 'q4f16', 'bnb4']), z.record(z.string(), z.string())]).optional(), }), callback: async (input) => { try { const entry = await getPipeline({ task: 'text-to-audio', model: input.model, device: input.device, dtype: input.dtype, }) const opts: any = {} if (input.speaker_embeddings) { opts.speaker_embeddings = input.speaker_embeddings } else if ((input.model || '').toLowerCase().includes('speecht5')) { opts.speaker_embeddings = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin' } const out = await entry.pipe(input.text, opts) if (input.autoplay !== false) { const AC = (window as any).AudioContext || (window as any).webkitAudioContext const ctx = new AC() const buf = ctx.createBuffer(1, out.audio.length, out.sampling_rate) buf.getChannelData(0).set(out.audio) const src = ctx.createBufferSource() src.buffer = buf src.connect(ctx.destination) src.start() } return JSON.stringify({ status: input.autoplay !== false ? 'playing' : 'synthesized', model: entry.model, samples: out.audio.length, sampling_rate: out.sampling_rate, duration_sec: out.audio.length / out.sampling_rate, }) } catch (err: unknown) { return JSON.stringify({ status: 'error', error: (err as Error).message }) } }, }) /** Whisper STT — audio URL / blob URL / data URL. */ export const transformersSttTool = tool({ name: 'transformers_stt', description: 'On-device speech-to-text via Whisper (default: Xenova/whisper-tiny.en). ' + 'Pass an audio URL (http/blob/data). Supports chunked long-form transcription with return_timestamps.', inputSchema: z.object({ audio: z.string().describe('Audio URL (http/blob/data).'), model: z.string().optional().describe('Default: Xenova/whisper-tiny.en. Others: whisper-base, whisper-small.en, whisper-large-v3-turbo.'), language: z.string().optional().describe('ISO-639-1 code, e.g. "en", "tr", "fr". Multilingual Whisper only.'), task: z.enum(['transcribe', 'translate']).optional(), return_timestamps: z.union([z.boolean(), z.literal('word')]).optional(), chunk_length_s: z.number().optional().describe('For long-form audio (e.g. 30).'), stride_length_s: z.number().optional(), device: z.enum(['wasm', 'webgpu']).optional(), dtype: z.union([z.enum(['fp32', 'fp16', 'q8', 'int8', 'uint8', 'q4', 'q4f16', 'bnb4']), z.record(z.string(), z.string())]).optional().describe('Auto-selects a Firefox-compatible dtype if omitted. Override with e.g. {encoder_model: "fp32", decoder_model_merged: "fp32"} if you hit ONNX errors.'), }), callback: async (input) => { try { const entry = await getPipeline({ task: 'automatic-speech-recognition', model: input.model, device: input.device, dtype: input.dtype, }) const opts: any = {} if (input.language) opts.language = input.language if (input.task) opts.task = input.task if (input.return_timestamps !== undefined) opts.return_timestamps = input.return_timestamps if (input.chunk_length_s !== undefined) opts.chunk_length_s = input.chunk_length_s if (input.stride_length_s !== undefined) opts.stride_length_s = input.stride_length_s const result = await entry.pipe(input.audio, opts) return JSON.stringify({ status: 'success', model: entry.model, ...result }) } catch (err: unknown) { return JSON.stringify({ status: 'error', error: (err as Error).message }) } }, }) /** Text / image embeddings. */ export const transformersEmbedTool = tool({ name: 'transformers_embed', description: 'Generate embeddings on-device. For text: Xenova/all-MiniLM-L6-v2 (default). ' + 'For images: pass task="image-feature-extraction" with default DINOv3. Returns vector(s) + dims.', inputSchema: z.object({ input: z.union([z.string(), z.array(z.string())]).describe('Text string(s) or image URL(s)'), task: z.enum(['feature-extraction', 'image-feature-extraction']).optional().describe('Default: feature-extraction (text)'), model: z.string().optional(), pooling: z.enum(['none', 'mean', 'cls']).optional(), normalize: z.boolean().optional(), device: z.enum(['wasm', 'webgpu']).optional(), dtype: z.union([z.enum(['fp32', 'fp16', 'q8']), z.record(z.string(), z.string())]).optional(), }), callback: async (input) => { try { const entry = await getPipeline({ task: input.task ?? 'feature-extraction', model: input.model, device: input.device, dtype: input.dtype, }) const opts: any = { pooling: input.pooling ?? 'mean', normalize: input.normalize ?? true, } const out = await entry.pipe(input.input, opts) const arr = out.tolist ? out.tolist() : Array.from(out.data) return JSON.stringify({ status: 'success', model: entry.model, dims: out.dims, count: Array.isArray(input.input) ? input.input.length : 1, embeddings: arr, }) } catch (err: unknown) { return JSON.stringify({ status: 'error', error: (err as Error).message }) } }, }) /** Configure global env (cache / remote / backends). */ export const transformersConfigureTool = tool({ name: 'transformers_configure', description: 'Configure global Transformers.js env: remote host, cache, allowed model sources, backend settings. ' + 'Affects ALL subsequent pipeline loads.', inputSchema: z.object({ allowRemoteModels: z.boolean().optional(), allowLocalModels: z.boolean().optional(), remoteHost: z.string().optional().describe('Default: https://huggingface.co/'), remotePathTemplate: z.string().optional(), useBrowserCache: z.boolean().optional(), }), callback: async (input) => { try { const { env } = await getTransformers() const before: any = { allowRemoteModels: env.allowRemoteModels, allowLocalModels: env.allowLocalModels, remoteHost: env.remoteHost, remotePathTemplate: env.remotePathTemplate, useBrowserCache: env.useBrowserCache, } if (input.allowRemoteModels !== undefined) env.allowRemoteModels = input.allowRemoteModels if (input.allowLocalModels !== undefined) env.allowLocalModels = input.allowLocalModels if (input.remoteHost !== undefined) env.remoteHost = input.remoteHost if (input.remotePathTemplate !== undefined) env.remotePathTemplate = input.remotePathTemplate if (input.useBrowserCache !== undefined) env.useBrowserCache = input.useBrowserCache return JSON.stringify({ status: 'success', before, after: { allowRemoteModels: env.allowRemoteModels, allowLocalModels: env.allowLocalModels, remoteHost: env.remoteHost, remotePathTemplate: env.remotePathTemplate, useBrowserCache: env.useBrowserCache, }, }) } catch (err: unknown) { return JSON.stringify({ status: 'error', error: (err as Error).message }) } }, }) /** List loaded pipelines with metadata. */ export const transformersStatusTool = tool({ name: 'transformers_status', description: 'List all currently loaded Transformers.js pipelines with task, model, device, dtype, age.', inputSchema: z.object({}), callback: async () => { const now = Date.now() const loaded = Array.from(PIPELINE_CACHE.entries()).map(([key, e]) => ({ key, task: e.task, model: e.model, device: e.device, dtype: e.dtype ?? 'default', loaded_ago_sec: Math.round((now - e.loadedAt) / 1000), })) let version: string | undefined if (_transformers) { try { version = _transformers.env?.version } catch {} } return JSON.stringify({ status: 'success', transformers_loaded: _transformers !== null, transformers_version: version, pipeline_count: loaded.length, pipelines: loaded, }) }, }) /** Proper unload — calls pipe.dispose() to free WASM memory (not just Map.delete). */ export const transformersUnloadTool = tool({ name: 'transformers_unload', description: 'Unload a cached pipeline AND dispose its underlying model to free WASM/WebGPU memory. ' + 'Pass specific (task,model,device,dtype) or just task to unload all matching.', inputSchema: z.object({ task: z.string().optional().describe('Omit to unload ALL pipelines'), model: z.string().optional(), device: z.enum(['wasm', 'webgpu']).optional(), dtype: z.union([z.string(), z.record(z.string(), z.string())]).optional(), all: z.boolean().optional().describe('Unload every pipeline'), }), callback: async (input) => { try { const disposed: string[] = [] const errors: string[] = [] const matches = (e: CachedPipeline) => (input.all === true) || (!input.task || e.task === resolveTask(input.task)) && (!input.model || e.model === input.model) && (!input.device || e.device === input.device) && (!input.dtype || e.dtype === input.dtype) for (const [key, entry] of Array.from(PIPELINE_CACHE.entries())) { if (!matches(entry)) continue try { if (entry.pipe?.dispose) await entry.pipe.dispose() PIPELINE_CACHE.delete(key) disposed.push(key) } catch (e: unknown) { errors.push(`${key}: ${(e as Error).message}`) } } return JSON.stringify({ status: 'success', disposed_count: disposed.length, disposed, errors: errors.length ? errors : undefined, remaining: PIPELINE_CACHE.size, }) } catch (err: unknown) { return JSON.stringify({ status: 'error', error: (err as Error).message }) } }, }) /** Quick device capability probe — useful before choosing webgpu. */ export const transformersDeviceInfoTool = tool({ name: 'transformers_device_info', description: 'Probe runtime capabilities: WebGPU support, WASM threads, SharedArrayBuffer, cross-origin-isolation. ' + 'Use this before choosing device: "webgpu".', inputSchema: z.object({}), callback: async () => { try { const nav = navigator as any let webgpu = false let adapterInfo: any = null if (nav.gpu) { try { const adapter = await nav.gpu.requestAdapter() if (adapter) { webgpu = true adapterInfo = await adapter.requestAdapterInfo?.().catch(() => null) } } catch {} } return JSON.stringify({ status: 'success', webgpu_supported: webgpu, adapter: adapterInfo, shared_array_buffer: typeof SharedArrayBuffer !== 'undefined', cross_origin_isolated: (window as any).crossOriginIsolated ?? false, hardware_concurrency: navigator.hardwareConcurrency, user_agent: navigator.userAgent, recommendation: webgpu ? 'WebGPU available — use device: "webgpu" for ~2-10x speedup on supported models (most LLMs, embeddings, vision models).' : 'WebGPU not available — fallback to device: "wasm". Works everywhere but slower.', }) } catch (err: unknown) { return JSON.stringify({ status: 'error', error: (err as Error).message }) } }, }) export const TRANSFORMERS_TOOLS = [ transformersListTasksTool, transformersDeviceInfoTool, transformersRunTool, transformersChatTool, transformersTtsTool, transformersSttTool, transformersEmbedTool, transformersConfigureTool, transformersStatusTool, transformersUnloadTool, ]