import type { AnyMap, HybridObject } from 'react-native-nitro-modules' /** * Statistics from the last text generation. */ export interface GenerationStats { tokenCount: number tokensPerSecond: number timeToFirstToken: number totalTime: number toolExecutionTime: number } export interface GenerationStartEvent { type: 'generation_start' timestamp: number } export interface TokenEvent { type: 'token' token: string } export interface ThinkingStartEvent { type: 'thinking_start' timestamp: number } export interface ThinkingChunkEvent { type: 'thinking_chunk' chunk: string } export interface ThinkingEndEvent { type: 'thinking_end' content: string timestamp: number } export interface ToolCallStartEvent { type: 'tool_call_start' id: string name: string arguments: string } export interface ToolCallExecutingEvent { type: 'tool_call_executing' id: string } export interface ToolCallCompletedEvent { type: 'tool_call_completed' id: string result: string } export interface ToolCallFailedEvent { type: 'tool_call_failed' id: string error: string } export interface GenerationEndEvent { type: 'generation_end' content: string stats: GenerationStats } export type StreamEvent = | GenerationStartEvent | TokenEvent | ThinkingStartEvent | ThinkingChunkEvent | ThinkingEndEvent | ToolCallStartEvent | ToolCallExecutingEvent | ToolCallCompletedEvent | ToolCallFailedEvent | GenerationEndEvent export interface LLMMessage { role: string content: string } /** * Controls low-level token generation behavior. */ export interface LLMGenerationConfig { /** Maximum number of tokens to generate */ maxTokens?: number /** Sliding-window KV cache size. When set, old cache entries are rotated out */ maxKVSize?: number /** KV cache quantization bits. Use 4 or 8 to reduce cache memory usage */ kvBits?: number /** KV cache quantization group size */ kvGroupSize?: number /** Token index at which KV cache quantization begins */ quantizedKVStart?: number /** Sampling temperature. Set to 0 for greedy decoding */ temperature?: number /** Top-p / nucleus sampling threshold */ topP?: number /** Penalty applied to recently repeated tokens */ repetitionPenalty?: number /** Number of recent tokens considered for repetition penalty */ repetitionContextSize?: number /** Prompt prefill chunk size used for long-context batching */ prefillStepSize?: number } /** * Controls history trimming when managed chat history is enabled. */ export interface LLMContextConfig { /** * Maximum prompt token count to preserve when rebuilding managed history. * Additional context passed during `load()` is treated as pinned and preserved. */ maxContextTokens?: number /** Number of most-recent history messages to preserve during trimming */ keepLastMessages?: number } /** * Parameter definition for a tool. */ export interface ToolParameter { name: string type: string description: string required: boolean } /** * Tool definition that can be called by the model. */ export interface ToolDefinition { name: string description: string parameters: ToolParameter[] handler: (args: AnyMap) => Promise } /** Options for loading a model. */ export interface LLMLoadOptions { /** Callback invoked with loading progress (0-1) */ onProgress?: (progress: number) => void /** Additional context to provide to the model */ additionalContext?: LLMMessage[] /** Whether to automatically manage message history */ manageHistory?: boolean /** Tools available for the model to call */ tools?: ToolDefinition[] /** Default generation parameters applied to future requests */ generationConfig?: LLMGenerationConfig /** Number of generated chunks to batch before crossing the JS bridge */ tokenBatchSize?: number /** Context trimming behavior for managed chat history */ contextConfig?: LLMContextConfig } /** * Low-level LLM interface for text generation using MLX. * @internal Use the `LLM` export from `react-native-nitro-mlx` instead. */ export interface LLM extends HybridObject<{ ios: 'swift' }> { /** * Load a model into memory. Downloads from HuggingFace if not already cached. * @param modelId - HuggingFace model ID (e.g., 'mlx-community/Qwen3-0.6B-4bit') * @param options - Callback invoked with loading progress (0-1) */ load(modelId: string, options?: LLMLoadOptions): Promise /** * Generate a complete response for a prompt. * @param prompt - The input text to generate a response for * @returns The generated text */ generate(prompt: string): Promise /** * Stream a response token by token with optional tool calling support. * Tools are automatically executed when the model calls them. * @param prompt - The input text to generate a response for * @param onToken - Callback invoked for each generated token * @param onToolCall - Optional callback invoked when a tool is called (for UI feedback) * @returns The complete generated text */ stream( prompt: string, onToken: (token: string) => void, onToolCall?: (toolName: string, args: string) => void, ): Promise streamWithEvents(prompt: string, onEvent: (eventJson: string) => void): Promise /** * Stop the current generation. */ stop(): void /** * Unload the current model and release memory. */ unload(): void /** * Get statistics from the last generation. * @returns Statistics including token count, speed, and timing */ getLastGenerationStats(): GenerationStats /** * Get the message history if management is enabled. * @returns Array of messages in the history */ getHistory(): LLMMessage[] /** * Clear the message history. */ clearHistory(): void /** Whether a model is currently loaded */ readonly isLoaded: boolean /** Whether text is currently being generated */ readonly isGenerating: boolean /** The ID of the currently loaded model */ readonly modelId: string /** Enable debug logging */ debug: boolean /** System prompt used when loading the model */ systemPrompt: string } /** * Supported parameter types for tool definitions. * Used for type safety in createTool(). */ export type ToolParameterType = 'string' | 'number' | 'boolean' | 'array' | 'object'