import * as types from '@google/genai'; import { type AudioTranscriptionConfig, type ContextWindowCompressionConfig, type HttpOptions, Modality, type RealtimeInputConfig } from '@google/genai'; import type { APIConnectOptions } from '@livekit/agents'; import { llm } from '@livekit/agents'; import { AudioFrame, type VideoFrame } from '@livekit/rtc-node'; import { type LLMTools } from '../tools.js'; import type { LiveAPIModels, Voice } from './api_proto.js'; /** * Default image encoding options for Google Realtime API */ export declare const DEFAULT_IMAGE_ENCODE_OPTIONS: { format: "JPEG"; quality: number; resizeOptions: { width: number; height: number; strategy: "scale_aspect_fit"; }; }; /** * Input transcription result */ export interface InputTranscription { itemId: string; transcript: string; } /** * Internal realtime options for Google Realtime API */ interface RealtimeOptions { model: LiveAPIModels | string; apiKey?: string; voice: Voice | string; language?: string; responseModalities: Modality[]; vertexai: boolean; project?: string; location?: string; candidateCount: number; temperature?: number; maxOutputTokens?: number; topP?: number; topK?: number; presencePenalty?: number; frequencyPenalty?: number; instructions?: string; inputAudioTranscription?: AudioTranscriptionConfig; outputAudioTranscription?: AudioTranscriptionConfig; imageEncodeOptions?: typeof DEFAULT_IMAGE_ENCODE_OPTIONS; connOptions: APIConnectOptions; httpOptions?: HttpOptions; mediaResolution?: types.MediaResolution; enableAffectiveDialog?: boolean; proactivity?: boolean; realtimeInputConfig?: RealtimeInputConfig; contextWindowCompression?: ContextWindowCompressionConfig; apiVersion?: string; geminiTools?: LLMTools; thinkingConfig?: types.ThinkingConfig; toolBehavior?: types.Behavior; toolResponseScheduling?: types.FunctionResponseScheduling; } /** * Google Realtime Model for real-time voice conversations with Gemini models */ export declare class RealtimeModel extends llm.RealtimeModel { #private; /** @internal */ _options: RealtimeOptions; get model(): string; label(): string; constructor(options?: { /** * Initial system instructions for the model */ instructions?: string; /** * The name of the model to use */ model?: LiveAPIModels | string; /** * Google Gemini API key. If not provided, will attempt to read from GOOGLE_API_KEY environment variable */ apiKey?: string; /** * Voice setting for audio outputs */ voice?: Voice | string; /** * The language (BCP-47 Code) to use for the API * See https://ai.google.dev/gemini-api/docs/live#supported-languages */ language?: string; /** * Modalities to use, such as [Modality.TEXT, Modality.AUDIO] */ modalities?: Modality[]; /** * Whether to use VertexAI for the API */ vertexai?: boolean; /** * The project ID to use for the API (for VertexAI) */ project?: string; /** * The location to use for the API (for VertexAI) */ location?: string; /** * The number of candidate responses to generate */ candidateCount?: number; /** * Sampling temperature for response generation */ temperature?: number; /** * Maximum number of tokens in the response */ maxOutputTokens?: number; /** * The top-p value for response generation */ topP?: number; /** * The top-k value for response generation */ topK?: number; /** * The presence penalty for response generation */ presencePenalty?: number; /** * The frequency penalty for response generation */ frequencyPenalty?: number; /** * The configuration for input audio transcription */ inputAudioTranscription?: AudioTranscriptionConfig | null; /** * The configuration for output audio transcription */ outputAudioTranscription?: AudioTranscriptionConfig | null; /** * The configuration for image encoding */ imageEncodeOptions?: typeof DEFAULT_IMAGE_ENCODE_OPTIONS; /** * Whether to enable affective dialog */ enableAffectiveDialog?: boolean; /** * Whether to enable proactive audio */ proactivity?: boolean; /** * The configuration for realtime input */ realtimeInputConfig?: RealtimeInputConfig; /** * The configuration for context window compression */ contextWindowCompression?: ContextWindowCompressionConfig; /** * API version to use */ apiVersion?: string; /** * The configuration for the API connection */ connOptions?: APIConnectOptions; /** * HTTP options for API requests */ httpOptions?: HttpOptions; /** * The media resolution for the session. */ mediaResolution?: types.MediaResolution; /** * Gemini-specific tools to use for the session */ geminiTools?: LLMTools; /** * Thinking configuration for native audio models. * If not set, the model's default thinking behavior is used. * Gemini 3.1 live models use `thinkingLevel`. * Gemini 2.5 live models use `thinkingBudget`. */ thinkingConfig?: types.ThinkingConfig; /** * The behavior for tool calls. Default behavior is `BLOCKING` in Gemini Realtime API. * Note: Not supported in Vertex AI. */ toolBehavior?: types.Behavior; /** * The scheduling for tool responses. Default scheduling is `WHEN_IDLE`. * Note: Vertex AI currently does not support the scheduling parameter; the user is * responsible for avoiding this parameter when using Vertex AI. */ toolResponseScheduling?: types.FunctionResponseScheduling; }); /** * Create a new realtime session */ session(): RealtimeSession; /** * Update model options */ updateOptions(options: { voice?: Voice | string; temperature?: number; toolBehavior?: types.Behavior; toolResponseScheduling?: types.FunctionResponseScheduling; }): void; /** * Close the model and cleanup resources */ close(): Promise; } /** * Google Realtime Session for real-time voice conversations * * This session provides real-time streaming capabilities with Google's Gemini models, * supporting both text and audio modalities with function calling capabilities. */ export declare class RealtimeSession extends llm.RealtimeSession { #private; private _tools; private _chatCtx; private options; private geminiDeclarations; private messageChannel; private inputResampler?; private inputResamplerInputRate?; private instructions?; private currentGeneration?; private bstream; private activeSession?; private sessionShouldClose; private responseCreatedFutures; private pendingGenerationFut?; private sessionResumptionHandle?; private inUserActivity; private sessionLock; private numRetries; private hasReceivedAudioInput; private pendingInterruptText; private earlyCompletionPending; private pendingToolCallIds; private toolCallStatuses; private toolResponseCallIds; private generationPendingTurnComplete?; constructor(realtimeModel: RealtimeModel); private closeActiveSession; private markRestartNeeded; private isNonBlockingToolBehavior; private shouldBlockRealtimeInputForPendingTools; private getToolResultsForRealtime; updateOptions(options: { voice?: Voice | string; temperature?: number; toolChoice?: llm.ToolChoice; toolBehavior?: types.Behavior; toolResponseScheduling?: types.FunctionResponseScheduling; }): void; updateInstructions(instructions: string): Promise; updateChatCtx(chatCtx: llm.ChatContext): Promise; updateTools(tools: llm.ToolContext): Promise; get chatCtx(): llm.ChatContext; get tools(): llm.ToolContext; get manualActivityDetection(): boolean; pushAudio(frame: AudioFrame): void; pushVideo(_: VideoFrame): void; private sendClientEvent; generateReply(instructions?: string, options?: { signal?: AbortSignal; }): Promise; startUserActivity(): void; private generationHasOutput; interrupt(): Promise; truncate(_options: { messageId: string; audioEndMs: number; audioTranscript?: string; }): Promise; close(): Promise; private sendTask; private onReceiveMessage; private truncateString; private loggableClientEvent; private loggableServerMessage; private markCurrentGenerationDone; private emitError; private buildConnectConfig; private startNewGeneration; private handleInputSpeechStarted; private handleInputSpeechStopped; private handleServerContent; private handleToolCall; private handleToolCallCancellation; private clearPendingToolCallIdsForResponses; private handleUsageMetadata; private tokenDetailsMap; private handleGoAway; commitAudio(): Promise; clearAudio(): Promise; private resampleAudio; private isNewGeneration; } export {}; //# sourceMappingURL=realtime_api.d.ts.map