/**
 * Voice functionality types for sudocode
 * Covers Speech-to-Text (STT) and Text-to-Speech (TTS) capabilities
 */

// =============================================================================
// Provider Types
// =============================================================================

/**
 * Available STT (Speech-to-Text) providers
 */
export type STTProvider = "whisper-local" | "openai";

/**
 * Available TTS (Text-to-Speech) providers
 */
export type TTSProvider = "browser" | "kokoro" | "openai";

// =============================================================================
// Project Settings (config.json)
// =============================================================================

/**
 * Voice settings stored in .sudocode/config.json
 *
 * These settings configure how voice features work for a project.
 *
 * @example
 * ```json
 * {
 *   "voice": {
 *     "enabled": true,
 *     "stt": {
 *       "provider": "whisper-local",
 *       "whisperUrl": "http://localhost:2022/v1",
 *       "whisperModel": "base"
 *     },
 *     "tts": {
 *       "provider": "browser"
 *     }
 *   }
 * }
 * ```
 */
export interface VoiceSettingsConfig {
  /** Whether voice features are enabled (default: false) */
  enabled?: boolean;
  /** Speech-to-text settings */
  stt?: {
    /** Preferred STT provider */
    provider?: STTProvider;
    /** URL for local Whisper server (default: http://localhost:2022/v1) */
    whisperUrl?: string;
    /** Whisper model to use (default: base) */
    whisperModel?: string;
  };
  /** Text-to-speech settings */
  tts?: {
    /** Preferred TTS provider */
    provider?: TTSProvider;
    /** URL for local Kokoro server (default: http://localhost:8880/v1) */
    kokoroUrl?: string;
    /** Default voice for TTS (default: nova) */
    defaultVoice?: string;
    /** Kokoro execution mode: 'browser' for WASM, 'server' for streaming via sidecar */
    kokoroMode?: "browser" | "server";
  };
  /** Narration settings - controls voice narration playback */
  narration?: {
    /** Whether voice narration is enabled (default: false) */
    enabled?: boolean;
    /** Preferred voice name for TTS (default: system default) */
    voice?: string;
    /** Speech rate from 0.5 to 2.0 (default: 1.0) */
    speed?: number;
    /** Volume from 0 to 1 (default: 1.0) */
    volume?: number;
    /** Whether to narrate tool use events like Read, Write, Bash (default: true) */
    narrateToolUse?: boolean;
    /** Whether to narrate tool results/completion (default: false) */
    narrateToolResults?: boolean;
    /** Whether to narrate assistant messages (default: true) */
    narrateAssistantMessages?: boolean;
    /** Whether to only narrate explicit speak tool calls (default: false)
     * When true, ignores narrateToolUse, narrateToolResults, and narrateAssistantMessages
     * and only narrates when the agent explicitly calls the speak tool */
    narrateSpeakOnly?: boolean;
  };
}

// =============================================================================
// STT Types
// =============================================================================

/**
 * Result from a speech-to-text transcription
 */
export interface TranscriptionResult {
  /** The transcribed text */
  text: string;
  /** Confidence score from 0 to 1 (optional, provider-dependent) */
  confidence?: number;
  /** Duration of the audio in milliseconds */
  duration_ms?: number;
}

/**
 * Options for speech-to-text transcription
 */
export interface STTOptions {
  /** Language code (e.g., "en", "es", "fr") - defaults to "en" */
  language?: string;
  /** Preferred STT provider */
  provider?: STTProvider;
}

// =============================================================================
// TTS Types
// =============================================================================

/**
 * Options for text-to-speech synthesis
 */
export interface TTSOptions {
  /** Voice identifier (provider-specific) */
  voice?: string;
  /** Preferred TTS provider */
  provider?: TTSProvider;
  /** Speech rate multiplier (0.5 to 2.0) */
  rate?: number;
  /** Volume level (0 to 1) */
  volume?: number;
}

/**
 * Request for text-to-speech synthesis
 */
export interface SynthesizeRequest {
  /** Text to synthesize */
  text: string;
  /** Voice identifier */
  voice?: string;
  /** TTS provider to use */
  provider?: TTSProvider;
}

/**
 * Response from text-to-speech synthesis
 * For browser provider: returns text for Web Speech API
 * For kokoro/openai: audio is returned as a stream (audio/mpeg)
 */
export interface SynthesizeResponse {
  /** Text to speak (for browser provider) */
  text?: string;
  /** Audio content type (for kokoro/openai) */
  contentType?: string;
}

// =============================================================================
// Voice Input States
// =============================================================================

/**
 * State of the voice input UI
 */
export type VoiceInputState = "idle" | "recording" | "transcribing" | "error";

/**
 * Error codes for voice input failures
 */
export type VoiceInputErrorCode =
  | "permission_denied"
  | "not_supported"
  | "transcription_failed"
  | "network_error";

/**
 * Error object for voice input failures
 */
export interface VoiceInputError {
  /** Error code for programmatic handling */
  code: VoiceInputErrorCode;
  /** Human-readable error message */
  message: string;
}

// =============================================================================
// API Request/Response Types
// =============================================================================

/**
 * Request for POST /api/voice/transcribe
 * Note: Actual request is multipart/form-data with audio blob
 */
export interface TranscribeRequest {
  /** Audio blob (audio/webm, audio/mp3, audio/wav) */
  audio: Blob;
  /** Language code (optional, defaults to "en") */
  language?: string;
}

/**
 * Response from POST /api/voice/transcribe
 */
export interface TranscribeResponse {
  /** Transcribed text */
  text: string;
  /** Confidence score from 0 to 1 */
  confidence?: number;
  /** Duration of the audio in milliseconds */
  duration_ms?: number;
}

// =============================================================================
// Voice Configuration
// =============================================================================

/**
 * STT configuration from GET /api/voice/config
 */
export interface STTConfig {
  /** Available STT providers */
  providers: STTProvider[];
  /** Default provider */
  default: STTProvider;
  /** Whether local Whisper is available */
  whisperAvailable: boolean;
}

/**
 * TTS configuration from GET /api/voice/config
 */
export interface TTSConfig {
  /** Available TTS providers */
  providers: TTSProvider[];
  /** Default provider */
  default: TTSProvider;
  /** Whether Kokoro is available */
  kokoroAvailable: boolean;
  /** Available voices per provider */
  voices: Record<TTSProvider, string[]>;
}

/**
 * Full voice configuration from GET /api/voice/config
 *
 * Combines runtime capabilities (provider availability) with
 * user settings from config.json.
 */
export interface VoiceConfig {
  /** Whether voice features are enabled for this project */
  enabled: boolean;
  /** Speech-to-text configuration (runtime capabilities) */
  stt: STTConfig;
  /** Text-to-speech configuration (runtime capabilities) */
  tts: TTSConfig;
  /** User settings from config.json */
  settings: VoiceSettingsConfig;
}

// =============================================================================
// Voice Narration Events (WebSocket)
// =============================================================================

/**
 * Category of narration content
 */
export type NarrationCategory = "status" | "progress" | "result" | "error";

/**
 * Priority level for narration
 */
export type NarrationPriority = "low" | "normal" | "high";

/**
 * WebSocket event for voice narration
 */
export interface VoiceNarrationEvent {
  /** Event type identifier */
  type: "voice_narration";
  /** Associated execution ID */
  executionId: string;
  /** Text to be narrated */
  text: string;
  /** Category of the narration */
  category: NarrationCategory;
  /** Priority level for queue ordering */
  priority: NarrationPriority;
}

// =============================================================================
// User Preferences
// =============================================================================

/**
 * User voice preferences stored in localStorage
 */
export interface VoicePreferences {
  /** Whether voice narration is enabled */
  narrationEnabled: boolean;
  /** Preferred TTS provider */
  ttsProvider: TTSProviderType;
  /** Preferred voice for TTS */
  ttsVoice: string;
  /** Narration playback speed (0.5 to 2.0) */
  narrationSpeed: number;
  /** Narration volume (0 to 1) */
  narrationVolume: number;
}

// =============================================================================
// TTS Provider Interface (Service-side)
// =============================================================================

/**
 * TTS provider type identifier
 * Used to distinguish between different provider implementations
 */
export type TTSProviderType = TTSProvider;

/**
 * Options passed to TTS providers for synthesis
 */
export interface TTSProviderOptions {
  /** Voice identifier (provider-specific) */
  voice?: string;
  /** Speech speed multiplier (0.5 to 2.0, default: 1.0) */
  speed?: number;
  /** Speech pitch multiplier (0.5 to 2.0, default: 1.0) */
  pitch?: number;
}

/**
 * Result from TTS synthesis
 *
 * Different providers return results in different forms:
 * - Server-side TTS (Kokoro, OpenAI): Returns audio buffer
 * - Browser TTS: Returns text for client-side Web Speech API synthesis
 */
export interface TTSProviderResult {
  /**
   * Audio buffer for server-side TTS providers.
   * Present when audio is synthesized server-side.
   */
  audio?: Buffer;

  /**
   * MIME type of the audio (e.g., "audio/mpeg", "audio/wav")
   * Present when audio is returned
   */
  mimeType?: string;

  /**
   * Text to synthesize client-side.
   * Present when using browser TTS (client does actual synthesis).
   */
  text?: string;

  /**
   * SSML markup for enhanced synthesis.
   * Optional, used for providers that support SSML.
   */
  ssml?: string;
}

/**
 * Information about a TTS voice
 */
export interface TTSVoice {
  /** Unique voice identifier (provider-specific) */
  id: string;
  /** Human-readable voice name */
  name: string;
  /** Language code (e.g., "en-US", "en-GB", "es-ES") */
  language: string;
  /** Provider that offers this voice */
  provider: TTSProviderType;
}

// =============================================================================
// Streaming TTS WebSocket Messages
// =============================================================================

/**
 * Client request to start TTS streaming
 *
 * Sent by the client to request text-to-speech synthesis.
 * Server will respond with TTSAudioChunk messages followed by TTSStreamEnd.
 */
export interface TTSStreamRequest {
  /** Message type identifier */
  type: "tts_request";
  /** Unique request ID for correlating responses */
  request_id: string;
  /** Text to synthesize */
  text: string;
  /** Voice identifier (optional, uses default if not specified) */
  voice?: string;
  /** Speech speed multiplier (0.5 to 2.0, default: 1.0) */
  speed?: number;
}

/**
 * Server response containing an audio chunk
 *
 * Streamed from server to client during TTS synthesis.
 * Audio is base64-encoded PCM (mono, 24kHz, float32).
 */
export interface TTSAudioChunk {
  /** Message type identifier */
  type: "tts_audio";
  /** Request ID this chunk belongs to */
  request_id: string;
  /** Base64-encoded PCM audio data (mono, 24kHz, float32) */
  chunk: string;
  /** Zero-based index of this chunk in the stream */
  index: number;
  /** Whether this is the final audio chunk */
  is_final: boolean;
}

/**
 * Server notification that TTS streaming has completed
 *
 * Sent after all audio chunks have been transmitted.
 */
export interface TTSStreamEnd {
  /** Message type identifier */
  type: "tts_end";
  /** Request ID this end message belongs to */
  request_id: string;
  /** Total number of audio chunks sent */
  total_chunks: number;
  /** Total duration of synthesis in milliseconds */
  duration_ms: number;
}

/**
 * Server notification of a TTS error
 *
 * Sent when TTS synthesis fails or encounters an error.
 */
export interface TTSStreamError {
  /** Message type identifier */
  type: "tts_error";
  /** Request ID this error belongs to */
  request_id: string;
  /** Human-readable error message */
  error: string;
  /** Whether the client can retry the request */
  recoverable: boolean;
  /** Whether the client should fall back to browser TTS */
  fallback: boolean;
}

/**
 * Union type for all TTS client messages
 */
export type TTSClientMessage = TTSStreamRequest;

/**
 * Union type for all TTS server messages
 */
export type TTSServerMessage = TTSAudioChunk | TTSStreamEnd | TTSStreamError;