/**
 * Text chunking for long document processing.
 *
 * Strategy: Split at sentence boundaries, respecting max chunk size.
 * Falls back to word boundaries if sentences are too long.
 */

export interface ChunkOptions {
  maxChars: number; // Max characters per chunk
  overlapChars: number; // Overlap for context (0 = none)
}

export const DEFAULT_CHUNK_OPTIONS: ChunkOptions = {
  maxChars: 6000, // ~1500 words, generates ~2-3 min audio
  overlapChars: 0, // No overlap by default
};

// Sentence-ending patterns
const SENTENCE_END = /[.!?]+[\s\n]+/g;

/**
 * Split text into chunks at sentence boundaries.
 */
export function chunkText(
  text: string,
  options: ChunkOptions = DEFAULT_CHUNK_OPTIONS
): string[] {
  const { maxChars } = options;

  // If text is short enough, return as-is
  if (text.length <= maxChars) {
    return [text.trim()];
  }

  const chunks: string[] = [];
  let remaining = text;

  while (remaining.length > 0) {
    if (remaining.length <= maxChars) {
      chunks.push(remaining.trim());
      break;
    }

    // Find last sentence boundary before maxChars
    const segment = remaining.slice(0, maxChars);
    const matches = [...segment.matchAll(SENTENCE_END)];

    let splitIndex: number;

    if (matches.length > 0) {
      // Split at last sentence boundary
      const lastMatch = matches[matches.length - 1];
      if (lastMatch && lastMatch.index !== undefined) {
        splitIndex = lastMatch.index + lastMatch[0].length;
      } else {
        // Fallback to word boundary
        const lastSpace = segment.lastIndexOf(" ");
        splitIndex = lastSpace > 0 ? lastSpace : maxChars;
      }
    } else {
      // No sentence boundary found - fall back to word boundary
      const lastSpace = segment.lastIndexOf(" ");
      splitIndex = lastSpace > 0 ? lastSpace : maxChars;
    }

    const chunk = remaining.slice(0, splitIndex).trim();
    if (chunk.length > 0) {
      chunks.push(chunk);
    }

    remaining = remaining.slice(splitIndex).trim();
  }

  return chunks;
}

/**
 * Estimate audio duration from text length.
 * Based on ~150 words/minute speaking rate, ~5 chars/word average.
 */
export function estimateDuration(text: string): number {
  const words = text.length / 5;
  const minutes = words / 150;
  return minutes * 60; // seconds
}

/**
 * Check if text should be auto-chunked based on estimated generation time.
 */
export function shouldAutoChunk(
  text: string,
  timeoutSeconds: number = 300
): boolean {
  // If estimated audio duration would take >80% of timeout to generate,
  // recommend chunking. Assume ~0.4x RTF for generation.
  const estimatedAudioSeconds = estimateDuration(text);
  const estimatedGenerationSeconds = estimatedAudioSeconds * 0.4;
  return estimatedGenerationSeconds > timeoutSeconds * 0.8;
}