/**
 * Text tokenization and phoneme processing system
 * Handles language detection, preprocessing, and format conversion
 */
import { G2PRegistry } from "./g2p";
/**
 * Configuration options for tokenizer behavior
 */
export interface TokenizerOptions {
    /** Remove stress markers from output */
    stripStress?: boolean;
    /**
     * Output format (IPA, ARPABET, or Zhuyin)
     *
     * Note: Non-chinese in zhuyin format will be converted to IPA
     **/
    format?: "ipa" | "arpabet" | "zhuyin";
    /** Token separator in output string */
    separator?: string;
    /** Convert non-Latin text to ASCII approximation */
    anyAscii?: boolean;
    /** Chinese tone format: 'unicode' (˧˩˧) or 'arrow' (↓↗↘→). Only applies when format is 'ipa' */
    toneFormat?: "unicode" | "arrow";
    /**
     * Preferred language tag (BCP 47, e.g. "en", "en-GB", "zh").
     *
     * When set, the tokenizer skips Unicode-based auto-detection for words
     * that don't clearly belong to a different script and routes them to
     * a G2P processor matching this tag. Words written in a script that
     * unambiguously identifies another language (e.g. Han for Chinese)
     * still go through their script-detected processor.
     */
    language?: string;
    /**
     * G2P registry to use for phoneme prediction. Defaults to the global
     * registry populated by the package's `useG2P()` calls. Pass a custom
     * registry (or use `createPhonemizer()`) for isolated multi-instance
     * setups.
     */
    registry?: G2PRegistry;
}
/**
 * Individual phoneme token with metadata
 */
export interface PhonemeToken {
    /** IPA or ARPABET phoneme string */
    phoneme: string;
    /** Original word/text */
    word: string;
    /** Position in original text */
    position: number;
}
/**
 * Language segment for multilingual processing
 */
interface LanguageSegment {
    text: string;
    language: string;
    startIndex: number;
}
/**
 * Preprocessing result with language information
 */
interface PreprocessResult {
    text: string;
    languageMap: Record<string, string>;
    segments: LanguageSegment[];
}
/**
 * Main tokenizer class for phoneme processing
 */
export declare class Tokenizer {
    protected readonly options: Required<Omit<TokenizerOptions, "language" | "registry">> & {
        language: string | undefined;
    };
    protected readonly registry: G2PRegistry;
    constructor(options?: TokenizerOptions);
    /**
     * Preprocess text with language detection and segmentation
     */
    protected _preprocess(text: string): PreprocessResult;
    /**
     * Detect languages for words and create character-level segments
     */
    private _detectLanguagesAndSegment;
    /**
     * Apply anyAscii conversion while preserving Chinese text
     */
    private _applyAnyAscii;
    /**
     * Fast character-level language detection
     */
    private _detectCharLanguage;
    /**
     * Post-process phonemes for format conversion and cleanup
     */
    protected _postProcess(phonemes: string): string;
    private _predict;
    /**
     * Core token processing method that handles both simple and detailed tokenization
     */
    private _processTokens;
    /**
     * Core tokenization method - converts text to phoneme array
     */
    tokenize(text: string): string[];
    /**
     * Smart tokenization using efficient regex patterns
     */
    private _smartTokenize;
    /**
     * Convert text to phoneme string with specified separator
     */
    tokenizeToString(text: string): string;
    /**
     * Convert text to detailed phoneme tokens with metadata
     */
    tokenizeToTokens(text: string): PhonemeToken[];
}
export {};