/** * Text tokenization and phoneme processing system * Handles language detection, preprocessing, and format conversion */ import { G2PRegistry } from "./g2p"; /** * Configuration options for tokenizer behavior */ export interface TokenizerOptions { /** Remove stress markers from output */ stripStress?: boolean; /** * Output format (IPA, ARPABET, or Zhuyin) * * Note: Non-chinese in zhuyin format will be converted to IPA **/ format?: "ipa" | "arpabet" | "zhuyin"; /** Token separator in output string */ separator?: string; /** Convert non-Latin text to ASCII approximation */ anyAscii?: boolean; /** Chinese tone format: 'unicode' (˧˩˧) or 'arrow' (↓↗↘→). Only applies when format is 'ipa' */ toneFormat?: "unicode" | "arrow"; /** * Preferred language tag (BCP 47, e.g. "en", "en-GB", "zh"). * * When set, the tokenizer skips Unicode-based auto-detection for words * that don't clearly belong to a different script and routes them to * a G2P processor matching this tag. Words written in a script that * unambiguously identifies another language (e.g. Han for Chinese) * still go through their script-detected processor. */ language?: string; /** * G2P registry to use for phoneme prediction. Defaults to the global * registry populated by the package's `useG2P()` calls. Pass a custom * registry (or use `createPhonemizer()`) for isolated multi-instance * setups. */ registry?: G2PRegistry; } /** * Individual phoneme token with metadata */ export interface PhonemeToken { /** IPA or ARPABET phoneme string */ phoneme: string; /** Original word/text */ word: string; /** Position in original text */ position: number; } /** * Language segment for multilingual processing */ interface LanguageSegment { text: string; language: string; startIndex: number; } /** * Preprocessing result with language information */ interface PreprocessResult { text: string; languageMap: Record; segments: LanguageSegment[]; } /** * Main tokenizer class for phoneme processing */ export declare class Tokenizer { protected readonly options: Required> & { language: string | undefined; }; protected readonly registry: G2PRegistry; constructor(options?: TokenizerOptions); /** * Preprocess text with language detection and segmentation */ protected _preprocess(text: string): PreprocessResult; /** * Detect languages for words and create character-level segments */ private _detectLanguagesAndSegment; /** * Apply anyAscii conversion while preserving Chinese text */ private _applyAnyAscii; /** * Fast character-level language detection */ private _detectCharLanguage; /** * Post-process phonemes for format conversion and cleanup */ protected _postProcess(phonemes: string): string; private _predict; /** * Core token processing method that handles both simple and detailed tokenization */ private _processTokens; /** * Core tokenization method - converts text to phoneme array */ tokenize(text: string): string[]; /** * Smart tokenization using efficient regex patterns */ private _smartTokenize; /** * Convert text to phoneme string with specified separator */ tokenizeToString(text: string): string; /** * Convert text to detailed phoneme tokens with metadata */ tokenizeToTokens(text: string): PhonemeToken[]; } export {};