/** * Korean & Japanese Encoding Tables * * Provides character mapping support for: * - EUC-KR (Korean) - KS X 1001/1003 encoding * - Shift-JIS (Japanese) - JIS X 0201/0208 encoding * - ISO-2022-JP (Japanese) - JIS encoding with escape sequences * * Note: For full coverage, these use TextEncoder/TextDecoder as the primary * implementation since they provide native support in modern environments. * The mapping tables here serve as reference and fallback. */ // ============================================================================ // EUC-KR (Korean) Constants // ============================================================================ /** * KS X 1001 (Hangul) code range in EUC-KR: * First byte: 0xB0-0xC8 (128-200) * Second byte: 0xA1-0xFE (161-254) * * KS X 1003 (Hanja) code range in EUC-KR: * First byte: 0xCA-0xFD * Second byte: 0xA1-0xFE */ // Korean Hangul Syllables (U+AC00 - U+D7AF) // These are composed syllables that map to EUC-KR precomposed forms export const HANGUL_SYLLABLE_OFFSET = 0xac00; export const HANGUL_INITIAL_COUNT = 21; // 19 initials × 21 vowels export const HANGUL_MEDIAL_COUNT = 21; export const HANGUL_FINAL_COUNT = 28; // ============================================================================ // Shift-JIS (Japanese) Constants // ============================================================================ /** * JIS X 0201 (Half-width Katakana): * First byte: 0xA1-0xDF (single-byte) * * JIS X 0208 (Kanji): * First byte: 0x81-0x9F (low) or 0xE0-0xEF (high) * Second byte: 0x40-0x7E (low) or 0x80-0xFC (high) * * JIS X 0212 (Supplementary Kanji): * First byte: 0xE0-0xEA * Second byte: 0x40-0x7E or 0x80-0xFC */ // JIS X 0201 Katakana range export const SHIFT_JIS_KATAKANA_START = 0xa1; export const SHIFT_JIS_KATAKANA_END = 0xdf; // JIS X 0208 Kanji regions export const SHIFT_JIS_KANJI_LOW_START = 0x81; export const SHIFT_JIS_KANJI_LOW_END = 0x9f; export const SHIFT_JIS_KANJI_HIGH_START = 0xe0; export const SHIFT_JIS_KANJI_HIGH_END = 0xef; // JIS non-pictorial (Kanji) area export const SHIFT_JIS_KANJI_SECOND_MIN = 0x40; export const SHIFT_JIS_KANJI_SECOND_MAX = 0xfc; // ============================================================================ // Character Classification Helpers // ============================================================================ /** * Check if a Unicode code point is a Korean Hangul syllable */ export function isKoreanHangul(code: number): boolean { return code >= 0xac00 && code <= 0xd7af; } /** * Check if a Unicode code point is a Korean Hanja (Hanja) */ export function isKoreanHanja(code: number): boolean { return ( (code >= 0x4e00 && code <= 0x9fff) || (code >= 0xf900 && code <= 0xfaff) || (code >= 0x3400 && code <= 0x4dbf) ); } /** * Check if a Unicode code point is basic Korean (Hangul Jamo) */ export function isKoreanJamo(code: number): boolean { return ( (code >= 0x1100 && code <= 0x11ff) || (code >= 0x3130 && code <= 0x318f) || (code >= 0xffa0 && code <= 0xffdf) ); } /** * Check if a code point is Japanese Hiragana */ export function isJapaneseHiragana(code: number): boolean { return code >= 0x3040 && code <= 0x309f; } /** * Check if a code point is Japanese Katakana (full-width) */ export function isJapaneseKatakana(code: number): boolean { return code >= 0x30a0 && code <= 0x30ff; } /** * Check if a code point is Japanese Kanji (CJK Unified Ideographs in Japanese range) */ export function isJapaneseKanji(code: number): boolean { // JIS X 0208 Kanji coverage in Unicode // Common: U+4E00-U+9FFF (with many gaps) // Extended A: U+3400-U+4DBF // Extended B: U+20000-U+2A6DF (not supported) return ( (code >= 0x4e00 && code <= 0x9fff) || (code >= 0x3400 && code <= 0x4dbf) || (code >= 0xf900 && code <= 0xfaff) ); } // ============================================================================ // EUC-KR Encoding Functions // ============================================================================ /** * Encode a Hangul syllable to EUC-KR bytes * * Hangul syllables in Unicode are algorithmically decomposable: * 1. Syllable = 0xAC00 + (initial × 21 + medial) × 28 + final * 2. We decompose to lead, vowel, trail * 3. Then convert to EUC-KR lead/trail bytes */ export function encodeHangulSyllable(code: number): [number, number] | null { if (!isKoreanHangul(code)) { return null; } // Decompose the syllable const syllableIndex = code - HANGUL_SYLLABLE_OFFSET; // Calculate leading consonant (initial), vowel (medial), trailing consonant (final) const leadingConsonant = Math.floor(syllableIndex / (HANGUL_MEDIAL_COUNT * HANGUL_FINAL_COUNT)); const vowelIndex = Math.floor( (syllableIndex % (HANGUL_MEDIAL_COUNT * HANGUL_FINAL_COUNT)) / HANGUL_FINAL_COUNT ); const trailingConsonant = syllableIndex % HANGUL_FINAL_COUNT; // Lead consonants: ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎ (14 consonants) const LEAD_CONSONANTS = [ 0x81, 0x82, 0x84, 0x85, 0x88, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, ]; // Vowels: ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅘㅙㅚㅛㅜㅝㅞㅟㅠㅡㅢㅣ (21 vowels) const VOWELS = [ 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, ]; // Trail consonants (finals): ㄱㄲㄳㄴㄵㄶㄷㄹㄺㄻㄼㄽㄾㄿㅀㅁㅂㅄㅅㅆㅇㅈㅊㅋㅌㅍㅎ (27 finals, 0=none) const TRAIL_CONSONANTS: number[] = [ 0x00, // No final 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, ]; if (leadingConsonant >= LEAD_CONSONANTS.length) return null; if (vowelIndex >= VOWELS.length) return null; const lead = LEAD_CONSONANTS[leadingConsonant]!; // medial is available but EUC-KR uses algorithmic mapping for syllables void VOWELS[vowelIndex]; let trail = 0x00; if (trailingConsonant > 0 && trailingConsonant < TRAIL_CONSONANTS.length) { const trailVal = TRAIL_CONSONANTS[trailingConsonant]; if (trailVal !== undefined) { trail = trailVal; } } // EUC-KR Hangul range: 0xB0A1-0xC8FE const leadByte = lead; const trailByte = trail !== 0x00 ? trail : 0x00; // For syllables without final, use simple lead+vowel encoding if (trail === 0x00) { // Map to EUC-KR compatible range const row = Math.floor((lead - 0x81) * 21 + vowelIndex); const eucLead = 0xb0 + Math.floor(row / 0x5e); const eucTrail = 0xa1 + (row % 0x5e); return [eucLead & 0xff, eucTrail & 0xff]; } return [leadByte, trailByte]; } // ============================================================================ // Shift-JIS Encoding Functions // ============================================================================ /** * Convert Unicode Hiragana to Shift-JIS * U+3040-309F → 0x829F-0x82F1 (simplified mapping) */ export function unicodeToShiftJisHiragana(code: number): [number, number] | null { if (code >= 0x3040 && code <= 0x309f) { const offset = code - 0x3040; // JIS X 0201 Hiragana range const jisLead = 0x82; const jisTrail = 0x9f + offset; return [jisLead, jisTrail & 0xff]; } return null; } /** * Convert Unicode Katakana to Shift-JIS * U+30A0-30FF → 0x8340-0x8396 (full-width Katakana) */ export function unicodeToShiftJisKatakana(code: number): [number, number] | null { if (code >= 0x30a0 && code <= 0x30ff) { const offset = code - 0x30a0; const jisLead = 0x83; const jisTrail = 0x40 + offset; return [jisLead, jisTrail & 0xff]; } return null; } /** * Convert half-width Katakana to Shift-JIS * U+FF61-U+FF9F → 0xA1-0xDF */ export function unicodeToHalfWidthKatakana(code: number): number | null { if (code >= 0xff61 && code <= 0xff9f) { return code - 0xff61 + 0xa1; } return null; } /** * Check if a code point is in JIS X 0208 range and estimate its Shift-JIS position * This is a simplified heuristic - full conversion requires large tables */ export function isInJisX0208Range(code: number): boolean { // Level 1 and Level 2 Kanji return (code >= 0x4e00 && code <= 0x9fff) || (code >= 0x3400 && code <= 0x4dbf); } // ============================================================================ // ISO-2022-JP Constants // ============================================================================ /** * ISO-2022-JP (JIS X 4081) escape sequences: * - ESC ( B - ASCII * - ESC ( J - JIS X 0201 Roman * - ESC $ B - JIS X 0208 * - ESC $ @ - JIS X 0208:1983 (old) * - ESC $ D - JIS X 0208:1990 (new) * - ESC $ ( C - JIS X 0212 supplementary */ /** * ISO-2022-JP escape sequences */ export const ISO2022JP_ESC_ASCII = new Uint8Array([0x1b, 0x28, 0x42]); // ESC ( B export const ISO2022JP_ESC_JIS0201 = new Uint8Array([0x1b, 0x28, 0x4a]); // ESC ( J export const ISO2022JP_ESC_JIS0208 = new Uint8Array([0x1b, 0x24, 0x42]); // ESC $ B export const ISO2022JP_ESC_JIS0208_83 = new Uint8Array([0x1b, 0x24, 0x40]); // ESC $ @ export const ISO2022JP_ESC_JIS0208_90 = new Uint8Array([0x1b, 0x24, 0x44]); // ESC $ D /** * ISO-2022-JP text encoder state */ export type ISO2022JPState = 'ASCII' | 'JIS0208' | 'JIS0201'; /** * Check if a code point requires JIS X 0208 escape sequence */ export function requiresJisX0208Escape(code: number): boolean { return isJapaneseHiragana(code) || isJapaneseKatakana(code) || isJapaneseKanji(code); } /** * Check if a code point is JIS X 0201 Katakana (half-width) */ export function isJisX0201Katakana(code: number): boolean { // Half-width Katakana range return ( (code >= 0xff61 && code <= 0xff9f) || // Full-width Katakana (JIS X 0201 range mapped to U+30A0) (code >= 0x30a0 && code <= 0x30ff) ); }