"""Extract translatable strings from PHP source and build POT/PO/MO. Usage: python languages/build-i18n.py Generates (under languages/): - live-2d.pot - live-2d-{locale}.po for each locale in translations.json - live-2d-{locale}.mo compiled from each .po Pure stdlib: no babel/msgfmt required. """ from __future__ import annotations import json import os import re import struct import sys from collections import OrderedDict from datetime import datetime, timezone from pathlib import Path ROOT = Path(__file__).resolve().parent.parent LANG_DIR = ROOT / "languages" TEXT_DOMAIN = "live-2d" # PHP files to scan for gettext calls. SCAN_FILES = [ ROOT / "wordpress-live2d.php", *(ROOT / "src").glob("*.php"), ] # Functions whose 1st arg is the singular msgid in the live-2d domain. GETTEXT_FUNCS = ( "__", "_e", "esc_html__", "esc_html_e", "esc_attr__", "esc_attr_e", "_x", "_ex", "esc_html_x", "esc_attr_x", ) # Match a gettext call: funcname( 'string' [, ...] , 'live-2d' ) # - string can be single- or double-quoted # - allow escaped quotes inside # - require the live-2d text domain to disambiguate from other plugins' calls CALL_RE = re.compile( r"\b(?P" + "|".join(GETTEXT_FUNCS) + r")\s*$\s*" r"(?P['\"])(?P(?:\\.|(?!(?P=q)).)*)(?P=q)" r"(?:\s*,\s*(?:['\"](?:\\.|[^'\"\\])*['\"]\s*,\s*)?)" r"['\"]" + re.escape(TEXT_DOMAIN) + r"['\"]\s*$", re.DOTALL, ) def php_unescape(s: str, quote: str) -> str: """Mimic PHP single/double quoted string escapes for the subset we use.""" if quote == "'": # PHP single-quoted: only \' and \\ are special return re.sub(r"\\(['\\])", r"\1", s) # double-quoted: handle \n, \r, \t, \", \\, \$ out = [] i = 0 while i < len(s): c = s[i] if c == "\\" and i + 1 < len(s): nxt = s[i + 1] if nxt == "n": out.append("\n") elif nxt == "r": out.append("\r") elif nxt == "t": out.append("\t") elif nxt in '"\\$': out.append(nxt) else: out.append(c + nxt) i += 2 continue out.append(c) i += 1 return "".join(out) def extract_calls(text: str) -> list[tuple[str, int]]: """Return list of (msgid, line_number) tuples found in text.""" results = [] for m in CALL_RE.finditer(text): msg = php_unescape(m.group("msg"), m.group("q")) line = text.count("\n", 0, m.start()) + 1 results.append((msg, line)) return results def collect_strings() -> "OrderedDict[str, list[str]]": """Walk SCAN_FILES, return ordered dict of msgid -> ['file:line', ...].""" found: "OrderedDict[str, list[str]]" = OrderedDict() for fpath in SCAN_FILES: if not fpath.exists(): continue text = fpath.read_text(encoding="utf-8") rel = fpath.relative_to(ROOT).as_posix() for msg, line in extract_calls(text): ref = f"{rel}:{line}" if msg not in found: found[msg] = [] if ref not in found[msg]: found[msg].append(ref) # Sort by first occurrence file/line for stable diffs return found def po_escape(s: str) -> str: return ( s.replace("\\", "\\\\") .replace('"', '\\"') .replace("\n", "\\n") .replace("\t", "\\t") ) def format_po_msg(key: str, value: str) -> list[str]: """Render `key "value"` with continuation lines for embedded \n.""" if "\n" not in value: return [f'{key} "{po_escape(value)}"'] # Multi-line: leading "" then each line ending with \n quoted separately out = [f'{key} ""'] parts = value.split("\n") for i, p in enumerate(parts): suffix = "\\n" if i < len(parts) - 1 else "" out.append(f'"{po_escape(p)}{suffix}"') return out def write_pot(strings: "OrderedDict[str, list[str]]", path: Path) -> None: now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M+0000") lines = [ "# Live 2D - WordPress plugin", "# Copyright (C) Weifang Chiang", '# This file is distributed under the same license as the Live 2D plugin.', "#", "#, fuzzy", 'msgid ""', 'msgstr ""', '"Project-Id-Version: Live 2D 2.1.3\\n"', '"Report-Msgid-Bugs-To: \\n"', f'"POT-Creation-Date: {now}\\n"', '"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"', '"Last-Translator: FULL NAME \\n"', '"Language-Team: \\n"', '"Language: \\n"', '"MIME-Version: 1.0\\n"', '"Content-Type: text/plain; charset=UTF-8\\n"', '"Content-Transfer-Encoding: 8bit\\n"', '"X-Generator: build-i18n.py\\n"', "", ] for msgid, refs in strings.items(): for ref in refs: lines.append(f"#: {ref}") lines.extend(format_po_msg("msgid", msgid)) lines.append('msgstr ""') lines.append("") path.write_text("\n".join(lines) + "\n", encoding="utf-8") def write_po( strings: "OrderedDict[str, list[str]]", translations: dict[str, str], locale: str, plural_forms: str, language_team: str, path: Path, ) -> None: now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M+0000") lines = [ f"# Live 2D - {locale}", "# Copyright (C) Weifang Chiang", '# This file is distributed under the same license as the Live 2D plugin.', "#", 'msgid ""', 'msgstr ""', '"Project-Id-Version: Live 2D 2.1.3\\n"', '"Report-Msgid-Bugs-To: \\n"', f'"POT-Creation-Date: {now}\\n"', f'"PO-Revision-Date: {now}\\n"', f'"Language-Team: {language_team}\\n"', f'"Language: {locale}\\n"', '"MIME-Version: 1.0\\n"', '"Content-Type: text/plain; charset=UTF-8\\n"', '"Content-Transfer-Encoding: 8bit\\n"', f'"Plural-Forms: {plural_forms}\\n"', '"X-Generator: build-i18n.py\\n"', "", ] for msgid, refs in strings.items(): for ref in refs: lines.append(f"#: {ref}") lines.extend(format_po_msg("msgid", msgid)) lines.extend(format_po_msg("msgstr", translations.get(msgid, ""))) lines.append("") path.write_text("\n".join(lines) + "\n", encoding="utf-8") # --- MO compilation (gettext binary format) --- # https://www.gnu.org/software/gettext/manual/html_node/MO-Files.html MO_MAGIC = 0x950412DE def compile_mo(po_entries: list[tuple[str, str]], path: Path) -> None: """Write a .mo file from a list of (msgid, msgstr) pairs. The metadata header (empty msgid) MUST be the first entry. Entries with empty msgstr are skipped (untranslated -> let gettext fall back). """ # Filter and sort by msgid (gettext requires sorted msgid for binary search) items = [(k, v) for k, v in po_entries if v != "" or k == ""] items.sort(key=lambda kv: kv[0].encode("utf-8")) keys_b = [k.encode("utf-8") for k, _ in items] vals_b = [v.encode("utf-8") for _, v in items] n = len(items) header_size = 7 * 4 key_table_offset = header_size val_table_offset = key_table_offset + n * 8 output = bytearray() # Reserve space for header + tables; we'll fill string offsets as we append data. strings_start = val_table_offset + n * 8 output.extend(b"\x00" * strings_start) key_offsets: list[tuple[int, int]] = [] val_offsets: list[tuple[int, int]] = [] for kb in keys_b: offset = len(output) output.extend(kb) output.append(0) key_offsets.append((len(kb), offset)) for vb in vals_b: offset = len(output) output.extend(vb) output.append(0) val_offsets.append((len(vb), offset)) # Write header struct.pack_into( " None: # The metadata entry; gettext header. now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M+0000") metadata = ( f"Project-Id-Version: Live 2D 2.1.3\n" f"PO-Revision-Date: {now}\n" f"Language-Team: \n" f"Language: {locale}\n" f"MIME-Version: 1.0\n" f"Content-Type: text/plain; charset=UTF-8\n" f"Content-Transfer-Encoding: 8bit\n" f"Plural-Forms: {plural_forms}\n" ) entries: list[tuple[str, str]] = [("", metadata)] for msgid in strings: tr = translations.get(msgid, "") if tr: entries.append((msgid, tr)) compile_mo(entries, path) # --- Locale config --- LOCALES = { # WP locale code -> (Plural-Forms, Language-Team) "zh_CN": ("nplurals=1; plural=0;", "Simplified Chinese"), "zh_TW": ("nplurals=1; plural=0;", "Traditional Chinese (Taiwan)"), "en_US": ("nplurals=2; plural=(n != 1);", "English (United States)"), "ja": ("nplurals=1; plural=0;", "Japanese"), } def main() -> int: LANG_DIR.mkdir(parents=True, exist_ok=True) strings = collect_strings() print(f"Extracted {len(strings)} unique strings from {len(SCAN_FILES)} PHP files") # POT pot_path = LANG_DIR / f"{TEXT_DOMAIN}.pot" write_pot(strings, pot_path) print(f"Wrote {pot_path.relative_to(ROOT)}") # Translations tr_path = LANG_DIR / "translations.json" if not tr_path.exists(): print(f"ERROR: missing {tr_path}", file=sys.stderr) return 1 all_tr = json.loads(tr_path.read_text(encoding="utf-8")) missing_report = {} for locale, (plural, team) in LOCALES.items(): tr_map = all_tr.get(locale, {}) po_path = LANG_DIR / f"{TEXT_DOMAIN}-{locale}.po" write_po(strings, tr_map, locale, plural, team, po_path) mo_path = LANG_DIR / f"{TEXT_DOMAIN}-{locale}.mo" build_mo_for_locale(strings, tr_map, locale, plural, mo_path) missing = [m for m in strings if m not in tr_map or tr_map[m] == ""] if missing: missing_report[locale] = missing print(f"Wrote {po_path.relative_to(ROOT)} and {mo_path.relative_to(ROOT)} ({len(strings) - len(missing)}/{len(strings)} translated)") if missing_report: print("\n--- Missing translations ---") for locale, ms in missing_report.items(): print(f" [{locale}] missing {len(ms)} strings:") for m in ms: print(f" - {m!r}") return 0 if __name__ == "__main__": raise SystemExit(main())