/** * Tests for WXR parser */ import { Readable } from "node:stream"; import { describe, it, expect } from "vitest"; import { parseWxr } from "../../../src/cli/wxr/parser.js"; function createStream(content: string): Readable { return Readable.from([content]); } describe("parseWxr", () => { it("parses basic WXR structure", async () => { const wxr = ` Test Site https://example.com A test WordPress site en-US https://example.com https://example.com `; const result = await parseWxr(createStream(wxr)); expect(result.site.title).toBe("Test Site"); expect(result.site.link).toBe("https://example.com"); expect(result.site.description).toBe("A test WordPress site"); expect(result.site.language).toBe("en-US"); }); it("parses posts", async () => { const wxr = ` Test Site Hello World https://example.com/hello-world/ Mon, 01 Jan 2024 12:00:00 +0000 admin

Welcome to WordPress!

]]>
1 2024-01-01 12:00:00 publish post hello-world
`; const result = await parseWxr(createStream(wxr)); expect(result.posts).toHaveLength(1); expect(result.posts[0]?.title).toBe("Hello World"); expect(result.posts[0]?.id).toBe(1); expect(result.posts[0]?.status).toBe("publish"); expect(result.posts[0]?.postType).toBe("post"); expect(result.posts[0]?.content).toContain("wp:paragraph"); }); it("parses pages", async () => { const wxr = ` About Us About page content

]]>
2 publish page
`; const result = await parseWxr(createStream(wxr)); expect(result.posts).toHaveLength(1); expect(result.posts[0]?.title).toBe("About Us"); expect(result.posts[0]?.postType).toBe("page"); }); it("parses attachments", async () => { const wxr = ` Test Image 10 attachment https://example.com/wp-content/uploads/2024/01/test.jpg `; const result = await parseWxr(createStream(wxr)); expect(result.posts).toHaveLength(0); expect(result.attachments).toHaveLength(1); expect(result.attachments[0]?.id).toBe(10); expect(result.attachments[0]?.title).toBe("Test Image"); expect(result.attachments[0]?.url).toContain("test.jpg"); }); it("parses categories", async () => { const wxr = ` 1 uncategorized 2 news uncategorized `; const result = await parseWxr(createStream(wxr)); expect(result.categories).toHaveLength(2); expect(result.categories[0]?.nicename).toBe("uncategorized"); expect(result.categories[0]?.name).toBe("Uncategorized"); expect(result.categories[1]?.parent).toBe("uncategorized"); }); it("parses tags", async () => { const wxr = ` 5 javascript `; const result = await parseWxr(createStream(wxr)); expect(result.tags).toHaveLength(1); expect(result.tags[0]?.slug).toBe("javascript"); expect(result.tags[0]?.name).toBe("JavaScript"); }); it("parses post categories and tags", async () => { const wxr = ` Tagged Post post `; const result = await parseWxr(createStream(wxr)); expect(result.posts[0]?.categories).toContain("news"); expect(result.posts[0]?.tags).toContain("javascript"); expect(result.posts[0]?.tags).toContain("typescript"); }); it("parses authors", async () => { const wxr = ` 1 admin admin@example.com Admin User `; const result = await parseWxr(createStream(wxr)); expect(result.authors).toHaveLength(1); expect(result.authors[0]?.login).toBe("admin"); expect(result.authors[0]?.email).toBe("admin@example.com"); expect(result.authors[0]?.displayName).toBe("Administrator"); }); it("parses post meta", async () => { const wxr = ` Post with Meta post _yoast_wpseo_title SEO Title _yoast_wpseo_metadesc SEO Description `; const result = await parseWxr(createStream(wxr)); expect(result.posts[0]?.meta.get("_yoast_wpseo_title")).toBe("SEO Title"); expect(result.posts[0]?.meta.get("_yoast_wpseo_metadesc")).toBe("SEO Description"); }); it("handles empty WXR", async () => { const wxr = ` Empty Site `; const result = await parseWxr(createStream(wxr)); expect(result.posts).toHaveLength(0); expect(result.attachments).toHaveLength(0); expect(result.categories).toHaveLength(0); }); it("parses page hierarchy (post_parent and menu_order)", async () => { const wxr = ` Parent Page 10 page 0 1 Child Page 11 page 10 2 `; const result = await parseWxr(createStream(wxr)); expect(result.posts).toHaveLength(2); expect(result.posts[0]?.postParent).toBe(0); expect(result.posts[0]?.menuOrder).toBe(1); expect(result.posts[1]?.postParent).toBe(10); expect(result.posts[1]?.menuOrder).toBe(2); }); it("parses generic wp:term elements (custom taxonomies)", async () => { const wxr = ` 100 genre sci-fi 101 genre fantasy sci-fi `; const result = await parseWxr(createStream(wxr)); expect(result.terms).toHaveLength(2); expect(result.terms[0]?.id).toBe(100); expect(result.terms[0]?.taxonomy).toBe("genre"); expect(result.terms[0]?.slug).toBe("sci-fi"); expect(result.terms[0]?.name).toBe("Science Fiction"); expect(result.terms[0]?.description).toBe("Science fiction books"); expect(result.terms[1]?.parent).toBe("sci-fi"); }); it("parses nav_menu terms and nav_menu_item posts into structured menus", async () => { const wxr = ` 5 nav_menu main-menu Home 50 nav_menu_item 1 _menu_item_type custom _menu_item_url https://example.com/ _menu_item_menu_item_parent 0 About 51 nav_menu_item 2 _menu_item_type post_type _menu_item_object page _menu_item_object_id 10 _menu_item_menu_item_parent 0 `; const result = await parseWxr(createStream(wxr)); // Check terms array includes nav_menu term expect(result.terms.some((t) => t.taxonomy === "nav_menu")).toBe(true); // Check nav_menu_item posts are in posts array expect(result.posts.filter((p) => p.postType === "nav_menu_item")).toHaveLength(2); // Check structured navMenus expect(result.navMenus).toHaveLength(1); expect(result.navMenus[0]?.name).toBe("main-menu"); expect(result.navMenus[0]?.id).toBe(5); expect(result.navMenus[0]?.items).toHaveLength(2); // Check menu items are sorted by menu_order expect(result.navMenus[0]?.items[0]?.title).toBe("Home"); expect(result.navMenus[0]?.items[0]?.type).toBe("custom"); expect(result.navMenus[0]?.items[0]?.url).toBe("https://example.com/"); expect(result.navMenus[0]?.items[0]?.sortOrder).toBe(1); expect(result.navMenus[0]?.items[1]?.title).toBe("About"); expect(result.navMenus[0]?.items[1]?.type).toBe("post_type"); expect(result.navMenus[0]?.items[1]?.objectType).toBe("page"); expect(result.navMenus[0]?.items[1]?.objectId).toBe(10); expect(result.navMenus[0]?.items[1]?.sortOrder).toBe(2); }); it("parses custom taxonomy assignments on posts", async () => { const wxr = ` Book Review 1 post `; const result = await parseWxr(createStream(wxr)); expect(result.posts[0]?.categories).toContain("reviews"); expect(result.posts[0]?.customTaxonomies?.get("genre")).toContain("sci-fi"); expect(result.posts[0]?.customTaxonomies?.get("genre")).toContain("dystopian"); expect(result.posts[0]?.customTaxonomies?.get("reading_level")).toContain("advanced"); }); describe("multilingual plugin metadata (issue #1080)", () => { it("promotes WPML _icl_lang_code and trid to locale + translationGroup", async () => { const wxr = ` Hello 1 post hello _icl_lang_code _icl_translation_id Mərhəba 2 post hello _icl_lang_code _icl_translation_id `; const result = await parseWxr(createStream(wxr)); expect(result.posts).toHaveLength(2); expect(result.posts[0]?.locale).toBe("en"); expect(result.posts[1]?.locale).toBe("ar"); // Both translations share the same group key. Prefix is opaque // but stable so the execute route can group on it. expect(result.posts[0]?.translationGroup).toBe(result.posts[1]?.translationGroup); expect(result.posts[0]?.translationGroup).toContain("42"); }); it("falls back to WPML legacy `trid` meta key when _icl_translation_id is absent", async () => { const wxr = ` Legacy 1 post _icl_lang_code trid `; const result = await parseWxr(createStream(wxr)); expect(result.posts[0]?.locale).toBe("fr"); expect(result.posts[0]?.translationGroup).toContain("7"); }); it("derives locale from Polylang's `language` taxonomy when WPML is absent", async () => { const wxr = ` Bonjour 1 post `; const result = await parseWxr(createStream(wxr)); expect(result.posts[0]?.locale).toBe("fr"); }); it("derives a stable Polylang translationGroup from _translations meta", async () => { // Polylang stores `_translations` as a serialized PHP map. We // hash the post IDs into a stable key shared by every member of // the group. The exact format isn't part of the contract -- the // only guarantee is "same map -> same key". const wxr = ` EN 1 post _translations FR 2 post _translations `; const result = await parseWxr(createStream(wxr)); expect(result.posts).toHaveLength(2); expect(result.posts[0]?.locale).toBe("en"); expect(result.posts[1]?.locale).toBe("fr"); expect(result.posts[0]?.translationGroup).toBeDefined(); expect(result.posts[0]?.translationGroup).toBe(result.posts[1]?.translationGroup); }); it("leaves locale/translationGroup undefined on monolingual exports", async () => { const wxr = ` Mono 1 post `; const result = await parseWxr(createStream(wxr)); expect(result.posts[0]?.locale).toBeUndefined(); expect(result.posts[0]?.translationGroup).toBeUndefined(); }); it("prefers WPML over Polylang when both are present", async () => { const wxr = ` Conflict 1 post _icl_lang_code `; const result = await parseWxr(createStream(wxr)); expect(result.posts[0]?.locale).toBe("en"); }); it("handles Polylang _translations payloads with multibyte string keys", async () => { // PHP `s:LEN:"..."` counts BYTES of the payload, not chars. A // payload like `é` (2 bytes UTF-8) would shift JS UTF-16 // position by 1, but the byte length is 2. The parser must // advance by bytes or it will misalign and drop subsequent // integer tokens. // // Real-world trigger: any non-ASCII locale code or label string // in `_translations`. We don't expect Polylang to produce such // keys, but the parser must not corrupt the group when it // does. const wxr = ` A 1 post _translations B 2 post _translations `; const result = await parseWxr(createStream(wxr)); expect(result.posts).toHaveLength(2); // Both ids extracted; group key has BOTH 1 and 2, not just 2. expect(result.posts[0]?.translationGroup).toBe("pll:1,2"); expect(result.posts[1]?.translationGroup).toBe("pll:1,2"); }); it("ignores `i:N;` literals embedded inside Polylang `_translations` string values", async () => { // String values can contain `i:N;` text that the naive regex // would erroneously match. The length-aware parser walks // `s:LEN:"..."` blocks and skips their payloads exactly. // The real translation IDs are `i:1;` and `i:7;`; the embedded // `i:99;` inside a string value must NOT contribute. const wxr = ` A 1 post _translations B 2 post _translations `; const result = await parseWxr(createStream(wxr)); expect(result.posts).toHaveLength(2); const groupA = result.posts[0]?.translationGroup; const groupB = result.posts[1]?.translationGroup; expect(groupA).toBe(groupB); // Group key derives from the integer post IDs, not the // embedded `99` -- post-id list is `[1, 7]`. expect(groupA).toBe("pll:1,7"); }); it("falls back to _icl_translation_id when trid is absent", async () => { // `trid` is preferred (it's the shared translation group id), // but legacy / partial exports may only have // `_icl_translation_id`. Accept it as a fallback so single- // translation exports still get a group key. const wxr = ` Solo 1 post _icl_lang_code _icl_translation_id `; const result = await parseWxr(createStream(wxr)); expect(result.posts[0]?.locale).toBe("en"); expect(result.posts[0]?.translationGroup).toBe("wpml:99"); }); it("captures per-item category text body as a taxonomyLabels entry", async () => { const wxr = ` Hello 1 post `; const result = await parseWxr(createStream(wxr)); const post = result.posts[0]; expect(post?.taxonomyLabels).toBeDefined(); // Keys are `${normalisedTaxonomy}\u0000${slug}`. `post_tag` // normalises to `tag`. expect(post?.taxonomyLabels?.get("category\u0000breaking-news")).toBe("Breaking News"); expect(post?.taxonomyLabels?.get("tag\u0000featured")).toBe("Featured"); expect(post?.taxonomyLabels?.get("genre\u0000sci-fi")).toBe("Science Fiction"); }); }); });