import { readFile } from "node:fs/promises"; import { XMLParser, XMLValidator } from "fast-xml-parser"; import JSZip from "jszip"; const xmlParser = new XMLParser({ ignoreAttributes: false, attributeNamePrefix: "@_", ignoreDeclaration: true, }); /** * Parse XML string, validate it, strip the root element, and return the children. */ function parseXml(xml: string): Record { const validation = XMLValidator.validate(xml); if (validation !== true) { const e = validation.err; throw new Error(`${e.msg}\nLine: ${e.line}\nColumn: ${e.col}\nChar: `); } const raw = xmlParser.parse(xml) as Record; // Strip root element (equivalent to xml2js explicitRoot: false) const keys = Object.keys(raw); if (keys.length === 1) { return raw[keys[0]] as Record; } return raw; } /** Extract text content from a parsed XML value (string, or object with #text). */ function textOf(val: unknown): string { if (val == null) { return ""; } if (typeof val === "string") { return val.trim(); } if (typeof val === "number") { return String(val); } if (typeof val === "object" && "#text" in (val as object)) { return String((val as Record)["#text"] ?? "").trim(); } return ""; } /** Extract all @_ prefixed attributes from a parsed element into a clean object. */ function attrsOf(obj: Record): Record { const result: Record = {}; for (const key of Object.keys(obj)) { if (key.startsWith("@_")) { result[key.slice(2)] = String(obj[key]); } } return result; } /** Ensure value is an array. */ function asArray(val: T | T[] | undefined): T[] { if (val == null) { return []; } return Array.isArray(val) ? val : [val]; } export interface ManifestItem { id: string; href: string; "media-type": string; [key: string]: unknown; } export interface TocElement { level: number; order: number; title: string; id: string; href: string; "media-type"?: string; [key: string]: unknown; } export interface Metadata { creator: string; creatorFileAs: string; title: string; language: string; subject: string; subjects?: string[]; date: string; description: string; publisher?: string; source?: string; UUID?: string; [key: string]: unknown; } function extractIdentifiers(val: unknown, out: Metadata): void { if (typeof val !== "object" || val == null) { return; } const obj = val as Record; const scheme = obj["@_opf:scheme"] as string | undefined; const id = obj["@_id"] as string | undefined; const contents = textOf(obj); if (scheme) { (out as Record)[scheme] = contents; } else if (id && id.match(/uuid/i)) { out.UUID = contents.replace("urn:uuid:", "").toUpperCase().trim(); } } export class EPub { input: string | Buffer | ArrayBuffer; imageroot: string; linkroot: string; metadata: Metadata = {} as Metadata; manifest: Record = {}; guide: Record[] = []; spine: { toc: ManifestItem | false; contents: ManifestItem[] } = { toc: false, contents: [], }; flow: ManifestItem[] = []; toc: TocElement[] = []; version: string = "2.0"; zip!: JSZip; containerFile: string | false = false; mimeFile: string | false = false; rootFile: string | false = false; constructor(input: string | Buffer | ArrayBuffer, imageroot?: string, linkroot?: string) { this.input = input; this.imageroot = (imageroot || "/images/").trim(); this.linkroot = (linkroot || "/links/").trim(); if (!this.imageroot.endsWith("/")) { this.imageroot += "/"; } if (!this.linkroot.endsWith("/")) { this.linkroot += "/"; } } async parse(): Promise { this.containerFile = false; this.mimeFile = false; this.rootFile = false; this.metadata = {} as Metadata; this.manifest = {}; this.guide = []; this.spine = { toc: false, contents: [] }; this.flow = []; this.toc = []; await this._open(); await this._checkMimeType(); await this._getRootFiles(); const rootfileData = await this._handleRootFile(); this._parseRootFile(rootfileData); if (this.spine.toc) { await this._parseTOC(); } } private async _readFile(name: string): Promise { const file = this.zip.file(name); if (!file) { throw new Error(`Entry not found: ${name}`); } return file.async("nodebuffer"); } private async _open(): Promise { try { const buf = typeof this.input === "string" ? await readFile(this.input) : this.input; this.zip = await JSZip.loadAsync(buf); } catch { throw new Error("Invalid/missing file"); } if (!Object.keys(this.zip.files).length) { throw new Error("No files in archive"); } } private async _checkMimeType(): Promise { for (const name of Object.keys(this.zip.files)) { if (name.toLowerCase() === "mimetype") { this.mimeFile = name; break; } } if (!this.mimeFile) { throw new Error("No mimetype file in archive"); } const data = await this._readFile(this.mimeFile); const txt = data.toString("utf-8").toLowerCase().trim(); if (txt !== "application/epub+zip") { throw new Error("Unsupported mime type"); } } private async _getRootFiles(): Promise { for (const name of Object.keys(this.zip.files)) { if (name.toLowerCase() === "meta-inf/container.xml") { this.containerFile = name; break; } } if (!this.containerFile) { throw new Error("No container file in archive"); } const data = await this._readFile(this.containerFile); const xml = data.toString("utf-8").trim(); const result = parseXml(xml); const rootfiles = result.rootfiles as Record | undefined; if (!rootfiles || !rootfiles.rootfile) { throw new Error("No rootfiles found"); } for (const rf of asArray(rootfiles.rootfile) as Record[]) { if ( String(rf["@_media-type"]).toLowerCase() === "application/oebps-package+xml" && rf["@_full-path"] ) { this.rootFile = String(rf["@_full-path"]); break; } } if (!this.rootFile) { throw new Error("Rootfile not found from archive"); } } private async _handleRootFile(): Promise> { const data = await this._readFile(this.rootFile as string); const xml = data.toString("utf-8"); return parseXml(xml); } private _parseRootFile(rootfile: Record): void { this.version = String(rootfile["@_version"] || "2.0"); for (const fullKey of Object.keys(rootfile)) { if (fullKey.startsWith("@_")) { continue; } const key = (fullKey.split(":").pop() || "").toLowerCase().trim(); switch (key) { case "metadata": this._parseMetadata(rootfile[fullKey] as Record); break; case "manifest": this._parseManifest(rootfile[fullKey] as Record); break; case "spine": this._parseSpine(rootfile[fullKey] as Record); break; case "guide": this._parseGuide(rootfile[fullKey] as Record); break; } } } private _parseMetadata(metadata: Record): void { for (const fullKey of Object.keys(metadata)) { if (fullKey.startsWith("@_")) { continue; } const metadataValue = metadata[fullKey]; const key = (fullKey.split(":").pop() || "").toLowerCase().trim(); switch (key) { case "publisher": case "title": case "description": case "date": { if (Array.isArray(metadataValue)) { (this.metadata as Record)[key] = textOf(metadataValue[0]); } else { (this.metadata as Record)[key] = textOf(metadataValue); } break; } case "language": { if (Array.isArray(metadataValue)) { this.metadata.language = textOf(metadataValue[0]).toLowerCase(); } else { this.metadata.language = textOf(metadataValue).toLowerCase(); } break; } case "subject": { const subjects = asArray(metadataValue); if (subjects.length === 0) { this.metadata.subject = ""; } else { this.metadata.subjects = subjects.map((v) => textOf(v)); this.metadata.subject = this.metadata.subjects[0] ?? ""; } break; } case "creator": { if (Array.isArray(metadataValue)) { const first = metadataValue[0] as Record | string | undefined; this.metadata.creator = textOf(first); this.metadata.creatorFileAs = String( (typeof first === "object" && first?.["@_opf:file-as"]) || this.metadata.creator, ).trim(); } else { this.metadata.creator = textOf(metadataValue); const fileAs = typeof metadataValue === "object" && metadataValue != null && (metadataValue as Record)["@_opf:file-as"]; this.metadata.creatorFileAs = String(fileAs || this.metadata.creator).trim(); } break; } case "identifier": { for (const v of asArray(metadataValue)) { extractIdentifiers(v, this.metadata); } break; } case "source": { const sources = asArray(metadataValue); this.metadata.source = sources.length > 0 ? textOf(sources[0]) : ""; break; } } } for (const meta of asArray(metadata.meta) as Record[]) { const name = meta["@_name"] as string | undefined; const content = meta["@_content"] as string | undefined; const property = meta["@_property"] as string | undefined; if (name) { (this.metadata as Record)[name] = content; } if (meta["#text"] && property) { (this.metadata as Record)[property] = meta["#text"]; } } } private _parseManifest(manifest: Record): void { const path = (this.rootFile as string).split("/"); path.pop(); const pathStr = path.join("/"); for (const item of asArray(manifest.item) as Record[]) { const element = attrsOf(item) as unknown as ManifestItem; if (element.href && element.href.substring(0, pathStr.length) !== pathStr) { element.href = path.concat([element.href]).join("/"); } if (element.id) { this.manifest[element.id] = element; } } } private _parseGuide(guide: Record): void { const path = (this.rootFile as string).split("/"); path.pop(); const pathStr = path.join("/"); for (const ref of asArray(guide.reference) as Record[]) { const element = attrsOf(ref); if (element.href && element.href.substring(0, pathStr.length) !== pathStr) { element.href = path.concat([element.href]).join("/"); } this.guide.push(element); } } private _parseSpine(spine: Record): void { const toc = spine["@_toc"] as string | undefined; if (toc) { this.spine.toc = this.manifest[toc] || false; } for (const itemref of asArray(spine.itemref) as Record[]) { const idref = itemref["@_idref"] as string | undefined; if (idref) { const element = this.manifest[idref]; if (element) { this.spine.contents.push(element); } } } this.flow = this.spine.contents; } private async _parseTOC(): Promise { const tocHref = (this.spine.toc as ManifestItem).href; const path = tocHref.split("/"); path.pop(); const idList: Record = {}; for (const key of Object.keys(this.manifest)) { idList[this.manifest[key].href as string] = key; } const data = await this._readFile(tocHref); const xml = data.toString("utf-8"); let result: Record; try { result = parseXml(xml); } catch (err) { throw new Error( "Parsing container XML failed in TOC: " + (err instanceof Error ? err.message : String(err)), ); } const navMap = result.navMap as Record | undefined; if (navMap?.navPoint) { this.toc = this.walkNavMap(navMap.navPoint as Record[], path, idList); } } walkNavMap( branch: Record | Record[], path: string[], idList: Record, level: number = 0, ): TocElement[] { if (level > 7) { return []; } const output: TocElement[] = []; const items = Array.isArray(branch) ? branch : [branch]; for (const item of items) { const navLabel = item.navLabel as Record | undefined; if (navLabel) { let title = ""; if (typeof navLabel.text === "string") { title = navLabel.text.trim(); } let order = Number(item["@_playOrder"] || 0); if (isNaN(order)) { order = 0; } let href = ""; const content = item.content as Record | undefined; if (typeof content?.["@_src"] === "string") { href = (content["@_src"] as string).trim(); } let element: TocElement = { level, order, title, id: "", href: "" }; if (href) { href = path.concat([href]).join("/"); element.href = href; if (idList[element.href]) { element = this.manifest[idList[element.href]] as ManifestItem & TocElement; element.title = title; element.order = order; element.level = level; } else { element.href = href; element.id = String(item["@_id"] || "").trim(); } output.push(element); } } if (item.navPoint) { output.push( ...this.walkNavMap(item.navPoint as Record[], path, idList, level + 1), ); } } return output; } async getChapter(id: string): Promise { const str = await this.getChapterRaw(id); const path = (this.rootFile as string).split("/"); path.pop(); const keys = Object.keys(this.manifest); // remove linebreaks (no multi line matches in JS regex!) let s = str.replace(/\r?\n/g, "\u0000"); // keep only contents s.replace(/]*?>(.*)<\/body[^>]*?>/i, (_o, d) => { s = d.trim(); return ""; }); // remove