import { Container } from 'r2-shared-js/dist/es8-es2017/src/parser/epub/container'; import { OPF } from 'r2-shared-js/dist/es8-es2017/src/parser/epub/opf'; import { NCX } from 'r2-shared-js/dist/es8-es2017/src/parser/epub/ncx'; import { DCMetadata } from 'r2-shared-js/dist/es8-es2017/src/parser/epub/opf-dc-metadata'; import { Metafield } from 'r2-shared-js/dist/es8-es2017/src/parser/epub/opf-metafield'; import { XML } from 'r2-utils-js/dist/es8-es2017/src/_utils/xml-js-mapper'; import { DOMParser } from 'xmldom'; import { EpubVersion } from './types'; import { Rootfile } from 'r2-shared-js/dist/es8-es2017/src/parser/epub/container-rootfile'; import { WebpubManifest } from './WebpubManifestTypes/WebpubManifest'; import { epubToManifest } from './convert'; import Decryptor from '@nypl-simplified-packages/axisnow-access-control-web'; import Fetcher from './Fetcher'; /** * This class represents a complete EPUB. It is abstract * because it is meant to be subclassed to support various * ways an EPUB can be sourced: * - Local (filesystem) exploded EPUB * - Local packaged EPUB * - Remote (external server) exploded EPUB * - Remote packaged EPUB * * This class includes utilites used to parse the string file values * into in-memory representations and to extract values from the various * data structures. They will be used by all subclasses. */ export default class Epub { static NCX_MEDIA_TYPE = 'application/x-dtbncx+xml'; constructor( public readonly fetcher: Fetcher, private readonly containerXmlPath: string, // used to resolve items relative to the opf file public readonly opfPath: string, public readonly container: Container, public readonly opf: OPF, // EPUB 2 uses NCX, EPUB 3 uses NavDoc public readonly ncx: NCX | undefined, public readonly navDoc: Document | undefined, // pass a decryptor to have all files except container.xml and opf run through it public readonly decryptor?: Decryptor ) {} public static async build( containerXmlPath: string, fetcher: Fetcher, decryptor?: Decryptor ) { const container = Epub.parseContainer( await fetcher.getFileStr(containerXmlPath) ); const relativeOpfPath = Epub.getOpfPath(container); const opfPath = fetcher.getOpfPath(relativeOpfPath); const opf = await Epub.parseOpf(await fetcher.getFileStr(opfPath)); const relativeNcxPath = Epub.getNcxHref(opf); const ncxPath = relativeNcxPath ? fetcher.resolvePath(opfPath, relativeNcxPath) : undefined; const ncxBuffer = ncxPath ? await fetcher.getArrayBuffer(ncxPath) : undefined; const ncxStr = await Epub.decryptStr(ncxBuffer, decryptor); const ncx = Epub.parseNcx(ncxStr); const relativeNavDocPath = Epub.getNavDocHref(opf); const navDocPath = relativeNavDocPath ? fetcher.resolvePath(opfPath, relativeNavDocPath) : undefined; const navDocBuffer = navDocPath ? await fetcher.getArrayBuffer(navDocPath) : undefined; const navDocStr = await Epub.decryptStr(navDocBuffer, decryptor); const navDoc = Epub.parseNavDoc(navDocStr); return new Epub( fetcher, containerXmlPath, opfPath, container, opf, ncx, navDoc, decryptor ); } /////////////////// // ACCESSOR METHODS AND UTILS // We need static and instance methods of these because the static version is // used in the subclass's `build` method /////////////////// static getVersion(rootfile: Rootfile | undefined, opf: OPF): EpubVersion { const versionNumber = rootfile?.Version ?? opf.Version; return versionNumber.startsWith('3') ? '3' : '2'; } get version(): EpubVersion { return Epub.getVersion(this.rootfile, this.opf); } static getRootfile(container: Container): Rootfile | undefined { return container.Rootfile[0]; } get rootfile(): Rootfile | undefined { return Epub.getRootfile(this.container); } static getContentPath(rootfile: Rootfile | undefined, opf: OPF): string { return Epub.getVersion(rootfile, opf) === '2' ? 'OEBPS/' : 'OPS/'; } get contentPath(): string { return Epub.getContentPath(this.rootfile, this.opf); } get webpubManifest(): Promise { return epubToManifest(this); } /////////////////// // METHODS FOR DESERIALIZING VALUES INTO IN-MEMORY CLASSES /////////////////// /** * Parses an XML string into a JS class */ static parseXmlString(str: string, objectType: any): T { const containerXmlDoc = new DOMParser().parseFromString(str, 'utf-8'); return XML.deserialize(containerXmlDoc, objectType); } /** * Parses an XML string into an OPF class, resolving edge cases on the way. */ static async parseOpf(str: string): Promise { const fixed = Epub.fixOpfString(str); const opf = Epub.parseXmlString(fixed, OPF); return opf; } /** * This code was found in the r2-shared-js repo. I'm not sure if * it's necessary, but it seems to fix an edge case of how the package * is defined in the XML. */ static fixOpfString(opfStr: string): string { const iStart = opfStr.indexOf('= 0) { const iEnd = opfStr.indexOf('>', iStart); if (iEnd > iStart) { const clip = opfStr.substr(iStart, iEnd - iStart); if (clip.indexOf('xmlns') < 0) { return opfStr.replace( /(str, Container); return container; } /** * Extract the OPF path from a Container */ static getOpfPath(container: Container): string { // get the content.opf file from the container.xml file const rootfilePath = container.Rootfile[0]?.PathDecoded; if (!rootfilePath) { throw new Error('container.xml file is missing rootfile path.'); } return rootfilePath; } /** * As best I can tell, the TOC.ncx file is always referenced with * an in the with id === 'ncx */ static getNcxHref(opf: OPF) { return opf.Manifest.find( (item) => item.ID === 'ncx' && item.MediaType === Epub.NCX_MEDIA_TYPE )?.HrefDecoded; } /** * Parses an NCX XML string into a TOC Document */ static parseNcx(ncxStr: string | undefined) { return ncxStr ? Epub.parseXmlString(ncxStr, NCX) : undefined; } static getNavDocHref(opf: OPF): string | undefined { const navDocItem = opf.Manifest.find((item) => Epub.parseSpaceSeparatedString(item.Properties).includes('nav') ); return navDocItem?.HrefDecoded; } static parseNavDoc(navDocStr: string | undefined) { return navDocStr ? new DOMParser().parseFromString(navDocStr) : undefined; } /** * Parses a space separated string of properties into an array */ static parseSpaceSeparatedString(str: string | undefined | null): string[] { return ( str ?.trim() .split(' ') .map((role) => role.trim()) .filter((role) => role.length > 0) ?? [] ); } /** * Takes a maybe file and a maybe decryptor and returns * a string */ static async decryptStr( buffer: ArrayBuffer | undefined, decryptor: Decryptor | undefined ): Promise { if (!buffer) return undefined; if (!decryptor) return new TextDecoder('utf-8').decode(buffer); return await decryptor.decryptStr(new Uint8Array(buffer)); } static async decryptAb( buffer: ArrayBuffer, decryptor: Decryptor | undefined ): Promise { if (!decryptor) return buffer; return await decryptor.decrypt(new Uint8Array(buffer)); } /////////////////// // METHODS FOR GETTING VALUES FROM THE TOC AND NCX FILES /////////////////// /** * Extracts a named metadata property from either epub.opf.Metatada.DCMetadata * or epub.opf.Metadata, if the prior doesn't exist. Returns undefined * in case of failure. This is to support EPUB 2.0, which allows * metadata to be nested within dc:metadata: * http://idpf.org/epub/20/spec/OPF_2.0.1_draft.htm#Section2.2 */ extractMetadataMember(key: T) { return this.opf.Metadata?.DCMetadata?.[key] ?? this.opf.Metadata[key]; } /** * Extracts meta fields that are found in the XMetadata or Meta arrays * within the Metadata object. This is necessary because EPUB allows metadata * to be nested under either tag. */ extractMetaField(filter: (meta: Metafield) => boolean) { /** * These properties are not marked as optional, but that is a mistake, they * are indeed optional and need to use optional chaining and nullish coalescing */ const xMetaFields = this.opf.Metadata?.XMetadata?.Meta?.filter(filter) ?? []; const metaFields = this.opf.Metadata?.Meta?.filter(filter) ?? []; return [...xMetaFields, ...metaFields]; } }