/** * Configuration options for the OfficeParser. */ export interface OfficeParserConfig { /** * Flag to show all the logs to console in case of an error irrespective of your own handling. * Default is false. */ outputErrorToConsole?: boolean; /** * The delimiter used for every new line in places that allow multiline text like word. * Default is \n. */ newlineDelimiter?: string; /** * Flag to ignore notes from parsing in files like powerpoint. * Default is false. It includes notes in the parsed text by default. */ ignoreNotes?: boolean; /** * Flag, if set to true, will collectively put all the parsed text from notes at last in files like powerpoint. * Default is false. It puts each notes right after its main slide content. * If ignoreNotes is set to true, this flag is also ignored. * @note This flag currently does not affect RTF files; RTF footnotes/endnotes are always collected and appended at the end of the content. */ putNotesAtLast?: boolean; /** * Flag to extract attachments like images, charts, etc. * Default is false. */ extractAttachments?: boolean; /** * Flag to include raw content (XML for XML-based formats, RTF for RTF) in the AST. * Default is false. */ includeRawContent?: boolean; /** * Flag to enable OCR for images. * Default is false. */ ocr?: boolean; /** * Language for OCR. * Default is 'eng'. * * You can provide multiple languages separated by a `+` sign (e.g., 'eng+fra' for English and French). * The OCR engine will then attempt to recognize text in any of the specified languages. * * See the list of supported languages and their codes here: * https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016 */ ocrLanguage?: string; /** * The URL/path to the PDF.js worker script. * * **Mandatory** when using PDF parsing in browser environments to avoid worker configuration errors. * If not provided, it defaults to `https://unpkg.com/pdfjs-dist@5.5.207/build/pdf.worker.min.mjs`. * You can override this with your own local path or a different CDN link. */ pdfWorkerSrc?: string; } /** * Supported file types for parsing. */ export type SupportedFileType = 'docx' | 'pptx' | 'xlsx' | 'odt' | 'odp' | 'ods' | 'pdf' | 'rtf'; /** * Types of content nodes in the AST. */ export type OfficeContentNodeType = 'paragraph' | 'heading' | 'table' | 'list' | 'text' | 'image' | 'chart' | 'drawing' | 'slide' | 'note' | 'sheet' | 'row' | 'cell' | 'page'; /** * Supported MIME types for attachments. */ export type OfficeMimeType = 'image/jpeg' | 'image/png' | 'image/gif' | 'image/bmp' | 'image/tiff' | 'image/svg+xml' | 'application/pdf' | 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' | 'application/vnd.oasis.opendocument.chart' | 'application/vnd.oasis.opendocument.spreadsheet' | 'application/vnd.oasis.opendocument.text' | 'application/vnd.oasis.opendocument.presentation'; /** * Text formatting options available for text content. * Represents common formatting attributes found in office documents (DOCX, RTF, PPTX, etc.). * All properties are optional and only present when the formatting is explicitly applied. */ export interface TextFormatting { /** * Whether the text is bold. * Corresponds to `` in OOXML, `\b` in RTF. * @example true for **bold text**, false or undefined for normal weight */ bold?: boolean; /** * Whether the text is italic. * Corresponds to `` in OOXML, `\i` in RTF. * @example true for *italic text*, false or undefined for normal style */ italic?: boolean; /** * Whether the text is underlined. * Corresponds to `` in OOXML, `\ul` in RTF. * @example true for underlined text, false or undefined for no underline */ underline?: boolean; /** * Whether the text has a strikethrough. * Corresponds to `` in OOXML, `\strike` in RTF. * @example true for ~~struck through~~ text */ strikethrough?: boolean; /** * Text color in hex format (#RRGGBB). * Extracted from color tables in RTF or XML color attributes in OOXML. * @example "#ff0000" for red, "#00ff00" for green, "#0000ff" for blue */ color?: string; /** * Background/highlight color in hex format (#RRGGBB). * Represents the background color or text highlighting. * @example "#ffff00" for yellow highlight, "#d3d3d3" for light gray */ backgroundColor?: string; /** * Font size with units. * Most parsers append 'pt' (points), but ODF may use other units like 'in' (inches) or 'cm'. * @example "12pt" for 12pt, "14pt" for 14pt, "0.5in" for 0.5 inches */ size?: string; /** * Font family/typeface name. * Extracted from font tables in RTF or font definitions in OOXML. * @example "Arial", "Times New Roman", "Calibri", "Ubuntu Mono" */ font?: string; /** * Whether the text is subscript (e.g., H₂O). * Corresponds to `\sub` in RTF, `` in OOXML. * Mutually exclusive with superscript. * @example true for subscript text like H₂O */ subscript?: boolean; /** * Whether the text is superscript (e.g., E=mc²). * Corresponds to `\super` in RTF, `` in OOXML. * Mutually exclusive with subscript. * @example true for superscript text like x² */ superscript?: boolean; /** * The alignment of the text. * Common in spreadsheet cells or paragraph styles. * @example "center", "right" */ alignment?: 'left' | 'center' | 'right' | 'justify'; } /** * Metadata for a slide in PowerPoint. */ export interface SlideMetadata { /** The slide number (1-based). */ slideNumber: number; /** * The unique ID of the note associated with this slide (if any). * @example "slide-note-1" */ noteId?: string; /** The style of the slide. */ style?: string; } /** * Metadata for a sheet in Excel. */ export interface SheetMetadata { /** The name of the sheet. */ sheetName: string; /** The style of the sheet. */ style?: string; } /** * Metadata for a heading. */ export interface HeadingMetadata { /** The heading level (e.g., 1 for H1). */ level: number; /** The alignment of the heading. */ alignment?: 'left' | 'center' | 'right' | 'justify'; /** The style of the heading. */ style?: string; } /** * Metadata for a paragraph. */ export interface ParagraphMetadata { /** The alignment of the paragraph. */ alignment?: 'left' | 'center' | 'right' | 'justify'; /** The style of the paragraph. */ style?: string; } /** * Metadata for a list item. */ export interface ListMetadata { /** * The type of list: 'ordered' (numbered) or 'unordered' (bulleted). * @example 'ordered' for numbered lists, 'unordered' for bulleted lists */ listType: 'ordered' | 'unordered'; /** * The nesting level (indent level) of the list item, starting from 0. * @example 0 for top-level items, 1 for first nested level */ indentation: number; /** * Text alignment of the list item. * @example 'left', 'center', 'right', 'justify' */ alignment: 'left' | 'center' | 'right' | 'justify'; /** * The list ID from the Word document's numbering definition. * Used to identify which list definition this item belongs to. * @example '1', '2' for different list definitions */ listId: string; /** * The zero-based index of this item within its list. * Continues incrementing even across paragraph interruptions for the same listId. * @example 0, 1, 2, 3 for sequential list items */ itemIndex: number; /** * The style name of the list item. * @example "ListParagraph" */ style?: string; } /** * Metadata for a table cell (primarily used in Excel/spreadsheet parsing). * Contains positional information about where the cell appears in the table. */ export interface CellMetadata { /** * The row index of the cell (0-based). * @example 0 for the first row, 1 for the second row, etc. */ row: number; /** * The column index of the cell (0-based). * @example 0 for column A, 1 for column B, etc. */ col: number; /** * The number of rows this cell spans (merges). * @example 2 if the cell is merged with the one below it. */ rowSpan?: number; /** * The number of columns this cell spans (merges). * @example 2 if the cell is merged with the one to its right. */ colSpan?: number; /** The style of the cell. */ style?: string; } /** * Metadata for a chart node in the document. * Links the chart node to its corresponding attachment in the attachments array. */ export interface ChartMetadata { /** * The name of the attachment that contains the actual chart data. * Use this to look up the full chart data from the attachments array. * @example "chart1.xml" */ attachmentName: string; } /** * Metadata for an image node in the document. * Links the image node to its corresponding attachment in the attachments array. */ export interface ImageMetadata { /** * The name of the attachment that contains the actual image data. * Use this to look up the full image data from the attachments array. * @example "image1.png" */ attachmentName: string; /** * Alt text (alternative text) describing the image. * Extracted from image properties in the document. * @example "Company logo" */ altText?: string; } /** * Metadata for PDF page nodes. * Indicates which page of the PDF this content came from. */ export interface PageMetadata { /** * The page number (1-based) from the PDF document. * @example 1 for the first page, 2 for the second page, etc. */ pageNumber: number; } /** * Metadata for text nodes that contain hyperlinks. * Used to track hyperlinks in text runs. */ export interface TextMetadata { /** Style name of the text */ style?: string; /** * The hyperlink URL (for external links) or anchor reference (for internal links). * @example "https://example.com" or "#_Toc123456" */ link?: string; /** * Type of hyperlink. * - 'internal': Link to a bookmark/anchor within the same document * - 'external': Link to an external URL */ linkType?: 'internal' | 'external'; } /** * Metadata for note nodes (footnotes/endnotes). * Used in ODT and DOCX files to track notes. */ export interface NoteMetadata { /** * Type of note: 'footnote' or 'endnote'. */ noteType?: 'footnote' | 'endnote'; /** * The unique ID of the note from the source document. * @example "1", "2" */ noteId?: string; } /** * Union type for content metadata. */ export type ContentMetadata = SlideMetadata | SheetMetadata | HeadingMetadata | ListMetadata | CellMetadata | ImageMetadata | ChartMetadata | PageMetadata | ParagraphMetadata | TextMetadata | NoteMetadata | undefined; /** * Represents a node in the document content tree. * This is the core building block of the parsed document structure. * Content nodes can be nested to represent hierarchical document structures * (e.g., paragraphs containing text runs, tables containing rows, rows containing cells). * * @example * // A simple paragraph with formatted text * { * type: 'paragraph', * text: 'Hello world', * children: [ * { type: 'text', text: 'Hello ', formatting: { bold: true } }, * { type: 'text', text: 'world', formatting: { italic: true } } * ] * } * * @example * // A heading with metadata * { * type: 'heading', * text: 'Chapter 1', * metadata: { level: 1 }, * children: [...] * } */ export interface OfficeContentNode { /** * The type of the node. * Determines how the node should be interpreted and rendered. * Common types: 'paragraph', 'heading', 'table', 'list', 'text', 'image', etc. */ type: OfficeContentNodeType; /** * The complete text content of the node and all its children combined. * For container nodes (paragraph, heading), this is the concatenation of all child text. * For leaf nodes (text), this is the actual text content. * @example "Hello world" for a paragraph containing "Hello " and "world" */ text?: string; /** * Child nodes that make up this node's content. * Used for hierarchical structures: * - Paragraphs contain text runs with different formatting * - Tables contain rows * - Rows contain cells * - Cells contain paragraphs * @example [{ type: 'text', text: 'Hello', formatting: { bold: true } }] */ children?: OfficeContentNode[]; /** * Text formatting applied to this node. * Only applicable to text-containing nodes. * For container nodes like paragraphs, formatting typically appears on child text nodes. * @example { bold: true, size: "12", font: "Arial" } */ formatting?: TextFormatting; /** * Type-specific metadata providing additional context about the node. * The metadata structure depends on the node type: * - Headings: { level: 1 } * - Lists: { listType: 'ordered', indentation: 0 } * - Cells: { row: 0, col: 0 } * - Slides: { slideNumber: 1 } * @example { level: 1 } for a heading */ metadata?: ContentMetadata; /** * The raw source content for this node. * - For XML-based formats (DOCX, XLSX, PPTX): contains the raw XML * - For RTF: contains the raw RTF markup * - For PDF: typically not available * Only populated when `config.includeRawContent` is true. * Useful for debugging or when you need access to format-specific features. * @example "Hello" for DOCX */ rawContent?: string; } /** * Structured information extracted from a chart. */ export interface ChartData { /** Chart title (if any) */ title?: string; /** X-axis title (for continuous or categorical axes) */ xAxisTitle?: string; /** Y-axis title (for value or continuous axes) */ yAxisTitle?: string; /** * Collections of data points. * For bar/line charts, each dataset is one 'line' or group of bars. * For pie charts, there is typically only one dataset. */ dataSets: { /** Name of this data group (e.g., 'Sales 2023') */ name?: string; /** Actual numeric or string values for this group */ values: string[]; /** Specific labels for each point in this dataset (if defined per point) */ pointLabels: string[]; }[]; /** * Labels for the chart facets (e.g., 'Jan', 'Feb', 'Mar' on X-axis). * These typically correspond to the data points in each dataSet. */ labels: string[]; /** Every text node discovered in the chart XML (for keyword search/raw extraction) */ rawTexts: string[]; } /** * Represents an attachment extracted from the document (image, chart, etc.). * Attachments are binary resources embedded in the document. * Only populated when `config.extractAttachments` is true. * * @example * ```typescript * { * type: 'image', * mimeType: 'image/png', * data: 'iVBORw0KGgoAAAANSUhEUgAA...', // Base64 * name: 'chart1.png', * extension: 'png', * ocrText: 'Sales Chart Q4 2024' // If OCR was enabled * } * ``` */ export interface OfficeAttachment { /** * The category of the attachment. * Helps identify what kind of content this represents. * @example 'image' for photos and diagrams, 'chart' for embedded charts */ type: 'image' | 'chart'; /** * The MIME type of the attachment data. * Indicates the file format and how the data should be interpreted. * @example 'image/png', 'image/jpeg', 'image/svg+xml' */ mimeType: OfficeMimeType; /** * The attachment content encoded as Base64. * This is the actual binary data of the image/chart/etc. encoded for text transmission. * Can be used directly in HTML img tags with data URIs or decoded to binary. * @example "iVBORw0KGgoAAAANSUhEUgAA..." (truncated) */ data: string; /** * A unique name for this attachment file. * May be derived from the source file or auto-generated. * Used to link `ImageMetadata` nodes to their corresponding attachments. * @example "image1.png", "chart2.emf", "picture3.jpg" */ name: string; /** * The file extension (without the dot). * Derived from the MIME type or original filename. * @example "png", "jpg", "svg" */ extension: string; /** * Text extracted from the image using Optical Character Recognition (OCR). * Only present when: * - `config.ocr` is true * - `config.extractAttachments` is true * - The attachment is an image containing text * Uses Tesseract.js with the language specified in `config.ocrLanguage`. * @example "Annual Revenue: $1.2M" */ ocrText?: string; /** * Alt text or description associated with the image in the document. * Extracted from the document markup (e.g., wp:docPr descr attribute in DOCX). * @example "A chart showing sales growth" */ altText?: string; /** * Structured data extracted from a chart attachment. * Only present if the attachment is a chart and data extraction was successful. * Contains series names, values, labels, and titles. * @example { title: "Sales Chart", series: [...], categories: [...] } */ chartData?: ChartData; } /** * Metadata for the parsed file. */ export interface OfficeMetadata { /** The title of the document. */ title?: string; /** The author of the document. */ author?: string; /** User who last modified the document. */ lastModifiedBy?: string; /** Creation date. */ created?: Date; /** Last modification date. */ modified?: Date; /** Description/Comments. */ description?: string; /** Subject/Topic. */ subject?: string; /** Number of pages (if available). */ pages?: number; /** Document-wide default formatting settings (font, size, color). */ formatting?: Partial; /** Style map for styles in the document. */ styleMap?: Record>; } /** * The Abstract Syntax Tree (AST) returned by the parser. * This is the root data structure representing the entire parsed document. * * The AST provides a format-agnostic representation of the document that can be easily * processed, transformed, or converted to other formats. It preserves the document's * structure, content, formatting, and metadata while abstracting away format-specific details. * * @example * ```typescript * const ast = await OfficeParser.parseOffice('document.docx', { * extractAttachments: true, * includeRawContent: false * }); * * console.log(ast.type); // 'docx' * console.log(ast.metadata.author); // 'John Doe' * console.log(ast.content.length); // Number of top-level content nodes * console.log(ast.toText()); // Plain text representation * ``` */ export interface OfficeParserAST { /** * The type of the parsed file. * Indicates which parser was used and what format the input was in. * @example 'docx', 'xlsx', 'pptx', 'rtf', 'pdf', 'odt', 'odp', 'ods' */ type: SupportedFileType; /** * Document metadata extracted from the file properties. * Includes information like author, title, creation date, etc. * Availability depends on the file format and whether metadata was present in the source. * @example { author: 'John Smith', title: 'Annual Report', created: new Date('2024-01-01') } */ metadata: OfficeMetadata; /** * The hierarchical content structure of the document. * This is an array of top-level content nodes. Each node can have children, creating a tree. * For different file types: * - DOCX: Array of paragraphs, headings, tables, etc. * - XLSX: Array of sheets, each containing rows * - PPTX: Array of slides, each containing content nodes * - PDF: Array of pages, each containing paragraphs * @example [{ type: 'paragraph', text: 'Hello' }, { type: 'heading', text: 'Chapter 1' }] */ content: OfficeContentNode[]; /** * Attachments extracted from the document (images, charts, embedded files). * Only populated when `config.extractAttachments` is true. * Each attachment includes: * - Base64-encoded data * - MIME type * - Optional OCR text (if `config.ocr` is true) * @example [{ type: 'image', mimeType: 'image/png', data: 'base64...', name: 'image1.png' }] */ attachments: OfficeAttachment[]; /** * Converts the entire AST to plain text. * This method flattens the document structure and returns just the text content, * stripping out all formatting, metadata, and structure. * * The text is concatenated using the delimiter specified in `config.newlineDelimiter` (default: '\n'). * * @returns A plain text representation of the document * @example * ```typescript * const text = ast.toText(); * console.log(text); // "Hello world\nChapter 1\n..." * ``` */ toText(): string; }