/** * @license * Copyright 2025 Steven Roussey * SPDX-License-Identifier: Apache-2.0 */ import { DocumentRootNode } from "@workglow/knowledge-base"; import type { IRunConfig, TaskConfig } from "@workglow/task-graph"; import { CreateWorkflow, IExecuteContext, Task } from "@workglow/task-graph"; import { DataPortSchema } from "@workglow/util/schema"; import type { Capability } from "../capability/Capabilities"; import type { ModelConfig } from "../model/ModelSchema"; export type HierarchicalChunkerTaskInput = Omit<{ model?: string | ModelConfig | undefined; maxTokens?: number | undefined; overlap?: number | undefined; reservedTokens?: number | undefined; strategy?: "flat" | "hierarchical" | "sentence" | undefined; doc_id: string; documentTree: { [x: string]: unknown; }; }, "documentTree"> & { documentTree: DocumentRootNode; }; export type HierarchicalChunkerTaskOutput = { text: string[]; doc_id: string; chunks: { [x: string]: unknown; leafNodeId?: string | undefined; summary?: string | undefined; entities?: { type: string; text: string; score: number; }[] | undefined; parentSummaries?: string[] | undefined; sectionTitles?: string[] | undefined; doc_title?: string | undefined; text: string; doc_id: string; chunkId: string; nodePath: string[]; depth: number; }[]; count: number; }; export type HierarchicalChunkerTaskConfig = TaskConfig; /** * Task for hierarchical chunking that respects token budgets and document structure. * Pass a `model` in the input to use {@link CountTokensTask} for accurate token * counting; when omitted, the task falls back to the character-based estimate * provided by {@link estimateTokens}. */ export declare class HierarchicalChunkerTask extends Task { static type: string; /** Pure-compute chunking task — no provider capability required. */ static readonly requires: readonly Capability[]; static category: string; static title: string; static description: string; static cacheable: boolean; static inputSchema(): DataPortSchema; static outputSchema(): DataPortSchema; execute(input: HierarchicalChunkerTaskInput, context: IExecuteContext): Promise; /** * Hierarchical chunking that respects document structure. * * `headingPath` accumulates the titles of ancestor *section* nodes only * (the document root's title is intentionally excluded), so leaves can be * prefixed with a `"Section > Subsection"` breadcrumb. This is a well-known * RAG win: the embedding picks up the topical keywords from the heading * hierarchy, and the LLM in the answer phase doesn't have to guess what * the chunk is about — both effects matter most on small models, which * otherwise pad with plausible-sounding but unsupported claims. * * The document root title is deliberately omitted: it's already carried * on the document's `metadata.title` (and surfaces to the chat layer as * the reference label), and including it inside chunk text encourages * small models to synthesize fake URLs by pattern-matching the brand * name onto the section slug — e.g. a doc titled "Workglow" plus a * "Secret Management" section produced fabricated links like * `https://workglow.com/help/secret-management`. */ private chunkHierarchically; /** * Chunk a single text string, using countFn for token counting. * countFn always returns a number -- it falls back to estimation internally * when no real tokenizer is available. * * `headingPath` is prepended to each emitted chunk as a `"A > B > C\n\n"` * breadcrumb and its tokens are charged against the chunk budget so the * combined chunk (prefix + body) still fits `maxTokensPerChunk - * reservedTokens`. If charging the prefix would leave no room for any * body, the prefix is dropped for this leaf so we always make progress. */ private chunkText; /** * Flat chunking (ignores hierarchy). The document title is intentionally * NOT prepended here — see the rationale on `chunkHierarchically`. The * doc title rides along on `metadata.title` instead, which is what the * chat layer surfaces as the reference label. */ private chunkFlat; private collectAllText; } export declare const hierarchicalChunker: (input: HierarchicalChunkerTaskInput, config?: HierarchicalChunkerTaskConfig, runConfig?: Partial) => Promise; declare module "@workglow/task-graph" { interface Workflow { hierarchicalChunker: CreateWorkflow; } } //# sourceMappingURL=HierarchicalChunkerTask.d.ts.map