/** * Code-Aware Chunking Engine * * Implements heuristic-based code-aware chunking that splits code at semantic * boundaries (functions, classes, methods) rather than fixed character counts. * * Benefits: * - Chunks align with code structure (no mid-function splits) * - Search results are more coherent and complete * - Reduced overlap requirement (from 20% to ~5%) * * Supported languages (20+): * - TypeScript/JavaScript: function declarations, class declarations, exports * - Python: def statements, class statements, decorators * - Java: class, interface, enum, method declarations * - Go: func, type struct, type interface declarations * - Rust: fn, struct, enum, impl, trait, macro declarations * - C#: class, struct, interface, enum, method declarations * - C/C++: function definitions, struct, class, namespace * - Kotlin: fun, class, interface, object declarations * - Swift: func, class, struct, enum, protocol, extension * - Ruby: def, class, module declarations * - PHP: function, class, interface, trait declarations * - Scala: def, class, trait, object declarations * - Shell/Bash: function declarations * - CSS/SCSS/LESS: rule blocks, @media, @keyframes * - HTML: structural elements * - Vue/Svelte: template, script, style blocks * - SQL: CREATE, ALTER, SELECT statements * - YAML/JSON/XML: structural markers * - GraphQL: type, query, mutation, subscription * - Terraform/HCL: resource, variable, output blocks * - Dockerfile: FROM, RUN, COPY instructions * * Falls back to character-based chunking for unsupported languages or on errors. * * @module codeAwareChunking */ import { ChunkWithLines } from './chunking.js'; /** * Supported programming languages for code-aware chunking (22 languages) * * Tier 1 - High Priority: * - typescript, javascript, python (existing) * - java, go, rust, csharp, c, cpp, kotlin, swift * * Tier 2 - Medium Priority: * - ruby, php, scala, shell * * Tier 3 - Markup/Config: * - css, scss, less, html, vue, svelte, sql, yaml, json, xml, graphql * * Tier 4 - Infrastructure: * - terraform, hcl, dockerfile */ export type SupportedLanguage = 'typescript' | 'javascript' | 'python' | 'java' | 'go' | 'rust' | 'csharp' | 'c' | 'cpp' | 'kotlin' | 'swift' | 'ruby' | 'php' | 'scala' | 'shell' | 'css' | 'scss' | 'less' | 'html' | 'vue' | 'svelte' | 'sql' | 'yaml' | 'json' | 'xml' | 'graphql' | 'terraform' | 'hcl' | 'dockerfile' | 'unknown'; /** * Configuration for code-aware chunking */ export interface CodeAwareChunkOptions { /** Target chunk size in characters (~4000 for ~1000 tokens) */ chunkSize: number; /** Minimum overlap size in characters (~200 for ~50 tokens - reduced from 800) */ chunkOverlap: number; /** Maximum chunk size before forcing a split */ maxChunkSize: number; } /** * Default options for code-aware chunking * * Reduced overlap compared to character-based chunking since we're * splitting at semantic boundaries. */ export declare const DEFAULT_CODE_AWARE_OPTIONS: CodeAwareChunkOptions; /** * Detect the programming language from a file path * * @param filePath - Path to the file (can be relative or absolute) * @returns The detected language or 'unknown' */ export declare function detectLanguage(filePath: string): SupportedLanguage; /** * Split code at semantic boundaries with line number tracking * * Algorithm: * 1. Detect language from file path * 2. Find all semantic boundaries (functions, classes, etc.) * 3. Group boundaries into chunks that fit within the size limit * 4. If a single semantic unit exceeds maxChunkSize, split it at line boundaries * 5. Add minimal overlap at chunk boundaries * * @param text - Source code text * @param filePath - Path to the file (for language detection) * @param options - Chunking options * @returns Array of chunks with line number information */ export declare function splitCodeWithLineNumbers(text: string, filePath: string, options?: Partial): ChunkWithLines[]; /** * Check if a file can be processed with code-aware chunking * * @param filePath - Path to the file * @returns true if the file's language is supported */ export declare function supportsCodeAwareChunking(filePath: string): boolean; /** * Get the language name for a file path * * @param filePath - Path to the file * @returns Human-readable language name */ export declare function getLanguageName(filePath: string): string; /** * Get the human-readable display name for a language * * @param language - Language identifier * @returns Human-readable name */ export declare function getLanguageDisplayName(language: SupportedLanguage): string; /** * Get a list of all supported languages * * @returns Array of supported language identifiers (excluding 'unknown') */ export declare function getSupportedLanguages(): SupportedLanguage[]; //# sourceMappingURL=codeAwareChunking.d.ts.map