/** * Language Provider interface — the complete capability contract for a supported language. * * Each language implements this interface in a single file under `languages/`. * The pipeline accesses all per-language behavior through this interface. * * Design pattern: Strategy pattern with compile-time exhaustiveness. * The providers table in `languages/index.ts` uses `satisfies Record` * so adding a language to the enum without creating a provider is a compiler error. */ import type { SupportedLanguages, MroStrategy, CaptureMatch, BindingRef, TypeRef, Scope, ScopeId, ScopeKind, ScopeTree, ParsedImport, ParsedTypeBinding, SymbolDefinition, Callsite, WorkspaceIndex } from '../../_shared/index.js'; import type { LanguageTypeConfig } from './type-extractors/types.js'; import type { CallRouter } from './call-routing.js'; import type { CallExtractor } from './call-types.js'; import type { ClassExtractor } from './class-types.js'; import type { ExportChecker } from './export-detection.js'; import type { FieldExtractor } from './field-extractor.js'; import type { MethodExtractor } from './method-types.js'; import type { VariableExtractor } from './variable-types.js'; import type { ImportResolverFn } from './import-resolvers/types.js'; import type { SyntaxNode } from './utils/ast-helpers.js'; import type { NodeLabel } from '../../_shared/index.js'; /** Tree-sitter query captures: capture name → AST node (or undefined if not captured). */ export type CaptureMap = Record; /** Configuration for AST-based framework detection patterns. */ export interface AstFrameworkPatternConfig { framework: string; entryPointMultiplier: number; reason: string; patterns: string[]; } /** * Everything a language needs to provide. * Required fields must be explicitly set; optional fields have defaults * applied by defineLanguage(). */ interface LanguageProviderConfig { readonly id: SupportedLanguages; /** File extensions that map to this language (e.g., ['.ts', '.tsx']) */ readonly extensions: readonly string[]; /** Entry-point function name patterns specific to this language. * Merged with universal patterns at runtime for process detection scoring. * Default: [] (only universal patterns apply). */ readonly entryPointPatterns?: readonly RegExp[]; /** AST-based framework detection patterns for this language. * Used by detectFrameworkFromAST to identify framework entry points. * Default: [] (no AST framework detection for this language). */ readonly astFrameworkPatterns?: readonly AstFrameworkPatternConfig[]; /** Parse strategy: 'tree-sitter' (default) uses AST parsing via tree-sitter. * 'standalone' means the language has its own regex-based processor and * should be skipped by the tree-sitter pipeline (e.g., COBOL, Markdown). */ readonly parseStrategy?: 'tree-sitter' | 'standalone'; /** Tree-sitter query strings for definitions, imports, calls, heritage. * Required for tree-sitter languages; empty string for standalone processors. */ readonly treeSitterQueries: string; /** * Optional source-text transform that runs **before** tree-sitter parses the file. * * Used to elide language constructs that confuse the grammar without affecting * source-position fidelity — e.g., Unreal Engine reflection macros (`UCLASS`, * `UFUNCTION`, `MODULENAME_API`) in C++ headers that prevent the parser from * recognising class/function names correctly. * * **Length / position preservation:** the returned string MUST have the same * JavaScript `.length` as the input AND preserve every newline (`\n`/`\r`) * position byte-for-byte. Implementations replace elided characters with * ASCII spaces while leaving newlines untouched. With this contract: * * - tree-sitter's reported `startPosition.row`/`startPosition.column` * match the original file exactly (line/column come from newline counts) * - `startIndex`/`endIndex` byte offsets match the original file exactly * **when the elided range is pure ASCII** (UTF-16 `.length` equals UTF-8 * byte length only for ASCII). * * Implementations targeting languages where elided ranges may contain * non-ASCII content must therefore preserve byte length, not just `.length`, * if downstream code uses `startIndex` to slice the original UTF-8 bytes. * The current C++ UE-macro preprocessor relies on the practical fact that * UE reflection macros and module-export tokens are ASCII-only. * * Must be a pure function — same input always yields the same output. Called * once per file, on every code path that re-parses (parsing-processor, import * processor, heritage processor, call processor, parse worker). * * Default: undefined (no preprocessing — `file.content` is parsed verbatim). */ readonly preprocessSource?: (sourceText: string, filePath: string) => string; /** Type extraction: declarations, initializers, for-loop bindings */ readonly typeConfig: LanguageTypeConfig; /** Export detection: is this AST node a public/exported symbol? */ readonly exportChecker: ExportChecker; /** Import resolution: resolves raw import path to file system path */ readonly importResolver: ImportResolverFn; /** Call routing for languages that express imports/heritage as calls (e.g., Ruby). * Default: no routing (all calls are normal call expressions). */ readonly callRouter?: CallRouter; /** Language-specific transformation of raw import path text before resolution. * Called after sanitization. E.g., Kotlin appends wildcard suffixes. * Default: undefined (no preprocessing). */ readonly importPathPreprocessor?: (cleaned: string, importNode: SyntaxNode) => string; /** Resolve a container node during enclosing-owner tree walks. * Called when a CLASS_CONTAINER_TYPES node is found while walking up. * - Return a different SyntaxNode to remap the container (e.g., Ruby * singleton_class → enclosing class/module). * - Return null to skip this container and keep walking up. * - Omit (undefined) to use the container node as-is (default). * Default: undefined (no remapping). */ readonly resolveEnclosingOwner?: (node: SyntaxNode) => SyntaxNode | null; /** Resolve the enclosing function name + label from an AST ancestor node * that is NOT a standard FUNCTION_NODE_TYPE. For languages where the * function body is a sibling of the signature (e.g. Dart: function_body ↔ * function_signature are siblings under program/class_body), the default * parent walk cannot find the enclosing function. This hook lets the * language provider inspect each ancestor and return the resolved result. * Return null to continue the default walk. * Default: undefined (standard parent walk only). */ readonly enclosingFunctionFinder?: (ancestorNode: SyntaxNode) => { funcName: string; label: NodeLabel; } | null; /** * Extract a per-language template-constraint payload for a templated * function / method definition. Used by `parsing-processor` to * disambiguate same-name same-arity overloads whose distinguishing * signal is their template constraints rather than their parameter * types — the canonical C++ SFINAE case (issue #1579): * * template, int> = 0> * void process(T); // overload A * * template, int> = 0> * void process(T); // overload B * * Both overloads' `parameterTypes` collapse to `['T']`, so without a * constraint fingerprint in the graph node ID they merge into one * Function node and the resolver only ever sees one candidate to * narrow. The hook's return value is stamped onto the node's ID via * `templateConstraintsIdTag()` AND stored on the node's * `templateConstraints` property so `resolveDefGraphId` can look up * the right overload by re-hashing the def's constraints at resolve * time. * * Returns the opaque payload (any JSON-serializable shape — the * producing adapter owns it; shared code MUST NOT inspect) or * `undefined` when no constraints exist / the node isn't a templated * function. Languages without SFINAE / concept semantics leave this * undefined and the disambiguation is a pass-through. */ readonly extractTemplateConstraints?: (definitionNode: SyntaxNode) => unknown; /** Override the default node label for definition.function captures. * Return null to skip (C/C++ duplicate), a different label to reclassify * (e.g., 'Method' for Kotlin), or defaultLabel to keep as-is. * Default: undefined (standard label assignment). */ readonly labelOverride?: (functionNode: SyntaxNode, defaultLabel: NodeLabel) => NodeLabel | null; /** MRO strategy for multiple inheritance resolution. * Default: 'first-wins'. */ readonly mroStrategy?: MroStrategy; /** Call extractor for extracting call site information (calledName, callForm, * receiverName, argCount, mixed chains) from @call / @call.name captures. * Produced by createCallExtractor() with a per-language CallExtractionConfig. * Default: undefined — if unset, no calls are extracted for this language. * All tree-sitter providers MUST supply this. */ readonly callExtractor?: CallExtractor; /** Field extractor for extracting field/property definitions from class/struct * declarations. Produces FieldInfo[] with name, type, visibility, static, * readonly metadata. Default: undefined (no field extraction). */ readonly fieldExtractor?: FieldExtractor; /** Method extractor for extracting method/function definitions from class/struct/interface * declarations. Produces MethodInfo[] with name, parameters, visibility, isAbstract, * isFinal, annotations metadata. Default: undefined (no method extraction). */ readonly methodExtractor?: MethodExtractor; /** Variable extractor for extracting metadata from module/file-scoped variable, * constant, and static declarations. Produces VariableInfo with type, visibility, * isConst, isStatic, isMutable metadata. Default: undefined (no variable extraction). */ readonly variableExtractor?: VariableExtractor; /** Class/type extractor for deriving canonical qualified names for class-like symbols. * Uses the same provider-driven strategy pattern as method/field extraction so * namespace/package/module rules stay language-specific. */ readonly classExtractor?: ClassExtractor; /** Extract a semantic description for a definition node (e.g., PHP Eloquent * property arrays, relation method descriptions). * Default: undefined (no description extraction). */ readonly descriptionExtractor?: (nodeLabel: NodeLabel, nodeName: string, captureMap: CaptureMap) => string | undefined; /** Detect if a file contains framework route definitions (e.g., Laravel routes.php). * When true, the worker extracts routes via the language's route extraction logic. * Default: undefined (no route files). */ readonly isRouteFile?: (filePath: string) => boolean; /** Built-in/stdlib names that should be filtered from the call graph for this language. * Default: undefined (no language-specific filtering). */ readonly builtInNames?: ReadonlySet; /** * Emit scope captures from raw source, **pre-grouped per tree-sitter * query match**. Tree-sitter-based providers run a scope query * (embedded as a string constant in each language's `query.ts`) and * emit one `CaptureMatch` per query match; standalone providers * (COBOL) emit matches from a regex tagger. The return shape is * parser-agnostic: the central `ScopeExtractor` consumes * `CaptureMatch[]` without knowing which parser produced them. * * **Pre-grouping is the provider's job.** The extractor expects each * `CaptureMatch` to correspond to one logical match — e.g., an import * statement match carries `@import.statement` + `@import.source` + * `@import.name` keyed under their capture names. Providers MUST * preserve the tree-sitter match boundaries so the extractor's topic * routing (scope / declaration / import / type-binding / reference) * lands on coherent records. * * Required for any provider participating in scope-based resolution * (the sole resolution path). * * **Sync return.** Tree-sitter query execution and COBOL's regex * tagger are both synchronous; no current or foreseeable provider * needs async work inside this hook. The sync signature lets * `parse-worker.ts` (#920) invoke it inline in its already-sync * per-file loop without cascading `async` through the batch pipeline. * * Default: undefined (no scope-based captures emitted for this language). */ readonly emitScopeCaptures?: (sourceText: string, filePath: string, /** * Optional pre-parsed tree-sitter Tree the caller has already * produced (e.g. from the parse phase's AST cache). When supplied, * the provider SHOULD skip its own `parser.parse(sourceText)` and * run its capture query against the supplied tree directly. Typed * as `unknown` here to avoid leaking the tree-sitter dependency * into the provider contract — the provider casts at use site. * Cache miss (parameter omitted or undefined) is always safe and * MUST trigger a fresh parse. */ cachedTree?: unknown, /** * Optional metadata about how `sourceText` was produced. * * Most providers ignore this and treat `sourceText` as full file content. * Vue uses it to distinguish: * - `full-file`: full `.vue` SFC source * - `pre-extracted-script`: worker-preprocessed bare `