/** * Source File Filter — Deduplicates source files by detecting build artifacts. * * Problem: When scanning a codebase, we encounter both source files and their * compiled/transpiled outputs (TypeScript → JavaScript, Vue → JavaScript, etc.). * Scanning both wastes time and produces duplicate findings. * * Solution: For each file, check if a "higher precedence" source sibling exists. * If yes, skip the file as a build artifact. If no, keep it as hand-written source. * * Supported ecosystems: * - TypeScript: .ts shadows .js, .tsx shadows .jsx * - Vue/Svelte: .vue/.svelte shadows .js * - CoffeeScript: .coffee shadows .js * * Files without higher-precedence siblings are kept only when they do not look * generated/codegen-produced (hand-written JS, Python, Go, Rust, etc.). */ import * as fs from "node:fs"; import * as path from "node:path"; import { getProjectIgnoreMatcher, isExcludedDirName } from "./file-utils.js"; import { isDeclarationFile, isGeneratedArtifactDirectoryName, isGeneratedOrArtifact, } from "./generated-artifacts.js"; /** * Mapping of file extension to the extensions it shadows (build artifacts). * Order matters: first entry has highest precedence. */ export const SOURCE_PRECEDENCE: Record = { ".ts": [".js", ".mjs", ".cjs"], ".tsx": [".jsx", ".js", ".mjs", ".cjs"], ".vue": [".js", ".mjs"], ".svelte": [".js", ".mjs"], ".coffee": [".js"], }; /** * All extensions that could be source or artifacts, in precedence order. */ export const ALL_SCANNABLE_EXTENSIONS = [ ".ts", ".tsx", ".js", ".jsx", ".mjs", ".cjs", ".vue", ".svelte", ".coffee", ".py", ".go", ".rs", ".rb", ".rake", ".gemspec", ".ru", ]; export interface SourceCollectionOptions { /** Additional directory names to exclude (merged with defaults) */ excludeDirs?: string[]; /** File extensions to consider (defaults to ALL_SCANNABLE_EXTENSIONS) */ extensions?: string[]; /** Whether to follow symlinks (default: false) */ followSymlinks?: boolean; /** Keep generated/codegen files instead of filtering them (default: false) */ includeGenerated?: boolean; /** Keep declaration stubs such as .d.ts (default: false) */ includeDeclarationFiles?: boolean; /** Inspect a small header prefix for generated-code banners (default: true) */ inspectGeneratedHeaders?: boolean; } function shouldSkipGeneratedOrArtifact( filePath: string, options?: Pick< SourceCollectionOptions, "includeGenerated" | "includeDeclarationFiles" | "inspectGeneratedHeaders" >, ): boolean { const includeDeclarations = options?.includeDeclarationFiles === true; if (options?.includeGenerated === true) { return !includeDeclarations && isDeclarationFile(filePath); } return isGeneratedOrArtifact(filePath, { readContentHeader: options?.inspectGeneratedHeaders !== false, includeDeclarations: !includeDeclarations, }); } /** * Extract the basename (filename without extension) from a path. */ function getBasename(filePath: string): string { const ext = path.extname(filePath); return path.basename(filePath, ext); } /** * Get the directory of a file path. */ function getDir(filePath: string): string { return path.dirname(filePath); } /** * Check if a file has a higher-precedence source sibling. * Returns the shadowing source file path if found, null otherwise. */ export function findSourceSibling(filePath: string): string | null { const ext = path.extname(filePath).toLowerCase(); const dir = getDir(filePath); const base = getBasename(filePath); // Find which precedence group this extension belongs to for (const [sourceExt, shadowedExts] of Object.entries(SOURCE_PRECEDENCE)) { if (shadowedExts.includes(ext)) { // This file could be shadowed by a source file with sourceExt const siblingPath = path.join(dir, base + sourceExt); if (fs.existsSync(siblingPath)) { return siblingPath; } } } return null; } /** * Check if a file is a build artifact (has a source sibling). */ export function isBuildArtifact(filePath: string): boolean { return findSourceSibling(filePath) !== null; } /** * Filter a list of files, removing build artifacts that have source siblings * plus likely generated/codegen artifacts. * Returns de-duplicated list keeping only highest-precedence source files. */ export function filterSourceFiles( filePaths: string[], options?: Pick< SourceCollectionOptions, "includeGenerated" | "includeDeclarationFiles" | "inspectGeneratedHeaders" >, ): string[] { // Track which files we're keeping and why we're skipping others const keep: string[] = []; const skipReasons = new Map(); // skipped file -> kept source for (const filePath of filePaths) { const sourceSibling = findSourceSibling(filePath); if (sourceSibling) { // This is a build artifact, skip it skipReasons.set(filePath, sourceSibling); } else if (shouldSkipGeneratedOrArtifact(filePath, options)) { // Generated/codegen outputs are not hand-written source. skipReasons.set(filePath, "generated-or-artifact"); } else { // No higher-precedence source, keep it keep.push(filePath); } } return keep; } /** * Recursively collect all source files in a directory, excluding build artifacts * and likely generated/codegen artifacts. * * @param dir - Directory to scan * @param options - Optional configuration * @returns Array of absolute file paths that are source files (not artifacts) */ export function collectSourceFiles( dir: string, options?: SourceCollectionOptions, ): string[] { const rootDir = path.resolve(dir); const ignoreMatcher = getProjectIgnoreMatcher(rootDir); const extraExcludePatterns = options?.excludeDirs ?? []; const extensions = new Set(options?.extensions || ALL_SCANNABLE_EXTENSIONS); const files: string[] = []; function scan(currentDir: string) { let entries: fs.Dirent[] = []; try { entries = fs.readdirSync(currentDir, { withFileTypes: true }); } catch { return; // Permission denied or directory doesn't exist } for (const entry of entries) { const fullPath = path.join(currentDir, entry.name); if (entry.isDirectory()) { if (isExcludedDirName(entry.name, extraExcludePatterns)) continue; if (ignoreMatcher.isIgnored(fullPath, true)) continue; if ( options?.includeGenerated !== true && isGeneratedArtifactDirectoryName(entry.name) ) { continue; } if (!options?.followSymlinks && entry.isSymbolicLink()) continue; scan(fullPath); } else if (entry.isFile()) { if (ignoreMatcher.isIgnored(fullPath, false)) continue; const ext = path.extname(entry.name).toLowerCase(); if (!extensions.has(ext)) continue; // Skip if this is a build artifact or generated/codegen output. if (isBuildArtifact(fullPath)) continue; if (shouldSkipGeneratedOrArtifact(fullPath, options)) continue; files.push(fullPath); } } } scan(rootDir); return files; } /** * Get statistics about source file filtering for debugging/monitoring. */ export function getFilterStats( allFiles: string[], filteredFiles: string[], ): { total: number; kept: number; skipped: number; byType: Record; } { const skipped = allFiles.length - filteredFiles.length; const byType: Record = {}; // Count what we skipped for (const file of allFiles) { if (!filteredFiles.includes(file)) { const ext = path.extname(file).toLowerCase(); byType[ext] = (byType[ext] || 0) + 1; } } return { total: allFiles.length, kept: filteredFiles.length, skipped, byType, }; }