// file-discovery.mjs — Walk directory tree, filter, binary detection // Zero dependencies (Node.js builtins only). import { readdir, stat, readFile } from 'node:fs/promises'; import { join, relative, extname } from 'node:path'; // Extensions we scan (text-based) const TEXT_EXTENSIONS = new Set([ '.js', '.mjs', '.cjs', '.ts', '.mts', '.cts', '.jsx', '.tsx', '.py', '.pyw', '.json', '.jsonc', '.json5', '.yaml', '.yml', '.toml', '.md', '.mdx', '.sh', '.bash', '.zsh', '.env', '.env.local', '.env.example', '.cfg', '.ini', '.conf', '.xml', '.html', '.htm', '.svg', '.css', '.scss', '.sass', '.less', '.glsl', '.frag', '.vert', '.shader', '.wgsl', // GPU shader source '.sql', '.rs', '.go', '.java', '.kt', '.cs', '.c', '.cpp', '.h', '.hpp', '.rb', '.php', '.lua', '.swift', '.m', '.txt', '.csv', '.log', '.lock', // package-lock.json, yarn.lock, etc. '.dockerfile', '', // Dockerfile, Makefile, etc. (no extension) ]); // Directories to always skip const SKIP_DIRS = new Set([ 'node_modules', '.git', '.hg', '.svn', '__pycache__', '.pytest_cache', '.mypy_cache', 'dist', 'build', '.next', '.nuxt', '.venv', 'venv', 'env', 'coverage', '.nyc_output', '.angular', '.cache', ]); // Max file size to read (512KB) const MAX_FILE_SIZE = 512 * 1024; /** * Discover all scannable files under a target path. * @param {string} targetPath - Absolute path to scan * @param {object} [opts] * @param {number} [opts.maxFiles=5000] - Stop after this many files * @param {number} [opts.maxFileSize=524288] - Skip files larger than this * @returns {Promise<{ files: FileInfo[], skipped: number, truncated: boolean }>} * * @typedef {{ absPath: string, relPath: string, ext: string, size: number }} FileInfo */ export async function discoverFiles(targetPath, opts = {}) { const maxFiles = opts.maxFiles || 5000; const maxFileSize = opts.maxFileSize || MAX_FILE_SIZE; const files = []; let skipped = 0; let truncated = false; async function walk(dir) { if (truncated) return; let entries; try { entries = await readdir(dir, { withFileTypes: true }); } catch { skipped++; return; } for (const entry of entries) { if (truncated) return; const fullPath = join(dir, entry.name); if (entry.isDirectory()) { if (SKIP_DIRS.has(entry.name) || entry.name.startsWith('.')) { // Allow .claude-plugin and .github but skip most dot dirs if (entry.name !== '.claude-plugin' && entry.name !== '.github' && entry.name !== '.claude') { continue; } } await walk(fullPath); } else if (entry.isFile()) { const ext = extname(entry.name).toLowerCase(); // Accept known text extensions or extensionless files (Dockerfile, Makefile, etc.) const isKnownText = TEXT_EXTENSIONS.has(ext); const isExtensionless = ext === '' && !entry.name.startsWith('.'); if (!isKnownText && !isExtensionless) { skipped++; continue; } let fileSize; try { const st = await stat(fullPath); if (st.size > maxFileSize) { skipped++; continue; } if (st.size === 0) { skipped++; continue; } fileSize = st.size; } catch { skipped++; continue; } files.push({ absPath: fullPath, relPath: relative(targetPath, fullPath), ext, size: fileSize, }); if (files.length >= maxFiles) { truncated = true; return; } } } } await walk(targetPath); return { files, skipped, truncated }; } /** * Read file content as UTF-8 string, with binary detection. * Returns null if file appears to be binary. * @param {string} absPath * @returns {Promise} */ export async function readTextFile(absPath) { try { const buf = await readFile(absPath); // Quick binary check: look for null bytes in first 8KB const checkLen = Math.min(buf.length, 8192); for (let i = 0; i < checkLen; i++) { if (buf[i] === 0) return null; } return buf.toString('utf-8'); } catch { return null; } }