145 lines
4.1 KiB
JavaScript
145 lines
4.1 KiB
JavaScript
// file-discovery.mjs — Walk directory tree, filter, binary detection
|
|
// Zero dependencies (Node.js builtins only).
|
|
|
|
import { readdir, stat, readFile } from 'node:fs/promises';
|
|
import { join, relative, extname } from 'node:path';
|
|
|
|
// Extensions we scan (text-based)
|
|
const TEXT_EXTENSIONS = new Set([
|
|
'.js', '.mjs', '.cjs', '.ts', '.mts', '.cts', '.jsx', '.tsx',
|
|
'.py', '.pyw',
|
|
'.json', '.jsonc', '.json5',
|
|
'.yaml', '.yml',
|
|
'.toml',
|
|
'.md', '.mdx',
|
|
'.sh', '.bash', '.zsh',
|
|
'.env', '.env.local', '.env.example',
|
|
'.cfg', '.ini', '.conf',
|
|
'.xml', '.html', '.htm', '.svg',
|
|
'.css', '.scss', '.less',
|
|
'.sql',
|
|
'.rs', '.go', '.java', '.kt', '.cs', '.c', '.cpp', '.h', '.hpp',
|
|
'.rb', '.php', '.lua', '.swift', '.m',
|
|
'.txt', '.csv', '.log',
|
|
'.lock', // package-lock.json, yarn.lock, etc.
|
|
'.dockerfile', '', // Dockerfile, Makefile, etc. (no extension)
|
|
]);
|
|
|
|
// Directories to always skip
|
|
const SKIP_DIRS = new Set([
|
|
'node_modules', '.git', '.hg', '.svn',
|
|
'__pycache__', '.pytest_cache', '.mypy_cache',
|
|
'dist', 'build', '.next', '.nuxt',
|
|
'.venv', 'venv', 'env',
|
|
'coverage', '.nyc_output',
|
|
'.angular', '.cache',
|
|
]);
|
|
|
|
// Max file size to read (512KB)
|
|
const MAX_FILE_SIZE = 512 * 1024;
|
|
|
|
/**
|
|
* Discover all scannable files under a target path.
|
|
* @param {string} targetPath - Absolute path to scan
|
|
* @param {object} [opts]
|
|
* @param {number} [opts.maxFiles=5000] - Stop after this many files
|
|
* @param {number} [opts.maxFileSize=524288] - Skip files larger than this
|
|
* @returns {Promise<{ files: FileInfo[], skipped: number, truncated: boolean }>}
|
|
*
|
|
* @typedef {{ absPath: string, relPath: string, ext: string, size: number }} FileInfo
|
|
*/
|
|
export async function discoverFiles(targetPath, opts = {}) {
|
|
const maxFiles = opts.maxFiles || 5000;
|
|
const maxFileSize = opts.maxFileSize || MAX_FILE_SIZE;
|
|
const files = [];
|
|
let skipped = 0;
|
|
let truncated = false;
|
|
|
|
async function walk(dir) {
|
|
if (truncated) return;
|
|
let entries;
|
|
try {
|
|
entries = await readdir(dir, { withFileTypes: true });
|
|
} catch {
|
|
skipped++;
|
|
return;
|
|
}
|
|
|
|
for (const entry of entries) {
|
|
if (truncated) return;
|
|
const fullPath = join(dir, entry.name);
|
|
|
|
if (entry.isDirectory()) {
|
|
if (SKIP_DIRS.has(entry.name) || entry.name.startsWith('.')) {
|
|
// Allow .claude-plugin and .github but skip most dot dirs
|
|
if (entry.name !== '.claude-plugin' && entry.name !== '.github' && entry.name !== '.claude') {
|
|
continue;
|
|
}
|
|
}
|
|
await walk(fullPath);
|
|
} else if (entry.isFile()) {
|
|
const ext = extname(entry.name).toLowerCase();
|
|
// Accept known text extensions or extensionless files (Dockerfile, Makefile, etc.)
|
|
const isKnownText = TEXT_EXTENSIONS.has(ext);
|
|
const isExtensionless = ext === '' && !entry.name.startsWith('.');
|
|
|
|
if (!isKnownText && !isExtensionless) {
|
|
skipped++;
|
|
continue;
|
|
}
|
|
|
|
let fileSize;
|
|
try {
|
|
const st = await stat(fullPath);
|
|
if (st.size > maxFileSize) {
|
|
skipped++;
|
|
continue;
|
|
}
|
|
if (st.size === 0) {
|
|
skipped++;
|
|
continue;
|
|
}
|
|
fileSize = st.size;
|
|
} catch {
|
|
skipped++;
|
|
continue;
|
|
}
|
|
|
|
files.push({
|
|
absPath: fullPath,
|
|
relPath: relative(targetPath, fullPath),
|
|
ext,
|
|
size: fileSize,
|
|
});
|
|
|
|
if (files.length >= maxFiles) {
|
|
truncated = true;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
await walk(targetPath);
|
|
return { files, skipped, truncated };
|
|
}
|
|
|
|
/**
|
|
* Read file content as UTF-8 string, with binary detection.
|
|
* Returns null if file appears to be binary.
|
|
* @param {string} absPath
|
|
* @returns {Promise<string|null>}
|
|
*/
|
|
export async function readTextFile(absPath) {
|
|
try {
|
|
const buf = await readFile(absPath);
|
|
// Quick binary check: look for null bytes in first 8KB
|
|
const checkLen = Math.min(buf.length, 8192);
|
|
for (let i = 0; i < checkLen; i++) {
|
|
if (buf[i] === 0) return null;
|
|
}
|
|
return buf.toString('utf-8');
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|