ktg-plugin-marketplace/plugins/llm-security/scanners/lib/file-discovery.mjs

145 lines
4.1 KiB
JavaScript

// file-discovery.mjs — Walk directory tree, filter, binary detection
// Zero dependencies (Node.js builtins only).
import { readdir, stat, readFile } from 'node:fs/promises';
import { join, relative, extname } from 'node:path';
// Extensions we scan (text-based)
const TEXT_EXTENSIONS = new Set([
'.js', '.mjs', '.cjs', '.ts', '.mts', '.cts', '.jsx', '.tsx',
'.py', '.pyw',
'.json', '.jsonc', '.json5',
'.yaml', '.yml',
'.toml',
'.md', '.mdx',
'.sh', '.bash', '.zsh',
'.env', '.env.local', '.env.example',
'.cfg', '.ini', '.conf',
'.xml', '.html', '.htm', '.svg',
'.css', '.scss', '.less',
'.sql',
'.rs', '.go', '.java', '.kt', '.cs', '.c', '.cpp', '.h', '.hpp',
'.rb', '.php', '.lua', '.swift', '.m',
'.txt', '.csv', '.log',
'.lock', // package-lock.json, yarn.lock, etc.
'.dockerfile', '', // Dockerfile, Makefile, etc. (no extension)
]);
// Directories to always skip
const SKIP_DIRS = new Set([
'node_modules', '.git', '.hg', '.svn',
'__pycache__', '.pytest_cache', '.mypy_cache',
'dist', 'build', '.next', '.nuxt',
'.venv', 'venv', 'env',
'coverage', '.nyc_output',
'.angular', '.cache',
]);
// Max file size to read (512KB)
const MAX_FILE_SIZE = 512 * 1024;
/**
* Discover all scannable files under a target path.
* @param {string} targetPath - Absolute path to scan
* @param {object} [opts]
* @param {number} [opts.maxFiles=5000] - Stop after this many files
* @param {number} [opts.maxFileSize=524288] - Skip files larger than this
* @returns {Promise<{ files: FileInfo[], skipped: number, truncated: boolean }>}
*
* @typedef {{ absPath: string, relPath: string, ext: string, size: number }} FileInfo
*/
export async function discoverFiles(targetPath, opts = {}) {
const maxFiles = opts.maxFiles || 5000;
const maxFileSize = opts.maxFileSize || MAX_FILE_SIZE;
const files = [];
let skipped = 0;
let truncated = false;
async function walk(dir) {
if (truncated) return;
let entries;
try {
entries = await readdir(dir, { withFileTypes: true });
} catch {
skipped++;
return;
}
for (const entry of entries) {
if (truncated) return;
const fullPath = join(dir, entry.name);
if (entry.isDirectory()) {
if (SKIP_DIRS.has(entry.name) || entry.name.startsWith('.')) {
// Allow .claude-plugin and .github but skip most dot dirs
if (entry.name !== '.claude-plugin' && entry.name !== '.github' && entry.name !== '.claude') {
continue;
}
}
await walk(fullPath);
} else if (entry.isFile()) {
const ext = extname(entry.name).toLowerCase();
// Accept known text extensions or extensionless files (Dockerfile, Makefile, etc.)
const isKnownText = TEXT_EXTENSIONS.has(ext);
const isExtensionless = ext === '' && !entry.name.startsWith('.');
if (!isKnownText && !isExtensionless) {
skipped++;
continue;
}
let fileSize;
try {
const st = await stat(fullPath);
if (st.size > maxFileSize) {
skipped++;
continue;
}
if (st.size === 0) {
skipped++;
continue;
}
fileSize = st.size;
} catch {
skipped++;
continue;
}
files.push({
absPath: fullPath,
relPath: relative(targetPath, fullPath),
ext,
size: fileSize,
});
if (files.length >= maxFiles) {
truncated = true;
return;
}
}
}
}
await walk(targetPath);
return { files, skipped, truncated };
}
/**
* Read file content as UTF-8 string, with binary detection.
* Returns null if file appears to be binary.
* @param {string} absPath
* @returns {Promise<string|null>}
*/
export async function readTextFile(absPath) {
try {
const buf = await readFile(absPath);
// Quick binary check: look for null bytes in first 8KB
const checkLen = Math.min(buf.length, 8192);
for (let i = 0; i < checkLen; i++) {
if (buf[i] === 0) return null;
}
return buf.toString('utf-8');
} catch {
return null;
}
}