ktg-plugin-marketplace/plugins/llm-security/scanners/git-forensics.mjs

743 lines
26 KiB
JavaScript

// git-forensics.mjs — Deterministic git history forensics scanner
// Detects supply chain rug pull signals: force pushes, description drift,
// hook modifications, new outbound URLs, author changes, binary additions,
// and suspicious commit patterns.
//
// Zero external dependencies — Node.js builtins only.
// OWASP coverage: LLM03 (Supply Chain)
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
import { levenshtein } from './lib/string-utils.mjs';
import { execSync } from 'node:child_process';
import { existsSync } from 'node:fs';
import { join } from 'node:path';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const MAX_COMMITS = 500;
const GIT_TIMEOUT_MS = 15000;
const MAX_DRIFT_FILES = 20;
/** Domains strongly associated with exfiltration or ephemeral endpoints */
const SUSPICIOUS_DOMAINS = [
'webhook.site',
'requestbin',
'ngrok',
'ngrok.io',
'pipedream.net',
'pastebin.com',
'hastebin.com',
'beeceptor.com',
'hookbin.com',
'httpbin.org',
'canarytokens.com',
];
/** Binary file extensions unusual in a plugin/package repo */
const BINARY_EXTENSIONS = new Set([
'.exe', '.dll', '.so', '.dylib', '.bin', '.dat',
'.wasm', '.node',
]);
/** Network-access patterns in source code (hooks/scripts concern) */
const NETWORK_PATTERNS = /\b(fetch|http|https|curl|wget|dns\.lookup|net\.connect|XMLHttpRequest|axios|got)\b/i;
// ---------------------------------------------------------------------------
// Helper: run a git command with standard options
// ---------------------------------------------------------------------------
/**
* Run a git command in the target directory.
* @param {string} cmd - Git command (without 'git' prefix) or full command
* @param {string} cwd - Working directory
* @returns {string} - stdout string, trimmed
* @throws - On non-zero exit or timeout
*/
function git(cmd, cwd) {
return execSync(`git ${cmd}`, {
cwd,
timeout: GIT_TIMEOUT_MS,
encoding: 'utf-8',
stdio: ['pipe', 'pipe', 'pipe'],
}).trim();
}
// ---------------------------------------------------------------------------
// Git repo detection
// ---------------------------------------------------------------------------
/**
* Determine if targetPath is inside a git repository.
* First checks for .git directory (top-level), then tries git rev-parse.
* @param {string} targetPath
* @returns {boolean}
*/
function isGitRepo(targetPath) {
if (existsSync(join(targetPath, '.git'))) return true;
try {
git('rev-parse --git-dir', targetPath);
return true;
} catch {
return false;
}
}
// ---------------------------------------------------------------------------
// Category 1: Force Push Detection
// ---------------------------------------------------------------------------
/**
* Detect force push signals in reflog.
* Looks for "reset" entries and "forced-update" in walk-reflogs.
* @param {string} targetPath
* @returns {object[]} findings
*/
function detectForcePushes(targetPath) {
const findings = [];
// Check reflog for reset entries (local force push evidence)
try {
const reflog = git("reflog --format='%H %gD %gs' -n 500", targetPath);
const lines = reflog.split('\n').filter(Boolean);
const resetLines = lines.filter(l => l.includes('reset:') || l.includes('reset'));
if (resetLines.length > 0) {
const examples = resetLines.slice(0, 3).map(l => l.slice(0, 80)).join(' | ');
findings.push(finding({
scanner: 'GIT',
severity: SEVERITY.HIGH,
title: 'Force push signal: reflog contains reset entries',
description:
`Reflog contains ${resetLines.length} reset entry/entries. ` +
'git reset --hard in a shared repo indicates history was rewritten, ' +
'which is the mechanism used in rug pull attacks to swap legitimate code ' +
'with malicious content after trust is established.',
evidence: `${resetLines.length} reset entries. Examples: ${examples}`,
owasp: 'LLM03',
recommendation:
'Review what was changed in the rewritten history. Compare the pre-reset ' +
'commit (visible in reflog) with the current HEAD to identify removed content.',
}));
}
} catch {
// reflog unavailable — not fatal
}
// Check walk-reflogs for forced-update
try {
const walkLog = git('log --walk-reflogs --format="%H %gD %gs" -n 200', targetPath);
const forcedLines = walkLog.split('\n').filter(l => l.includes('forced-update'));
if (forcedLines.length > 0) {
const shortHash = forcedLines[0].split(' ')[0].slice(0, 8);
findings.push(finding({
scanner: 'GIT',
severity: SEVERITY.HIGH,
title: 'Force push signal: forced-update entries in walk-reflogs',
description:
`Found ${forcedLines.length} forced-update entry/entries in reflog walk. ` +
'Forced updates overwrite remote history non-fast-forward, a classic rug pull vector.',
evidence: `${forcedLines.length} forced-update entries; first at commit ${shortHash}`,
owasp: 'LLM03',
recommendation:
'Audit the commits immediately before and after each forced-update. ' +
'Pin the plugin to a specific commit hash rather than a branch reference.',
}));
}
} catch {
// walk-reflogs may fail in shallow clones
}
return findings;
}
// ---------------------------------------------------------------------------
// Category 2: Description Drift
// ---------------------------------------------------------------------------
/**
* Extract the description field from YAML frontmatter in a string.
* Handles both single-line and block scalar (|) styles.
* @param {string} content
* @returns {string | null}
*/
function extractDescription(content) {
const fmMatch = content.match(/^---[\r\n]([\s\S]*?)[\r\n]---/);
if (!fmMatch) return null;
const block = fmMatch[1];
// Single-line: description: some text
const singleLine = block.match(/^description:\s*(.+)$/m);
if (singleLine && singleLine[1].trim() !== '|' && singleLine[1].trim() !== '>') {
return singleLine[1].trim().replace(/^['"]|['"]$/g, '');
}
// Block scalar: description: |
const blockScalar = block.match(/^description:\s*[|>][\r\n]((?:[ \t]+.+[\r\n]?)*)/m);
if (blockScalar) {
return blockScalar[1]
.split('\n')
.map(l => l.replace(/^[ \t]{2}/, ''))
.join('\n')
.trim();
}
return null;
}
/**
* Detect significant description changes in commands/ and agents/ files.
* @param {string} targetPath
* @returns {object[]} findings
*/
function detectDescriptionDrift(targetPath) {
const results = [];
// List tracked files matching commands/*.md or agents/*.md
let trackedFiles;
try {
const raw = git('ls-files -- "commands/*.md" "agents/*.md"', targetPath);
trackedFiles = raw.split('\n').filter(Boolean).slice(0, MAX_DRIFT_FILES);
} catch {
return results;
}
for (const relFile of trackedFiles) {
try {
// Find the commit that first added this file
const addHash = git(`log --diff-filter=A --format='%H' -- "${relFile}"`, targetPath)
.split('\n')
.filter(Boolean)
.pop(); // oldest = last in log output (reverse chrono)
if (!addHash) continue;
const shortAddHash = addHash.slice(0, 8);
// Get initial content at that commit
let initialContent;
try {
initialContent = git(`show ${addHash}:${relFile}`, targetPath);
} catch {
continue;
}
// Get current content
let currentContent;
try {
currentContent = git(`show HEAD:${relFile}`, targetPath);
} catch {
continue;
}
const initialDesc = extractDescription(initialContent);
const currentDesc = extractDescription(currentContent);
if (!initialDesc || !currentDesc) continue;
if (initialDesc === currentDesc) continue;
const dist = levenshtein(initialDesc, currentDesc);
const threshold = Math.ceil(initialDesc.length * 0.20);
if (dist > threshold) {
results.push(finding({
scanner: 'GIT',
severity: SEVERITY.MEDIUM,
title: `Description drift detected: ${relFile}`,
description:
`The description in "${relFile}" has changed significantly since its initial commit (${shortAddHash}). ` +
`Edit distance: ${dist} characters (threshold: ${threshold}, 20% of original length ${initialDesc.length}). ` +
'Substantial description changes can indicate purpose drift or an attempt to ' +
'misrepresent what an agent/command does after users have trusted it.',
file: relFile,
evidence:
`Initial (${shortAddHash}): "${initialDesc.slice(0, 80)}${initialDesc.length > 80 ? '…' : ''}" | ` +
`Current: "${currentDesc.slice(0, 80)}${currentDesc.length > 80 ? '…' : ''}" | ` +
`Levenshtein distance: ${dist}`,
owasp: 'LLM03',
recommendation:
'Review the description change history: ' +
`git log -p -- "${relFile}". ` +
'Verify the new description accurately represents current behavior.',
}));
}
} catch {
// Per-file errors are non-fatal
}
}
return results;
}
// ---------------------------------------------------------------------------
// Category 3: Hook Modification After Initial Commit
// ---------------------------------------------------------------------------
/**
* Detect suspicious hook file modification patterns.
* @param {string} targetPath
* @returns {object[]} findings
*/
function detectHookModifications(targetPath) {
const results = [];
let hookFiles;
try {
const raw = git('ls-files -- "hooks/scripts/*"', targetPath);
hookFiles = raw.split('\n').filter(Boolean);
} catch {
return results;
}
for (const relFile of hookFiles) {
try {
// Count total commits touching this file
const logLines = git(`log --oneline -- "${relFile}"`, targetPath)
.split('\n')
.filter(Boolean);
const modCount = logLines.length;
if (modCount <= 1) continue; // Only the initial commit — clean
// Check if latest diff adds network calls
let latestDiff = '';
try {
latestDiff = git(`diff HEAD~1 HEAD -- "${relFile}"`, targetPath);
} catch {
// HEAD~1 may not exist (single commit repo after first mod)
}
const addedLines = latestDiff
.split('\n')
.filter(l => l.startsWith('+') && !l.startsWith('+++'));
const addedContent = addedLines.join('\n');
const addsNetwork = NETWORK_PATTERNS.test(addedContent);
if (modCount > 1 && addsNetwork) {
const shortHash = logLines[0].split(' ')[0];
results.push(finding({
scanner: 'GIT',
severity: SEVERITY.HIGH,
title: `Hook modified with new network capability: ${relFile}`,
description:
`Hook script "${relFile}" was modified ${modCount} time(s) and the latest change ` +
`adds outbound network calls (fetch/http/curl/wget/etc.). ` +
'Hook scripts run automatically with full filesystem access — adding network calls ' +
'post-initial-commit is a strong rug pull indicator (exfiltration vector).',
file: relFile,
evidence: `${modCount} modifications; latest commit: ${shortHash}; network pattern detected in diff`,
owasp: 'LLM03',
recommendation:
`Audit: git log -p -- "${relFile}". ` +
'Pin hook files to trusted commits. Review what data the network calls access.',
}));
} else if (modCount > 3) {
const shortHash = logLines[0].split(' ')[0];
results.push(finding({
scanner: 'GIT',
severity: SEVERITY.MEDIUM,
title: `Hook script modified frequently: ${relFile}`,
description:
`Hook script "${relFile}" has been modified ${modCount} times. ` +
'Frequent modifications to hook scripts are unusual and warrant review — ' +
'hooks run automatically and are a high-value target for supply chain attacks.',
file: relFile,
evidence: `${modCount} commits modify this file; latest: ${shortHash}`,
owasp: 'LLM03',
recommendation:
`Review all hook changes: git log -p -- "${relFile}". ` +
'Ensure each modification has a clear, legitimate purpose.',
}));
}
} catch {
// Per-file errors are non-fatal
}
}
return results;
}
// ---------------------------------------------------------------------------
// Category 4: New Outbound URLs Post-Initial Commit
// ---------------------------------------------------------------------------
/**
* Extract unique hostnames from URLs in a text block.
* @param {string} text
* @returns {Set<string>}
*/
function extractHostnames(text) {
const hosts = new Set();
const urlRe = /https?:\/\/([a-zA-Z0-9.-]+)/g;
let m;
while ((m = urlRe.exec(text)) !== null) {
hosts.add(m[1].toLowerCase());
}
return hosts;
}
/**
* Detect new outbound URLs added in recent commits not present at initial commit.
* @param {string} targetPath
* @returns {object[]} findings
*/
function detectNewOutboundUrls(targetPath) {
const results = [];
// Get initial commit hash
let initialHash;
try {
initialHash = git('rev-list --max-parents=0 HEAD', targetPath).split('\n')[0].trim();
} catch {
return results;
}
// Get all URLs present in initial commit (full tree)
let initialUrls = new Set();
try {
const initialContent = git(`show ${initialHash}:`, targetPath);
// This lists files — we need content. Use git grep on the initial tree.
const initialGrep = git(`grep -r "https\\?://" ${initialHash}`, targetPath);
initialUrls = extractHostnames(initialGrep);
} catch {
// Fallback: grep the initial commit diff itself
try {
const initDiff = git(`show ${initialHash}`, targetPath);
initialUrls = extractHostnames(initDiff);
} catch {
// Cannot determine initial URLs — skip
return results;
}
}
// Get diff of last 50 commits (added lines only)
let recentDiff = '';
try {
recentDiff = git(`log -50 --format='' -p`, targetPath);
} catch {
return results;
}
// Parse added lines from the diff
const addedLines = recentDiff
.split('\n')
.filter(l => l.startsWith('+') && !l.startsWith('+++'));
const addedContent = addedLines.join('\n');
const addedHostnames = extractHostnames(addedContent);
const newHostnames = [...addedHostnames].filter(h => !initialUrls.has(h));
for (const host of newHostnames) {
const isSuspicious = SUSPICIOUS_DOMAINS.some(d => host === d || host.endsWith(`.${d}`));
const sev = isSuspicious ? SEVERITY.HIGH : SEVERITY.MEDIUM;
results.push(finding({
scanner: 'GIT',
severity: sev,
title: isSuspicious
? `Suspicious exfiltration endpoint added post-initial-commit: ${host}`
: `New outbound domain added in recent commits: ${host}`,
description: isSuspicious
? `Domain "${host}" was added in recent commits and matches known exfiltration/ephemeral ` +
'endpoint patterns (webhook.site, requestbin, ngrok, pipedream, pastebin, etc.). ' +
'This is a high-confidence rug pull indicator — these services receive arbitrary HTTP requests.'
: `Domain "${host}" appears in recent commits but was not present at initial commit. ` +
'New outbound connections introduced after trust establishment warrant review.',
evidence: `New domain: ${host}; not present in initial commit (${initialHash.slice(0, 8)})`,
owasp: 'LLM03',
recommendation: isSuspicious
? `Remove all references to "${host}" immediately and audit what data was sent. ` +
'This domain pattern is used exclusively for receiving exfiltrated data.'
: `Verify the purpose of "${host}". If legitimate, document it in README. ` +
'If unexpected, this may indicate a compromised dependency or injected code.',
}));
}
return results;
}
// ---------------------------------------------------------------------------
// Category 5: Author/Email Changes
// ---------------------------------------------------------------------------
/**
* Detect suspicious author diversity in repository history.
* @param {string} targetPath
* @returns {object[]} findings
*/
function detectAuthorChanges(targetPath) {
const results = [];
let emailList;
try {
emailList = git('log --format="%ae"', targetPath).split('\n').filter(Boolean);
} catch {
return results;
}
const totalCommits = emailList.length;
const uniqueEmails = new Set(emailList);
const uniqueCount = uniqueEmails.size;
// Flag: many distinct emails in a small repo
if (uniqueCount > 3 && totalCommits < 50) {
results.push(finding({
scanner: 'GIT',
severity: SEVERITY.MEDIUM,
title: 'High author diversity in small repository',
description:
`Repository has ${uniqueCount} distinct commit author email(s) across only ${totalCommits} ` +
'commit(s). High author diversity in a small plugin/package repo can indicate ' +
'that multiple unrelated parties have committed (e.g., compromised maintainer account, ' +
'supply chain injection via PR merge with altered identity).',
evidence: `${uniqueCount} unique emails in ${totalCommits} commits: ${[...uniqueEmails].join(', ')}`,
owasp: 'LLM03',
recommendation:
'Verify each commit author is a known, trusted contributor. ' +
'Check for commits from unfamiliar email domains or auto-generated addresses.',
}));
}
// Flag: mid-history author change (compare first commit author to later commits)
try {
const allAuthors = git('log --reverse --format="%ae"', targetPath);
const firstAuthor = allAuthors.split('\n')[0].trim();
const laterAuthors = emailList.slice(0, -1); // all except the oldest (last in desc order)
const newAuthors = laterAuthors.filter(e => e !== firstAuthor);
const newAuthorSet = new Set(newAuthors);
if (newAuthorSet.size > 0) {
results.push(finding({
scanner: 'GIT',
severity: SEVERITY.INFO,
title: 'Author change mid-history',
description:
`Repository was initially committed by "${firstAuthor}" but later commits use ` +
`${newAuthorSet.size} different author email(s). This is normal for collaborative ` +
'projects but worth noting for single-author plugins.',
evidence: `Original author: ${firstAuthor}; subsequent authors: ${[...newAuthorSet].slice(0, 5).join(', ')}`,
owasp: 'LLM03',
recommendation:
'Verify all contributing authors are known and trusted. ' +
'For single-maintainer plugins, unexpected author changes warrant investigation.',
}));
}
} catch {
// git log may fail on some platforms — non-fatal
}
return results;
}
// ---------------------------------------------------------------------------
// Category 6: Binary File Additions
// ---------------------------------------------------------------------------
/**
* Detect unusual binary files added in recent commits.
* @param {string} targetPath
* @returns {object[]} findings
*/
function detectBinaryAdditions(targetPath) {
const results = [];
let addedFiles;
try {
const raw = git('log --diff-filter=A --name-only --format="" -50', targetPath);
addedFiles = raw.split('\n').filter(Boolean);
} catch {
return results;
}
const binaryFiles = addedFiles.filter(f => {
const lower = f.toLowerCase();
return [...BINARY_EXTENSIONS].some(ext => lower.endsWith(ext));
});
for (const binFile of binaryFiles) {
// Find which commit added it
let addCommit = 'unknown';
try {
addCommit = git(`log --diff-filter=A --format="%H %ae %ai" -- "${binFile}"`, targetPath)
.split('\n')[0] || 'unknown';
} catch {
// non-fatal
}
const shortHash = addCommit.split(' ')[0].slice(0, 8);
const author = addCommit.split(' ')[1] || 'unknown';
results.push(finding({
scanner: 'GIT',
severity: SEVERITY.LOW,
title: `Binary file added in recent commits: ${binFile}`,
description:
`Binary file "${binFile}" was added in the last 50 commits. ` +
'Binary files in plugin/package repositories are unusual and cannot be easily audited. ' +
'They may contain compiled malware, encoded payloads, or native modules with backdoors.',
file: binFile,
evidence: `Added in commit ${shortHash} by ${author}`,
owasp: 'LLM03',
recommendation:
`Verify the necessity of "${binFile}". If it must exist, document its provenance ` +
'and provide a reproducible build process. Scan with antivirus and inspect with ' +
'strings/objdump/hexdump for suspicious embedded content.',
}));
}
return results;
}
// ---------------------------------------------------------------------------
// Category 7: Suspicious Commit Patterns
// ---------------------------------------------------------------------------
/**
* Detect commits that add new network capabilities while modifying hook files.
* @param {string} targetPath
* @returns {object[]} findings
*/
function detectSuspiciousCommitPatterns(targetPath) {
const results = [];
let commitHashes;
try {
const raw = git(`log --format="%H" -${MAX_COMMITS}`, targetPath);
commitHashes = raw.split('\n').filter(Boolean).slice(0, 50); // check last 50
} catch {
return results;
}
for (const hash of commitHashes) {
try {
// Get commit subject and diff stat
const subject = git(`log -1 --format="%s" ${hash}`, targetPath).toLowerCase();
const isCosmeticMsg = /^(update|fix|cleanup|refactor|minor|bump|chore)/.test(subject);
if (!isCosmeticMsg) continue;
// Check if this "cosmetic" commit actually touches hooks
const changedFiles = git(`diff-tree --no-commit-id -r --name-only ${hash}`, targetPath)
.split('\n')
.filter(Boolean);
const touchesHooks = changedFiles.some(f => f.includes('hooks/') || f.includes('hook'));
if (!touchesHooks) continue;
// Check if the diff adds network patterns
let commitDiff;
try {
commitDiff = git(`show ${hash} --format=""`, targetPath);
} catch {
continue;
}
const addedInCommit = commitDiff
.split('\n')
.filter(l => l.startsWith('+') && !l.startsWith('+++'))
.join('\n');
if (!NETWORK_PATTERNS.test(addedInCommit)) continue;
const shortHash = hash.slice(0, 8);
const author = git(`log -1 --format="%ae" ${hash}`, targetPath);
const date = git(`log -1 --format="%ai" ${hash}`, targetPath);
results.push(finding({
scanner: 'GIT',
severity: SEVERITY.MEDIUM,
title: `Suspicious commit: cosmetic message hides hook+network changes (${shortHash})`,
description:
`Commit ${shortHash} has a cosmetic message ("${subject}") but modifies hook files ` +
'and introduces new network-access code. This pattern — disguising functional changes ' +
'as maintenance — is used to slip malicious hook modifications past reviewers.',
evidence: `Commit: ${shortHash} | Author: ${author} | Date: ${date} | ` +
`Message: "${subject}" | Hooks modified: ${changedFiles.filter(f => f.includes('hook')).join(', ')}`,
owasp: 'LLM03',
recommendation:
`Audit this commit in full: git show ${shortHash}. ` +
'Verify the network calls introduced are intentional and documented. ' +
'Enforce commit message policies that require meaningful descriptions for hook changes.',
}));
} catch {
// Per-commit errors are non-fatal
}
}
return results;
}
// ---------------------------------------------------------------------------
// Main scanner export
// ---------------------------------------------------------------------------
/**
* Scan git history of targetPath for supply chain rug pull signals.
*
* @param {string} targetPath - Absolute root path being scanned
* @param {object} discovery - File discovery result (not used directly; git commands enumerate)
* @returns {Promise<object>} - scannerResult envelope
*/
export async function scan(targetPath, discovery) {
const startMs = Date.now();
// Prerequisite: must be a git repo
if (!isGitRepo(targetPath)) {
return scannerResult(
'git-forensics',
'skipped',
[],
0,
Date.now() - startMs,
'Not a git repository — git forensics skipped',
);
}
const findings = [];
const errors = [];
// Run all detection categories, collecting errors without aborting
const categories = [
['force-push', () => detectForcePushes(targetPath)],
['description-drift', () => detectDescriptionDrift(targetPath)],
['hook-modifications', () => detectHookModifications(targetPath)],
['new-outbound-urls', () => detectNewOutboundUrls(targetPath)],
['author-changes', () => detectAuthorChanges(targetPath)],
['binary-additions', () => detectBinaryAdditions(targetPath)],
['suspicious-patterns', () => detectSuspiciousCommitPatterns(targetPath)],
];
for (const [name, fn] of categories) {
try {
const categoryFindings = fn();
findings.push(...categoryFindings);
} catch (err) {
errors.push(`${name}: ${err.message}`);
}
}
const durationMs = Date.now() - startMs;
if (errors.length > 0 && findings.length === 0) {
// All categories failed — report as error
return scannerResult(
'git-forensics',
'error',
findings,
0,
durationMs,
`All detection categories failed: ${errors.join('; ')}`,
);
}
// Partial errors are logged but status is 'ok' if we have results
const result = scannerResult('git-forensics', 'ok', findings, 0, durationMs);
if (errors.length > 0) {
result.partial_errors = errors;
}
return result;
}