322 lines
9.4 KiB
JavaScript
322 lines
9.4 KiB
JavaScript
// string-utils.mjs — Entropy, Levenshtein, base64 detection, redaction, decoding
|
|
// Zero dependencies.
|
|
|
|
/**
|
|
* Shannon entropy of a string (bits per character).
|
|
* @param {string} s
|
|
* @returns {number}
|
|
*/
|
|
export function shannonEntropy(s) {
|
|
if (s.length === 0) return 0;
|
|
const freq = new Map();
|
|
for (const ch of s) {
|
|
freq.set(ch, (freq.get(ch) || 0) + 1);
|
|
}
|
|
let H = 0;
|
|
const len = s.length;
|
|
for (const count of freq.values()) {
|
|
const p = count / len;
|
|
H -= p * Math.log2(p);
|
|
}
|
|
return H;
|
|
}
|
|
|
|
/**
|
|
* Levenshtein edit distance between two strings.
|
|
* @param {string} a
|
|
* @param {string} b
|
|
* @returns {number}
|
|
*/
|
|
export function levenshtein(a, b) {
|
|
if (a === b) return 0;
|
|
if (a.length === 0) return b.length;
|
|
if (b.length === 0) return a.length;
|
|
|
|
const m = a.length;
|
|
const n = b.length;
|
|
// Single-row optimization
|
|
let prev = new Array(n + 1);
|
|
let curr = new Array(n + 1);
|
|
for (let j = 0; j <= n; j++) prev[j] = j;
|
|
|
|
for (let i = 1; i <= m; i++) {
|
|
curr[0] = i;
|
|
for (let j = 1; j <= n; j++) {
|
|
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
|
|
curr[j] = Math.min(
|
|
prev[j] + 1, // deletion
|
|
curr[j - 1] + 1, // insertion
|
|
prev[j - 1] + cost // substitution
|
|
);
|
|
}
|
|
[prev, curr] = [curr, prev];
|
|
}
|
|
return prev[n];
|
|
}
|
|
|
|
/**
|
|
* Check if a string looks like base64-encoded data.
|
|
* @param {string} s
|
|
* @returns {boolean}
|
|
*/
|
|
export function isBase64Like(s) {
|
|
if (s.length < 20) return false;
|
|
// Must be mostly base64 chars and optionally end with =
|
|
return /^[A-Za-z0-9+/]{20,}={0,3}$/.test(s);
|
|
}
|
|
|
|
/**
|
|
* Check if a string looks like a hex-encoded blob.
|
|
* @param {string} s
|
|
* @returns {boolean}
|
|
*/
|
|
export function isHexBlob(s) {
|
|
if (s.length < 32) return false;
|
|
return /^(0x)?[0-9a-fA-F]{32,}$/.test(s);
|
|
}
|
|
|
|
/**
|
|
* Redact a string for safe display — show first 8 and last 4 chars.
|
|
* @param {string} s
|
|
* @param {number} [showStart=8]
|
|
* @param {number} [showEnd=4]
|
|
* @returns {string}
|
|
*/
|
|
export function redact(s, showStart = 8, showEnd = 4) {
|
|
if (s.length <= showStart + showEnd + 3) return s;
|
|
return `${s.slice(0, showStart)}...${s.slice(-showEnd)}`;
|
|
}
|
|
|
|
/**
|
|
* Extract string literals from a line of code.
|
|
* Handles single-quoted, double-quoted, and backtick strings.
|
|
* @param {string} line
|
|
* @returns {string[]}
|
|
*/
|
|
export function extractStringLiterals(line) {
|
|
const results = [];
|
|
const regex = /(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)'|`([^`\\]*(?:\\.[^`\\]*)*)`)/g;
|
|
let match;
|
|
while ((match = regex.exec(line)) !== null) {
|
|
results.push(match[1] ?? match[2] ?? match[3]);
|
|
}
|
|
return results;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Encoding/obfuscation decoders
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Decode JavaScript/Unicode escape sequences: \uXXXX and \u{XXXXX}.
|
|
* @param {string} s
|
|
* @returns {string}
|
|
*/
|
|
export function decodeUnicodeEscapes(s) {
|
|
return s
|
|
.replace(/\\u\{([0-9a-fA-F]{1,6})\}/g, (_, hex) => {
|
|
const cp = parseInt(hex, 16);
|
|
return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
|
|
})
|
|
.replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) =>
|
|
String.fromCodePoint(parseInt(hex, 16))
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Decode hex escape sequences: \xXX.
|
|
* @param {string} s
|
|
* @returns {string}
|
|
*/
|
|
export function decodeHexEscapes(s) {
|
|
return s.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) =>
|
|
String.fromCharCode(parseInt(hex, 16))
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Decode URL percent-encoding: %XX.
|
|
* Uses decodeURIComponent with fallback for malformed sequences.
|
|
* @param {string} s
|
|
* @returns {string}
|
|
*/
|
|
export function decodeUrlEncoding(s) {
|
|
// Fast path: no percent signs means nothing to decode
|
|
if (!s.includes('%')) return s;
|
|
try {
|
|
return decodeURIComponent(s);
|
|
} catch {
|
|
// Malformed sequences — decode individual %XX pairs
|
|
return s.replace(/%([0-9a-fA-F]{2})/g, (_, hex) =>
|
|
String.fromCharCode(parseInt(hex, 16))
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Attempt to decode a base64 string to UTF-8 text.
|
|
* Returns null if the input is not base64-like or decoded result is not readable text.
|
|
* @param {string} s
|
|
* @returns {string|null}
|
|
*/
|
|
export function tryDecodeBase64(s) {
|
|
if (!isBase64Like(s)) return null;
|
|
try {
|
|
const decoded = Buffer.from(s, 'base64').toString('utf-8');
|
|
// Check if result is mostly printable text (>= 80% printable ASCII)
|
|
const printable = decoded.replace(/[^\x20-\x7E\n\r\t]/g, '').length;
|
|
if (decoded.length === 0 || printable / decoded.length < 0.8) return null;
|
|
return decoded;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Decode HTML entities: named (< > & " '),
|
|
* decimal (i), and hex (i).
|
|
* @param {string} s
|
|
* @returns {string}
|
|
*/
|
|
export function decodeHtmlEntities(s) {
|
|
if (!s.includes('&')) return s;
|
|
const NAMED = {
|
|
'<': '<', '>': '>', '&': '&', '"': '"', ''': "'",
|
|
' ': ' ', '&tab;': '\t', '&newline;': '\n',
|
|
'(': '(', ')': ')', '[': '[', ']': ']',
|
|
'{': '{', '}': '}', '/': '/', '\': '\\',
|
|
':': ':', ';': ';', ',': ',', '.': '.',
|
|
'!': '!', '?': '?', '#': '#', '%': '%',
|
|
'=': '=', '+': '+', '−': '-', '*': '*',
|
|
'|': '|', '˜': '~', '`': '`', '^': '^',
|
|
'_': '_', '&at;': '@', '$': '$',
|
|
};
|
|
return s
|
|
.replace(/&#x([0-9a-fA-F]{1,6});/g, (_, hex) => {
|
|
const cp = parseInt(hex, 16);
|
|
return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
|
|
})
|
|
.replace(/&#(\d{1,7});/g, (_, dec) => {
|
|
const cp = parseInt(dec, 10);
|
|
return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
|
|
})
|
|
.replace(/&[a-zA-Z]{2,8};/g, (entity) => NAMED[entity] ?? entity);
|
|
}
|
|
|
|
/**
|
|
* Collapse letter-spaced text: "i g n o r e" → "ignore".
|
|
* Only collapses runs of single letters separated by spaces/tabs.
|
|
* Minimum 4 letters to avoid false positives on normal text.
|
|
* @param {string} s
|
|
* @returns {string}
|
|
*/
|
|
export function collapseLetterSpacing(s) {
|
|
// Match 4+ single-letter tokens separated by 1+ spaces/tabs
|
|
return s.replace(/\b([a-zA-Z]) (?:[a-zA-Z] ){2,}[a-zA-Z]\b/g, (match) =>
|
|
match.replace(/ /g, '')
|
|
);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Unicode Tags steganography (U+E0000 block) — DeepMind traps kat. 1
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Decode Unicode Tags steganography: U+E0001-E007F → ASCII.
|
|
* Unicode Tags (U+E0000 block) can encode invisible ASCII text inside
|
|
* what appears to be empty or normal-looking strings.
|
|
* E.g., U+E0069 U+E0067 U+E006E → "ign"
|
|
* @param {string} s
|
|
* @returns {string}
|
|
*/
|
|
export function decodeUnicodeTags(s) {
|
|
let result = '';
|
|
let decoded = '';
|
|
let inTagSequence = false;
|
|
|
|
for (const ch of s) {
|
|
const cp = ch.codePointAt(0);
|
|
if (cp >= 0xE0001 && cp <= 0xE007F) {
|
|
// Tag character — map to ASCII (subtract 0xE0000)
|
|
decoded += String.fromCharCode(cp - 0xE0000);
|
|
inTagSequence = true;
|
|
} else {
|
|
if (inTagSequence && decoded.length > 0) {
|
|
result += decoded;
|
|
decoded = '';
|
|
inTagSequence = false;
|
|
}
|
|
result += ch;
|
|
}
|
|
}
|
|
// Flush remaining tag sequence
|
|
if (decoded.length > 0) {
|
|
result += decoded;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Check if a string contains Unicode Tag characters (U+E0001-E007F).
|
|
* Presence of these characters is suspicious regardless of decoded content.
|
|
* @param {string} s
|
|
* @returns {boolean}
|
|
*/
|
|
export function containsUnicodeTags(s) {
|
|
for (const ch of s) {
|
|
const cp = ch.codePointAt(0);
|
|
if (cp >= 0xE0001 && cp <= 0xE007F) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// BIDI override stripping
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Strip BIDI override characters that can reorder text visually.
|
|
* U+202A (LRE), U+202B (RLE), U+202C (PDF), U+202D (LRO), U+202E (RLO),
|
|
* U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI).
|
|
* These can hide injection by making text render differently than it parses.
|
|
* @param {string} s
|
|
* @returns {string}
|
|
*/
|
|
export function stripBidiOverrides(s) {
|
|
return s.replace(/[\u202A-\u202E\u2066-\u2069]/g, '');
|
|
}
|
|
|
|
/**
|
|
* Normalize a string by decoding all known obfuscation layers.
|
|
* Runs up to 3 iterations to catch multi-layered encoding (e.g., base64 of URL-encoded).
|
|
* Order per iteration: Unicode Tags -> BIDI strip -> HTML entities -> unicode escapes ->
|
|
* hex escapes -> URL encoding -> base64.
|
|
* After decoding: collapse letter-spaced text.
|
|
* @param {string} s
|
|
* @returns {string}
|
|
*/
|
|
export function normalizeForScan(s) {
|
|
let result = s;
|
|
const MAX_ITERATIONS = 3;
|
|
|
|
// Pre-decode: Unicode Tags and BIDI overrides (before the main loop)
|
|
result = decodeUnicodeTags(result);
|
|
result = stripBidiOverrides(result);
|
|
|
|
for (let i = 0; i < MAX_ITERATIONS; i++) {
|
|
const prev = result;
|
|
result = decodeHtmlEntities(result);
|
|
result = decodeUnicodeEscapes(result);
|
|
result = decodeHexEscapes(result);
|
|
result = decodeUrlEncoding(result);
|
|
const b64decoded = tryDecodeBase64(result);
|
|
if (b64decoded) result = b64decoded;
|
|
// Stable — no further decoding possible
|
|
if (result === prev) break;
|
|
}
|
|
|
|
// Post-decode: collapse letter-spaced evasion
|
|
result = collapseLetterSpacing(result);
|
|
|
|
return result;
|
|
}
|