feat(llm-security): add zero-dep plugin.xml + MANIFEST.MF parsers

This commit is contained in:
Kjell Tore Guttormsen 2026-04-18 10:07:14 +02:00
commit b86239448d
2 changed files with 546 additions and 4 deletions

View file

@ -1,5 +1,9 @@
// ide-extension-parser.mjs — Parse VS Code extension package.json into normalized manifest.
// ide-extension-parser.mjs — Parse IDE extension manifests into a normalized shape.
// Zero dependencies (Node.js builtins only).
//
// Two extension types are supported:
// - type: 'vscode' → parseVSCodeExtension (package.json + contributes)
// - type: 'jetbrains' → parseIntelliJPlugin (plugin.xml + MANIFEST.MF inside JARs)
import { readFile, access } from 'node:fs/promises';
import { join } from 'node:path';
@ -10,6 +14,9 @@ async function pathExists(p) {
/**
* @typedef {object} ParsedManifest
* @property {'vscode'|'jetbrains'} type
*
* // Shared / VS Code fields (kept at top level for backward compat with runIdeChecks)
* @property {string} id
* @property {string} publisher
* @property {string} name
@ -28,6 +35,20 @@ async function pathExists(p) {
* @property {object|string|null} repository
* @property {object} dependencies
* @property {boolean} hasSignature
*
* // JetBrains-only fields (present only when type === 'jetbrains')
* @property {string} [pluginId]
* @property {string|null} [sinceBuild]
* @property {string|null} [untilBuild]
* @property {Array<{id:string, optional:boolean, configFile:string|null}>} [depends]
* @property {Array<{namespace:string, name:string, attrs:object}>} [extensionDeclarations]
* @property {string[]} [applicationComponents]
* @property {Array<{topic:string, class:string}>} [listeners]
* @property {boolean} [hasPremainClass]
* @property {string|null} [premainClass]
* @property {Array<{path:string, size:number, sha256:string}>} [nativeBinaries]
* @property {Array<{name:string, version:string|null, shaded:boolean, coords:string|null}>} [bundledJars]
* @property {Array<{id:string, path:string}>} [themeProviders]
*/
/**
@ -70,6 +91,7 @@ export async function parseVSCodeExtension(extRoot) {
const hasSignature = await pathExists(join(extRoot, '.signature.p7s'));
const manifest = {
type: 'vscode',
id: `${publisher}.${name}`.toLowerCase(),
publisher: publisher.toLowerCase(),
name: name.toLowerCase(),
@ -94,16 +116,293 @@ export async function parseVSCodeExtension(extRoot) {
}
/**
* Parse a .vsix file. Stub for v1 user must extract first.
* Parse a .vsix file. Stub caller extracts first via lib/zip-extract.mjs.
* @param {string} vsixPath
* @throws {Error}
*/
export async function parseVsixFile(vsixPath) {
throw new Error(`VSIX parsing not implemented in v6.3.0. Extract manually (unzip ${vsixPath}) and pass the extracted directory.`);
throw new Error(`VSIX parsing not implemented in library-direct form. Extract manually (unzip ${vsixPath}) and pass the extracted directory.`);
}
// ---------------------------------------------------------------------------
// JetBrains helpers — zero-dep plugin.xml + MANIFEST.MF parsers
// ---------------------------------------------------------------------------
const NAMED_ENTITIES = {
amp: '&', lt: '<', gt: '>', quot: '"', apos: "'",
};
/**
* Decode XML entity references in text content (non-CDATA).
* @param {string} s
* @returns {string}
*/
function decodeEntities(s) {
return s.replace(/&(#x?[0-9a-fA-F]+|[a-zA-Z]+);/g, (full, inner) => {
if (inner.startsWith('#x') || inner.startsWith('#X')) {
const cp = parseInt(inner.slice(2), 16);
return Number.isFinite(cp) ? String.fromCodePoint(cp) : full;
}
if (inner.startsWith('#')) {
const cp = parseInt(inner.slice(1), 10);
return Number.isFinite(cp) ? String.fromCodePoint(cp) : full;
}
return Object.prototype.hasOwnProperty.call(NAMED_ENTITIES, inner)
? NAMED_ENTITIES[inner]
: full;
});
}
/**
* Parse an IntelliJ plugin. Stub for v1.1.
* Capture the first match of a named element. Returns its text content
* (with CDATA honoured, otherwise entity-decoded), or null.
* @param {string} xml
* @param {string} tag
* @returns {string|null}
*/
function firstElementText(xml, tag) {
const re = new RegExp(`<${tag}\\b[^>]*>([\\s\\S]*?)</${tag}>`, 'i');
const m = xml.match(re);
if (!m) return null;
return normalizeTextContent(m[1]);
}
function normalizeTextContent(raw) {
const trimmed = raw.trim();
if (trimmed.startsWith('<![CDATA[') && trimmed.endsWith(']]>')) {
return trimmed.slice(9, -3);
}
return decodeEntities(trimmed);
}
/**
* Parse a `key="value"` or `key='value'` attribute list.
* @param {string} attrBlob - e.g. ` id="X" path='Y'`
* @returns {Record<string,string>}
*/
function parseAttrs(attrBlob) {
const attrs = {};
if (!attrBlob) return attrs;
const re = /([\w-]+)\s*=\s*(?:"([^"]*)"|'([^']*)')/g;
let m;
while ((m = re.exec(attrBlob)) !== null) {
attrs[m[1]] = decodeEntities(m[2] !== undefined ? m[2] : m[3]);
}
return attrs;
}
/**
* Parse a JetBrains `plugin.xml` document.
*
* Regex-based, intentionally lenient. Malformed input returns
* `{ manifest: null, warnings: [...] }` rather than throwing.
*
* @param {string} xmlString
* @returns {{ manifest: object|null, warnings: string[] }}
*/
export function parsePluginXml(xmlString) {
const warnings = [];
if (typeof xmlString !== 'string') {
return { manifest: null, warnings: ['input is not a string'] };
}
// --- Pre-processing pipeline ---
let xml = xmlString.replace(/^\uFEFF/, '');
xml = xml.replace(/\r\n?/g, '\n');
xml = xml.replace(/<!--[\s\S]*?-->/g, '');
// Bail on obviously malformed (no <idea-plugin> root and no <id>)
if (!/<idea-plugin\b/i.test(xml) && !/<id\b/i.test(xml)) {
warnings.push('no <idea-plugin> or <id> element found — not a plugin.xml');
return { manifest: null, warnings };
}
// Bail on unbalanced-tag smell: count open vs close for <id> and <name>.
// Both are required non-self-closing elements in plugin.xml, so any mismatch
// signals truncation or malformed input.
const balanceChecks = [
{ open: /<id\b[^>]*>/g, close: /<\/id>/g, tag: 'id' },
{ open: /<name\b[^>]*>/g, close: /<\/name>/g, tag: 'name' },
];
for (const { open, close, tag } of balanceChecks) {
const o = (xml.match(open) || []).length;
const c = (xml.match(close) || []).length;
if (o > 0 && o !== c) {
warnings.push(`unbalanced <${tag}> tags — truncated input`);
return { manifest: null, warnings };
}
}
const safe = (fn, label) => {
try { return fn(); } catch (err) {
warnings.push(`${label}: ${err.message}`);
return null;
}
};
const pluginId = safe(() => firstElementText(xml, 'id'), 'pluginId') || '';
const name = safe(() => firstElementText(xml, 'name'), 'name') || '';
const version = safe(() => firstElementText(xml, 'version'), 'version') || '';
// <vendor url="..." email="...">Name</vendor>
let vendor = '';
let vendorUrl = null;
const vendorMatch = xml.match(/<vendor\b([^>]*)>([\s\S]*?)<\/vendor>/i);
if (vendorMatch) {
const attrs = parseAttrs(vendorMatch[1]);
vendorUrl = attrs.url || null;
vendor = normalizeTextContent(vendorMatch[2]);
}
// <idea-version since-build="A" until-build="B"/>
let sinceBuild = null;
let untilBuild = null;
const ideaVersionMatch = xml.match(/<idea-version\b([^>]*)\/?\s*>/i);
if (ideaVersionMatch) {
const attrs = parseAttrs(ideaVersionMatch[1]);
sinceBuild = attrs['since-build'] || null;
untilBuild = attrs['until-build'] || null;
}
// <depends optional="true" config-file="X">ID</depends>
const depends = [];
const dependsRe = /<depends\b([^>]*)>([\s\S]*?)<\/depends>/gi;
let dm;
while ((dm = dependsRe.exec(xml)) !== null) {
const attrs = parseAttrs(dm[1]);
depends.push({
id: normalizeTextContent(dm[2]),
optional: attrs.optional === 'true',
configFile: attrs['config-file'] || null,
});
}
// <extensions defaultExtensionNs="com.intellij">...<extensions>
const extensionDeclarations = [];
const themeProviders = [];
const extBlockRe = /<extensions\b([^>]*)>([\s\S]*?)<\/extensions>/gi;
let em;
while ((em = extBlockRe.exec(xml)) !== null) {
const attrs = parseAttrs(em[1]);
const ns = attrs.defaultExtensionNs || 'com.intellij';
const body = em[2];
// Children: <(name) ... /> or <(name) ...>...</(name)>
// Use [^>]*? (non-greedy, slash allowed in attr values like path="/x/y")
// so self-closing elements with slashes in attributes still match.
const childRe = /<([\w.-]+)\b([^>]*?)(?:\/\s*>|>([\s\S]*?)<\/\1>)/g;
let cm;
while ((cm = childRe.exec(body)) !== null) {
const childName = cm[1];
const childAttrs = parseAttrs(cm[2]);
extensionDeclarations.push({ namespace: ns, name: childName, attrs: childAttrs });
if (childName === 'themeProvider') {
themeProviders.push({
id: childAttrs.id || '',
path: childAttrs.path || '',
});
}
}
}
// <application-components><component><implementation-class>X</implementation-class></component></application-components>
const applicationComponents = [];
const appCompBlockRe = /<application-components\b[^>]*>([\s\S]*?)<\/application-components>/gi;
let am;
while ((am = appCompBlockRe.exec(xml)) !== null) {
const implRe = /<implementation-class>\s*([\s\S]*?)\s*<\/implementation-class>/g;
let im;
while ((im = implRe.exec(am[1])) !== null) {
applicationComponents.push(decodeEntities(im[1]).trim());
}
}
// <applicationListener topic="X" class="Y"/>
const listeners = [];
const listenerRe = /<applicationListener\b([^/>]*)\/?\s*>/gi;
let lm;
while ((lm = listenerRe.exec(xml)) !== null) {
const attrs = parseAttrs(lm[1]);
listeners.push({
topic: attrs.topic || '',
class: attrs.class || '',
});
}
return {
manifest: {
pluginId,
name,
version,
vendor,
vendorUrl,
sinceBuild,
untilBuild,
depends,
extensionDeclarations,
applicationComponents,
listeners,
themeProviders,
},
warnings,
};
}
/**
* Parse a `META-INF/MANIFEST.MF` file. Simple `Key: Value` line protocol.
* Handles RFC-822 72-char continuation lines (lines starting with space/tab
* are appended to the previous line's value).
*
* @param {string} mfString
* @returns {{mainClass: string|null, premainClass: string|null, implTitle: string|null, implVersion: string|null, premainAttrs: object}}
*/
export function parseManifestMf(mfString) {
const out = {
mainClass: null,
premainClass: null,
implTitle: null,
implVersion: null,
premainAttrs: {},
};
if (typeof mfString !== 'string' || mfString.length === 0) return out;
// Pre-processing
let s = mfString.replace(/^\uFEFF/, '');
s = s.replace(/\r\n?/g, '\n');
// Concatenate continuation lines (lines beginning with a single space or tab).
const rawLines = s.split('\n');
const logical = [];
for (const line of rawLines) {
if (line.length > 0 && (line[0] === ' ' || line[0] === '\t') && logical.length > 0) {
logical[logical.length - 1] += line.slice(1);
} else {
logical.push(line);
}
}
for (const line of logical) {
if (!line || !line.includes(': ')) continue;
const idx = line.indexOf(': ');
const key = line.slice(0, idx).trim();
const value = line.slice(idx + 2);
if (key === 'Main-Class') out.mainClass = value;
else if (key === 'Premain-Class') out.premainClass = value;
else if (key === 'Implementation-Title') out.implTitle = value;
else if (key === 'Implementation-Version') out.implVersion = value;
// Forensic collection of all Premain-* + Agent-* attributes
if (/^(Premain-|Agent-|Boot-Class-Path|Can-)/.test(key)) {
out.premainAttrs[key] = value;
}
}
return out;
}
/**
* Parse an IntelliJ plugin directory. Implemented in Step 6 (v6.6.0).
* Stub preserved until Step 6 lands.
* @param {string} pluginRoot
* @returns {Promise<null>}
*/

View file

@ -0,0 +1,243 @@
// jetbrains-parser.test.mjs — Zero-dep plugin.xml + MANIFEST.MF parsers.
//
// All inputs are inline strings — no filesystem fixtures required.
import { describe, it } from 'node:test';
import assert from 'node:assert/strict';
import {
parsePluginXml,
parseManifestMf,
} from '../../scanners/lib/ide-extension-parser.mjs';
describe('parsePluginXml — happy path', () => {
const xml = `<?xml version="1.0"?>
<idea-plugin>
<id>org.example.myplugin</id>
<name>My Plugin</name>
<version>1.2.3</version>
<vendor url="https://example.com">Example Inc</vendor>
<idea-version since-build="232.0" until-build="242.*"/>
<depends>com.intellij.modules.platform</depends>
<depends optional="true" config-file="python.xml">com.intellij.modules.python</depends>
<extensions defaultExtensionNs="com.intellij">
<applicationService serviceImplementation="org.example.Foo"/>
<postStartupActivity implementation="org.example.Startup"/>
<themeProvider id="my-theme" path="/themes/my.theme.json"/>
</extensions>
</idea-plugin>`;
it('extracts pluginId, name, version, vendor', () => {
const { manifest, warnings } = parsePluginXml(xml);
assert.ok(manifest, `expected manifest, got null; warnings: ${warnings.join('; ')}`);
assert.equal(manifest.pluginId, 'org.example.myplugin');
assert.equal(manifest.name, 'My Plugin');
assert.equal(manifest.version, '1.2.3');
assert.equal(manifest.vendor, 'Example Inc');
assert.equal(manifest.vendorUrl, 'https://example.com');
});
it('extracts idea-version build range', () => {
const { manifest } = parsePluginXml(xml);
assert.equal(manifest.sinceBuild, '232.0');
assert.equal(manifest.untilBuild, '242.*');
});
it('extracts depends[] with optional + config-file', () => {
const { manifest } = parsePluginXml(xml);
assert.equal(manifest.depends.length, 2);
assert.deepEqual(manifest.depends[0], {
id: 'com.intellij.modules.platform', optional: false, configFile: null,
});
assert.deepEqual(manifest.depends[1], {
id: 'com.intellij.modules.python', optional: true, configFile: 'python.xml',
});
});
it('captures extension children with namespace', () => {
const { manifest } = parsePluginXml(xml);
const names = manifest.extensionDeclarations.map(e => e.name).sort();
assert.deepEqual(names, ['applicationService', 'postStartupActivity', 'themeProvider']);
assert.ok(manifest.extensionDeclarations.every(e => e.namespace === 'com.intellij'));
});
it('collects themeProviders[] with id + path', () => {
const { manifest } = parsePluginXml(xml);
assert.equal(manifest.themeProviders.length, 1);
assert.equal(manifest.themeProviders[0].id, 'my-theme');
assert.equal(manifest.themeProviders[0].path, '/themes/my.theme.json');
});
});
describe('parsePluginXml — CDATA + entity handling', () => {
it('preserves CDATA content verbatim', () => {
const xml = `<idea-plugin>
<id>x.y</id>
<name><![CDATA[<b>hello & world</b>]]></name>
</idea-plugin>`;
const { manifest } = parsePluginXml(xml);
assert.equal(manifest.name, '<b>hello & world</b>');
});
it('decodes named entity refs in non-CDATA text', () => {
const xml = `<idea-plugin>
<id>com.intellij.java&amp;extras</id>
<name>n</name>
</idea-plugin>`;
const { manifest } = parsePluginXml(xml);
assert.equal(manifest.pluginId, 'com.intellij.java&extras');
});
it('decodes numeric entity refs (decimal + hex)', () => {
const xml = `<idea-plugin><id>A&#66;C&#x44;</id><name>n</name></idea-plugin>`;
const { manifest } = parsePluginXml(xml);
assert.equal(manifest.pluginId, 'ABCD');
});
});
describe('parsePluginXml — robustness', () => {
it('parses BOM-prefixed input identically', () => {
const xmlA = `<idea-plugin><id>a</id><name>n</name></idea-plugin>`;
const xmlB = '\uFEFF' + xmlA;
assert.deepEqual(parsePluginXml(xmlA).manifest, parsePluginXml(xmlB).manifest);
});
it('parses CRLF identically to LF', () => {
const xmlLF = `<idea-plugin>\n<id>a</id>\n<name>n</name>\n</idea-plugin>`;
const xmlCRLF = xmlLF.replace(/\n/g, '\r\n');
assert.deepEqual(parsePluginXml(xmlLF).manifest, parsePluginXml(xmlCRLF).manifest);
});
it('strips XML comments before regex match', () => {
const xml = `<idea-plugin>
<!-- <id>fake.id</id> -->
<id>real.id</id>
<name>n</name>
</idea-plugin>`;
const { manifest } = parsePluginXml(xml);
assert.equal(manifest.pluginId, 'real.id');
});
it('non-string input returns null + warning (never throws)', () => {
const { manifest, warnings } = parsePluginXml(null);
assert.equal(manifest, null);
assert.ok(warnings.length > 0);
});
it('truncated input returns null + warning (never throws)', () => {
const xml = `<idea-plugin><id>a</id><name>n</name`; // truncated
const { manifest, warnings } = parsePluginXml(xml);
assert.equal(manifest, null);
assert.ok(warnings.length > 0);
});
it('unknown namespace on <extensions> is preserved', () => {
const xml = `<idea-plugin>
<id>a</id><name>n</name>
<extensions defaultExtensionNs="org.custom">
<myService key="x"/>
</extensions>
</idea-plugin>`;
const { manifest } = parsePluginXml(xml);
assert.equal(manifest.extensionDeclarations.length, 1);
assert.equal(manifest.extensionDeclarations[0].namespace, 'org.custom');
assert.equal(manifest.extensionDeclarations[0].name, 'myService');
});
it('captures legacy application-components', () => {
const xml = `<idea-plugin>
<id>a</id><name>n</name>
<application-components>
<component>
<implementation-class>com.bad.Comp</implementation-class>
</component>
</application-components>
</idea-plugin>`;
const { manifest } = parsePluginXml(xml);
assert.deepEqual(manifest.applicationComponents, ['com.bad.Comp']);
});
it('captures applicationListener topic + class', () => {
const xml = `<idea-plugin>
<id>a</id><name>n</name>
<applicationListener topic="com.intellij.ide.AppLifecycleListener" class="org.x.Listener"/>
</idea-plugin>`;
const { manifest } = parsePluginXml(xml);
assert.equal(manifest.listeners.length, 1);
assert.equal(manifest.listeners[0].topic, 'com.intellij.ide.AppLifecycleListener');
assert.equal(manifest.listeners[0].class, 'org.x.Listener');
});
});
describe('parseManifestMf', () => {
it('extracts Main-Class, Premain-Class, Implementation-Title/Version', () => {
const mf = [
'Manifest-Version: 1.0',
'Main-Class: org.example.Main',
'Premain-Class: org.bad.Agent',
'Implementation-Title: my-plugin',
'Implementation-Version: 1.0.0',
'',
].join('\n');
const out = parseManifestMf(mf);
assert.equal(out.mainClass, 'org.example.Main');
assert.equal(out.premainClass, 'org.bad.Agent');
assert.equal(out.implTitle, 'my-plugin');
assert.equal(out.implVersion, '1.0.0');
});
it('collects Premain-/Agent-/Can- attrs into premainAttrs', () => {
const mf = [
'Premain-Class: org.bad.Agent',
'Can-Redefine-Classes: true',
'Can-Retransform-Classes: true',
'Agent-Class: org.bad.Agent',
'Boot-Class-Path: boot.jar',
'',
].join('\n');
const out = parseManifestMf(mf);
assert.equal(out.premainAttrs['Can-Redefine-Classes'], 'true');
assert.equal(out.premainAttrs['Can-Retransform-Classes'], 'true');
assert.equal(out.premainAttrs['Agent-Class'], 'org.bad.Agent');
assert.equal(out.premainAttrs['Boot-Class-Path'], 'boot.jar');
});
it('handles 72-char continuation lines (space-prefixed)', () => {
const mf = [
'Premain-Class: org.example.VeryLongPackage',
' Name.ContinuedAgent',
'',
].join('\n');
const out = parseManifestMf(mf);
assert.equal(out.premainClass, 'org.example.VeryLongPackageName.ContinuedAgent');
});
it('handles tab continuation (rare but legal)', () => {
const mf = 'Main-Class: org.a\n\tTail\n';
const out = parseManifestMf(mf);
assert.equal(out.mainClass, 'org.aTail');
});
it('empty input returns all-null without throwing', () => {
const out = parseManifestMf('');
assert.equal(out.mainClass, null);
assert.equal(out.premainClass, null);
assert.deepEqual(out.premainAttrs, {});
});
it('non-string input returns all-null without throwing', () => {
const out = parseManifestMf(null);
assert.equal(out.mainClass, null);
});
it('garbage input returns all-null without throwing', () => {
const out = parseManifestMf('lkajsdf qwertyui 12345\n!!!\n');
assert.equal(out.mainClass, null);
assert.equal(out.premainClass, null);
});
it('CRLF input parses identically to LF', () => {
const lf = 'Main-Class: org.a\nPremain-Class: org.b\n';
const crlf = lf.replace(/\n/g, '\r\n');
assert.deepEqual(parseManifestMf(lf), parseManifestMf(crlf));
});
});