feat(ms-ai-architect): sitemap-based KB change detection system
Adds a zero-dependency Node.js pipeline that polls Microsoft Learn sitemaps weekly to detect when source documentation changes. Replaces the broken mtime-based staleness check (all files had identical mtime after release). Components: - build-registry.mjs: extracts 1342 URLs from 387 reference files - poll-sitemaps.mjs: streams ~18 child sitemaps, matches against registry - report-changes.mjs: prioritized change report (critical/high/medium/low) - discover-new-urls.mjs: finds relevant new MS Learn pages not yet covered - run-weekly-update.mjs: orchestrator with --force/--discover/--dry-run Integration: - session-start hook reads change-report.json instead of broken mtime check - hook triggers background poll if >7 days since last check - generate-skills --update reads change report for targeted MCP updates Current stats: 69% match rate (924/1342 URLs tracked via sitemaps). ~31% unmatched due to Microsoft URL restructuring (ai-foundry/openai paths). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
035255fc5d
commit
f968f37be3
13 changed files with 976 additions and 59 deletions
144
plugins/ms-ai-architect/scripts/kb-update/report-changes.mjs
Normal file
144
plugins/ms-ai-architect/scripts/kb-update/report-changes.mjs
Normal file
|
|
@ -0,0 +1,144 @@
|
|||
#!/usr/bin/env node
|
||||
// report-changes.mjs — Compare sitemap lastmod to reference file "Last updated:" headers.
|
||||
// Generates change-report.json and prints human-readable summary.
|
||||
// Usage: node report-changes.mjs [--json]
|
||||
|
||||
import { readFileSync, existsSync } from 'node:fs';
|
||||
import { join, dirname } from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { loadRegistry, saveReport } from './lib/registry-io.mjs';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const PLUGIN_ROOT = join(__dirname, '..', '..');
|
||||
const DATA_DIR = join(__dirname, 'data');
|
||||
const jsonOnly = process.argv.includes('--json');
|
||||
|
||||
// Priority classification by file path patterns
|
||||
function getFilePriority(filePath) {
|
||||
const lower = filePath.toLowerCase();
|
||||
if (/cost|pricing|pris/.test(lower)) return 'critical';
|
||||
if (/responsible-ai|governance|ai-security-(?:engineering|scoring)/.test(lower)) return 'high';
|
||||
if (/platforms|copilot|azure-ai-services|agent-orchestration|rag|mlops|prompt-engineering|monitoring|performance/.test(lower)) return 'medium';
|
||||
return 'low';
|
||||
}
|
||||
|
||||
// Parse "Last updated:" header from a reference file
|
||||
const LAST_UPDATED_PATTERNS = [
|
||||
/\*\*Last updated:\*\*\s*([\d-]+)/i,
|
||||
/\*\*Sist (?:oppdatert|verifisert):\*\*\s*([\d-]+)/i,
|
||||
/\*\*Dato:\*\*\s*([\d-]+)/i,
|
||||
];
|
||||
|
||||
function parseLastUpdated(filePath) {
|
||||
const fullPath = join(PLUGIN_ROOT, filePath);
|
||||
if (!existsSync(fullPath)) return null;
|
||||
|
||||
// Only read first 500 bytes — header is always at the top
|
||||
const content = readFileSync(fullPath, 'utf8').slice(0, 500);
|
||||
for (const pattern of LAST_UPDATED_PATTERNS) {
|
||||
const match = content.match(pattern);
|
||||
if (match) {
|
||||
const raw = match[1].trim();
|
||||
// YYYY-MM → YYYY-MM-01, YYYY-MM-DD → as-is
|
||||
return raw.length === 7 ? raw + '-01' : raw;
|
||||
}
|
||||
}
|
||||
return null; // No date found — treat as always stale
|
||||
}
|
||||
|
||||
// Priority sort order
|
||||
const PRIORITY_ORDER = { critical: 0, high: 1, medium: 2, low: 3 };
|
||||
|
||||
// --- Main ---
|
||||
const registry = loadRegistry(DATA_DIR);
|
||||
|
||||
if (!registry.last_poll) {
|
||||
console.error('Registry has not been polled yet. Run poll-sitemaps.mjs first.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Group changed URLs by reference file
|
||||
const fileChanges = new Map(); // filePath → { changedUrls, newestChange }
|
||||
|
||||
for (const [url, entry] of Object.entries(registry.urls)) {
|
||||
if (!entry.sitemap_lastmod || entry.status !== 'tracked') continue;
|
||||
|
||||
for (const refFile of entry.reference_files) {
|
||||
const fileDate = parseLastUpdated(refFile);
|
||||
// If no date found, treat as always stale (date "0000-01-01")
|
||||
const effectiveDate = fileDate || '0000-01-01';
|
||||
|
||||
if (entry.sitemap_lastmod > effectiveDate) {
|
||||
if (!fileChanges.has(refFile)) {
|
||||
fileChanges.set(refFile, { changedUrls: [], newestChange: entry.sitemap_lastmod, fileDate });
|
||||
}
|
||||
const fc = fileChanges.get(refFile);
|
||||
fc.changedUrls.push({ url, sitemap_lastmod: entry.sitemap_lastmod });
|
||||
if (entry.sitemap_lastmod > fc.newestChange) {
|
||||
fc.newestChange = entry.sitemap_lastmod;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build report entries
|
||||
const files = [];
|
||||
for (const [path, changes] of fileChanges) {
|
||||
const priority = getFilePriority(path);
|
||||
const pathParts = path.split('/');
|
||||
files.push({
|
||||
path,
|
||||
priority,
|
||||
file_last_updated: changes.fileDate || 'unknown',
|
||||
newest_source_change: changes.newestChange,
|
||||
changed_url_count: changes.changedUrls.length,
|
||||
changed_urls: changes.changedUrls.map(u => u.url),
|
||||
skill: pathParts[1] || 'unknown',
|
||||
category: pathParts[3] || 'unknown',
|
||||
});
|
||||
}
|
||||
|
||||
// Sort: priority first, then newest source change descending
|
||||
files.sort((a, b) => {
|
||||
const pDiff = PRIORITY_ORDER[a.priority] - PRIORITY_ORDER[b.priority];
|
||||
if (pDiff !== 0) return pDiff;
|
||||
return b.newest_source_change.localeCompare(a.newest_source_change);
|
||||
});
|
||||
|
||||
// Count by priority
|
||||
const byPriority = { critical: 0, high: 0, medium: 0, low: 0 };
|
||||
for (const f of files) byPriority[f.priority]++;
|
||||
|
||||
const report = {
|
||||
generated_at: new Date().toISOString().split('T')[0],
|
||||
last_poll: registry.last_poll,
|
||||
total_tracked: Object.values(registry.urls).filter(u => u.status === 'tracked').length,
|
||||
total_not_in_sitemap: Object.values(registry.urls).filter(u => u.status === 'not_in_sitemap').length,
|
||||
total_files_needing_update: files.length,
|
||||
by_priority: byPriority,
|
||||
files,
|
||||
};
|
||||
|
||||
saveReport('change-report.json', report, DATA_DIR);
|
||||
|
||||
if (jsonOnly) {
|
||||
process.stdout.write(JSON.stringify(report, null, 2) + '\n');
|
||||
} else {
|
||||
console.log(`\n=== KB Change Report (${report.generated_at}) ===`);
|
||||
console.log(`Sources last polled: ${registry.last_poll}`);
|
||||
console.log(`URLs tracked: ${report.total_tracked}/${Object.keys(registry.urls).length} (${report.total_not_in_sitemap} not in sitemap)`);
|
||||
console.log(`Files needing update: ${files.length} (Critical: ${byPriority.critical}, High: ${byPriority.high}, Medium: ${byPriority.medium}, Low: ${byPriority.low})`);
|
||||
|
||||
if (files.length > 0) {
|
||||
console.log('\nTop 20 by priority:');
|
||||
for (const f of files.slice(0, 20)) {
|
||||
console.log(` [${f.priority.toUpperCase()}] ${f.path}`);
|
||||
console.log(` ${f.changed_url_count} source(s) changed. Latest: ${f.newest_source_change}. File: ${f.file_last_updated}`);
|
||||
}
|
||||
if (files.length > 20) {
|
||||
console.log(` ... and ${files.length - 20} more`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\nRun: /architect:generate-skills --update to process updates');
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue