feat(ms-ai-architect): sitemap-based KB change detection system
Adds a zero-dependency Node.js pipeline that polls Microsoft Learn sitemaps weekly to detect when source documentation changes. Replaces the broken mtime-based staleness check (all files had identical mtime after release). Components: - build-registry.mjs: extracts 1342 URLs from 387 reference files - poll-sitemaps.mjs: streams ~18 child sitemaps, matches against registry - report-changes.mjs: prioritized change report (critical/high/medium/low) - discover-new-urls.mjs: finds relevant new MS Learn pages not yet covered - run-weekly-update.mjs: orchestrator with --force/--discover/--dry-run Integration: - session-start hook reads change-report.json instead of broken mtime check - hook triggers background poll if >7 days since last check - generate-skills --update reads change report for targeted MCP updates Current stats: 69% match rate (924/1342 URLs tracked via sitemaps). ~31% unmatched due to Microsoft URL restructuring (ai-foundry/openai paths). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
035255fc5d
commit
f968f37be3
13 changed files with 976 additions and 59 deletions
87
plugins/ms-ai-architect/scripts/kb-update/build-registry.mjs
Normal file
87
plugins/ms-ai-architect/scripts/kb-update/build-registry.mjs
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
#!/usr/bin/env node
|
||||
// build-registry.mjs — Build URL registry from existing reference files.
|
||||
// Extracts all learn.microsoft.com URLs and maps them to their source reference files.
|
||||
// Usage: node build-registry.mjs [--merge]
|
||||
// --merge: preserve existing sitemap_lastmod data, only add new URLs
|
||||
|
||||
import { readdirSync, readFileSync, existsSync } from 'node:fs';
|
||||
import { join, relative, dirname } from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { normalizeUrl, extractUrls } from './lib/url-normalize.mjs';
|
||||
import { loadRegistry, saveRegistry } from './lib/registry-io.mjs';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const PLUGIN_ROOT = join(__dirname, '..', '..');
|
||||
const SKILLS_DIR = join(PLUGIN_ROOT, 'skills');
|
||||
const merge = process.argv.includes('--merge');
|
||||
|
||||
// Walk directory recursively for .md files
|
||||
function walkMd(dir) {
|
||||
const results = [];
|
||||
if (!existsSync(dir)) return results;
|
||||
for (const entry of readdirSync(dir, { withFileTypes: true })) {
|
||||
const full = join(dir, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
results.push(...walkMd(full));
|
||||
} else if (entry.name.endsWith('.md') && entry.name !== 'SKILL.md') {
|
||||
results.push(full);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
// --- Main ---
|
||||
const existing = merge ? loadRegistry() : null;
|
||||
const urlToFiles = new Map(); // normalizedUrl → Set<relativePath>
|
||||
let totalFiles = 0;
|
||||
|
||||
const skillDirs = readdirSync(SKILLS_DIR, { withFileTypes: true })
|
||||
.filter(d => d.isDirectory())
|
||||
.map(d => d.name);
|
||||
|
||||
for (const skill of skillDirs) {
|
||||
const refsDir = join(SKILLS_DIR, skill, 'references');
|
||||
const files = walkMd(refsDir);
|
||||
|
||||
for (const file of files) {
|
||||
totalFiles++;
|
||||
const content = readFileSync(file, 'utf8');
|
||||
const urls = extractUrls(content);
|
||||
const relPath = relative(PLUGIN_ROOT, file);
|
||||
|
||||
for (const url of urls) {
|
||||
if (!urlToFiles.has(url)) urlToFiles.set(url, new Set());
|
||||
urlToFiles.get(url).add(relPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build registry
|
||||
const today = new Date().toISOString().split('T')[0];
|
||||
const registry = {
|
||||
version: 1,
|
||||
created_at: today,
|
||||
last_poll: merge ? existing?.last_poll || null : null,
|
||||
sitemap_state: merge ? existing?.sitemap_state || {} : {},
|
||||
urls: {},
|
||||
};
|
||||
|
||||
for (const [url, files] of urlToFiles) {
|
||||
const prev = merge ? existing?.urls?.[url] : null;
|
||||
registry.urls[url] = {
|
||||
sitemap_lastmod: prev?.sitemap_lastmod || null,
|
||||
reference_files: [...files].sort(),
|
||||
status: prev?.status || 'unpolled',
|
||||
};
|
||||
}
|
||||
|
||||
saveRegistry(registry);
|
||||
|
||||
// Stats
|
||||
const multiRef = [...urlToFiles.values()].filter(s => s.size > 1).length;
|
||||
console.log(`Registry built: ${urlToFiles.size} unique URLs from ${totalFiles} files`);
|
||||
console.log(` URLs referenced by multiple files: ${multiRef}`);
|
||||
if (merge && existing?.urls) {
|
||||
const newUrls = [...urlToFiles.keys()].filter(u => !existing.urls[u]).length;
|
||||
console.log(` New URLs added (merge): ${newUrls}`);
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue