feat(ms-ai-architect): sitemap-based KB change detection system
Adds a zero-dependency Node.js pipeline that polls Microsoft Learn sitemaps weekly to detect when source documentation changes. Replaces the broken mtime-based staleness check (all files had identical mtime after release). Components: - build-registry.mjs: extracts 1342 URLs from 387 reference files - poll-sitemaps.mjs: streams ~18 child sitemaps, matches against registry - report-changes.mjs: prioritized change report (critical/high/medium/low) - discover-new-urls.mjs: finds relevant new MS Learn pages not yet covered - run-weekly-update.mjs: orchestrator with --force/--discover/--dry-run Integration: - session-start hook reads change-report.json instead of broken mtime check - hook triggers background poll if >7 days since last check - generate-skills --update reads change report for targeted MCP updates Current stats: 69% match rate (924/1342 URLs tracked via sitemaps). ~31% unmatched due to Microsoft URL restructuring (ai-foundry/openai paths). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
035255fc5d
commit
f968f37be3
13 changed files with 976 additions and 59 deletions
154
plugins/ms-ai-architect/scripts/kb-update/poll-sitemaps.mjs
Normal file
154
plugins/ms-ai-architect/scripts/kb-update/poll-sitemaps.mjs
Normal file
|
|
@ -0,0 +1,154 @@
|
|||
#!/usr/bin/env node
|
||||
// poll-sitemaps.mjs — Poll Microsoft Learn sitemaps for lastmod changes.
|
||||
// Updates url-registry.json with current sitemap_lastmod values.
|
||||
// Usage: node poll-sitemaps.mjs [--force] [--verbose]
|
||||
|
||||
import { dirname, join } from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { normalizeUrl } from './lib/url-normalize.mjs';
|
||||
import { loadRegistry, saveRegistry } from './lib/registry-io.mjs';
|
||||
import { fetchSitemapIndex, streamSitemap } from './lib/sitemap-stream.mjs';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const DATA_DIR = join(__dirname, 'data');
|
||||
const force = process.argv.includes('--force');
|
||||
const verbose = process.argv.includes('--verbose');
|
||||
|
||||
// Target child sitemaps — covers all URL path prefixes in the registry.
|
||||
// Derived from analyzing which sitemaps contain our 1342 tracked URLs.
|
||||
const TARGET_PREFIXES = [
|
||||
'azure_en-us_',
|
||||
'microsoft-copilot-studio_en-us_',
|
||||
'security_en-us_',
|
||||
'fabric_en-us_',
|
||||
'power-platform_en-us_',
|
||||
'ai_en-us_',
|
||||
'copilot_en-us_',
|
||||
'compliance_en-us_',
|
||||
'agent-framework_en-us_',
|
||||
'semantic-kernel_en-us_',
|
||||
'entra_en-us_',
|
||||
'purview_en-us_',
|
||||
'microsoftteams_en-us_',
|
||||
'sharepoint_en-us_',
|
||||
'microsoft-365_en-us_',
|
||||
'training_en-us_',
|
||||
'cloud-computing_en-us_',
|
||||
'privacy_en-us_',
|
||||
// dotnet_en-us_ excluded: 75 sitemaps, only 12 matches. Not worth weekly polling.
|
||||
// Re-enable with --force if needed.
|
||||
];
|
||||
|
||||
function extractChildName(loc) {
|
||||
// https://learn.microsoft.com/_sitemaps/azure_en-us_7.xml → azure_en-us_7
|
||||
const match = loc.match(/\/_sitemaps\/([^/]+)\.xml$/);
|
||||
return match ? match[1] : null;
|
||||
}
|
||||
|
||||
function isTargetChild(childName) {
|
||||
return TARGET_PREFIXES.some(p => childName.startsWith(p));
|
||||
}
|
||||
|
||||
// --- Main ---
|
||||
async function main() {
|
||||
const registry = loadRegistry(DATA_DIR);
|
||||
const urlIndex = new Map();
|
||||
for (const [url, entry] of Object.entries(registry.urls)) {
|
||||
urlIndex.set(url, entry);
|
||||
}
|
||||
|
||||
console.log(`Registry loaded: ${urlIndex.size} URLs`);
|
||||
|
||||
// Step 1: Fetch sitemap index
|
||||
console.log('Fetching sitemap index...');
|
||||
const indexEntries = await fetchSitemapIndex();
|
||||
console.log(`Sitemap index: ${indexEntries.length} child sitemaps found`);
|
||||
|
||||
// Step 2: Filter to target children
|
||||
const targetChildren = indexEntries
|
||||
.map(e => ({ ...e, name: extractChildName(e.loc) }))
|
||||
.filter(e => e.name && isTargetChild(e.name));
|
||||
|
||||
console.log(`Target children: ${targetChildren.length}`);
|
||||
|
||||
let totalMatched = 0;
|
||||
let totalUpdated = 0;
|
||||
let childrenPolled = 0;
|
||||
let childrenSkipped = 0;
|
||||
|
||||
// Step 3: Poll each child
|
||||
for (const child of targetChildren) {
|
||||
const prevState = registry.sitemap_state[child.name];
|
||||
|
||||
// Skip if unchanged since last check (unless --force)
|
||||
if (!force && prevState && prevState.lastmod === child.lastmod) {
|
||||
if (verbose) console.log(` Skipping ${child.name} (unchanged since ${child.lastmod})`);
|
||||
childrenSkipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log(` Polling ${child.name} (lastmod: ${child.lastmod})...`);
|
||||
childrenPolled++;
|
||||
|
||||
let matchedInChild = 0;
|
||||
let updatedInChild = 0;
|
||||
|
||||
try {
|
||||
for await (const entry of streamSitemap(child.loc)) {
|
||||
const normalized = normalizeUrl(entry.loc);
|
||||
if (!normalized) continue;
|
||||
|
||||
const registryEntry = urlIndex.get(normalized);
|
||||
if (!registryEntry) continue;
|
||||
|
||||
matchedInChild++;
|
||||
|
||||
// Update lastmod if changed
|
||||
if (entry.lastmod && registryEntry.sitemap_lastmod !== entry.lastmod) {
|
||||
registryEntry.sitemap_lastmod = entry.lastmod;
|
||||
updatedInChild++;
|
||||
}
|
||||
registryEntry.status = 'tracked';
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(` ERROR polling ${child.name}: ${err.message}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log(` Matched: ${matchedInChild}, Updated: ${updatedInChild}`);
|
||||
totalMatched += matchedInChild;
|
||||
totalUpdated += updatedInChild;
|
||||
|
||||
// Record child state
|
||||
registry.sitemap_state[child.name] = {
|
||||
lastmod: child.lastmod,
|
||||
checked_at: new Date().toISOString(),
|
||||
};
|
||||
}
|
||||
|
||||
// Step 4: Mark remaining unpolled URLs
|
||||
let notInSitemap = 0;
|
||||
for (const entry of Object.values(registry.urls)) {
|
||||
if (entry.status === 'unpolled') {
|
||||
entry.status = 'not_in_sitemap';
|
||||
notInSitemap++;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 5: Save
|
||||
registry.last_poll = new Date().toISOString();
|
||||
saveRegistry(registry, DATA_DIR);
|
||||
|
||||
// Summary
|
||||
const tracked = Object.values(registry.urls).filter(u => u.status === 'tracked').length;
|
||||
console.log('\n=== Poll Summary ===');
|
||||
console.log(`Children polled: ${childrenPolled}, skipped: ${childrenSkipped}`);
|
||||
console.log(`URLs matched: ${totalMatched}, lastmod updated: ${totalUpdated}`);
|
||||
console.log(`Registry: ${tracked} tracked, ${notInSitemap} not in sitemap`);
|
||||
console.log(`Match rate: ${((tracked / urlIndex.size) * 100).toFixed(1)}%`);
|
||||
}
|
||||
|
||||
main().catch(err => {
|
||||
console.error('Fatal error:', err.message);
|
||||
process.exit(1);
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue