ktg-plugin-marketplace/plugins/ms-ai-architect/scripts/kb-update/poll-sitemaps.mjs
Kjell Tore Guttormsen f968f37be3 feat(ms-ai-architect): sitemap-based KB change detection system
Adds a zero-dependency Node.js pipeline that polls Microsoft Learn sitemaps
weekly to detect when source documentation changes. Replaces the broken
mtime-based staleness check (all files had identical mtime after release).

Components:
- build-registry.mjs: extracts 1342 URLs from 387 reference files
- poll-sitemaps.mjs: streams ~18 child sitemaps, matches against registry
- report-changes.mjs: prioritized change report (critical/high/medium/low)
- discover-new-urls.mjs: finds relevant new MS Learn pages not yet covered
- run-weekly-update.mjs: orchestrator with --force/--discover/--dry-run

Integration:
- session-start hook reads change-report.json instead of broken mtime check
- hook triggers background poll if >7 days since last check
- generate-skills --update reads change report for targeted MCP updates

Current stats: 69% match rate (924/1342 URLs tracked via sitemaps).
~31% unmatched due to Microsoft URL restructuring (ai-foundry/openai paths).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-09 21:19:51 +02:00

154 lines
4.9 KiB
JavaScript

#!/usr/bin/env node
// poll-sitemaps.mjs — Poll Microsoft Learn sitemaps for lastmod changes.
// Updates url-registry.json with current sitemap_lastmod values.
// Usage: node poll-sitemaps.mjs [--force] [--verbose]
import { dirname, join } from 'node:path';
import { fileURLToPath } from 'node:url';
import { normalizeUrl } from './lib/url-normalize.mjs';
import { loadRegistry, saveRegistry } from './lib/registry-io.mjs';
import { fetchSitemapIndex, streamSitemap } from './lib/sitemap-stream.mjs';
const __dirname = dirname(fileURLToPath(import.meta.url));
const DATA_DIR = join(__dirname, 'data');
const force = process.argv.includes('--force');
const verbose = process.argv.includes('--verbose');
// Target child sitemaps — covers all URL path prefixes in the registry.
// Derived from analyzing which sitemaps contain our 1342 tracked URLs.
const TARGET_PREFIXES = [
'azure_en-us_',
'microsoft-copilot-studio_en-us_',
'security_en-us_',
'fabric_en-us_',
'power-platform_en-us_',
'ai_en-us_',
'copilot_en-us_',
'compliance_en-us_',
'agent-framework_en-us_',
'semantic-kernel_en-us_',
'entra_en-us_',
'purview_en-us_',
'microsoftteams_en-us_',
'sharepoint_en-us_',
'microsoft-365_en-us_',
'training_en-us_',
'cloud-computing_en-us_',
'privacy_en-us_',
// dotnet_en-us_ excluded: 75 sitemaps, only 12 matches. Not worth weekly polling.
// Re-enable with --force if needed.
];
function extractChildName(loc) {
// https://learn.microsoft.com/_sitemaps/azure_en-us_7.xml → azure_en-us_7
const match = loc.match(/\/_sitemaps\/([^/]+)\.xml$/);
return match ? match[1] : null;
}
function isTargetChild(childName) {
return TARGET_PREFIXES.some(p => childName.startsWith(p));
}
// --- Main ---
async function main() {
const registry = loadRegistry(DATA_DIR);
const urlIndex = new Map();
for (const [url, entry] of Object.entries(registry.urls)) {
urlIndex.set(url, entry);
}
console.log(`Registry loaded: ${urlIndex.size} URLs`);
// Step 1: Fetch sitemap index
console.log('Fetching sitemap index...');
const indexEntries = await fetchSitemapIndex();
console.log(`Sitemap index: ${indexEntries.length} child sitemaps found`);
// Step 2: Filter to target children
const targetChildren = indexEntries
.map(e => ({ ...e, name: extractChildName(e.loc) }))
.filter(e => e.name && isTargetChild(e.name));
console.log(`Target children: ${targetChildren.length}`);
let totalMatched = 0;
let totalUpdated = 0;
let childrenPolled = 0;
let childrenSkipped = 0;
// Step 3: Poll each child
for (const child of targetChildren) {
const prevState = registry.sitemap_state[child.name];
// Skip if unchanged since last check (unless --force)
if (!force && prevState && prevState.lastmod === child.lastmod) {
if (verbose) console.log(` Skipping ${child.name} (unchanged since ${child.lastmod})`);
childrenSkipped++;
continue;
}
console.log(` Polling ${child.name} (lastmod: ${child.lastmod})...`);
childrenPolled++;
let matchedInChild = 0;
let updatedInChild = 0;
try {
for await (const entry of streamSitemap(child.loc)) {
const normalized = normalizeUrl(entry.loc);
if (!normalized) continue;
const registryEntry = urlIndex.get(normalized);
if (!registryEntry) continue;
matchedInChild++;
// Update lastmod if changed
if (entry.lastmod && registryEntry.sitemap_lastmod !== entry.lastmod) {
registryEntry.sitemap_lastmod = entry.lastmod;
updatedInChild++;
}
registryEntry.status = 'tracked';
}
} catch (err) {
console.error(` ERROR polling ${child.name}: ${err.message}`);
continue;
}
console.log(` Matched: ${matchedInChild}, Updated: ${updatedInChild}`);
totalMatched += matchedInChild;
totalUpdated += updatedInChild;
// Record child state
registry.sitemap_state[child.name] = {
lastmod: child.lastmod,
checked_at: new Date().toISOString(),
};
}
// Step 4: Mark remaining unpolled URLs
let notInSitemap = 0;
for (const entry of Object.values(registry.urls)) {
if (entry.status === 'unpolled') {
entry.status = 'not_in_sitemap';
notInSitemap++;
}
}
// Step 5: Save
registry.last_poll = new Date().toISOString();
saveRegistry(registry, DATA_DIR);
// Summary
const tracked = Object.values(registry.urls).filter(u => u.status === 'tracked').length;
console.log('\n=== Poll Summary ===');
console.log(`Children polled: ${childrenPolled}, skipped: ${childrenSkipped}`);
console.log(`URLs matched: ${totalMatched}, lastmod updated: ${totalUpdated}`);
console.log(`Registry: ${tracked} tracked, ${notInSitemap} not in sitemap`);
console.log(`Match rate: ${((tracked / urlIndex.size) * 100).toFixed(1)}%`);
}
main().catch(err => {
console.error('Fatal error:', err.message);
process.exit(1);
});