Adds a zero-dependency Node.js pipeline that polls Microsoft Learn sitemaps weekly to detect when source documentation changes. Replaces the broken mtime-based staleness check (all files had identical mtime after release). Components: - build-registry.mjs: extracts 1342 URLs from 387 reference files - poll-sitemaps.mjs: streams ~18 child sitemaps, matches against registry - report-changes.mjs: prioritized change report (critical/high/medium/low) - discover-new-urls.mjs: finds relevant new MS Learn pages not yet covered - run-weekly-update.mjs: orchestrator with --force/--discover/--dry-run Integration: - session-start hook reads change-report.json instead of broken mtime check - hook triggers background poll if >7 days since last check - generate-skills --update reads change report for targeted MCP updates Current stats: 69% match rate (924/1342 URLs tracked via sitemaps). ~31% unmatched due to Microsoft URL restructuring (ai-foundry/openai paths). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
154 lines
4.9 KiB
JavaScript
154 lines
4.9 KiB
JavaScript
#!/usr/bin/env node
|
|
// poll-sitemaps.mjs — Poll Microsoft Learn sitemaps for lastmod changes.
|
|
// Updates url-registry.json with current sitemap_lastmod values.
|
|
// Usage: node poll-sitemaps.mjs [--force] [--verbose]
|
|
|
|
import { dirname, join } from 'node:path';
|
|
import { fileURLToPath } from 'node:url';
|
|
import { normalizeUrl } from './lib/url-normalize.mjs';
|
|
import { loadRegistry, saveRegistry } from './lib/registry-io.mjs';
|
|
import { fetchSitemapIndex, streamSitemap } from './lib/sitemap-stream.mjs';
|
|
|
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
const DATA_DIR = join(__dirname, 'data');
|
|
const force = process.argv.includes('--force');
|
|
const verbose = process.argv.includes('--verbose');
|
|
|
|
// Target child sitemaps — covers all URL path prefixes in the registry.
|
|
// Derived from analyzing which sitemaps contain our 1342 tracked URLs.
|
|
const TARGET_PREFIXES = [
|
|
'azure_en-us_',
|
|
'microsoft-copilot-studio_en-us_',
|
|
'security_en-us_',
|
|
'fabric_en-us_',
|
|
'power-platform_en-us_',
|
|
'ai_en-us_',
|
|
'copilot_en-us_',
|
|
'compliance_en-us_',
|
|
'agent-framework_en-us_',
|
|
'semantic-kernel_en-us_',
|
|
'entra_en-us_',
|
|
'purview_en-us_',
|
|
'microsoftteams_en-us_',
|
|
'sharepoint_en-us_',
|
|
'microsoft-365_en-us_',
|
|
'training_en-us_',
|
|
'cloud-computing_en-us_',
|
|
'privacy_en-us_',
|
|
// dotnet_en-us_ excluded: 75 sitemaps, only 12 matches. Not worth weekly polling.
|
|
// Re-enable with --force if needed.
|
|
];
|
|
|
|
function extractChildName(loc) {
|
|
// https://learn.microsoft.com/_sitemaps/azure_en-us_7.xml → azure_en-us_7
|
|
const match = loc.match(/\/_sitemaps\/([^/]+)\.xml$/);
|
|
return match ? match[1] : null;
|
|
}
|
|
|
|
function isTargetChild(childName) {
|
|
return TARGET_PREFIXES.some(p => childName.startsWith(p));
|
|
}
|
|
|
|
// --- Main ---
|
|
async function main() {
|
|
const registry = loadRegistry(DATA_DIR);
|
|
const urlIndex = new Map();
|
|
for (const [url, entry] of Object.entries(registry.urls)) {
|
|
urlIndex.set(url, entry);
|
|
}
|
|
|
|
console.log(`Registry loaded: ${urlIndex.size} URLs`);
|
|
|
|
// Step 1: Fetch sitemap index
|
|
console.log('Fetching sitemap index...');
|
|
const indexEntries = await fetchSitemapIndex();
|
|
console.log(`Sitemap index: ${indexEntries.length} child sitemaps found`);
|
|
|
|
// Step 2: Filter to target children
|
|
const targetChildren = indexEntries
|
|
.map(e => ({ ...e, name: extractChildName(e.loc) }))
|
|
.filter(e => e.name && isTargetChild(e.name));
|
|
|
|
console.log(`Target children: ${targetChildren.length}`);
|
|
|
|
let totalMatched = 0;
|
|
let totalUpdated = 0;
|
|
let childrenPolled = 0;
|
|
let childrenSkipped = 0;
|
|
|
|
// Step 3: Poll each child
|
|
for (const child of targetChildren) {
|
|
const prevState = registry.sitemap_state[child.name];
|
|
|
|
// Skip if unchanged since last check (unless --force)
|
|
if (!force && prevState && prevState.lastmod === child.lastmod) {
|
|
if (verbose) console.log(` Skipping ${child.name} (unchanged since ${child.lastmod})`);
|
|
childrenSkipped++;
|
|
continue;
|
|
}
|
|
|
|
console.log(` Polling ${child.name} (lastmod: ${child.lastmod})...`);
|
|
childrenPolled++;
|
|
|
|
let matchedInChild = 0;
|
|
let updatedInChild = 0;
|
|
|
|
try {
|
|
for await (const entry of streamSitemap(child.loc)) {
|
|
const normalized = normalizeUrl(entry.loc);
|
|
if (!normalized) continue;
|
|
|
|
const registryEntry = urlIndex.get(normalized);
|
|
if (!registryEntry) continue;
|
|
|
|
matchedInChild++;
|
|
|
|
// Update lastmod if changed
|
|
if (entry.lastmod && registryEntry.sitemap_lastmod !== entry.lastmod) {
|
|
registryEntry.sitemap_lastmod = entry.lastmod;
|
|
updatedInChild++;
|
|
}
|
|
registryEntry.status = 'tracked';
|
|
}
|
|
} catch (err) {
|
|
console.error(` ERROR polling ${child.name}: ${err.message}`);
|
|
continue;
|
|
}
|
|
|
|
console.log(` Matched: ${matchedInChild}, Updated: ${updatedInChild}`);
|
|
totalMatched += matchedInChild;
|
|
totalUpdated += updatedInChild;
|
|
|
|
// Record child state
|
|
registry.sitemap_state[child.name] = {
|
|
lastmod: child.lastmod,
|
|
checked_at: new Date().toISOString(),
|
|
};
|
|
}
|
|
|
|
// Step 4: Mark remaining unpolled URLs
|
|
let notInSitemap = 0;
|
|
for (const entry of Object.values(registry.urls)) {
|
|
if (entry.status === 'unpolled') {
|
|
entry.status = 'not_in_sitemap';
|
|
notInSitemap++;
|
|
}
|
|
}
|
|
|
|
// Step 5: Save
|
|
registry.last_poll = new Date().toISOString();
|
|
saveRegistry(registry, DATA_DIR);
|
|
|
|
// Summary
|
|
const tracked = Object.values(registry.urls).filter(u => u.status === 'tracked').length;
|
|
console.log('\n=== Poll Summary ===');
|
|
console.log(`Children polled: ${childrenPolled}, skipped: ${childrenSkipped}`);
|
|
console.log(`URLs matched: ${totalMatched}, lastmod updated: ${totalUpdated}`);
|
|
console.log(`Registry: ${tracked} tracked, ${notInSitemap} not in sitemap`);
|
|
console.log(`Match rate: ${((tracked / urlIndex.size) * 100).toFixed(1)}%`);
|
|
}
|
|
|
|
main().catch(err => {
|
|
console.error('Fatal error:', err.message);
|
|
process.exit(1);
|
|
});
|