#!/usr/bin/env node // poll-sitemaps.mjs — Poll Microsoft Learn sitemaps for lastmod changes. // Updates url-registry.json with current sitemap_lastmod values. // Usage: node poll-sitemaps.mjs [--force] [--verbose] import { dirname, join } from 'node:path'; import { fileURLToPath } from 'node:url'; import { normalizeUrl } from './lib/url-normalize.mjs'; import { loadRegistry, saveRegistry } from './lib/registry-io.mjs'; import { fetchSitemapIndex, streamSitemap } from './lib/sitemap-stream.mjs'; const __dirname = dirname(fileURLToPath(import.meta.url)); const DATA_DIR = join(__dirname, 'data'); const force = process.argv.includes('--force'); const verbose = process.argv.includes('--verbose'); // Target child sitemaps — covers all URL path prefixes in the registry. // Derived from analyzing which sitemaps contain our 1342 tracked URLs. const TARGET_PREFIXES = [ 'azure_en-us_', 'microsoft-copilot-studio_en-us_', 'security_en-us_', 'fabric_en-us_', 'power-platform_en-us_', 'ai_en-us_', 'copilot_en-us_', 'compliance_en-us_', 'agent-framework_en-us_', 'semantic-kernel_en-us_', 'entra_en-us_', 'purview_en-us_', 'microsoftteams_en-us_', 'sharepoint_en-us_', 'microsoft-365_en-us_', 'training_en-us_', 'cloud-computing_en-us_', 'privacy_en-us_', // dotnet_en-us_ excluded: 75 sitemaps, only 12 matches. Not worth weekly polling. // Re-enable with --force if needed. ]; function extractChildName(loc) { // https://learn.microsoft.com/_sitemaps/azure_en-us_7.xml → azure_en-us_7 const match = loc.match(/\/_sitemaps\/([^/]+)\.xml$/); return match ? match[1] : null; } function isTargetChild(childName) { return TARGET_PREFIXES.some(p => childName.startsWith(p)); } // --- Main --- async function main() { const registry = loadRegistry(DATA_DIR); const urlIndex = new Map(); for (const [url, entry] of Object.entries(registry.urls)) { urlIndex.set(url, entry); } console.log(`Registry loaded: ${urlIndex.size} URLs`); // Step 1: Fetch sitemap index console.log('Fetching sitemap index...'); const indexEntries = await fetchSitemapIndex(); console.log(`Sitemap index: ${indexEntries.length} child sitemaps found`); // Step 2: Filter to target children const targetChildren = indexEntries .map(e => ({ ...e, name: extractChildName(e.loc) })) .filter(e => e.name && isTargetChild(e.name)); console.log(`Target children: ${targetChildren.length}`); let totalMatched = 0; let totalUpdated = 0; let childrenPolled = 0; let childrenSkipped = 0; // Step 3: Poll each child for (const child of targetChildren) { const prevState = registry.sitemap_state[child.name]; // Skip if unchanged since last check (unless --force) if (!force && prevState && prevState.lastmod === child.lastmod) { if (verbose) console.log(` Skipping ${child.name} (unchanged since ${child.lastmod})`); childrenSkipped++; continue; } console.log(` Polling ${child.name} (lastmod: ${child.lastmod})...`); childrenPolled++; let matchedInChild = 0; let updatedInChild = 0; try { for await (const entry of streamSitemap(child.loc)) { const normalized = normalizeUrl(entry.loc); if (!normalized) continue; const registryEntry = urlIndex.get(normalized); if (!registryEntry) continue; matchedInChild++; // Update lastmod if changed if (entry.lastmod && registryEntry.sitemap_lastmod !== entry.lastmod) { registryEntry.sitemap_lastmod = entry.lastmod; updatedInChild++; } registryEntry.status = 'tracked'; } } catch (err) { console.error(` ERROR polling ${child.name}: ${err.message}`); continue; } console.log(` Matched: ${matchedInChild}, Updated: ${updatedInChild}`); totalMatched += matchedInChild; totalUpdated += updatedInChild; // Record child state registry.sitemap_state[child.name] = { lastmod: child.lastmod, checked_at: new Date().toISOString(), }; } // Step 4: Mark remaining unpolled URLs let notInSitemap = 0; for (const entry of Object.values(registry.urls)) { if (entry.status === 'unpolled') { entry.status = 'not_in_sitemap'; notInSitemap++; } } // Step 5: Save registry.last_poll = new Date().toISOString(); saveRegistry(registry, DATA_DIR); // Summary const tracked = Object.values(registry.urls).filter(u => u.status === 'tracked').length; console.log('\n=== Poll Summary ==='); console.log(`Children polled: ${childrenPolled}, skipped: ${childrenSkipped}`); console.log(`URLs matched: ${totalMatched}, lastmod updated: ${totalUpdated}`); console.log(`Registry: ${tracked} tracked, ${notInSitemap} not in sitemap`); console.log(`Match rate: ${((tracked / urlIndex.size) * 100).toFixed(1)}%`); } main().catch(err => { console.error('Fatal error:', err.message); process.exit(1); });