#!/usr/bin/env node // discover-new-urls.mjs — Find relevant Microsoft Learn pages not yet in the registry. // Scans sitemaps for URLs matching relevance patterns, suggests skill/category mapping. // Usage: node discover-new-urls.mjs [--limit N] import { dirname, join } from 'node:path'; import { fileURLToPath } from 'node:url'; import { normalizeUrl } from './lib/url-normalize.mjs'; import { loadRegistry, saveReport } from './lib/registry-io.mjs'; import { streamSitemap, fetchSitemapIndex } from './lib/sitemap-stream.mjs'; const __dirname = dirname(fileURLToPath(import.meta.url)); const DATA_DIR = join(__dirname, 'data'); const limitArg = process.argv.indexOf('--limit'); const limit = limitArg !== -1 ? parseInt(process.argv[limitArg + 1], 10) : Infinity; // Relevance patterns — URL paths that indicate content this plugin should cover const INCLUDE = [ { pattern: /\/azure\/ai-foundry\//, skill: 'ms-ai-engineering', category: 'azure-ai-services' }, { pattern: /\/azure\/ai-services\//, skill: 'ms-ai-engineering', category: 'azure-ai-services' }, { pattern: /\/azure\/machine-learning\//, skill: 'ms-ai-engineering', category: 'mlops-genaiops' }, { pattern: /\/azure\/search\//, skill: 'ms-ai-engineering', category: 'rag-architecture' }, { pattern: /\/azure\/api-management\//, skill: 'ms-ai-engineering', category: 'api-management' }, { pattern: /\/azure\/azure-monitor\//, skill: 'ms-ai-governance', category: 'monitoring-observability' }, { pattern: /\/azure\/well-architected\//, skill: 'ms-ai-advisor', category: 'architecture' }, { pattern: /\/microsoft-copilot-studio\//, skill: 'ms-ai-advisor', category: 'copilot-extensibility' }, { pattern: /\/copilot\/microsoft-365\//, skill: 'ms-ai-advisor', category: 'copilot-extensibility' }, { pattern: /\/microsoft-365-copilot\//, skill: 'ms-ai-advisor', category: 'copilot-extensibility' }, { pattern: /\/security\/(?:ai|benchmark|engineering)\//, skill: 'ms-ai-security', category: 'ai-security-engineering' }, { pattern: /\/azure\/defender-for-cloud\//, skill: 'ms-ai-security', category: 'ai-security-engineering' }, { pattern: /\/purview\//, skill: 'ms-ai-governance', category: 'responsible-ai' }, { pattern: /\/semantic-kernel\//, skill: 'ms-ai-engineering', category: 'agent-orchestration' }, { pattern: /\/agent-framework\//, skill: 'ms-ai-engineering', category: 'agent-orchestration' }, { pattern: /\/fabric\/(?:data-engineering|data-science|real-time-intelligence)\//, skill: 'ms-ai-engineering', category: 'data-engineering' }, { pattern: /\/azure\/cosmos-db\//, skill: 'ms-ai-engineering', category: 'data-engineering' }, { pattern: /\/azure\/databricks\//, skill: 'ms-ai-engineering', category: 'data-engineering' }, { pattern: /\/entra\/(?:identity|agent)\//, skill: 'ms-ai-security', category: 'ai-security-engineering' }, ]; // Exclude patterns — skip even if they match INCLUDE const EXCLUDE = [ /\/training\//, /\/credentials\//, /\/legal\//, /\/previous-versions\//, /\/archive\//, /\/samples\//, /\/release-notes?\//, /\/whats-new/, /\/changelog/, /\/migrate\//, ]; function classifyUrl(url) { if (EXCLUDE.some(p => p.test(url))) return null; for (const rule of INCLUDE) { if (rule.pattern.test(url)) { return { skill: rule.skill, category: rule.category }; } } return null; } // Target sitemaps for discovery — same as poller, minus dotnet const TARGET_PREFIXES = [ 'azure_en-us_', 'microsoft-copilot-studio_en-us_', 'security_en-us_', 'fabric_en-us_', 'power-platform_en-us_', 'ai_en-us_', 'copilot_en-us_', 'compliance_en-us_', 'agent-framework_en-us_', 'semantic-kernel_en-us_', 'entra_en-us_', 'purview_en-us_', ]; function extractChildName(loc) { const match = loc.match(/\/_sitemaps\/([^/]+)\.xml$/); return match ? match[1] : null; } // --- Main --- async function main() { const registry = loadRegistry(DATA_DIR); const knownUrls = new Set(Object.keys(registry.urls)); console.log(`Registry: ${knownUrls.size} known URLs`); console.log('Fetching sitemap index...'); const indexEntries = await fetchSitemapIndex(); const targetChildren = indexEntries .filter(e => { const name = extractChildName(e.loc); return name && TARGET_PREFIXES.some(p => name.startsWith(p)); }); console.log(`Scanning ${targetChildren.length} sitemaps for new relevant URLs...`); const candidates = []; const bySkill = {}; for (const child of targetChildren) { const childName = extractChildName(child.loc); let foundInChild = 0; try { for await (const entry of streamSitemap(child.loc)) { const normalized = normalizeUrl(entry.loc); if (!normalized || knownUrls.has(normalized)) continue; const classification = classifyUrl(normalized); if (!classification) continue; candidates.push({ url: normalized, lastmod: entry.lastmod, sitemap: childName, suggested_skill: classification.skill, suggested_category: classification.category, }); knownUrls.add(normalized); // Prevent dupes across sitemaps bySkill[classification.skill] = (bySkill[classification.skill] || 0) + 1; foundInChild++; if (candidates.length >= limit) break; } } catch (err) { console.error(` ERROR scanning ${childName}: ${err.message}`); } if (foundInChild > 0) { console.log(` ${childName}: ${foundInChild} new candidates`); } if (candidates.length >= limit) break; } // Sort by lastmod descending (newest first) candidates.sort((a, b) => (b.lastmod || '').localeCompare(a.lastmod || '')); const report = { generated_at: new Date().toISOString().split('T')[0], new_candidates: candidates.length, by_suggested_skill: bySkill, candidates, }; saveReport('discovery-report.json', report, DATA_DIR); console.log(`\n=== Discovery Report ===`); console.log(`New relevant URLs found: ${candidates.length}`); console.log('By skill:', JSON.stringify(bySkill, null, 2)); if (candidates.length > 0) { console.log('\nNewest 10:'); for (const c of candidates.slice(0, 10)) { console.log(` [${c.suggested_skill}/${c.suggested_category}] ${c.url}`); console.log(` lastmod: ${c.lastmod}`); } } } main().catch(err => { console.error('Fatal error:', err.message); process.exit(1); });