Adds a zero-dependency Node.js pipeline that polls Microsoft Learn sitemaps weekly to detect when source documentation changes. Replaces the broken mtime-based staleness check (all files had identical mtime after release). Components: - build-registry.mjs: extracts 1342 URLs from 387 reference files - poll-sitemaps.mjs: streams ~18 child sitemaps, matches against registry - report-changes.mjs: prioritized change report (critical/high/medium/low) - discover-new-urls.mjs: finds relevant new MS Learn pages not yet covered - run-weekly-update.mjs: orchestrator with --force/--discover/--dry-run Integration: - session-start hook reads change-report.json instead of broken mtime check - hook triggers background poll if >7 days since last check - generate-skills --update reads change report for targeted MCP updates Current stats: 69% match rate (924/1342 URLs tracked via sitemaps). ~31% unmatched due to Microsoft URL restructuring (ai-foundry/openai paths). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
168 lines
6.3 KiB
JavaScript
168 lines
6.3 KiB
JavaScript
#!/usr/bin/env node
|
|
// discover-new-urls.mjs — Find relevant Microsoft Learn pages not yet in the registry.
|
|
// Scans sitemaps for URLs matching relevance patterns, suggests skill/category mapping.
|
|
// Usage: node discover-new-urls.mjs [--limit N]
|
|
|
|
import { dirname, join } from 'node:path';
|
|
import { fileURLToPath } from 'node:url';
|
|
import { normalizeUrl } from './lib/url-normalize.mjs';
|
|
import { loadRegistry, saveReport } from './lib/registry-io.mjs';
|
|
import { streamSitemap, fetchSitemapIndex } from './lib/sitemap-stream.mjs';
|
|
|
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
const DATA_DIR = join(__dirname, 'data');
|
|
const limitArg = process.argv.indexOf('--limit');
|
|
const limit = limitArg !== -1 ? parseInt(process.argv[limitArg + 1], 10) : Infinity;
|
|
|
|
// Relevance patterns — URL paths that indicate content this plugin should cover
|
|
const INCLUDE = [
|
|
{ pattern: /\/azure\/ai-foundry\//, skill: 'ms-ai-engineering', category: 'azure-ai-services' },
|
|
{ pattern: /\/azure\/ai-services\//, skill: 'ms-ai-engineering', category: 'azure-ai-services' },
|
|
{ pattern: /\/azure\/machine-learning\//, skill: 'ms-ai-engineering', category: 'mlops-genaiops' },
|
|
{ pattern: /\/azure\/search\//, skill: 'ms-ai-engineering', category: 'rag-architecture' },
|
|
{ pattern: /\/azure\/api-management\//, skill: 'ms-ai-engineering', category: 'api-management' },
|
|
{ pattern: /\/azure\/azure-monitor\//, skill: 'ms-ai-governance', category: 'monitoring-observability' },
|
|
{ pattern: /\/azure\/well-architected\//, skill: 'ms-ai-advisor', category: 'architecture' },
|
|
{ pattern: /\/microsoft-copilot-studio\//, skill: 'ms-ai-advisor', category: 'copilot-extensibility' },
|
|
{ pattern: /\/copilot\/microsoft-365\//, skill: 'ms-ai-advisor', category: 'copilot-extensibility' },
|
|
{ pattern: /\/microsoft-365-copilot\//, skill: 'ms-ai-advisor', category: 'copilot-extensibility' },
|
|
{ pattern: /\/security\/(?:ai|benchmark|engineering)\//, skill: 'ms-ai-security', category: 'ai-security-engineering' },
|
|
{ pattern: /\/azure\/defender-for-cloud\//, skill: 'ms-ai-security', category: 'ai-security-engineering' },
|
|
{ pattern: /\/purview\//, skill: 'ms-ai-governance', category: 'responsible-ai' },
|
|
{ pattern: /\/semantic-kernel\//, skill: 'ms-ai-engineering', category: 'agent-orchestration' },
|
|
{ pattern: /\/agent-framework\//, skill: 'ms-ai-engineering', category: 'agent-orchestration' },
|
|
{ pattern: /\/fabric\/(?:data-engineering|data-science|real-time-intelligence)\//, skill: 'ms-ai-engineering', category: 'data-engineering' },
|
|
{ pattern: /\/azure\/cosmos-db\//, skill: 'ms-ai-engineering', category: 'data-engineering' },
|
|
{ pattern: /\/azure\/databricks\//, skill: 'ms-ai-engineering', category: 'data-engineering' },
|
|
{ pattern: /\/entra\/(?:identity|agent)\//, skill: 'ms-ai-security', category: 'ai-security-engineering' },
|
|
];
|
|
|
|
// Exclude patterns — skip even if they match INCLUDE
|
|
const EXCLUDE = [
|
|
/\/training\//,
|
|
/\/credentials\//,
|
|
/\/legal\//,
|
|
/\/previous-versions\//,
|
|
/\/archive\//,
|
|
/\/samples\//,
|
|
/\/release-notes?\//,
|
|
/\/whats-new/,
|
|
/\/changelog/,
|
|
/\/migrate\//,
|
|
];
|
|
|
|
function classifyUrl(url) {
|
|
if (EXCLUDE.some(p => p.test(url))) return null;
|
|
for (const rule of INCLUDE) {
|
|
if (rule.pattern.test(url)) {
|
|
return { skill: rule.skill, category: rule.category };
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
// Target sitemaps for discovery — same as poller, minus dotnet
|
|
const TARGET_PREFIXES = [
|
|
'azure_en-us_',
|
|
'microsoft-copilot-studio_en-us_',
|
|
'security_en-us_',
|
|
'fabric_en-us_',
|
|
'power-platform_en-us_',
|
|
'ai_en-us_',
|
|
'copilot_en-us_',
|
|
'compliance_en-us_',
|
|
'agent-framework_en-us_',
|
|
'semantic-kernel_en-us_',
|
|
'entra_en-us_',
|
|
'purview_en-us_',
|
|
];
|
|
|
|
function extractChildName(loc) {
|
|
const match = loc.match(/\/_sitemaps\/([^/]+)\.xml$/);
|
|
return match ? match[1] : null;
|
|
}
|
|
|
|
// --- Main ---
|
|
async function main() {
|
|
const registry = loadRegistry(DATA_DIR);
|
|
const knownUrls = new Set(Object.keys(registry.urls));
|
|
console.log(`Registry: ${knownUrls.size} known URLs`);
|
|
|
|
console.log('Fetching sitemap index...');
|
|
const indexEntries = await fetchSitemapIndex();
|
|
|
|
const targetChildren = indexEntries
|
|
.filter(e => {
|
|
const name = extractChildName(e.loc);
|
|
return name && TARGET_PREFIXES.some(p => name.startsWith(p));
|
|
});
|
|
|
|
console.log(`Scanning ${targetChildren.length} sitemaps for new relevant URLs...`);
|
|
|
|
const candidates = [];
|
|
const bySkill = {};
|
|
|
|
for (const child of targetChildren) {
|
|
const childName = extractChildName(child.loc);
|
|
let foundInChild = 0;
|
|
|
|
try {
|
|
for await (const entry of streamSitemap(child.loc)) {
|
|
const normalized = normalizeUrl(entry.loc);
|
|
if (!normalized || knownUrls.has(normalized)) continue;
|
|
|
|
const classification = classifyUrl(normalized);
|
|
if (!classification) continue;
|
|
|
|
candidates.push({
|
|
url: normalized,
|
|
lastmod: entry.lastmod,
|
|
sitemap: childName,
|
|
suggested_skill: classification.skill,
|
|
suggested_category: classification.category,
|
|
});
|
|
knownUrls.add(normalized); // Prevent dupes across sitemaps
|
|
|
|
bySkill[classification.skill] = (bySkill[classification.skill] || 0) + 1;
|
|
foundInChild++;
|
|
|
|
if (candidates.length >= limit) break;
|
|
}
|
|
} catch (err) {
|
|
console.error(` ERROR scanning ${childName}: ${err.message}`);
|
|
}
|
|
|
|
if (foundInChild > 0) {
|
|
console.log(` ${childName}: ${foundInChild} new candidates`);
|
|
}
|
|
if (candidates.length >= limit) break;
|
|
}
|
|
|
|
// Sort by lastmod descending (newest first)
|
|
candidates.sort((a, b) => (b.lastmod || '').localeCompare(a.lastmod || ''));
|
|
|
|
const report = {
|
|
generated_at: new Date().toISOString().split('T')[0],
|
|
new_candidates: candidates.length,
|
|
by_suggested_skill: bySkill,
|
|
candidates,
|
|
};
|
|
|
|
saveReport('discovery-report.json', report, DATA_DIR);
|
|
|
|
console.log(`\n=== Discovery Report ===`);
|
|
console.log(`New relevant URLs found: ${candidates.length}`);
|
|
console.log('By skill:', JSON.stringify(bySkill, null, 2));
|
|
if (candidates.length > 0) {
|
|
console.log('\nNewest 10:');
|
|
for (const c of candidates.slice(0, 10)) {
|
|
console.log(` [${c.suggested_skill}/${c.suggested_category}] ${c.url}`);
|
|
console.log(` lastmod: ${c.lastmod}`);
|
|
}
|
|
}
|
|
}
|
|
|
|
main().catch(err => {
|
|
console.error('Fatal error:', err.message);
|
|
process.exit(1);
|
|
});
|