ktg-plugin-marketplace/plugins/ms-ai-architect/scripts/kb-update/discover-new-urls.mjs
Kjell Tore Guttormsen f968f37be3 feat(ms-ai-architect): sitemap-based KB change detection system
Adds a zero-dependency Node.js pipeline that polls Microsoft Learn sitemaps
weekly to detect when source documentation changes. Replaces the broken
mtime-based staleness check (all files had identical mtime after release).

Components:
- build-registry.mjs: extracts 1342 URLs from 387 reference files
- poll-sitemaps.mjs: streams ~18 child sitemaps, matches against registry
- report-changes.mjs: prioritized change report (critical/high/medium/low)
- discover-new-urls.mjs: finds relevant new MS Learn pages not yet covered
- run-weekly-update.mjs: orchestrator with --force/--discover/--dry-run

Integration:
- session-start hook reads change-report.json instead of broken mtime check
- hook triggers background poll if >7 days since last check
- generate-skills --update reads change report for targeted MCP updates

Current stats: 69% match rate (924/1342 URLs tracked via sitemaps).
~31% unmatched due to Microsoft URL restructuring (ai-foundry/openai paths).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-09 21:19:51 +02:00

168 lines
6.3 KiB
JavaScript

#!/usr/bin/env node
// discover-new-urls.mjs — Find relevant Microsoft Learn pages not yet in the registry.
// Scans sitemaps for URLs matching relevance patterns, suggests skill/category mapping.
// Usage: node discover-new-urls.mjs [--limit N]
import { dirname, join } from 'node:path';
import { fileURLToPath } from 'node:url';
import { normalizeUrl } from './lib/url-normalize.mjs';
import { loadRegistry, saveReport } from './lib/registry-io.mjs';
import { streamSitemap, fetchSitemapIndex } from './lib/sitemap-stream.mjs';
const __dirname = dirname(fileURLToPath(import.meta.url));
const DATA_DIR = join(__dirname, 'data');
const limitArg = process.argv.indexOf('--limit');
const limit = limitArg !== -1 ? parseInt(process.argv[limitArg + 1], 10) : Infinity;
// Relevance patterns — URL paths that indicate content this plugin should cover
const INCLUDE = [
{ pattern: /\/azure\/ai-foundry\//, skill: 'ms-ai-engineering', category: 'azure-ai-services' },
{ pattern: /\/azure\/ai-services\//, skill: 'ms-ai-engineering', category: 'azure-ai-services' },
{ pattern: /\/azure\/machine-learning\//, skill: 'ms-ai-engineering', category: 'mlops-genaiops' },
{ pattern: /\/azure\/search\//, skill: 'ms-ai-engineering', category: 'rag-architecture' },
{ pattern: /\/azure\/api-management\//, skill: 'ms-ai-engineering', category: 'api-management' },
{ pattern: /\/azure\/azure-monitor\//, skill: 'ms-ai-governance', category: 'monitoring-observability' },
{ pattern: /\/azure\/well-architected\//, skill: 'ms-ai-advisor', category: 'architecture' },
{ pattern: /\/microsoft-copilot-studio\//, skill: 'ms-ai-advisor', category: 'copilot-extensibility' },
{ pattern: /\/copilot\/microsoft-365\//, skill: 'ms-ai-advisor', category: 'copilot-extensibility' },
{ pattern: /\/microsoft-365-copilot\//, skill: 'ms-ai-advisor', category: 'copilot-extensibility' },
{ pattern: /\/security\/(?:ai|benchmark|engineering)\//, skill: 'ms-ai-security', category: 'ai-security-engineering' },
{ pattern: /\/azure\/defender-for-cloud\//, skill: 'ms-ai-security', category: 'ai-security-engineering' },
{ pattern: /\/purview\//, skill: 'ms-ai-governance', category: 'responsible-ai' },
{ pattern: /\/semantic-kernel\//, skill: 'ms-ai-engineering', category: 'agent-orchestration' },
{ pattern: /\/agent-framework\//, skill: 'ms-ai-engineering', category: 'agent-orchestration' },
{ pattern: /\/fabric\/(?:data-engineering|data-science|real-time-intelligence)\//, skill: 'ms-ai-engineering', category: 'data-engineering' },
{ pattern: /\/azure\/cosmos-db\//, skill: 'ms-ai-engineering', category: 'data-engineering' },
{ pattern: /\/azure\/databricks\//, skill: 'ms-ai-engineering', category: 'data-engineering' },
{ pattern: /\/entra\/(?:identity|agent)\//, skill: 'ms-ai-security', category: 'ai-security-engineering' },
];
// Exclude patterns — skip even if they match INCLUDE
const EXCLUDE = [
/\/training\//,
/\/credentials\//,
/\/legal\//,
/\/previous-versions\//,
/\/archive\//,
/\/samples\//,
/\/release-notes?\//,
/\/whats-new/,
/\/changelog/,
/\/migrate\//,
];
function classifyUrl(url) {
if (EXCLUDE.some(p => p.test(url))) return null;
for (const rule of INCLUDE) {
if (rule.pattern.test(url)) {
return { skill: rule.skill, category: rule.category };
}
}
return null;
}
// Target sitemaps for discovery — same as poller, minus dotnet
const TARGET_PREFIXES = [
'azure_en-us_',
'microsoft-copilot-studio_en-us_',
'security_en-us_',
'fabric_en-us_',
'power-platform_en-us_',
'ai_en-us_',
'copilot_en-us_',
'compliance_en-us_',
'agent-framework_en-us_',
'semantic-kernel_en-us_',
'entra_en-us_',
'purview_en-us_',
];
function extractChildName(loc) {
const match = loc.match(/\/_sitemaps\/([^/]+)\.xml$/);
return match ? match[1] : null;
}
// --- Main ---
async function main() {
const registry = loadRegistry(DATA_DIR);
const knownUrls = new Set(Object.keys(registry.urls));
console.log(`Registry: ${knownUrls.size} known URLs`);
console.log('Fetching sitemap index...');
const indexEntries = await fetchSitemapIndex();
const targetChildren = indexEntries
.filter(e => {
const name = extractChildName(e.loc);
return name && TARGET_PREFIXES.some(p => name.startsWith(p));
});
console.log(`Scanning ${targetChildren.length} sitemaps for new relevant URLs...`);
const candidates = [];
const bySkill = {};
for (const child of targetChildren) {
const childName = extractChildName(child.loc);
let foundInChild = 0;
try {
for await (const entry of streamSitemap(child.loc)) {
const normalized = normalizeUrl(entry.loc);
if (!normalized || knownUrls.has(normalized)) continue;
const classification = classifyUrl(normalized);
if (!classification) continue;
candidates.push({
url: normalized,
lastmod: entry.lastmod,
sitemap: childName,
suggested_skill: classification.skill,
suggested_category: classification.category,
});
knownUrls.add(normalized); // Prevent dupes across sitemaps
bySkill[classification.skill] = (bySkill[classification.skill] || 0) + 1;
foundInChild++;
if (candidates.length >= limit) break;
}
} catch (err) {
console.error(` ERROR scanning ${childName}: ${err.message}`);
}
if (foundInChild > 0) {
console.log(` ${childName}: ${foundInChild} new candidates`);
}
if (candidates.length >= limit) break;
}
// Sort by lastmod descending (newest first)
candidates.sort((a, b) => (b.lastmod || '').localeCompare(a.lastmod || ''));
const report = {
generated_at: new Date().toISOString().split('T')[0],
new_candidates: candidates.length,
by_suggested_skill: bySkill,
candidates,
};
saveReport('discovery-report.json', report, DATA_DIR);
console.log(`\n=== Discovery Report ===`);
console.log(`New relevant URLs found: ${candidates.length}`);
console.log('By skill:', JSON.stringify(bySkill, null, 2));
if (candidates.length > 0) {
console.log('\nNewest 10:');
for (const c of candidates.slice(0, 10)) {
console.log(` [${c.suggested_skill}/${c.suggested_category}] ${c.url}`);
console.log(` lastmod: ${c.lastmod}`);
}
}
}
main().catch(err => {
console.error('Fatal error:', err.message);
process.exit(1);
});