From f968f37be3c6a2830dd6fad96a0852ac9d4fcd8e Mon Sep 17 00:00:00 2001 From: Kjell Tore Guttormsen Date: Thu, 9 Apr 2026 21:19:51 +0200 Subject: [PATCH] feat(ms-ai-architect): sitemap-based KB change detection system Adds a zero-dependency Node.js pipeline that polls Microsoft Learn sitemaps weekly to detect when source documentation changes. Replaces the broken mtime-based staleness check (all files had identical mtime after release). Components: - build-registry.mjs: extracts 1342 URLs from 387 reference files - poll-sitemaps.mjs: streams ~18 child sitemaps, matches against registry - report-changes.mjs: prioritized change report (critical/high/medium/low) - discover-new-urls.mjs: finds relevant new MS Learn pages not yet covered - run-weekly-update.mjs: orchestrator with --force/--discover/--dry-run Integration: - session-start hook reads change-report.json instead of broken mtime check - hook triggers background poll if >7 days since last check - generate-skills --update reads change report for targeted MCP updates Current stats: 69% match rate (924/1342 URLs tracked via sitemaps). ~31% unmatched due to Microsoft URL restructuring (ai-foundry/openai paths). Co-Authored-By: Claude Opus 4.6 --- plugins/ms-ai-architect/.gitignore | 1 + plugins/ms-ai-architect/CLAUDE.md | 25 ++- plugins/ms-ai-architect/README.md | 32 ++-- .../commands/generate-skills.md | 20 ++- .../hooks/scripts/session-start-context.mjs | 66 ++++--- .../scripts/kb-update/build-registry.mjs | 87 +++++++++ .../scripts/kb-update/discover-new-urls.mjs | 168 ++++++++++++++++++ .../scripts/kb-update/lib/registry-io.mjs | 75 ++++++++ .../scripts/kb-update/lib/sitemap-stream.mjs | 124 +++++++++++++ .../scripts/kb-update/lib/url-normalize.mjs | 69 +++++++ .../scripts/kb-update/poll-sitemaps.mjs | 154 ++++++++++++++++ .../scripts/kb-update/report-changes.mjs | 144 +++++++++++++++ .../scripts/kb-update/run-weekly-update.mjs | 70 ++++++++ 13 files changed, 976 insertions(+), 59 deletions(-) create mode 100644 plugins/ms-ai-architect/scripts/kb-update/build-registry.mjs create mode 100644 plugins/ms-ai-architect/scripts/kb-update/discover-new-urls.mjs create mode 100644 plugins/ms-ai-architect/scripts/kb-update/lib/registry-io.mjs create mode 100644 plugins/ms-ai-architect/scripts/kb-update/lib/sitemap-stream.mjs create mode 100644 plugins/ms-ai-architect/scripts/kb-update/lib/url-normalize.mjs create mode 100644 plugins/ms-ai-architect/scripts/kb-update/poll-sitemaps.mjs create mode 100644 plugins/ms-ai-architect/scripts/kb-update/report-changes.mjs create mode 100644 plugins/ms-ai-architect/scripts/kb-update/run-weekly-update.mjs diff --git a/plugins/ms-ai-architect/.gitignore b/plugins/ms-ai-architect/.gitignore index f15d72a..b83342f 100644 --- a/plugins/ms-ai-architect/.gitignore +++ b/plugins/ms-ai-architect/.gitignore @@ -20,3 +20,4 @@ node_modules/ # Runtime state .work/ org/ +scripts/kb-update/data/ diff --git a/plugins/ms-ai-architect/CLAUDE.md b/plugins/ms-ai-architect/CLAUDE.md index 5cbc65c..3ead81e 100644 --- a/plugins/ms-ai-architect/CLAUDE.md +++ b/plugins/ms-ai-architect/CLAUDE.md @@ -121,13 +121,28 @@ Se `references/architecture/recommended-mcp-servers.md` for detaljer. bash tests/validate-plugin.sh ``` -#### KB-ferskhet +#### KB-ferskhet (sitemap-basert) ```bash -# Sjekk stale kunnskapsfiler -bash scripts/kb-staleness-check.sh +# Ukentlig oppdatering: poll sitemaps → endringsrapport +node scripts/kb-update/run-weekly-update.mjs --force -# Vis kun prioriterte stale filer -bash scripts/kb-staleness-check.sh --priority-only +# Med discovery av nye relevante sider +node scripts/kb-update/run-weekly-update.mjs --force --discover + +# Kun endringsrapport (etter polling) +node scripts/kb-update/report-changes.mjs + +# Bygg/oppdater URL-registry fra referansefiler +node scripts/kb-update/build-registry.mjs [--merge] +``` + +Systemet poller Microsoft Learn sitemaps ukentlig, sammenligner `` med filenes `Last updated:` header, og genererer en prioritert endringsrapport. Session-start hook trigger bakgrunns-poll automatisk hvis >7 dager siden siste. + +**Match rate:** ~69% av 1342 URLer matche mot sitemaps. ~31% (mest `azure/ai-foundry/openai/`-stier) finnes ikke i sitemaps pga. Microsofts URL-restrukturering. + +Legacy (deprecated): +```bash +bash scripts/kb-staleness-check.sh # mtime-basert, upålitelig etter git clone ``` #### E2E-regresjonstester diff --git a/plugins/ms-ai-architect/README.md b/plugins/ms-ai-architect/README.md index e51c2d4..1c213d6 100644 --- a/plugins/ms-ai-architect/README.md +++ b/plugins/ms-ai-architect/README.md @@ -487,29 +487,37 @@ bash tests/capture-fixture.sh ### Knowledge Base Maintenance -The plugin includes a systematic process for keeping reference documents current. See `docs/kb-update-policy.md` for the full policy (update frequencies per domain, procedures, quality gates). +The plugin includes a sitemap-based change detection system that tracks when Microsoft Learn source pages are updated. This replaces the previous mtime-based staleness check. -**Staleness checking:** +**Automated change detection (sitemap-based):** ```bash -# Human-readable report -bash scripts/kb-staleness-check.sh +# Weekly update: poll sitemaps → compare → generate change report +node scripts/kb-update/run-weekly-update.mjs --force -# Machine-readable JSON output -bash scripts/kb-staleness-check.sh --json +# Include discovery of new relevant pages +node scripts/kb-update/run-weekly-update.mjs --force --discover -# Write report to file -bash scripts/kb-staleness-check.sh --json --output report.json +# View change report only (after polling) +node scripts/kb-update/report-changes.mjs ``` -**Knowledge base regeneration:** +The session-start hook automatically triggers a background poll if >7 days since the last check. + +**How it works:** +1. `build-registry.mjs` extracts 1342 unique `learn.microsoft.com` URLs from reference files +2. `poll-sitemaps.mjs` fetches Microsoft Learn sitemaps and compares `` dates +3. `report-changes.mjs` generates a prioritized list of files needing update +4. `discover-new-urls.mjs` finds relevant new pages not yet covered + +**Knowledge base update:** ```bash +# Incremental update based on change report (targets changed sources via MCP) +/architect:generate-skills --update + # Full regeneration via MCP research /architect:generate-skills - -# Incremental update (Edit existing files instead of rewriting) -/architect:generate-skills --update ``` Category-to-skill routing is defined in `scripts/skill-gen/category-skill-map.json` (20 categories mapped to 5 skills), used by the generate-skills workflow to place new reference documents in the correct skill directory. diff --git a/plugins/ms-ai-architect/commands/generate-skills.md b/plugins/ms-ai-architect/commands/generate-skills.md index a2b59f6..efd3634 100644 --- a/plugins/ms-ai-architect/commands/generate-skills.md +++ b/plugins/ms-ai-architect/commands/generate-skills.md @@ -234,7 +234,9 @@ When invoked with `--update`, the command updates existing stale files instead o **Workflow:** -1. Run `bash scripts/kb-staleness-check.sh --json` to identify stale files +1. Read `scripts/kb-update/data/change-report.json` for source-aware change detection + - If not available, fall back to `bash scripts/kb-staleness-check.sh --json` + - The change report contains `changed_urls` per file — use these for targeted MCP fetches 2. Sort by priority (Critical > High > Medium > Low) 3. For each stale file, dispatch an update agent with this prompt: @@ -247,10 +249,14 @@ Oppdater filen: {FILE_PATH} ## Eksisterende innhold (les først) Les filen med Read-verktøyet. Bevar strukturen. +## Endrede kilde-URLer (hent disse først) +{changed_urls from change-report.json — if available} + ## Steg 1: Research Bruk MCP-verktøy for å verifisere og oppdatere: -1. microsoft_docs_search — 2-3 søk for siste oppdateringer -2. microsoft_docs_fetch — les oppdatert dokumentasjon +1. microsoft_docs_fetch — hent de endrede kilde-URLene direkte (hvis tilgjengelig) +2. microsoft_docs_search — 2-3 søk for siste oppdateringer +3. microsoft_docs_fetch — les ytterligere oppdatert dokumentasjon ved behov ## Steg 2: Oppdater med Edit Bruk Edit-verktøyet (IKKE Write) for å: @@ -277,7 +283,9 @@ status: success|no_changes|failed Before generating new knowledge base content, check for stale files: -1. Run `bash scripts/kb-staleness-check.sh` to identify stale files +1. Read `scripts/kb-update/data/change-report.json` for source-aware staleness data + - This is generated by `node scripts/kb-update/run-weekly-update.mjs` (polls Microsoft Learn sitemaps) + - Fallback: `bash scripts/kb-staleness-check.sh` (mtime-based, less accurate) 2. Prioritize regeneration of stale files by priority (Critical > Low) -3. When regenerating a file, update its `Sist oppdatert:` header to today's date -4. After regeneration, verify the file with the staleness checker +3. When regenerating a file, update its `Last updated:` header to today's date +4. After update, run `node scripts/kb-update/build-registry.mjs --merge` to refresh URL registry diff --git a/plugins/ms-ai-architect/hooks/scripts/session-start-context.mjs b/plugins/ms-ai-architect/hooks/scripts/session-start-context.mjs index 392fa10..cb4a46f 100644 --- a/plugins/ms-ai-architect/hooks/scripts/session-start-context.mjs +++ b/plugins/ms-ai-architect/hooks/scripts/session-start-context.mjs @@ -3,8 +3,9 @@ // Shows active utredning sessions and KB staleness on session start. // Output: plain text to stdout (advisory, never blocking). -import { readdirSync, statSync, existsSync } from 'node:fs'; +import { readdirSync, readFileSync, existsSync } from 'node:fs'; import { join, relative } from 'node:path'; +import { spawn } from 'node:child_process'; const pluginRoot = process.env.CLAUDE_PLUGIN_ROOT || join(process.cwd()); const cwd = process.cwd(); @@ -40,23 +41,36 @@ if (existsSync(docsDir)) { } } -// --- 2. Check KB staleness (stat mtime, no content reading) --- -const staleLevels = { critical: 0, high: 0, medium: 0 }; +// --- 2. Check KB staleness (from sitemap-based change report) --- const now = Date.now(); const DAY_MS = 24 * 60 * 60 * 1000; +const staleLevels = { critical: 0, high: 0, medium: 0 }; +let lastPollDaysAgo = Infinity; -const skillsDir = join(pluginRoot, 'skills'); -if (existsSync(skillsDir)) { +const changeReportPath = join(pluginRoot, 'scripts', 'kb-update', 'data', 'change-report.json'); +if (existsSync(changeReportPath)) { try { - const skillDirs = readdirSync(skillsDir, { withFileTypes: true }); - for (const skill of skillDirs) { - if (!skill.isDirectory()) continue; - const refsDir = join(skillsDir, skill.name, 'references'); - if (!existsSync(refsDir)) continue; - countStaleFiles(refsDir, staleLevels, now); + const report = JSON.parse(readFileSync(changeReportPath, 'utf8')); + staleLevels.critical = report.by_priority?.critical || 0; + staleLevels.high = report.by_priority?.high || 0; + staleLevels.medium = report.by_priority?.medium || 0; + if (report.last_poll) { + lastPollDaysAgo = (now - new Date(report.last_poll).getTime()) / DAY_MS; } } catch { - // Ignore + // Ignore — fall back to showing no data + } +} + +// Trigger background poll if >7 days since last check +if (lastPollDaysAgo > 7) { + const updateScript = join(pluginRoot, 'scripts', 'kb-update', 'run-weekly-update.mjs'); + if (existsSync(updateScript)) { + try { + spawn('node', [updateScript], { detached: true, stdio: 'ignore' }).unref(); + } catch { + // Non-critical — silent fail + } } } @@ -117,7 +131,10 @@ if (staleLevels.high > 0) staleEntries.push(`${staleLevels.high} high`); if (staleLevels.medium > 0) staleEntries.push(`${staleLevels.medium} medium`); if (staleEntries.length > 0) { - parts.push(`KB stale: ${staleEntries.join(', ')}`); + const pollAge = lastPollDaysAgo < Infinity ? ` (pollet ${Math.floor(lastPollDaysAgo)}d siden)` : ''; + parts.push(`KB: ${staleEntries.join(', ')} needs update${pollAge}`); +} else if (lastPollDaysAgo > 7) { + parts.push('KB: poll overdue'); } if (nearestDeadline) { @@ -154,26 +171,3 @@ function countFiles(dir, filename) { return count; } -function countStaleFiles(dir, levels, now) { - try { - const entries = readdirSync(dir, { withFileTypes: true }); - for (const entry of entries) { - const fullPath = join(dir, entry.name); - if (entry.isDirectory()) { - countStaleFiles(fullPath, levels, now); - } else if (entry.name.endsWith('.md')) { - try { - const mtime = statSync(fullPath).mtimeMs; - const ageDays = (now - mtime) / DAY_MS; - if (ageDays > 180) levels.critical++; - else if (ageDays > 90) levels.high++; - else if (ageDays > 60) levels.medium++; - } catch { - // Skip unreadable files - } - } - } - } catch { - // Ignore - } -} diff --git a/plugins/ms-ai-architect/scripts/kb-update/build-registry.mjs b/plugins/ms-ai-architect/scripts/kb-update/build-registry.mjs new file mode 100644 index 0000000..02dd8eb --- /dev/null +++ b/plugins/ms-ai-architect/scripts/kb-update/build-registry.mjs @@ -0,0 +1,87 @@ +#!/usr/bin/env node +// build-registry.mjs — Build URL registry from existing reference files. +// Extracts all learn.microsoft.com URLs and maps them to their source reference files. +// Usage: node build-registry.mjs [--merge] +// --merge: preserve existing sitemap_lastmod data, only add new URLs + +import { readdirSync, readFileSync, existsSync } from 'node:fs'; +import { join, relative, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { normalizeUrl, extractUrls } from './lib/url-normalize.mjs'; +import { loadRegistry, saveRegistry } from './lib/registry-io.mjs'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const PLUGIN_ROOT = join(__dirname, '..', '..'); +const SKILLS_DIR = join(PLUGIN_ROOT, 'skills'); +const merge = process.argv.includes('--merge'); + +// Walk directory recursively for .md files +function walkMd(dir) { + const results = []; + if (!existsSync(dir)) return results; + for (const entry of readdirSync(dir, { withFileTypes: true })) { + const full = join(dir, entry.name); + if (entry.isDirectory()) { + results.push(...walkMd(full)); + } else if (entry.name.endsWith('.md') && entry.name !== 'SKILL.md') { + results.push(full); + } + } + return results; +} + +// --- Main --- +const existing = merge ? loadRegistry() : null; +const urlToFiles = new Map(); // normalizedUrl → Set +let totalFiles = 0; + +const skillDirs = readdirSync(SKILLS_DIR, { withFileTypes: true }) + .filter(d => d.isDirectory()) + .map(d => d.name); + +for (const skill of skillDirs) { + const refsDir = join(SKILLS_DIR, skill, 'references'); + const files = walkMd(refsDir); + + for (const file of files) { + totalFiles++; + const content = readFileSync(file, 'utf8'); + const urls = extractUrls(content); + const relPath = relative(PLUGIN_ROOT, file); + + for (const url of urls) { + if (!urlToFiles.has(url)) urlToFiles.set(url, new Set()); + urlToFiles.get(url).add(relPath); + } + } +} + +// Build registry +const today = new Date().toISOString().split('T')[0]; +const registry = { + version: 1, + created_at: today, + last_poll: merge ? existing?.last_poll || null : null, + sitemap_state: merge ? existing?.sitemap_state || {} : {}, + urls: {}, +}; + +for (const [url, files] of urlToFiles) { + const prev = merge ? existing?.urls?.[url] : null; + registry.urls[url] = { + sitemap_lastmod: prev?.sitemap_lastmod || null, + reference_files: [...files].sort(), + status: prev?.status || 'unpolled', + }; +} + +saveRegistry(registry); + +// Stats +const multiRef = [...urlToFiles.values()].filter(s => s.size > 1).length; +console.log(`Registry built: ${urlToFiles.size} unique URLs from ${totalFiles} files`); +console.log(` URLs referenced by multiple files: ${multiRef}`); +if (merge && existing?.urls) { + const newUrls = [...urlToFiles.keys()].filter(u => !existing.urls[u]).length; + console.log(` New URLs added (merge): ${newUrls}`); +} diff --git a/plugins/ms-ai-architect/scripts/kb-update/discover-new-urls.mjs b/plugins/ms-ai-architect/scripts/kb-update/discover-new-urls.mjs new file mode 100644 index 0000000..33a4c23 --- /dev/null +++ b/plugins/ms-ai-architect/scripts/kb-update/discover-new-urls.mjs @@ -0,0 +1,168 @@ +#!/usr/bin/env node +// discover-new-urls.mjs — Find relevant Microsoft Learn pages not yet in the registry. +// Scans sitemaps for URLs matching relevance patterns, suggests skill/category mapping. +// Usage: node discover-new-urls.mjs [--limit N] + +import { dirname, join } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { normalizeUrl } from './lib/url-normalize.mjs'; +import { loadRegistry, saveReport } from './lib/registry-io.mjs'; +import { streamSitemap, fetchSitemapIndex } from './lib/sitemap-stream.mjs'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const DATA_DIR = join(__dirname, 'data'); +const limitArg = process.argv.indexOf('--limit'); +const limit = limitArg !== -1 ? parseInt(process.argv[limitArg + 1], 10) : Infinity; + +// Relevance patterns — URL paths that indicate content this plugin should cover +const INCLUDE = [ + { pattern: /\/azure\/ai-foundry\//, skill: 'ms-ai-engineering', category: 'azure-ai-services' }, + { pattern: /\/azure\/ai-services\//, skill: 'ms-ai-engineering', category: 'azure-ai-services' }, + { pattern: /\/azure\/machine-learning\//, skill: 'ms-ai-engineering', category: 'mlops-genaiops' }, + { pattern: /\/azure\/search\//, skill: 'ms-ai-engineering', category: 'rag-architecture' }, + { pattern: /\/azure\/api-management\//, skill: 'ms-ai-engineering', category: 'api-management' }, + { pattern: /\/azure\/azure-monitor\//, skill: 'ms-ai-governance', category: 'monitoring-observability' }, + { pattern: /\/azure\/well-architected\//, skill: 'ms-ai-advisor', category: 'architecture' }, + { pattern: /\/microsoft-copilot-studio\//, skill: 'ms-ai-advisor', category: 'copilot-extensibility' }, + { pattern: /\/copilot\/microsoft-365\//, skill: 'ms-ai-advisor', category: 'copilot-extensibility' }, + { pattern: /\/microsoft-365-copilot\//, skill: 'ms-ai-advisor', category: 'copilot-extensibility' }, + { pattern: /\/security\/(?:ai|benchmark|engineering)\//, skill: 'ms-ai-security', category: 'ai-security-engineering' }, + { pattern: /\/azure\/defender-for-cloud\//, skill: 'ms-ai-security', category: 'ai-security-engineering' }, + { pattern: /\/purview\//, skill: 'ms-ai-governance', category: 'responsible-ai' }, + { pattern: /\/semantic-kernel\//, skill: 'ms-ai-engineering', category: 'agent-orchestration' }, + { pattern: /\/agent-framework\//, skill: 'ms-ai-engineering', category: 'agent-orchestration' }, + { pattern: /\/fabric\/(?:data-engineering|data-science|real-time-intelligence)\//, skill: 'ms-ai-engineering', category: 'data-engineering' }, + { pattern: /\/azure\/cosmos-db\//, skill: 'ms-ai-engineering', category: 'data-engineering' }, + { pattern: /\/azure\/databricks\//, skill: 'ms-ai-engineering', category: 'data-engineering' }, + { pattern: /\/entra\/(?:identity|agent)\//, skill: 'ms-ai-security', category: 'ai-security-engineering' }, +]; + +// Exclude patterns — skip even if they match INCLUDE +const EXCLUDE = [ + /\/training\//, + /\/credentials\//, + /\/legal\//, + /\/previous-versions\//, + /\/archive\//, + /\/samples\//, + /\/release-notes?\//, + /\/whats-new/, + /\/changelog/, + /\/migrate\//, +]; + +function classifyUrl(url) { + if (EXCLUDE.some(p => p.test(url))) return null; + for (const rule of INCLUDE) { + if (rule.pattern.test(url)) { + return { skill: rule.skill, category: rule.category }; + } + } + return null; +} + +// Target sitemaps for discovery — same as poller, minus dotnet +const TARGET_PREFIXES = [ + 'azure_en-us_', + 'microsoft-copilot-studio_en-us_', + 'security_en-us_', + 'fabric_en-us_', + 'power-platform_en-us_', + 'ai_en-us_', + 'copilot_en-us_', + 'compliance_en-us_', + 'agent-framework_en-us_', + 'semantic-kernel_en-us_', + 'entra_en-us_', + 'purview_en-us_', +]; + +function extractChildName(loc) { + const match = loc.match(/\/_sitemaps\/([^/]+)\.xml$/); + return match ? match[1] : null; +} + +// --- Main --- +async function main() { + const registry = loadRegistry(DATA_DIR); + const knownUrls = new Set(Object.keys(registry.urls)); + console.log(`Registry: ${knownUrls.size} known URLs`); + + console.log('Fetching sitemap index...'); + const indexEntries = await fetchSitemapIndex(); + + const targetChildren = indexEntries + .filter(e => { + const name = extractChildName(e.loc); + return name && TARGET_PREFIXES.some(p => name.startsWith(p)); + }); + + console.log(`Scanning ${targetChildren.length} sitemaps for new relevant URLs...`); + + const candidates = []; + const bySkill = {}; + + for (const child of targetChildren) { + const childName = extractChildName(child.loc); + let foundInChild = 0; + + try { + for await (const entry of streamSitemap(child.loc)) { + const normalized = normalizeUrl(entry.loc); + if (!normalized || knownUrls.has(normalized)) continue; + + const classification = classifyUrl(normalized); + if (!classification) continue; + + candidates.push({ + url: normalized, + lastmod: entry.lastmod, + sitemap: childName, + suggested_skill: classification.skill, + suggested_category: classification.category, + }); + knownUrls.add(normalized); // Prevent dupes across sitemaps + + bySkill[classification.skill] = (bySkill[classification.skill] || 0) + 1; + foundInChild++; + + if (candidates.length >= limit) break; + } + } catch (err) { + console.error(` ERROR scanning ${childName}: ${err.message}`); + } + + if (foundInChild > 0) { + console.log(` ${childName}: ${foundInChild} new candidates`); + } + if (candidates.length >= limit) break; + } + + // Sort by lastmod descending (newest first) + candidates.sort((a, b) => (b.lastmod || '').localeCompare(a.lastmod || '')); + + const report = { + generated_at: new Date().toISOString().split('T')[0], + new_candidates: candidates.length, + by_suggested_skill: bySkill, + candidates, + }; + + saveReport('discovery-report.json', report, DATA_DIR); + + console.log(`\n=== Discovery Report ===`); + console.log(`New relevant URLs found: ${candidates.length}`); + console.log('By skill:', JSON.stringify(bySkill, null, 2)); + if (candidates.length > 0) { + console.log('\nNewest 10:'); + for (const c of candidates.slice(0, 10)) { + console.log(` [${c.suggested_skill}/${c.suggested_category}] ${c.url}`); + console.log(` lastmod: ${c.lastmod}`); + } + } +} + +main().catch(err => { + console.error('Fatal error:', err.message); + process.exit(1); +}); diff --git a/plugins/ms-ai-architect/scripts/kb-update/lib/registry-io.mjs b/plugins/ms-ai-architect/scripts/kb-update/lib/registry-io.mjs new file mode 100644 index 0000000..f36266f --- /dev/null +++ b/plugins/ms-ai-architect/scripts/kb-update/lib/registry-io.mjs @@ -0,0 +1,75 @@ +// registry-io.mjs — Atomic read/write for url-registry.json and report files. +// Zero dependencies. Uses rename() for atomic writes. + +import { readFileSync, writeFileSync, renameSync, existsSync, mkdirSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const DEFAULT_DATA_DIR = join(__dirname, '..', 'data'); + +/** + * Load the URL registry from disk. + * @param {string} [dataDir] — defaults to ../data/ relative to lib/ + * @returns {object} parsed registry or empty scaffold + */ +export function loadRegistry(dataDir = DEFAULT_DATA_DIR) { + const path = join(dataDir, 'url-registry.json'); + if (!existsSync(path)) { + return { + version: 1, + created_at: null, + last_poll: null, + sitemap_state: {}, + urls: {}, + }; + } + return JSON.parse(readFileSync(path, 'utf8')); +} + +/** + * Save the URL registry atomically (write to .tmp, then rename). + * @param {object} registry + * @param {string} [dataDir] + */ +export function saveRegistry(registry, dataDir = DEFAULT_DATA_DIR) { + ensureDir(dataDir); + const path = join(dataDir, 'url-registry.json'); + const tmp = path + '.tmp'; + writeFileSync(tmp, JSON.stringify(registry, null, 2) + '\n', 'utf8'); + renameSync(tmp, path); +} + +/** + * Load a JSON report file (change-report.json or discovery-report.json). + * @param {string} name — filename without path (e.g. 'change-report.json') + * @param {string} [dataDir] + * @returns {object|null} parsed JSON or null if not found + */ +export function loadReport(name, dataDir = DEFAULT_DATA_DIR) { + const path = join(dataDir, name); + if (!existsSync(path)) return null; + try { + return JSON.parse(readFileSync(path, 'utf8')); + } catch { + return null; + } +} + +/** + * Save a JSON report file atomically. + * @param {string} name + * @param {object} data + * @param {string} [dataDir] + */ +export function saveReport(name, data, dataDir = DEFAULT_DATA_DIR) { + ensureDir(dataDir); + const path = join(dataDir, name); + const tmp = path + '.tmp'; + writeFileSync(tmp, JSON.stringify(data, null, 2) + '\n', 'utf8'); + renameSync(tmp, path); +} + +function ensureDir(dir) { + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); +} diff --git a/plugins/ms-ai-architect/scripts/kb-update/lib/sitemap-stream.mjs b/plugins/ms-ai-architect/scripts/kb-update/lib/sitemap-stream.mjs new file mode 100644 index 0000000..6817f66 --- /dev/null +++ b/plugins/ms-ai-architect/scripts/kb-update/lib/sitemap-stream.mjs @@ -0,0 +1,124 @@ +// sitemap-stream.mjs — Streaming XML parser for Microsoft Learn sitemaps. +// Zero dependencies. Handles 47MB+ XML without loading into memory. +// Yields { loc, lastmod } per entry. + +import { get as httpsGet } from 'node:https'; +import { createGunzip } from 'node:zlib'; + +const MAX_RETRIES = 3; +const RETRY_DELAY_MS = 2000; + +/** + * Stream a sitemap XML file and yield { loc, lastmod } for each entry. + * Works with both the sitemap index () and child sitemaps (). + * @param {string} url — full HTTPS URL to sitemap XML + * @yields {{ loc: string, lastmod: string|null }} + */ +export async function* streamSitemap(url) { + const chunks = await fetchWithRetry(url); + yield* parseSitemapEntries(chunks); +} + +/** + * Fetch a sitemap fully into a buffer (most child sitemaps are 24-47MB). + * For the index (612KB) this is trivial. For children, we buffer to allow + * the generator to yield entries without backpressure issues. + * @param {string} url + * @param {number} [attempt] + * @returns {Promise} + */ +function fetchWithRetry(url, attempt = 1) { + return new Promise((resolve, reject) => { + httpsGet(url, (res) => { + if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) { + return fetchWithRetry(res.headers.location, attempt).then(resolve, reject); + } + if (res.statusCode !== 200) { + res.resume(); + const err = new Error(`HTTP ${res.statusCode} for ${url}`); + if (attempt < MAX_RETRIES) { + return setTimeout(() => fetchWithRetry(url, attempt + 1).then(resolve, reject), RETRY_DELAY_MS * attempt); + } + return reject(err); + } + + const stream = res.headers['content-encoding'] === 'gzip' + ? res.pipe(createGunzip()) + : res; + + const parts = []; + stream.on('data', (chunk) => parts.push(chunk.toString('utf8'))); + stream.on('end', () => resolve(parts.join(''))); + stream.on('error', (err) => { + if (attempt < MAX_RETRIES) { + setTimeout(() => fetchWithRetry(url, attempt + 1).then(resolve, reject), RETRY_DELAY_MS * attempt); + } else { + reject(err); + } + }); + }).on('error', (err) => { + if (attempt < MAX_RETRIES) { + setTimeout(() => fetchWithRetry(url, attempt + 1).then(resolve, reject), RETRY_DELAY_MS * attempt); + } else { + reject(err); + } + }); + }); +} + +/** + * Parse sitemap XML text and yield entries. + * Handles both (yields loc from blocks) + * and (yields loc+lastmod from blocks). + * Uses simple regex extraction — reliable for well-formed sitemap XML. + * @param {string} xml + * @yields {{ loc: string, lastmod: string|null }} + */ +function* parseSitemapEntries(xml) { + // Detect if this is a sitemap index or a urlset + const isSitemapIndex = xml.includes(' blocks: ... and ... + const sitemapRegex = /\s*([^<]+)<\/loc>(?:\s*([^<]+)<\/lastmod>)?/g; + let match; + while ((match = sitemapRegex.exec(xml)) !== null) { + yield { loc: match[1].trim(), lastmod: match[2]?.trim() || null }; + } + } else { + // Parse blocks — extract and + // The XML structure per entry is: + // ......... + // We use a two-pass approach: find each ... block, then extract fields + const urlBlockRegex = /([\s\S]*?)<\/url>/g; + const locRegex = /([^<]+)<\/loc>/; + const lastmodRegex = /([^<]+)<\/lastmod>/; + + let match; + while ((match = urlBlockRegex.exec(xml)) !== null) { + const block = match[1]; + const locMatch = locRegex.exec(block); + if (!locMatch) continue; + + const lastmodMatch = lastmodRegex.exec(block); + yield { + loc: locMatch[1].trim(), + lastmod: lastmodMatch ? lastmodMatch[1].trim() : null, + }; + } + } +} + +/** + * Fetch the sitemap index and return parsed entries. + * Convenience wrapper for the common "fetch index, decide which children to poll" pattern. + * @param {string} [indexUrl] + * @returns {Promise>} + */ +export async function fetchSitemapIndex(indexUrl = 'https://learn.microsoft.com/_sitemaps/sitemapindex.xml') { + const entries = []; + for await (const entry of streamSitemap(indexUrl)) { + entries.push(entry); + } + return entries; +} diff --git a/plugins/ms-ai-architect/scripts/kb-update/lib/url-normalize.mjs b/plugins/ms-ai-architect/scripts/kb-update/lib/url-normalize.mjs new file mode 100644 index 0000000..76d2fe2 --- /dev/null +++ b/plugins/ms-ai-architect/scripts/kb-update/lib/url-normalize.mjs @@ -0,0 +1,69 @@ +// url-normalize.mjs — Consistent URL normalization for sitemap ↔ reference file matching. +// Zero dependencies. Idempotent: normalizeUrl(normalizeUrl(x)) === normalizeUrl(x). + +/** + * Normalize a learn.microsoft.com URL to a canonical form. + * Rules applied in order: + * 1. Strip trailing punctuation leaked from markdown + * 2. Strip fragment (#anchor) + * 3. Strip ?view= and other query params + * 4. Remove /en-us/ locale prefix (store locale-free) + * 5. Lowercase + * @param {string} raw + * @returns {string|null} normalized URL, or null if not a learn.microsoft.com URL + */ +export function normalizeUrl(raw) { + if (!raw || typeof raw !== 'string') return null; + if (!raw.includes('learn.microsoft.com')) return null; + + let url = raw; + + // 1. Strip trailing punctuation that leaked from markdown context + url = url.replace(/[.,;:!?'")}\]]+$/, ''); + + // 2. Strip fragment + const hashIdx = url.indexOf('#'); + if (hashIdx !== -1) url = url.slice(0, hashIdx); + + // 3. Strip query parameters (?view=, ?tabs=, etc.) + const qIdx = url.indexOf('?'); + if (qIdx !== -1) url = url.slice(0, qIdx); + + // 4. Remove /en-us/ locale prefix — store locale-free for consistent matching + url = url.replace('://learn.microsoft.com/en-us/', '://learn.microsoft.com/'); + + // 5. Strip trailing slash for consistency + url = url.replace(/\/+$/, ''); + + // 6. Lowercase + url = url.toLowerCase(); + + return url; +} + +/** + * Extract all learn.microsoft.com URLs from a text string. + * Handles all 5 citation formats found in reference files: + * - Markdown links: [text](https://learn.microsoft.com/...) + * - Bare URLs on their own line + * - URL: prefix format + * - Dash-bullet format + * - Table cell format + * @param {string} text + * @returns {string[]} array of normalized unique URLs + */ +export function extractUrls(text) { + if (!text) return []; + const regex = /https:\/\/learn\.microsoft\.com[^\s)"'<>\]|]+/g; + const seen = new Set(); + const results = []; + let match; + while ((match = regex.exec(text)) !== null) { + const normalized = normalizeUrl(match[0]); + if (normalized && !seen.has(normalized)) { + seen.add(normalized); + results.push(normalized); + } + } + return results; +} diff --git a/plugins/ms-ai-architect/scripts/kb-update/poll-sitemaps.mjs b/plugins/ms-ai-architect/scripts/kb-update/poll-sitemaps.mjs new file mode 100644 index 0000000..b8726c4 --- /dev/null +++ b/plugins/ms-ai-architect/scripts/kb-update/poll-sitemaps.mjs @@ -0,0 +1,154 @@ +#!/usr/bin/env node +// poll-sitemaps.mjs — Poll Microsoft Learn sitemaps for lastmod changes. +// Updates url-registry.json with current sitemap_lastmod values. +// Usage: node poll-sitemaps.mjs [--force] [--verbose] + +import { dirname, join } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { normalizeUrl } from './lib/url-normalize.mjs'; +import { loadRegistry, saveRegistry } from './lib/registry-io.mjs'; +import { fetchSitemapIndex, streamSitemap } from './lib/sitemap-stream.mjs'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const DATA_DIR = join(__dirname, 'data'); +const force = process.argv.includes('--force'); +const verbose = process.argv.includes('--verbose'); + +// Target child sitemaps — covers all URL path prefixes in the registry. +// Derived from analyzing which sitemaps contain our 1342 tracked URLs. +const TARGET_PREFIXES = [ + 'azure_en-us_', + 'microsoft-copilot-studio_en-us_', + 'security_en-us_', + 'fabric_en-us_', + 'power-platform_en-us_', + 'ai_en-us_', + 'copilot_en-us_', + 'compliance_en-us_', + 'agent-framework_en-us_', + 'semantic-kernel_en-us_', + 'entra_en-us_', + 'purview_en-us_', + 'microsoftteams_en-us_', + 'sharepoint_en-us_', + 'microsoft-365_en-us_', + 'training_en-us_', + 'cloud-computing_en-us_', + 'privacy_en-us_', + // dotnet_en-us_ excluded: 75 sitemaps, only 12 matches. Not worth weekly polling. + // Re-enable with --force if needed. +]; + +function extractChildName(loc) { + // https://learn.microsoft.com/_sitemaps/azure_en-us_7.xml → azure_en-us_7 + const match = loc.match(/\/_sitemaps\/([^/]+)\.xml$/); + return match ? match[1] : null; +} + +function isTargetChild(childName) { + return TARGET_PREFIXES.some(p => childName.startsWith(p)); +} + +// --- Main --- +async function main() { + const registry = loadRegistry(DATA_DIR); + const urlIndex = new Map(); + for (const [url, entry] of Object.entries(registry.urls)) { + urlIndex.set(url, entry); + } + + console.log(`Registry loaded: ${urlIndex.size} URLs`); + + // Step 1: Fetch sitemap index + console.log('Fetching sitemap index...'); + const indexEntries = await fetchSitemapIndex(); + console.log(`Sitemap index: ${indexEntries.length} child sitemaps found`); + + // Step 2: Filter to target children + const targetChildren = indexEntries + .map(e => ({ ...e, name: extractChildName(e.loc) })) + .filter(e => e.name && isTargetChild(e.name)); + + console.log(`Target children: ${targetChildren.length}`); + + let totalMatched = 0; + let totalUpdated = 0; + let childrenPolled = 0; + let childrenSkipped = 0; + + // Step 3: Poll each child + for (const child of targetChildren) { + const prevState = registry.sitemap_state[child.name]; + + // Skip if unchanged since last check (unless --force) + if (!force && prevState && prevState.lastmod === child.lastmod) { + if (verbose) console.log(` Skipping ${child.name} (unchanged since ${child.lastmod})`); + childrenSkipped++; + continue; + } + + console.log(` Polling ${child.name} (lastmod: ${child.lastmod})...`); + childrenPolled++; + + let matchedInChild = 0; + let updatedInChild = 0; + + try { + for await (const entry of streamSitemap(child.loc)) { + const normalized = normalizeUrl(entry.loc); + if (!normalized) continue; + + const registryEntry = urlIndex.get(normalized); + if (!registryEntry) continue; + + matchedInChild++; + + // Update lastmod if changed + if (entry.lastmod && registryEntry.sitemap_lastmod !== entry.lastmod) { + registryEntry.sitemap_lastmod = entry.lastmod; + updatedInChild++; + } + registryEntry.status = 'tracked'; + } + } catch (err) { + console.error(` ERROR polling ${child.name}: ${err.message}`); + continue; + } + + console.log(` Matched: ${matchedInChild}, Updated: ${updatedInChild}`); + totalMatched += matchedInChild; + totalUpdated += updatedInChild; + + // Record child state + registry.sitemap_state[child.name] = { + lastmod: child.lastmod, + checked_at: new Date().toISOString(), + }; + } + + // Step 4: Mark remaining unpolled URLs + let notInSitemap = 0; + for (const entry of Object.values(registry.urls)) { + if (entry.status === 'unpolled') { + entry.status = 'not_in_sitemap'; + notInSitemap++; + } + } + + // Step 5: Save + registry.last_poll = new Date().toISOString(); + saveRegistry(registry, DATA_DIR); + + // Summary + const tracked = Object.values(registry.urls).filter(u => u.status === 'tracked').length; + console.log('\n=== Poll Summary ==='); + console.log(`Children polled: ${childrenPolled}, skipped: ${childrenSkipped}`); + console.log(`URLs matched: ${totalMatched}, lastmod updated: ${totalUpdated}`); + console.log(`Registry: ${tracked} tracked, ${notInSitemap} not in sitemap`); + console.log(`Match rate: ${((tracked / urlIndex.size) * 100).toFixed(1)}%`); +} + +main().catch(err => { + console.error('Fatal error:', err.message); + process.exit(1); +}); diff --git a/plugins/ms-ai-architect/scripts/kb-update/report-changes.mjs b/plugins/ms-ai-architect/scripts/kb-update/report-changes.mjs new file mode 100644 index 0000000..e0c9437 --- /dev/null +++ b/plugins/ms-ai-architect/scripts/kb-update/report-changes.mjs @@ -0,0 +1,144 @@ +#!/usr/bin/env node +// report-changes.mjs — Compare sitemap lastmod to reference file "Last updated:" headers. +// Generates change-report.json and prints human-readable summary. +// Usage: node report-changes.mjs [--json] + +import { readFileSync, existsSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { loadRegistry, saveReport } from './lib/registry-io.mjs'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const PLUGIN_ROOT = join(__dirname, '..', '..'); +const DATA_DIR = join(__dirname, 'data'); +const jsonOnly = process.argv.includes('--json'); + +// Priority classification by file path patterns +function getFilePriority(filePath) { + const lower = filePath.toLowerCase(); + if (/cost|pricing|pris/.test(lower)) return 'critical'; + if (/responsible-ai|governance|ai-security-(?:engineering|scoring)/.test(lower)) return 'high'; + if (/platforms|copilot|azure-ai-services|agent-orchestration|rag|mlops|prompt-engineering|monitoring|performance/.test(lower)) return 'medium'; + return 'low'; +} + +// Parse "Last updated:" header from a reference file +const LAST_UPDATED_PATTERNS = [ + /\*\*Last updated:\*\*\s*([\d-]+)/i, + /\*\*Sist (?:oppdatert|verifisert):\*\*\s*([\d-]+)/i, + /\*\*Dato:\*\*\s*([\d-]+)/i, +]; + +function parseLastUpdated(filePath) { + const fullPath = join(PLUGIN_ROOT, filePath); + if (!existsSync(fullPath)) return null; + + // Only read first 500 bytes — header is always at the top + const content = readFileSync(fullPath, 'utf8').slice(0, 500); + for (const pattern of LAST_UPDATED_PATTERNS) { + const match = content.match(pattern); + if (match) { + const raw = match[1].trim(); + // YYYY-MM → YYYY-MM-01, YYYY-MM-DD → as-is + return raw.length === 7 ? raw + '-01' : raw; + } + } + return null; // No date found — treat as always stale +} + +// Priority sort order +const PRIORITY_ORDER = { critical: 0, high: 1, medium: 2, low: 3 }; + +// --- Main --- +const registry = loadRegistry(DATA_DIR); + +if (!registry.last_poll) { + console.error('Registry has not been polled yet. Run poll-sitemaps.mjs first.'); + process.exit(1); +} + +// Group changed URLs by reference file +const fileChanges = new Map(); // filePath → { changedUrls, newestChange } + +for (const [url, entry] of Object.entries(registry.urls)) { + if (!entry.sitemap_lastmod || entry.status !== 'tracked') continue; + + for (const refFile of entry.reference_files) { + const fileDate = parseLastUpdated(refFile); + // If no date found, treat as always stale (date "0000-01-01") + const effectiveDate = fileDate || '0000-01-01'; + + if (entry.sitemap_lastmod > effectiveDate) { + if (!fileChanges.has(refFile)) { + fileChanges.set(refFile, { changedUrls: [], newestChange: entry.sitemap_lastmod, fileDate }); + } + const fc = fileChanges.get(refFile); + fc.changedUrls.push({ url, sitemap_lastmod: entry.sitemap_lastmod }); + if (entry.sitemap_lastmod > fc.newestChange) { + fc.newestChange = entry.sitemap_lastmod; + } + } + } +} + +// Build report entries +const files = []; +for (const [path, changes] of fileChanges) { + const priority = getFilePriority(path); + const pathParts = path.split('/'); + files.push({ + path, + priority, + file_last_updated: changes.fileDate || 'unknown', + newest_source_change: changes.newestChange, + changed_url_count: changes.changedUrls.length, + changed_urls: changes.changedUrls.map(u => u.url), + skill: pathParts[1] || 'unknown', + category: pathParts[3] || 'unknown', + }); +} + +// Sort: priority first, then newest source change descending +files.sort((a, b) => { + const pDiff = PRIORITY_ORDER[a.priority] - PRIORITY_ORDER[b.priority]; + if (pDiff !== 0) return pDiff; + return b.newest_source_change.localeCompare(a.newest_source_change); +}); + +// Count by priority +const byPriority = { critical: 0, high: 0, medium: 0, low: 0 }; +for (const f of files) byPriority[f.priority]++; + +const report = { + generated_at: new Date().toISOString().split('T')[0], + last_poll: registry.last_poll, + total_tracked: Object.values(registry.urls).filter(u => u.status === 'tracked').length, + total_not_in_sitemap: Object.values(registry.urls).filter(u => u.status === 'not_in_sitemap').length, + total_files_needing_update: files.length, + by_priority: byPriority, + files, +}; + +saveReport('change-report.json', report, DATA_DIR); + +if (jsonOnly) { + process.stdout.write(JSON.stringify(report, null, 2) + '\n'); +} else { + console.log(`\n=== KB Change Report (${report.generated_at}) ===`); + console.log(`Sources last polled: ${registry.last_poll}`); + console.log(`URLs tracked: ${report.total_tracked}/${Object.keys(registry.urls).length} (${report.total_not_in_sitemap} not in sitemap)`); + console.log(`Files needing update: ${files.length} (Critical: ${byPriority.critical}, High: ${byPriority.high}, Medium: ${byPriority.medium}, Low: ${byPriority.low})`); + + if (files.length > 0) { + console.log('\nTop 20 by priority:'); + for (const f of files.slice(0, 20)) { + console.log(` [${f.priority.toUpperCase()}] ${f.path}`); + console.log(` ${f.changed_url_count} source(s) changed. Latest: ${f.newest_source_change}. File: ${f.file_last_updated}`); + } + if (files.length > 20) { + console.log(` ... and ${files.length - 20} more`); + } + } + + console.log('\nRun: /architect:generate-skills --update to process updates'); +} diff --git a/plugins/ms-ai-architect/scripts/kb-update/run-weekly-update.mjs b/plugins/ms-ai-architect/scripts/kb-update/run-weekly-update.mjs new file mode 100644 index 0000000..09acf03 --- /dev/null +++ b/plugins/ms-ai-architect/scripts/kb-update/run-weekly-update.mjs @@ -0,0 +1,70 @@ +#!/usr/bin/env node +// run-weekly-update.mjs — Orchestrator for weekly KB update pipeline. +// Runs: poll → report → (optional) discover, sequentially. +// Usage: node run-weekly-update.mjs [--force] [--discover] [--dry-run] + +import { dirname, join } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { execFileSync } from 'node:child_process'; +import { loadRegistry } from './lib/registry-io.mjs'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const DATA_DIR = join(__dirname, 'data'); + +const force = process.argv.includes('--force'); +const discover = process.argv.includes('--discover'); +const dryRun = process.argv.includes('--dry-run'); + +const DAY_MS = 24 * 60 * 60 * 1000; + +function run(script, args = []) { + const fullPath = join(__dirname, script); + console.log(`\n--- Running ${script} ${args.join(' ')} ---`); + try { + execFileSync('node', [fullPath, ...args], { + stdio: 'inherit', + timeout: 10 * 60 * 1000, // 10 min max per step + }); + } catch (err) { + console.error(`${script} failed: ${err.message}`); + process.exit(1); + } +} + +// --- Main --- +const registry = loadRegistry(DATA_DIR); +const lastPoll = registry.last_poll ? new Date(registry.last_poll) : null; +const daysSince = lastPoll ? (Date.now() - lastPoll.getTime()) / DAY_MS : Infinity; + +if (!force && daysSince < 7) { + console.log(`Last poll: ${Math.floor(daysSince)} day(s) ago. Next in ${Math.ceil(7 - daysSince)} day(s).`); + console.log('Use --force to run anyway.'); + process.exit(0); +} + +if (dryRun) { + console.log('DRY RUN — would execute:'); + console.log(' 1. poll-sitemaps.mjs' + (force ? ' --force' : '')); + console.log(' 2. report-changes.mjs'); + if (discover) console.log(' 3. discover-new-urls.mjs'); + process.exit(0); +} + +// Ensure registry exists +if (Object.keys(registry.urls).length === 0) { + console.log('Registry empty — building from reference files first...'); + run('build-registry.mjs'); +} + +// Step 1: Poll sitemaps +run('poll-sitemaps.mjs', force ? ['--force'] : []); + +// Step 2: Generate change report +run('report-changes.mjs'); + +// Step 3: Optional discovery +if (discover) { + run('discover-new-urls.mjs', ['--limit', '500']); +} + +console.log('\n=== Weekly update complete ===');