feat(ms-ai-architect): sitemap-based KB change detection system

Adds a zero-dependency Node.js pipeline that polls Microsoft Learn sitemaps weekly to detect when source documentation changes. Replaces the broken mtime-based staleness check (all files had identical mtime after release). Components: - build-registry.mjs: extracts 1342 URLs from 387 reference files - poll-sitemaps.mjs: streams ~18 child sitemaps, matches against registry - report-changes.mjs: prioritized change report (critical/high/medium/low) - discover-new-urls.mjs: finds relevant new MS Learn pages not yet covered - run-weekly-update.mjs: orchestrator with --force/--discover/--dry-run Integration: - session-start hook reads change-report.json instead of broken mtime check - hook triggers background poll if >7 days since last check - generate-skills --update reads change report for targeted MCP updates Current stats: 69% match rate (924/1342 URLs tracked via sitemaps). ~31% unmatched due to Microsoft URL restructuring (ai-foundry/openai paths). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-09 21:19:51 +02:00 · 2026-04-09 21:19:51 +02:00 · f968f37be3
commit f968f37be3
parent 035255fc5d
13 changed files with 976 additions and 59 deletions
--- a/plugins/ms-ai-architect/scripts/kb-update/lib/registry-io.mjs
+++ b/plugins/ms-ai-architect/scripts/kb-update/lib/registry-io.mjs
@ -0,0 +1,75 @@
+// registry-io.mjs — Atomic read/write for url-registry.json and report files.
+// Zero dependencies. Uses rename() for atomic writes.
+
+import { readFileSync, writeFileSync, renameSync, existsSync, mkdirSync } from 'node:fs';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const DEFAULT_DATA_DIR = join(__dirname, '..', 'data');
+
+/**
+ * Load the URL registry from disk.
+ * @param {string} [dataDir] — defaults to ../data/ relative to lib/
+ * @returns {object} parsed registry or empty scaffold
+ */
+export function loadRegistry(dataDir = DEFAULT_DATA_DIR) {
+  const path = join(dataDir, 'url-registry.json');
+  if (!existsSync(path)) {
+    return {
+      version: 1,
+      created_at: null,
+      last_poll: null,
+      sitemap_state: {},
+      urls: {},
+    };
+  }
+  return JSON.parse(readFileSync(path, 'utf8'));
+}
+
+/**
+ * Save the URL registry atomically (write to .tmp, then rename).
+ * @param {object} registry
+ * @param {string} [dataDir]
+ */
+export function saveRegistry(registry, dataDir = DEFAULT_DATA_DIR) {
+  ensureDir(dataDir);
+  const path = join(dataDir, 'url-registry.json');
+  const tmp = path + '.tmp';
+  writeFileSync(tmp, JSON.stringify(registry, null, 2) + '\n', 'utf8');
+  renameSync(tmp, path);
+}
+
+/**
+ * Load a JSON report file (change-report.json or discovery-report.json).
+ * @param {string} name — filename without path (e.g. 'change-report.json')
+ * @param {string} [dataDir]
+ * @returns {object|null} parsed JSON or null if not found
+ */
+export function loadReport(name, dataDir = DEFAULT_DATA_DIR) {
+  const path = join(dataDir, name);
+  if (!existsSync(path)) return null;
+  try {
+    return JSON.parse(readFileSync(path, 'utf8'));
+  } catch {
+    return null;
+  }
+}
+
+/**
+ * Save a JSON report file atomically.
+ * @param {string} name
+ * @param {object} data
+ * @param {string} [dataDir]
+ */
+export function saveReport(name, data, dataDir = DEFAULT_DATA_DIR) {
+  ensureDir(dataDir);
+  const path = join(dataDir, name);
+  const tmp = path + '.tmp';
+  writeFileSync(tmp, JSON.stringify(data, null, 2) + '\n', 'utf8');
+  renameSync(tmp, path);
+}
+
+function ensureDir(dir) {
+  if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
+}
--- a/plugins/ms-ai-architect/scripts/kb-update/lib/sitemap-stream.mjs
+++ b/plugins/ms-ai-architect/scripts/kb-update/lib/sitemap-stream.mjs
@ -0,0 +1,124 @@
+// sitemap-stream.mjs — Streaming XML parser for Microsoft Learn sitemaps.
+// Zero dependencies. Handles 47MB+ XML without loading into memory.
+// Yields { loc, lastmod } per <url> entry.
+
+import { get as httpsGet } from 'node:https';
+import { createGunzip } from 'node:zlib';
+
+const MAX_RETRIES = 3;
+const RETRY_DELAY_MS = 2000;
+
+/**
+ * Stream a sitemap XML file and yield { loc, lastmod } for each <url> entry.
+ * Works with both the sitemap index (<sitemapindex>) and child sitemaps (<urlset>).
+ * @param {string} url — full HTTPS URL to sitemap XML
+ * @yields {{ loc: string, lastmod: string|null }}
+ */
+export async function* streamSitemap(url) {
+  const chunks = await fetchWithRetry(url);
+  yield* parseSitemapEntries(chunks);
+}
+
+/**
+ * Fetch a sitemap fully into a buffer (most child sitemaps are 24-47MB).
+ * For the index (612KB) this is trivial. For children, we buffer to allow
+ * the generator to yield entries without backpressure issues.
+ * @param {string} url
+ * @param {number} [attempt]
+ * @returns {Promise<string>}
+ */
+function fetchWithRetry(url, attempt = 1) {
+  return new Promise((resolve, reject) => {
+    httpsGet(url, (res) => {
+      if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
+        return fetchWithRetry(res.headers.location, attempt).then(resolve, reject);
+      }
+      if (res.statusCode !== 200) {
+        res.resume();
+        const err = new Error(`HTTP ${res.statusCode} for ${url}`);
+        if (attempt < MAX_RETRIES) {
+          return setTimeout(() => fetchWithRetry(url, attempt + 1).then(resolve, reject), RETRY_DELAY_MS * attempt);
+        }
+        return reject(err);
+      }
+
+      const stream = res.headers['content-encoding'] === 'gzip'
+        ? res.pipe(createGunzip())
+        : res;
+
+      const parts = [];
+      stream.on('data', (chunk) => parts.push(chunk.toString('utf8')));
+      stream.on('end', () => resolve(parts.join('')));
+      stream.on('error', (err) => {
+        if (attempt < MAX_RETRIES) {
+          setTimeout(() => fetchWithRetry(url, attempt + 1).then(resolve, reject), RETRY_DELAY_MS * attempt);
+        } else {
+          reject(err);
+        }
+      });
+    }).on('error', (err) => {
+      if (attempt < MAX_RETRIES) {
+        setTimeout(() => fetchWithRetry(url, attempt + 1).then(resolve, reject), RETRY_DELAY_MS * attempt);
+      } else {
+        reject(err);
+      }
+    });
+  });
+}
+
+/**
+ * Parse sitemap XML text and yield entries.
+ * Handles both <sitemapindex> (yields loc from <sitemap> blocks)
+ * and <urlset> (yields loc+lastmod from <url> blocks).
+ * Uses simple regex extraction — reliable for well-formed sitemap XML.
+ * @param {string} xml
+ * @yields {{ loc: string, lastmod: string|null }}
+ */
+function* parseSitemapEntries(xml) {
+  // Detect if this is a sitemap index or a urlset
+  const isSitemapIndex = xml.includes('<sitemapindex');
+
+  if (isSitemapIndex) {
+    // Parse <sitemap> blocks: <loc>...</loc> and <lastmod>...</lastmod>
+    const sitemapRegex = /<sitemap>\s*<loc>([^<]+)<\/loc>(?:\s*<lastmod>([^<]+)<\/lastmod>)?/g;
+    let match;
+    while ((match = sitemapRegex.exec(xml)) !== null) {
+      yield { loc: match[1].trim(), lastmod: match[2]?.trim() || null };
+    }
+  } else {
+    // Parse <url> blocks — extract <loc> and <lastmod>
+    // The XML structure per entry is:
+    //   <url><loc>...</loc><lastmod>...</lastmod><xhtml:link .../>...</url>
+    // We use a two-pass approach: find each <url>...</url> block, then extract fields
+    const urlBlockRegex = /<url>([\s\S]*?)<\/url>/g;
+    const locRegex = /<loc>([^<]+)<\/loc>/;
+    const lastmodRegex = /<lastmod>([^<]+)<\/lastmod>/;
+
+    let match;
+    while ((match = urlBlockRegex.exec(xml)) !== null) {
+      const block = match[1];
+      const locMatch = locRegex.exec(block);
+      if (!locMatch) continue;
+
+      const lastmodMatch = lastmodRegex.exec(block);
+      yield {
+        loc: locMatch[1].trim(),
+        lastmod: lastmodMatch ? lastmodMatch[1].trim() : null,
+      };
+    }
+  }
+}
+
+/**
+ * Fetch the sitemap index and return parsed entries.
+ * Convenience wrapper for the common "fetch index, decide which children to poll" pattern.
+ * @param {string} [indexUrl]
+ * @returns {Promise<Array<{ loc: string, lastmod: string|null }>>}
+ */
+export async function fetchSitemapIndex(indexUrl = 'https://learn.microsoft.com/_sitemaps/sitemapindex.xml') {
+  const entries = [];
+  for await (const entry of streamSitemap(indexUrl)) {
+    entries.push(entry);
+  }
+  return entries;
+}
--- a/plugins/ms-ai-architect/scripts/kb-update/lib/url-normalize.mjs
+++ b/plugins/ms-ai-architect/scripts/kb-update/lib/url-normalize.mjs
@ -0,0 +1,69 @@
+// url-normalize.mjs — Consistent URL normalization for sitemap ↔ reference file matching.
+// Zero dependencies. Idempotent: normalizeUrl(normalizeUrl(x)) === normalizeUrl(x).
+
+/**
+ * Normalize a learn.microsoft.com URL to a canonical form.
+ * Rules applied in order:
+ *   1. Strip trailing punctuation leaked from markdown
+ *   2. Strip fragment (#anchor)
+ *   3. Strip ?view= and other query params
+ *   4. Remove /en-us/ locale prefix (store locale-free)
+ *   5. Lowercase
+ * @param {string} raw
+ * @returns {string|null} normalized URL, or null if not a learn.microsoft.com URL
+ */
+export function normalizeUrl(raw) {
+  if (!raw || typeof raw !== 'string') return null;
+  if (!raw.includes('learn.microsoft.com')) return null;
+
+  let url = raw;
+
+  // 1. Strip trailing punctuation that leaked from markdown context
+  url = url.replace(/[.,;:!?'")}\]]+$/, '');
+
+  // 2. Strip fragment
+  const hashIdx = url.indexOf('#');
+  if (hashIdx !== -1) url = url.slice(0, hashIdx);
+
+  // 3. Strip query parameters (?view=, ?tabs=, etc.)
+  const qIdx = url.indexOf('?');
+  if (qIdx !== -1) url = url.slice(0, qIdx);
+
+  // 4. Remove /en-us/ locale prefix — store locale-free for consistent matching
+  url = url.replace('://learn.microsoft.com/en-us/', '://learn.microsoft.com/');
+
+  // 5. Strip trailing slash for consistency
+  url = url.replace(/\/+$/, '');
+
+  // 6. Lowercase
+  url = url.toLowerCase();
+
+  return url;
+}
+
+/**
+ * Extract all learn.microsoft.com URLs from a text string.
+ * Handles all 5 citation formats found in reference files:
+ *   - Markdown links: [text](https://learn.microsoft.com/...)
+ *   - Bare URLs on their own line
+ *   - URL: prefix format
+ *   - Dash-bullet format
+ *   - Table cell format
+ * @param {string} text
+ * @returns {string[]} array of normalized unique URLs
+ */
+export function extractUrls(text) {
+  if (!text) return [];
+  const regex = /https:\/\/learn\.microsoft\.com[^\s)"'<>\]|]+/g;
+  const seen = new Set();
+  const results = [];
+  let match;
+  while ((match = regex.exec(text)) !== null) {
+    const normalized = normalizeUrl(match[0]);
+    if (normalized && !seen.has(normalized)) {
+      seen.add(normalized);
+      results.push(normalized);
+    }
+  }
+  return results;
+}