ktg-plugin-marketplace/plugins/ms-ai-architect/scripts/kb-update/lib/url-normalize.mjs
Kjell Tore Guttormsen f968f37be3 feat(ms-ai-architect): sitemap-based KB change detection system
Adds a zero-dependency Node.js pipeline that polls Microsoft Learn sitemaps
weekly to detect when source documentation changes. Replaces the broken
mtime-based staleness check (all files had identical mtime after release).

Components:
- build-registry.mjs: extracts 1342 URLs from 387 reference files
- poll-sitemaps.mjs: streams ~18 child sitemaps, matches against registry
- report-changes.mjs: prioritized change report (critical/high/medium/low)
- discover-new-urls.mjs: finds relevant new MS Learn pages not yet covered
- run-weekly-update.mjs: orchestrator with --force/--discover/--dry-run

Integration:
- session-start hook reads change-report.json instead of broken mtime check
- hook triggers background poll if >7 days since last check
- generate-skills --update reads change report for targeted MCP updates

Current stats: 69% match rate (924/1342 URLs tracked via sitemaps).
~31% unmatched due to Microsoft URL restructuring (ai-foundry/openai paths).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-09 21:19:51 +02:00

69 lines
2.2 KiB
JavaScript

// url-normalize.mjs — Consistent URL normalization for sitemap ↔ reference file matching.
// Zero dependencies. Idempotent: normalizeUrl(normalizeUrl(x)) === normalizeUrl(x).
/**
* Normalize a learn.microsoft.com URL to a canonical form.
* Rules applied in order:
* 1. Strip trailing punctuation leaked from markdown
* 2. Strip fragment (#anchor)
* 3. Strip ?view= and other query params
* 4. Remove /en-us/ locale prefix (store locale-free)
* 5. Lowercase
* @param {string} raw
* @returns {string|null} normalized URL, or null if not a learn.microsoft.com URL
*/
export function normalizeUrl(raw) {
if (!raw || typeof raw !== 'string') return null;
if (!raw.includes('learn.microsoft.com')) return null;
let url = raw;
// 1. Strip trailing punctuation that leaked from markdown context
url = url.replace(/[.,;:!?'")}\]]+$/, '');
// 2. Strip fragment
const hashIdx = url.indexOf('#');
if (hashIdx !== -1) url = url.slice(0, hashIdx);
// 3. Strip query parameters (?view=, ?tabs=, etc.)
const qIdx = url.indexOf('?');
if (qIdx !== -1) url = url.slice(0, qIdx);
// 4. Remove /en-us/ locale prefix — store locale-free for consistent matching
url = url.replace('://learn.microsoft.com/en-us/', '://learn.microsoft.com/');
// 5. Strip trailing slash for consistency
url = url.replace(/\/+$/, '');
// 6. Lowercase
url = url.toLowerCase();
return url;
}
/**
* Extract all learn.microsoft.com URLs from a text string.
* Handles all 5 citation formats found in reference files:
* - Markdown links: [text](https://learn.microsoft.com/...)
* - Bare URLs on their own line
* - URL: prefix format
* - Dash-bullet format
* - Table cell format
* @param {string} text
* @returns {string[]} array of normalized unique URLs
*/
export function extractUrls(text) {
if (!text) return [];
const regex = /https:\/\/learn\.microsoft\.com[^\s)"'<>\]|]+/g;
const seen = new Set();
const results = [];
let match;
while ((match = regex.exec(text)) !== null) {
const normalized = normalizeUrl(match[0]);
if (normalized && !seen.has(normalized)) {
seen.add(normalized);
results.push(normalized);
}
}
return results;
}