// url-normalize.mjs — Consistent URL normalization for sitemap ↔ reference file matching. // Zero dependencies. Idempotent: normalizeUrl(normalizeUrl(x)) === normalizeUrl(x). /** * Normalize a learn.microsoft.com URL to a canonical form. * Rules applied in order: * 1. Strip trailing punctuation leaked from markdown * 2. Strip fragment (#anchor) * 3. Strip ?view= and other query params * 4. Remove /en-us/ locale prefix (store locale-free) * 5. Lowercase * @param {string} raw * @returns {string|null} normalized URL, or null if not a learn.microsoft.com URL */ export function normalizeUrl(raw) { if (!raw || typeof raw !== 'string') return null; if (!raw.includes('learn.microsoft.com')) return null; let url = raw; // 1. Strip trailing punctuation that leaked from markdown context url = url.replace(/[.,;:!?'")}\]]+$/, ''); // 2. Strip fragment const hashIdx = url.indexOf('#'); if (hashIdx !== -1) url = url.slice(0, hashIdx); // 3. Strip query parameters (?view=, ?tabs=, etc.) const qIdx = url.indexOf('?'); if (qIdx !== -1) url = url.slice(0, qIdx); // 4. Remove /en-us/ locale prefix — store locale-free for consistent matching url = url.replace('://learn.microsoft.com/en-us/', '://learn.microsoft.com/'); // 5. Strip trailing slash for consistency url = url.replace(/\/+$/, ''); // 6. Lowercase url = url.toLowerCase(); return url; } /** * Extract all learn.microsoft.com URLs from a text string. * Handles all 5 citation formats found in reference files: * - Markdown links: [text](https://learn.microsoft.com/...) * - Bare URLs on their own line * - URL: prefix format * - Dash-bullet format * - Table cell format * @param {string} text * @returns {string[]} array of normalized unique URLs */ export function extractUrls(text) { if (!text) return []; const regex = /https:\/\/learn\.microsoft\.com[^\s)"'<>\]|]+/g; const seen = new Set(); const results = []; let match; while ((match = regex.exec(text)) !== null) { const normalized = normalizeUrl(match[0]); if (normalized && !seen.has(normalized)) { seen.add(normalized); results.push(normalized); } } return results; }