ktg-plugin-marketplace/plugins/ms-ai-architect/scripts/kb-update/lib/url-normalize.mjs

// url-normalize.mjs — Consistent URL normalization for sitemap ↔ reference file matching.
// Zero dependencies. Idempotent: normalizeUrl(normalizeUrl(x)) === normalizeUrl(x).

/**
 * Normalize a learn.microsoft.com URL to a canonical form.
 * Rules applied in order:
 *   1. Strip trailing punctuation leaked from markdown
 *   2. Strip fragment (#anchor)
 *   3. Strip ?view= and other query params
 *   4. Remove /en-us/ locale prefix (store locale-free)
 *   5. Lowercase
 * @param {string} raw
 * @returns {string|null} normalized URL, or null if not a learn.microsoft.com URL
 */
export function normalizeUrl(raw) {
  if (!raw || typeof raw !== 'string') return null;
  if (!raw.includes('learn.microsoft.com')) return null;

  let url = raw;

  // 1. Strip trailing punctuation that leaked from markdown context
  url = url.replace(/[.,;:!?'")}\]]+$/, '');

  // 2. Strip fragment
  const hashIdx = url.indexOf('#');
  if (hashIdx !== -1) url = url.slice(0, hashIdx);

  // 3. Strip query parameters (?view=, ?tabs=, etc.)
  const qIdx = url.indexOf('?');
  if (qIdx !== -1) url = url.slice(0, qIdx);

  // 4. Remove /en-us/ locale prefix — store locale-free for consistent matching
  url = url.replace('://learn.microsoft.com/en-us/', '://learn.microsoft.com/');

  // 5. Strip trailing slash for consistency
  url = url.replace(/\/+$/, '');

  // 6. Lowercase
  url = url.toLowerCase();

  return url;
}

/**
 * Extract all learn.microsoft.com URLs from a text string.
 * Handles all 5 citation formats found in reference files:
 *   - Markdown links: [text](https://learn.microsoft.com/...)
 *   - Bare URLs on their own line
 *   - URL: prefix format
 *   - Dash-bullet format
 *   - Table cell format
 * @param {string} text
 * @returns {string[]} array of normalized unique URLs
 */
export function extractUrls(text) {
  if (!text) return [];
  const regex = /https:\/\/learn\.microsoft\.com[^\s)"'<>\]|]+/g;
  const seen = new Set();
  const results = [];
  let match;
  while ((match = regex.exec(text)) !== null) {
    const normalized = normalizeUrl(match[0]);
    if (normalized && !seen.has(normalized)) {
      seen.add(normalized);
      results.push(normalized);
    }
  }
  return results;
}