ktg-plugin-marketplace/plugins/ms-ai-architect/scripts/kb-update/lib/sitemap-stream.mjs

// sitemap-stream.mjs — Streaming XML parser for Microsoft Learn sitemaps.
// Zero dependencies. Handles 47MB+ XML without loading into memory.
// Yields { loc, lastmod } per <url> entry.

import { get as httpsGet } from 'node:https';
import { createGunzip } from 'node:zlib';

const MAX_RETRIES = 3;
const RETRY_DELAY_MS = 2000;

/**
 * Stream a sitemap XML file and yield { loc, lastmod } for each <url> entry.
 * Works with both the sitemap index (<sitemapindex>) and child sitemaps (<urlset>).
 * @param {string} url — full HTTPS URL to sitemap XML
 * @yields {{ loc: string, lastmod: string|null }}
 */
export async function* streamSitemap(url) {
  const chunks = await fetchWithRetry(url);
  yield* parseSitemapEntries(chunks);
}

/**
 * Fetch a sitemap fully into a buffer (most child sitemaps are 24-47MB).
 * For the index (612KB) this is trivial. For children, we buffer to allow
 * the generator to yield entries without backpressure issues.
 * @param {string} url
 * @param {number} [attempt]
 * @returns {Promise<string>}
 */
function fetchWithRetry(url, attempt = 1) {
  return new Promise((resolve, reject) => {
    httpsGet(url, (res) => {
      if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
        return fetchWithRetry(res.headers.location, attempt).then(resolve, reject);
      }
      if (res.statusCode !== 200) {
        res.resume();
        const err = new Error(`HTTP ${res.statusCode} for ${url}`);
        if (attempt < MAX_RETRIES) {
          return setTimeout(() => fetchWithRetry(url, attempt + 1).then(resolve, reject), RETRY_DELAY_MS * attempt);
        }
        return reject(err);
      }

      const stream = res.headers['content-encoding'] === 'gzip'
        ? res.pipe(createGunzip())
        : res;

      const parts = [];
      stream.on('data', (chunk) => parts.push(chunk.toString('utf8')));
      stream.on('end', () => resolve(parts.join('')));
      stream.on('error', (err) => {
        if (attempt < MAX_RETRIES) {
          setTimeout(() => fetchWithRetry(url, attempt + 1).then(resolve, reject), RETRY_DELAY_MS * attempt);
        } else {
          reject(err);
        }
      });
    }).on('error', (err) => {
      if (attempt < MAX_RETRIES) {
        setTimeout(() => fetchWithRetry(url, attempt + 1).then(resolve, reject), RETRY_DELAY_MS * attempt);
      } else {
        reject(err);
      }
    });
  });
}

/**
 * Parse sitemap XML text and yield entries.
 * Handles both <sitemapindex> (yields loc from <sitemap> blocks)
 * and <urlset> (yields loc+lastmod from <url> blocks).
 * Uses simple regex extraction — reliable for well-formed sitemap XML.
 * @param {string} xml
 * @yields {{ loc: string, lastmod: string|null }}
 */
function* parseSitemapEntries(xml) {
  // Detect if this is a sitemap index or a urlset
  const isSitemapIndex = xml.includes('<sitemapindex');

  if (isSitemapIndex) {
    // Parse <sitemap> blocks: <loc>...</loc> and <lastmod>...</lastmod>
    const sitemapRegex = /<sitemap>\s*<loc>([^<]+)<\/loc>(?:\s*<lastmod>([^<]+)<\/lastmod>)?/g;
    let match;
    while ((match = sitemapRegex.exec(xml)) !== null) {
      yield { loc: match[1].trim(), lastmod: match[2]?.trim() || null };
    }
  } else {
    // Parse <url> blocks — extract <loc> and <lastmod>
    // The XML structure per entry is:
    //   <url><loc>...</loc><lastmod>...</lastmod><xhtml:link .../>...</url>
    // We use a two-pass approach: find each <url>...</url> block, then extract fields
    const urlBlockRegex = /<url>([\s\S]*?)<\/url>/g;
    const locRegex = /<loc>([^<]+)<\/loc>/;
    const lastmodRegex = /<lastmod>([^<]+)<\/lastmod>/;

    let match;
    while ((match = urlBlockRegex.exec(xml)) !== null) {
      const block = match[1];
      const locMatch = locRegex.exec(block);
      if (!locMatch) continue;

      const lastmodMatch = lastmodRegex.exec(block);
      yield {
        loc: locMatch[1].trim(),
        lastmod: lastmodMatch ? lastmodMatch[1].trim() : null,
      };
    }
  }
}

/**
 * Fetch the sitemap index and return parsed entries.
 * Convenience wrapper for the common "fetch index, decide which children to poll" pattern.
 * @param {string} [indexUrl]
 * @returns {Promise<Array<{ loc: string, lastmod: string|null }>>}
 */
export async function fetchSitemapIndex(indexUrl = 'https://learn.microsoft.com/_sitemaps/sitemapindex.xml') {
  const entries = [];
  for await (const entry of streamSitemap(indexUrl)) {
    entries.push(entry);
  }
  return entries;
}