// sitemap-stream.mjs — Streaming XML parser for Microsoft Learn sitemaps. // Zero dependencies. Handles 47MB+ XML without loading into memory. // Yields { loc, lastmod } per entry. import { get as httpsGet } from 'node:https'; import { createGunzip } from 'node:zlib'; const MAX_RETRIES = 3; const RETRY_DELAY_MS = 2000; /** * Stream a sitemap XML file and yield { loc, lastmod } for each entry. * Works with both the sitemap index () and child sitemaps (). * @param {string} url — full HTTPS URL to sitemap XML * @yields {{ loc: string, lastmod: string|null }} */ export async function* streamSitemap(url) { const chunks = await fetchWithRetry(url); yield* parseSitemapEntries(chunks); } /** * Fetch a sitemap fully into a buffer (most child sitemaps are 24-47MB). * For the index (612KB) this is trivial. For children, we buffer to allow * the generator to yield entries without backpressure issues. * @param {string} url * @param {number} [attempt] * @returns {Promise} */ function fetchWithRetry(url, attempt = 1) { return new Promise((resolve, reject) => { httpsGet(url, (res) => { if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) { return fetchWithRetry(res.headers.location, attempt).then(resolve, reject); } if (res.statusCode !== 200) { res.resume(); const err = new Error(`HTTP ${res.statusCode} for ${url}`); if (attempt < MAX_RETRIES) { return setTimeout(() => fetchWithRetry(url, attempt + 1).then(resolve, reject), RETRY_DELAY_MS * attempt); } return reject(err); } const stream = res.headers['content-encoding'] === 'gzip' ? res.pipe(createGunzip()) : res; const parts = []; stream.on('data', (chunk) => parts.push(chunk.toString('utf8'))); stream.on('end', () => resolve(parts.join(''))); stream.on('error', (err) => { if (attempt < MAX_RETRIES) { setTimeout(() => fetchWithRetry(url, attempt + 1).then(resolve, reject), RETRY_DELAY_MS * attempt); } else { reject(err); } }); }).on('error', (err) => { if (attempt < MAX_RETRIES) { setTimeout(() => fetchWithRetry(url, attempt + 1).then(resolve, reject), RETRY_DELAY_MS * attempt); } else { reject(err); } }); }); } /** * Parse sitemap XML text and yield entries. * Handles both (yields loc from blocks) * and (yields loc+lastmod from blocks). * Uses simple regex extraction — reliable for well-formed sitemap XML. * @param {string} xml * @yields {{ loc: string, lastmod: string|null }} */ function* parseSitemapEntries(xml) { // Detect if this is a sitemap index or a urlset const isSitemapIndex = xml.includes(' blocks: ... and ... const sitemapRegex = /\s*([^<]+)<\/loc>(?:\s*([^<]+)<\/lastmod>)?/g; let match; while ((match = sitemapRegex.exec(xml)) !== null) { yield { loc: match[1].trim(), lastmod: match[2]?.trim() || null }; } } else { // Parse blocks — extract and // The XML structure per entry is: // ......... // We use a two-pass approach: find each ... block, then extract fields const urlBlockRegex = /([\s\S]*?)<\/url>/g; const locRegex = /([^<]+)<\/loc>/; const lastmodRegex = /([^<]+)<\/lastmod>/; let match; while ((match = urlBlockRegex.exec(xml)) !== null) { const block = match[1]; const locMatch = locRegex.exec(block); if (!locMatch) continue; const lastmodMatch = lastmodRegex.exec(block); yield { loc: locMatch[1].trim(), lastmod: lastmodMatch ? lastmodMatch[1].trim() : null, }; } } } /** * Fetch the sitemap index and return parsed entries. * Convenience wrapper for the common "fetch index, decide which children to poll" pattern. * @param {string} [indexUrl] * @returns {Promise>} */ export async function fetchSitemapIndex(indexUrl = 'https://learn.microsoft.com/_sitemaps/sitemapindex.xml') { const entries = []; for await (const entry of streamSitemap(indexUrl)) { entries.push(entry); } return entries; }