Adds a zero-dependency Node.js pipeline that polls Microsoft Learn sitemaps weekly to detect when source documentation changes. Replaces the broken mtime-based staleness check (all files had identical mtime after release). Components: - build-registry.mjs: extracts 1342 URLs from 387 reference files - poll-sitemaps.mjs: streams ~18 child sitemaps, matches against registry - report-changes.mjs: prioritized change report (critical/high/medium/low) - discover-new-urls.mjs: finds relevant new MS Learn pages not yet covered - run-weekly-update.mjs: orchestrator with --force/--discover/--dry-run Integration: - session-start hook reads change-report.json instead of broken mtime check - hook triggers background poll if >7 days since last check - generate-skills --update reads change report for targeted MCP updates Current stats: 69% match rate (924/1342 URLs tracked via sitemaps). ~31% unmatched due to Microsoft URL restructuring (ai-foundry/openai paths). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
124 lines
4.4 KiB
JavaScript
124 lines
4.4 KiB
JavaScript
// sitemap-stream.mjs — Streaming XML parser for Microsoft Learn sitemaps.
|
|
// Zero dependencies. Handles 47MB+ XML without loading into memory.
|
|
// Yields { loc, lastmod } per <url> entry.
|
|
|
|
import { get as httpsGet } from 'node:https';
|
|
import { createGunzip } from 'node:zlib';
|
|
|
|
const MAX_RETRIES = 3;
|
|
const RETRY_DELAY_MS = 2000;
|
|
|
|
/**
|
|
* Stream a sitemap XML file and yield { loc, lastmod } for each <url> entry.
|
|
* Works with both the sitemap index (<sitemapindex>) and child sitemaps (<urlset>).
|
|
* @param {string} url — full HTTPS URL to sitemap XML
|
|
* @yields {{ loc: string, lastmod: string|null }}
|
|
*/
|
|
export async function* streamSitemap(url) {
|
|
const chunks = await fetchWithRetry(url);
|
|
yield* parseSitemapEntries(chunks);
|
|
}
|
|
|
|
/**
|
|
* Fetch a sitemap fully into a buffer (most child sitemaps are 24-47MB).
|
|
* For the index (612KB) this is trivial. For children, we buffer to allow
|
|
* the generator to yield entries without backpressure issues.
|
|
* @param {string} url
|
|
* @param {number} [attempt]
|
|
* @returns {Promise<string>}
|
|
*/
|
|
function fetchWithRetry(url, attempt = 1) {
|
|
return new Promise((resolve, reject) => {
|
|
httpsGet(url, (res) => {
|
|
if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
|
|
return fetchWithRetry(res.headers.location, attempt).then(resolve, reject);
|
|
}
|
|
if (res.statusCode !== 200) {
|
|
res.resume();
|
|
const err = new Error(`HTTP ${res.statusCode} for ${url}`);
|
|
if (attempt < MAX_RETRIES) {
|
|
return setTimeout(() => fetchWithRetry(url, attempt + 1).then(resolve, reject), RETRY_DELAY_MS * attempt);
|
|
}
|
|
return reject(err);
|
|
}
|
|
|
|
const stream = res.headers['content-encoding'] === 'gzip'
|
|
? res.pipe(createGunzip())
|
|
: res;
|
|
|
|
const parts = [];
|
|
stream.on('data', (chunk) => parts.push(chunk.toString('utf8')));
|
|
stream.on('end', () => resolve(parts.join('')));
|
|
stream.on('error', (err) => {
|
|
if (attempt < MAX_RETRIES) {
|
|
setTimeout(() => fetchWithRetry(url, attempt + 1).then(resolve, reject), RETRY_DELAY_MS * attempt);
|
|
} else {
|
|
reject(err);
|
|
}
|
|
});
|
|
}).on('error', (err) => {
|
|
if (attempt < MAX_RETRIES) {
|
|
setTimeout(() => fetchWithRetry(url, attempt + 1).then(resolve, reject), RETRY_DELAY_MS * attempt);
|
|
} else {
|
|
reject(err);
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Parse sitemap XML text and yield entries.
|
|
* Handles both <sitemapindex> (yields loc from <sitemap> blocks)
|
|
* and <urlset> (yields loc+lastmod from <url> blocks).
|
|
* Uses simple regex extraction — reliable for well-formed sitemap XML.
|
|
* @param {string} xml
|
|
* @yields {{ loc: string, lastmod: string|null }}
|
|
*/
|
|
function* parseSitemapEntries(xml) {
|
|
// Detect if this is a sitemap index or a urlset
|
|
const isSitemapIndex = xml.includes('<sitemapindex');
|
|
|
|
if (isSitemapIndex) {
|
|
// Parse <sitemap> blocks: <loc>...</loc> and <lastmod>...</lastmod>
|
|
const sitemapRegex = /<sitemap>\s*<loc>([^<]+)<\/loc>(?:\s*<lastmod>([^<]+)<\/lastmod>)?/g;
|
|
let match;
|
|
while ((match = sitemapRegex.exec(xml)) !== null) {
|
|
yield { loc: match[1].trim(), lastmod: match[2]?.trim() || null };
|
|
}
|
|
} else {
|
|
// Parse <url> blocks — extract <loc> and <lastmod>
|
|
// The XML structure per entry is:
|
|
// <url><loc>...</loc><lastmod>...</lastmod><xhtml:link .../>...</url>
|
|
// We use a two-pass approach: find each <url>...</url> block, then extract fields
|
|
const urlBlockRegex = /<url>([\s\S]*?)<\/url>/g;
|
|
const locRegex = /<loc>([^<]+)<\/loc>/;
|
|
const lastmodRegex = /<lastmod>([^<]+)<\/lastmod>/;
|
|
|
|
let match;
|
|
while ((match = urlBlockRegex.exec(xml)) !== null) {
|
|
const block = match[1];
|
|
const locMatch = locRegex.exec(block);
|
|
if (!locMatch) continue;
|
|
|
|
const lastmodMatch = lastmodRegex.exec(block);
|
|
yield {
|
|
loc: locMatch[1].trim(),
|
|
lastmod: lastmodMatch ? lastmodMatch[1].trim() : null,
|
|
};
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Fetch the sitemap index and return parsed entries.
|
|
* Convenience wrapper for the common "fetch index, decide which children to poll" pattern.
|
|
* @param {string} [indexUrl]
|
|
* @returns {Promise<Array<{ loc: string, lastmod: string|null }>>}
|
|
*/
|
|
export async function fetchSitemapIndex(indexUrl = 'https://learn.microsoft.com/_sitemaps/sitemapindex.xml') {
|
|
const entries = [];
|
|
for await (const entry of streamSitemap(indexUrl)) {
|
|
entries.push(entry);
|
|
}
|
|
return entries;
|
|
}
|