Adds a zero-dependency Node.js pipeline that polls Microsoft Learn sitemaps weekly to detect when source documentation changes. Replaces the broken mtime-based staleness check (all files had identical mtime after release). Components: - build-registry.mjs: extracts 1342 URLs from 387 reference files - poll-sitemaps.mjs: streams ~18 child sitemaps, matches against registry - report-changes.mjs: prioritized change report (critical/high/medium/low) - discover-new-urls.mjs: finds relevant new MS Learn pages not yet covered - run-weekly-update.mjs: orchestrator with --force/--discover/--dry-run Integration: - session-start hook reads change-report.json instead of broken mtime check - hook triggers background poll if >7 days since last check - generate-skills --update reads change report for targeted MCP updates Current stats: 69% match rate (924/1342 URLs tracked via sitemaps). ~31% unmatched due to Microsoft URL restructuring (ai-foundry/openai paths). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
69 lines
2.2 KiB
JavaScript
69 lines
2.2 KiB
JavaScript
// url-normalize.mjs — Consistent URL normalization for sitemap ↔ reference file matching.
|
|
// Zero dependencies. Idempotent: normalizeUrl(normalizeUrl(x)) === normalizeUrl(x).
|
|
|
|
/**
|
|
* Normalize a learn.microsoft.com URL to a canonical form.
|
|
* Rules applied in order:
|
|
* 1. Strip trailing punctuation leaked from markdown
|
|
* 2. Strip fragment (#anchor)
|
|
* 3. Strip ?view= and other query params
|
|
* 4. Remove /en-us/ locale prefix (store locale-free)
|
|
* 5. Lowercase
|
|
* @param {string} raw
|
|
* @returns {string|null} normalized URL, or null if not a learn.microsoft.com URL
|
|
*/
|
|
export function normalizeUrl(raw) {
|
|
if (!raw || typeof raw !== 'string') return null;
|
|
if (!raw.includes('learn.microsoft.com')) return null;
|
|
|
|
let url = raw;
|
|
|
|
// 1. Strip trailing punctuation that leaked from markdown context
|
|
url = url.replace(/[.,;:!?'")}\]]+$/, '');
|
|
|
|
// 2. Strip fragment
|
|
const hashIdx = url.indexOf('#');
|
|
if (hashIdx !== -1) url = url.slice(0, hashIdx);
|
|
|
|
// 3. Strip query parameters (?view=, ?tabs=, etc.)
|
|
const qIdx = url.indexOf('?');
|
|
if (qIdx !== -1) url = url.slice(0, qIdx);
|
|
|
|
// 4. Remove /en-us/ locale prefix — store locale-free for consistent matching
|
|
url = url.replace('://learn.microsoft.com/en-us/', '://learn.microsoft.com/');
|
|
|
|
// 5. Strip trailing slash for consistency
|
|
url = url.replace(/\/+$/, '');
|
|
|
|
// 6. Lowercase
|
|
url = url.toLowerCase();
|
|
|
|
return url;
|
|
}
|
|
|
|
/**
|
|
* Extract all learn.microsoft.com URLs from a text string.
|
|
* Handles all 5 citation formats found in reference files:
|
|
* - Markdown links: [text](https://learn.microsoft.com/...)
|
|
* - Bare URLs on their own line
|
|
* - URL: prefix format
|
|
* - Dash-bullet format
|
|
* - Table cell format
|
|
* @param {string} text
|
|
* @returns {string[]} array of normalized unique URLs
|
|
*/
|
|
export function extractUrls(text) {
|
|
if (!text) return [];
|
|
const regex = /https:\/\/learn\.microsoft\.com[^\s)"'<>\]|]+/g;
|
|
const seen = new Set();
|
|
const results = [];
|
|
let match;
|
|
while ((match = regex.exec(text)) !== null) {
|
|
const normalized = normalizeUrl(match[0]);
|
|
if (normalized && !seen.has(normalized)) {
|
|
seen.add(normalized);
|
|
results.push(normalized);
|
|
}
|
|
}
|
|
return results;
|
|
}
|