import TurndownService from 'turndown'; /** * HTML to Markdown converter using Turndown */ const turndownService = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced', bulletListMarker: '-', emDelimiter: '*', }); // Preserve code blocks with language turndownService.addRule('fencedCodeBlock', { filter: (node) => { return ( node.nodeName === 'PRE' && node.firstChild !== null && node.firstChild.nodeName === 'CODE' ); }, replacement: (_content, node) => { const codeNode = node.firstChild; const code = codeNode?.textContent || ''; // Try to extract language from class using getAttribute if available const getAttr = (codeNode as { getAttribute?: (name: string) => string | null })?.getAttribute; const className = getAttr ? getAttr.call(codeNode, 'class') || '' : ''; const langMatch = className.match(/language-(\w+)/); const language = langMatch ? langMatch[1] : ''; return `\n\`\`\`${language}\n${code}\n\`\`\`\n`; }, }); // Handle inline code turndownService.addRule('inlineCode', { filter: (node) => { return node.nodeName === 'CODE' && node.parentNode?.nodeName !== 'PRE'; }, replacement: (content) => { if (content.includes('`')) { return `\`\`${content}\`\``; } return `\`${content}\``; }, }); // Remove navigation elements turndownService.addRule('removeNav', { filter: (node) => { const tagName = node.nodeName.toLowerCase(); return ['nav', 'header', 'footer', 'aside'].includes(tagName); }, replacement: () => '', }); // Remove scripts and styles turndownService.addRule('removeScripts', { filter: ['script', 'style', 'noscript'], replacement: () => '', }); /** * Convert HTML to Markdown */ export function htmlToMarkdown(html: string): string { // Pre-process: remove common non-content elements let cleaned = html // Remove HTML comments .replace(//g, '') // Remove common cookie/banner divs .replace(/]*class="[^"]*(?:cookie|banner|popup|modal|overlay)[^"]*"[^>]*>[\s\S]*?<\/div>/gi, '') // Remove aria-hidden elements .replace(/<[^>]+aria-hidden="true"[^>]*>[\s\S]*?<\/[^>]+>/gi, ''); const markdown = turndownService.turndown(cleaned); // Post-process: clean up markdown return markdown // Remove excessive newlines .replace(/\n{4,}/g, '\n\n\n') // Remove trailing whitespace .replace(/[ \t]+$/gm, '') // Ensure file ends with single newline .trim() + '\n'; } /** * Extract main content from HTML using common selectors */ export function extractMainContent(html: string): string { // Common documentation content selectors const contentSelectors = [ 'main article', 'main .content', '[role="main"]', '.documentation', '.docs-content', '.markdown-body', '.prose', 'article.content', 'article', 'main', '#content', '.content', ]; // Try to match content area with simple regex patterns for (const selector of contentSelectors) { // Convert selector to basic regex pattern const pattern = selectorToPattern(selector); if (pattern) { const match = html.match(pattern); if (match) { return match[0]; } } } // Fall back to body content const bodyMatch = html.match(/]*>([\s\S]*)<\/body>/i); return bodyMatch ? bodyMatch[1] : html; } /** * Convert CSS selector to regex pattern (simplified) */ function selectorToPattern(selector: string): RegExp | null { // Handle class selectors if (selector.startsWith('.')) { const className = selector.slice(1).replace('.', '\\s+'); return new RegExp(`<[^>]+class="[^"]*${className}[^"]*"[^>]*>[\\s\\S]*?<\\/[^>]+>`, 'i'); } // Handle ID selectors if (selector.startsWith('#')) { const id = selector.slice(1); return new RegExp(`<[^>]+id="${id}"[^>]*>[\\s\\S]*?<\\/[^>]+>`, 'i'); } // Handle attribute selectors if (selector.includes('[')) { const attrMatch = selector.match(/\[([^=]+)="([^"]+)"\]/); if (attrMatch) { return new RegExp(`<[^>]+${attrMatch[1]}="${attrMatch[2]}"[^>]*>[\\s\\S]*?<\\/[^>]+>`, 'i'); } } // Handle tag selectors const tagMatch = selector.match(/^(\w+)(?:\s+\.(\w+))?$/); if (tagMatch) { const tag = tagMatch[1]; const className = tagMatch[2]; if (className) { return new RegExp(`<${tag}[^>]+class="[^"]*${className}[^"]*"[^>]*>[\\s\\S]*?<\\/${tag}>`, 'i'); } return new RegExp(`<${tag}[^>]*>[\\s\\S]*?<\\/${tag}>`, 'i'); } return null; }