/** * @bfra.me/doc-sync/utils/safe-patterns - Safe regex patterns and utilities for MDX/HTML parsing * All patterns are designed to prevent ReDoS attacks */ import remarkParse from 'remark-parse' import {unified} from 'unified' /** * Create safe heading pattern for specific level * Uses explicit character class instead of greedy `.+` to prevent ReDoS * * @param level - Heading level (1-6) * @returns Safe regex pattern for the heading level * * @example * ```ts * const h2Pattern = createHeadingPattern(2) * const matches = content.match(h2Pattern) * ``` */ export function createHeadingPattern(level: number): RegExp { if (level < 1 || level > 6) { throw new Error('Heading level must be between 1 and 6') } const hashes = '#'.repeat(level) // Use [^\r\n]+ instead of .+ to prevent ReDoS return new RegExp(`^${hashes} ([^\r\n]+)$`, 'gm') } /** * Check if content contains a specific JSX component * Uses a safe pattern that avoids catastrophic backtracking * * @param content - The MDX/HTML content to search * @param componentName - Name of the component to find (e.g., 'Card', 'Badge') * @returns True if the component is found * * @example * ```ts * const hasCard = hasComponent(content, 'Card') * const hasCardGrid = hasComponent(content, 'CardGrid') * ``` */ export function hasComponent(content: string, componentName: string): boolean { // Escape special regex characters in component name const escaped = componentName.replaceAll(/[.*+?^${}()|[\]\\]/g, String.raw`\$&`) // Match opening tag or self-closing tag with word boundary const pattern = new RegExp(String.raw`<${escaped}(?:\s[^>]*)?(?:>|\/>)`) return pattern.test(content) } /** * Extract code blocks and inline code from markdown content using unified/remark (safe, no regex) * This approach uses AST parsing instead of regex to avoid ReDoS vulnerabilities * * @param content - The markdown content to parse * @returns Array of code block strings (fenced blocks with backticks, inline code with backticks) * * @example * ```ts * const blocks = extractCodeBlocks(content) * for (const block of blocks) { * console.log(block) * } * ``` */ export function extractCodeBlocks(content: string): readonly string[] { const processor = unified().use(remarkParse) let tree try { tree = processor.parse(content) } catch { // If parsing fails, return empty array instead of throwing return [] } const blocks: string[] = [] interface MarkdownNode { type?: string lang?: string | null value?: string children?: MarkdownNode[] } function visit(node: MarkdownNode): void { if (node.type === 'code') { const lang = node.lang ?? '' const value = node.value ?? '' blocks.push(`\`\`\`${lang}\n${value}\n\`\`\``) } if (node.type === 'inlineCode') { const value = node.value ?? '' blocks.push(`\`${value}\``) } if (Array.isArray(node.children)) { for (const child of node.children) { visit(child) } } } visit(tree as MarkdownNode) return blocks } /** * Parse JSX tags from content using a safe, non-backtracking approach. * Uses a state machine instead of regex to prevent ReDoS. * * @param content - The MDX/HTML content to parse * @returns Array of matched JSX tags with their positions */ export function parseJSXTags( content: string, ): readonly {tag: string; index: number; isClosing: boolean; isSelfClosing: boolean}[] { const results: {tag: string; index: number; isClosing: boolean; isSelfClosing: boolean}[] = [] let i = 0 while (i < content.length) { if (content[i] !== '<') { i++ continue } const startIndex = i i++ if (i >= content.length) break const isClosing = content[i] === '/' if (isClosing) { i++ if (i >= content.length) break } // JSX components must start with uppercase (React convention) if (!/^[A-Z]/.test(content[i] ?? '')) { continue } let tagName = '' while (i < content.length && /^[a-z0-9]/i.test(content[i] ?? '')) { tagName += content[i] i++ } if (tagName.length === 0) { continue } if (isClosing) { if (i < content.length && content[i] === '>') { results.push({ tag: ``, index: startIndex, isClosing: true, isSelfClosing: false, }) i++ } continue } // Skip attributes while tracking nested structures let depth = 0 let foundEnd = false let isSelfClosing = false while (i < content.length && !foundEnd) { const char = content[i] // Skip quoted string contents to avoid misinterpreting > inside strings if ((char === '"' || char === "'") && depth === 0) { const quote = char i++ while (i < content.length && content[i] !== quote) { if (content[i] === '\\' && i + 1 < content.length) { i += 2 } else { i++ } } if (i < content.length) i++ continue } // Track brace depth for JSX expressions like {value} if (char === '{') { depth++ i++ continue } if (char === '}') { depth = Math.max(0, depth - 1) i++ continue } // Only match tag end when not inside a JSX expression if (depth === 0) { if (char === '/' && i + 1 < content.length && content[i + 1] === '>') { isSelfClosing = true foundEnd = true i += 2 continue } if (char === '>') { foundEnd = true i++ continue } } i++ } if (foundEnd) { const fullTag = content.slice(startIndex, i) results.push({tag: fullTag, index: startIndex, isClosing: false, isSelfClosing}) } } return results } /** * Find empty markdown links in content using safe parsing. * Uses indexOf-based scanning instead of regex to prevent ReDoS. * * @param content - The markdown content to check * @returns Array of positions where empty links were found */ export function findEmptyMarkdownLinks(content: string): readonly number[] { const positions: number[] = [] let pos = 0 while (pos < content.length) { const openBracket = content.indexOf('[', pos) if (openBracket === -1) break // Handle nested brackets let bracketDepth = 1 let closeBracket = openBracket + 1 while (closeBracket < content.length && bracketDepth > 0) { if (content[closeBracket] === '[') { bracketDepth++ } else if (content[closeBracket] === ']') { bracketDepth-- } closeBracket++ } if (bracketDepth !== 0) { pos = openBracket + 1 continue } closeBracket-- if (closeBracket + 1 < content.length && content[closeBracket + 1] === '(') { let parenPos = closeBracket + 2 let isEmptyOrWhitespace = true while (parenPos < content.length && content[parenPos] !== ')') { if (!/^\s/.test(content[parenPos] ?? '')) { isEmptyOrWhitespace = false break } parenPos++ } if (isEmptyOrWhitespace && parenPos < content.length && content[parenPos] === ')') { positions.push(openBracket) } } pos = closeBracket + 1 } return positions }