// Regular Expression Tokenizer // https://github.com/fent/ret.js export enum types { ROOT = 0, GROUP = 1, POSITION = 2, SET = 3, RANGE = 4, REPETITION = 5, REFERENCE = 6, CHAR = 7, } export interface Token { type: types // eslint-disable-next-line @typescript-eslint/no-explicit-any [key: string]: any } const wordBoundary = (): Token => ({ type: types.POSITION, value: 'b' }) const nonWordBoundary = (): Token => ({ type: types.POSITION, value: 'B' }) const begin = (): Token => ({ type: types.POSITION, value: '^' }) const end = (): Token => ({ type: types.POSITION, value: '$' }) const ints = (): Token[] => [{ type: types.RANGE, from: 48, to: 57 }] const words = (): Token[] => [ { type: types.CHAR, value: 95 }, { type: types.RANGE, from: 97, to: 122 }, { type: types.RANGE, from: 65, to: 90 }, ...ints(), ] const whitespace = (): Token[] => [ { type: types.CHAR, value: 9 }, { type: types.CHAR, value: 10 }, { type: types.CHAR, value: 11 }, { type: types.CHAR, value: 12 }, { type: types.CHAR, value: 13 }, { type: types.CHAR, value: 32 }, { type: types.CHAR, value: 160 }, { type: types.CHAR, value: 5760 }, { type: types.CHAR, value: 6158 }, { type: types.CHAR, value: 8192 }, { type: types.CHAR, value: 8193 }, { type: types.CHAR, value: 8194 }, { type: types.CHAR, value: 8195 }, { type: types.CHAR, value: 8196 }, { type: types.CHAR, value: 8197 }, { type: types.CHAR, value: 8198 }, { type: types.CHAR, value: 8199 }, { type: types.CHAR, value: 8200 }, { type: types.CHAR, value: 8201 }, { type: types.CHAR, value: 8202 }, { type: types.CHAR, value: 8232 }, { type: types.CHAR, value: 8233 }, { type: types.CHAR, value: 8239 }, { type: types.CHAR, value: 8287 }, { type: types.CHAR, value: 12288 }, { type: types.CHAR, value: 65279 }, ] const notanychar = (): Token[] => [ { type: types.CHAR, value: 10 }, { type: types.CHAR, value: 13 }, { type: types.CHAR, value: 8232 }, { type: types.CHAR, value: 8233 }, ] const words_ = (): Token => ({ type: types.SET, set: words(), not: false }) const notWords = (): Token => ({ type: types.SET, set: words(), not: true }) const ints_ = (): Token => ({ type: types.SET, set: ints(), not: false }) const notInts = (): Token => ({ type: types.SET, set: ints(), not: true }) const whitespace_ = (): Token => ({ type: types.SET, set: whitespace(), not: false }) const notWhitespace = (): Token => ({ type: types.SET, set: whitespace(), not: true }) const anyChar = (): Token => ({ type: types.SET, set: notanychar(), not: true }) const specialChars: Record = { '0': 0, 't': 9, 'n': 10, 'v': 11, 'f': 12, 'r': 13, } const strToChars = (str: string): string => { const chars = /(\[\\b\])|(\\)?\\(?:u([A-F0-9]{4})|x([A-F0-9]{2})|(0?[0-7]{2})|c([@A-Z[\\\]^?])|([0tnvfr]))/g return str.replace(chars, (match, b, s, uh, xh, o, c, sp) => { if (s) return match const code = b ? 8 : uh ? parseInt(uh, 16) : xh ? parseInt(xh, 16) : o ? parseInt(o, 8) : c ? '@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^ ?'.indexOf(c) : specialChars[sp] let ch = String.fromCharCode(code) if (/[[{}^$.|?*+()]/.test(ch)) { ch = `\\${ch}` } return ch }) } const tokenizeClass = (str: string, regexpStr: string): [Token[], number] => { const tokens: Token[] = [] const classTokens = /\\(?:(w)|(d)|(s)|(W)|(D)|(S))|((?:(?:\\)(.)|([^\]\\]))-(?:\\)?([^\]]))|(\])|(?:\\)?(.)/g let match let char while ((match = classTokens.exec(str)) !== null) { if (match[1]) { tokens.push(words_()) } else if (match[2]) { tokens.push(ints_()) } else if (match[3]) { tokens.push(whitespace_()) } else if (match[4]) { tokens.push(notWords()) } else if (match[5]) { tokens.push(notInts()) } else if (match[6]) { tokens.push(notWhitespace()) } else if (match[7]) { tokens.push({ type: types.RANGE, from: (match[8] || match[9]).charCodeAt(0), to: match[10].charCodeAt(0), }) } else if ((char = match[12])) { tokens.push({ type: types.CHAR, value: char.charCodeAt(0) }) } else { return [tokens, classTokens.lastIndex] } } throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Unterminated character class`) } const error = (regexpStr: string, msg: string): never => { throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: ${msg}`) } export const ret = (regexpStr: string): Token => { let i = 0 const root: Token = { type: types.ROOT, stack: [] } let p: Token = root let stack: Token[] = root.stack const groupStack: Token[] = [] const repeatErr = (i: number) => error(regexpStr, `Nothing to repeat at column ${i - 1}`) const chars = strToChars(regexpStr) while (i < chars.length) { let c = chars[i++] switch (c) { case '\\': c = chars[i++] switch (c) { case 'b': stack.push(wordBoundary()) break case 'B': stack.push(nonWordBoundary()) break case 'w': stack.push(words_()) break case 'W': stack.push(notWords()) break case 'd': stack.push(ints_()) break case 'D': stack.push(notInts()) break case 's': stack.push(whitespace_()) break case 'S': stack.push(notWhitespace()) break default: if (/\d/.test(c)) { stack.push({ type: types.REFERENCE, value: parseInt(c, 10) }) } else { stack.push({ type: types.CHAR, value: c.charCodeAt(0) }) } } break case '^': stack.push(begin()) break case '$': stack.push(end()) break case '[': { let not = false if (chars[i] === '^') { not = true i++ } const [set, lastIndex] = tokenizeClass(chars.slice(i), regexpStr) i += lastIndex stack.push({ type: types.SET, set, not }) break } case '.': stack.push(anyChar()) break case '(': { const group: Token = { type: types.GROUP, stack: [], remember: true } if (chars[i] === '?') { c = chars[i + 1] i += 2 if (c === '=') { group.followedBy = true } else if (c === '!') { group.notFollowedBy = true } else if (c !== ':') { error(regexpStr, `Invalid group, character '${c}' after '?' at column ${i - 1}`) } group.remember = false } stack.push(group) groupStack.push(p) p = group stack = group.stack break } case ')': if (groupStack.length === 0) error(regexpStr, `Unmatched ) at column ${i - 1}`) p = groupStack.pop() as Token stack = p.options ? p.options[p.options.length - 1] : p.stack break case '|': if (!p.options) { p.options = [p.stack] delete p.stack } { const newStack: Token[] = [] p.options.push(newStack) stack = newStack } break case '{': { const repetition = /^(\d+)(,(\d+)?)?\}/.exec(chars.slice(i)) if (repetition) { if (stack.length === 0) repeatErr(i) const min = parseInt(repetition[1], 10) const max = repetition[2] ? (repetition[3] ? parseInt(repetition[3], 10) : Infinity) : min i += repetition[0].length stack.push({ type: types.REPETITION, min, max, value: stack.pop() }) } else { stack.push({ type: types.CHAR, value: 123 }) } break } case '?': if (stack.length === 0) repeatErr(i) stack.push({ type: types.REPETITION, min: 0, max: 1, value: stack.pop() }) break case '+': if (stack.length === 0) repeatErr(i) stack.push({ type: types.REPETITION, min: 1, max: Infinity, value: stack.pop() }) break case '*': if (stack.length === 0) repeatErr(i) stack.push({ type: types.REPETITION, min: 0, max: Infinity, value: stack.pop() }) break default: stack.push({ type: types.CHAR, value: c.charCodeAt(0) }) } } if (groupStack.length !== 0) error(regexpStr, 'Unterminated group') return root }