import { BS } from './bs' import { Token, Attribute } from './token' import { NsResolver, NsResolverMap } from './ns-resolver' /** * Produces tokens while scanning the input stream. * @public * * @remarks * * Will emit (using `nextToken()`) any of StartTag, EndTag, Text, Comment, CDATA or undefined when the stream chunk is exhauseted (the last full tag was scanned). * On the next write, scanning resumes taking into account any uncomplete tag from the previous chunk. * * @example * Writing the xml in 2 chunks: * * ```javascript * const tokenizer = new Tokenizer() * tokenizer.write('someinn') * while ((token = tokenizer.nextToken())) { * // do something * } * tokenizer.write('ertext') * while ((token = tokenizer.nextToken())) { * // do some more * } * ``` * */ export class Tokenizer { // bs offset private at = 0 private bs = BS.EMPTY // state properties /** carry over partial names or content to next chunk */ private tail = BS.EMPTY /** tag name of pending start or end tag */ private name = BS.EMPTY // tag name /** name of pending attribute */ private attName = BS.EMPTY /** list of pending attributes */ private atts: Attribute[] = [] /** list of pending name spaces */ private ns: { name?: BS; uri: BS }[] = [] /** state of tokenizer of previous chunk */ private state = 0 private seenLength = 0 /** namespace resolver */ private nsResolver?: NsResolver private nsResolverStack: NsResolver[] = [] // level: distance from root (root = 0) private parentNames: BS[] = [] private skippingLevel = -1 private skippingEndTag = false constructor(nsResolverMap?: NsResolverMap) { if (nsResolverMap) { this.nsResolver = NsResolver.create(nsResolverMap) } } /** * Write some bytes to be scanned. * * @param bs - byte sequence as an instance of `BS`, `Buffer` or `string` */ write(bs: BS | string | Buffer | Uint8Array) { this.seenLength += this.bs.length this.at = 0 this.bs = bs instanceof BS ? bs : BS.create(bs) } get exhausted() { return this.bs.length === this.at } /** * Gets the next token * * return `undefined` when fully scanned or last tag was incomplete, waiting for more. */ nextToken() { const { bs } = this while (this.at < bs.byteLength) { switch (this.state) { case 0: { if (this.bs[this.at] === 0x3c /*<*/) { this.state = 1 this.at++ } else { this.state = 30 } continue } case 1 /*<*/: switch (this.bs[this.at]) { case 0x2f /*/*/: this.state = 20 this.at++ continue case 0x21 /*!*/: { this.state = 50 this.at++ continue } case 0x3f /*?*/: { this.state = 70 this.at++ continue } default: { this.state = 10 continue } } case 10 /*<...*/: { this.name = this.getName(11) this.ns = [] this.atts = [] continue } case 11 /**/: { this.at++ this.state = 0 const startTag = this.startTag() if (startTag) { return startTag } else { continue } } case 0x2f /*/*/: { this.at++ this.state = 19 continue } } this.state = 12 continue } case 12 /* -1) { this.state = 11 this.capture(this.at, index) if (this.isAtXmlns(this.attName)) { this.ns.push({ name: this.attName.length === 5 ? undefined : this.attName.slice(6), uri: this.flushCapture(), }) } else { this.atts.push({ name: this.attName, value: this.flushCapture() }) } this.at++ continue } else { this.capture(this.at, this.bs.length) continue } } case 19 /**/) { this.state = 0 const startTag = this.startTag(true) if (startTag) { return startTag } else { continue } } else { throw new Error(`Expect closing of self-closing tag (at: ${this.seenLength + this.at})`) } } case 20 /**/) { this.state = 0 const endTag = this.endTag() if (endTag) { return endTag } else { continue } } else { throw new Error(`Expect closing of end-tag (at: ${this.seenLength + this.at})`) } } case 30 /*...*/: { const index = this.bs.indexOf(0x3c /*<*/, this.at) if (index > -1) { this.state = 0 if (this.isSkipping) { this.at = index continue } else { this.capture(this.at, index) return new Token.Text(this.flushCapture()) } } else { this.capture(this.at, this.bs.length) continue } } case 50 /* -1) { this.state++ this.at = end + 1 } else { this.at = this.bs.length } continue } case 71 /**/ ? 0 : 70 continue } } } return } /** * When called, the tokenizer will skip decoding the child nodes of the last tag. * But scanning will resume on its end-tag. This end-tag will not be emitted when called with `true` (`tokenizer.skipChildNodes(true)`) * This is a performance optimization as there's no need to decode text, attribute values... * * @example * Skipping nodes * * ```javascript * tokenizer.write('someinnertext') * const aStart = tokenizer.nextToken() * const bStart = tokenizer.nextToken() * bStart.getAttribute('att') === 'value' * tokenizer.skipChildNodes() * const bEnd = tokenizer.nextToken() * bEnd.toString() === '' * ``` */ skipChildNodes(skippingEndTag: boolean = false) { this.skippingLevel = this.parentNames.length this.skippingEndTag = skippingEndTag } /** Capuring a name */ private getName(nextState: number) { const { at: start, bs } = this let b = bs[start] let at = start // ' ' '>' '/' 'n' 'r' 't' '=' while ( b !== undefined && b !== 0x20 && b !== 0x3e && b !== 0x2f && b !== 0x0a && b !== 0x0d && b !== 0x09 && b !== 0x3d ) { b = bs[++at] } if (start === at && this.tail.length === 0) throw new Error(`Expecting name (at: ${this.seenLength + this.at})`) const name = this.tail.append(this.bs.subarray(start, at)) this.at = at if (b !== undefined) { this.state = nextState this.tail = BS.EMPTY } else { this.tail = name } return name } /** Capuring a sequence */ private capture(start: number, end: number) { this.tail = this.isSkipping ? BS.EMPTY : this.tail.append(this.bs.subarray(start, end)) this.at = end } /** Capuring content ending with a given byte */ private startingContentCapture(b: number) { // in search for the first char of a '-->' or ']]>' let off = this.at const end = this.bs.indexOf(b, off) if (end > -1) { this.capture(this.at, end + 1) this.state++ } else { this.capture(this.at, this.bs.length) } } /** detecting the possible end of content */ private mayBeEndingContentCapture(b: number) { this.state += this.bs[this.at] === b ? 1 : -1 this.capture(this.at, this.at + 1) } /** detecting the actual end of content */ private hasContentCaputreEnded(b: number) { switch (this.bs[this.at]) { case 0x3e /*>*/: { this.state = 0 this.at++ return !this.isSkipping } case b: { this.capture(this.at, this.at + 1) return false } default: { this.capture(this.at, this.at + 1) this.state -= 2 return false } } } /** returns the catpure (value) */ private flushCapture() { const capture = this.tail this.tail = BS.EMPTY return capture } /** returns the content of comment or CDATA */ private flushCaptureContent() { const content = this.tail.subarray(0, this.tail.length - 2) this.tail = BS.EMPTY return content } /** if skipping this token */ private get isSkipping() { return this.skippingLevel !== -1 && this.skippingLevel <= this.parentNames.length } /** creates a StartTag */ private startTag(selfclosing?: true) { if (!selfclosing) this.parentNames.push(this.name) if (this.isSkipping) return if (this.nsResolver) { this.nsResolverStack.push(this.nsResolver) this.nsResolver = this.nsResolver.forChildTag(this.ns) return new Token.StartTag( this.nsResolver.resolveTagName(this.name), this.nsResolver.resolveAtts(this.atts), this.nsResolver.resolveNs(this.ns), selfclosing ) } else { return new Token.StartTag( this.name, this.atts, this.ns.map(ns => ({ name: ns.name, uri: ns.uri, uriString: ns.uri.toString() })), selfclosing ) } } /** creates an EndTag */ private endTag() { while (this.parentNames.length > 0) { // should it generate EndTag for unproperly closed tags? const prevName = this.parentNames.pop() if (prevName && this.name.equals(prevName)) break } if (this.isSkipping) return this.skippingLevel = -1 if (this.skippingEndTag) { this.skippingEndTag = false return } const endTag = this.nsResolver ? new Token.EndTag(this.nsResolver.resolveTagName(this.name)) : new Token.EndTag(this.name) if (this.nsResolver) this.nsResolver = this.nsResolverStack.pop() return endTag } /** skipping spaces, ie after names */ private skipSpaceLike() { const bs = this.bs let b = bs[this.at] while (b !== undefined && (b === 0x20 || b === 0xa || b === 0xd || b === 0x9)) b = bs[++this.at] return b === undefined } /** testing 'XMLNS' or 'xmlns' sequence */ private isAtXmlns(bs: BS) { return (bs.length >= 5 && this.isAt_xmlns(bs)) || this.isAt_XMLNS(bs) } /** testing 'xmlns' sequence */ private isAt_xmlns(bs: BS) { return bs[0] === 0x78 && bs[1] === 0x6d && bs[2] === 0x6c && bs[3] === 0x6e && bs[4] === 0x73 } /** testing 'XMLNS' sequence */ private isAt_XMLNS(bs: BS) { return bs[0] === 0x58 && bs[1] === 0x4d && bs[2] === 0x4c && bs[3] === 0x4e && bs[4] === 0x53 } }