import { BS } from './bs'
import { Token, Attribute } from './token'
import { NsResolver, NsResolverMap } from './ns-resolver'
/**
* Produces tokens while scanning the input stream.
* @public
*
* @remarks
*
* Will emit (using `nextToken()`) any of StartTag, EndTag, Text, Comment, CDATA or undefined when the stream chunk is exhauseted (the last full tag was scanned).
* On the next write, scanning resumes taking into account any uncomplete tag from the previous chunk.
*
* @example
* Writing the xml in 2 chunks:
*
* ```javascript
* const tokenizer = new Tokenizer()
* tokenizer.write('someinn')
* while ((token = tokenizer.nextToken())) {
* // do something
* }
* tokenizer.write('ertext')
* while ((token = tokenizer.nextToken())) {
* // do some more
* }
* ```
*
*/
export class Tokenizer {
// bs offset
private at = 0
private bs = BS.EMPTY
// state properties
/** carry over partial names or content to next chunk */
private tail = BS.EMPTY
/** tag name of pending start or end tag */
private name = BS.EMPTY // tag name
/** name of pending attribute */
private attName = BS.EMPTY
/** list of pending attributes */
private atts: Attribute[] = []
/** list of pending name spaces */
private ns: { name?: BS; uri: BS }[] = []
/** state of tokenizer of previous chunk */
private state = 0
private seenLength = 0
/** namespace resolver */
private nsResolver?: NsResolver
private nsResolverStack: NsResolver[] = []
// level: distance from root (root = 0)
private parentNames: BS[] = []
private skippingLevel = -1
private skippingEndTag = false
constructor(nsResolverMap?: NsResolverMap) {
if (nsResolverMap) {
this.nsResolver = NsResolver.create(nsResolverMap)
}
}
/**
* Write some bytes to be scanned.
*
* @param bs - byte sequence as an instance of `BS`, `Buffer` or `string`
*/
write(bs: BS | string | Buffer | Uint8Array) {
this.seenLength += this.bs.length
this.at = 0
this.bs = bs instanceof BS ? bs : BS.create(bs)
}
get exhausted() {
return this.bs.length === this.at
}
/**
* Gets the next token
*
* return `undefined` when fully scanned or last tag was incomplete, waiting for more.
*/
nextToken() {
const { bs } = this
while (this.at < bs.byteLength) {
switch (this.state) {
case 0: {
if (this.bs[this.at] === 0x3c /*<*/) {
this.state = 1
this.at++
} else {
this.state = 30
}
continue
}
case 1 /*<*/:
switch (this.bs[this.at]) {
case 0x2f /*/*/:
this.state = 20
this.at++
continue
case 0x21 /*!*/: {
this.state = 50
this.at++
continue
}
case 0x3f /*?*/: {
this.state = 70
this.at++
continue
}
default: {
this.state = 10
continue
}
}
case 10 /*<...*/: {
this.name = this.getName(11)
this.ns = []
this.atts = []
continue
}
case 11 /**/: {
this.at++
this.state = 0
const startTag = this.startTag()
if (startTag) {
return startTag
} else {
continue
}
}
case 0x2f /*/*/: {
this.at++
this.state = 19
continue
}
}
this.state = 12
continue
}
case 12 /* -1) {
this.state = 11
this.capture(this.at, index)
if (this.isAtXmlns(this.attName)) {
this.ns.push({
name: this.attName.length === 5 ? undefined : this.attName.slice(6),
uri: this.flushCapture(),
})
} else {
this.atts.push({ name: this.attName, value: this.flushCapture() })
}
this.at++
continue
} else {
this.capture(this.at, this.bs.length)
continue
}
}
case 19 /**/) {
this.state = 0
const startTag = this.startTag(true)
if (startTag) {
return startTag
} else {
continue
}
} else {
throw new Error(`Expect closing of self-closing tag (at: ${this.seenLength + this.at})`)
}
}
case 20 /**/: {
this.name = this.getName(21)
continue
}
case 21 /**/) {
this.state = 0
const endTag = this.endTag()
if (endTag) {
return endTag
} else {
continue
}
} else {
throw new Error(`Expect closing of end-tag (at: ${this.seenLength + this.at})`)
}
}
case 30 /*...*/: {
const index = this.bs.indexOf(0x3c /*<*/, this.at)
if (index > -1) {
this.state = 0
if (this.isSkipping) {
this.at = index
continue
} else {
this.capture(this.at, index)
return new Token.Text(this.flushCapture())
}
} else {
this.capture(this.at, this.bs.length)
continue
}
}
case 50 /* -1) {
this.state++
this.at = end + 1
} else {
this.at = this.bs.length
}
continue
}
case 71 /**/: {
this.state = bs[this.at++] === 0x3e /*>*/ ? 0 : 70
continue
}
}
}
return
}
/**
* When called, the tokenizer will skip decoding the child nodes of the last tag.
* But scanning will resume on its end-tag. This end-tag will not be emitted when called with `true` (`tokenizer.skipChildNodes(true)`)
* This is a performance optimization as there's no need to decode text, attribute values...
*
* @example
* Skipping nodes
*
* ```javascript
* tokenizer.write('someinnertext')
* const aStart = tokenizer.nextToken()
* const bStart = tokenizer.nextToken()
* bStart.getAttribute('att') === 'value'
* tokenizer.skipChildNodes()
* const bEnd = tokenizer.nextToken()
* bEnd.toString() === ''
* ```
*/
skipChildNodes(skippingEndTag: boolean = false) {
this.skippingLevel = this.parentNames.length
this.skippingEndTag = skippingEndTag
}
/** Capuring a name */
private getName(nextState: number) {
const { at: start, bs } = this
let b = bs[start]
let at = start
// ' ' '>' '/' 'n' 'r' 't' '='
while (
b !== undefined &&
b !== 0x20 &&
b !== 0x3e &&
b !== 0x2f &&
b !== 0x0a &&
b !== 0x0d &&
b !== 0x09 &&
b !== 0x3d
) {
b = bs[++at]
}
if (start === at && this.tail.length === 0) throw new Error(`Expecting name (at: ${this.seenLength + this.at})`)
const name = this.tail.append(this.bs.subarray(start, at))
this.at = at
if (b !== undefined) {
this.state = nextState
this.tail = BS.EMPTY
} else {
this.tail = name
}
return name
}
/** Capuring a sequence */
private capture(start: number, end: number) {
this.tail = this.isSkipping ? BS.EMPTY : this.tail.append(this.bs.subarray(start, end))
this.at = end
}
/** Capuring content ending with a given byte */
private startingContentCapture(b: number) {
// in search for the first char of a '-->' or ']]>'
let off = this.at
const end = this.bs.indexOf(b, off)
if (end > -1) {
this.capture(this.at, end + 1)
this.state++
} else {
this.capture(this.at, this.bs.length)
}
}
/** detecting the possible end of content */
private mayBeEndingContentCapture(b: number) {
this.state += this.bs[this.at] === b ? 1 : -1
this.capture(this.at, this.at + 1)
}
/** detecting the actual end of content */
private hasContentCaputreEnded(b: number) {
switch (this.bs[this.at]) {
case 0x3e /*>*/: {
this.state = 0
this.at++
return !this.isSkipping
}
case b: {
this.capture(this.at, this.at + 1)
return false
}
default: {
this.capture(this.at, this.at + 1)
this.state -= 2
return false
}
}
}
/** returns the catpure (value) */
private flushCapture() {
const capture = this.tail
this.tail = BS.EMPTY
return capture
}
/** returns the content of comment or CDATA */
private flushCaptureContent() {
const content = this.tail.subarray(0, this.tail.length - 2)
this.tail = BS.EMPTY
return content
}
/** if skipping this token */
private get isSkipping() {
return this.skippingLevel !== -1 && this.skippingLevel <= this.parentNames.length
}
/** creates a StartTag */
private startTag(selfclosing?: true) {
if (!selfclosing) this.parentNames.push(this.name)
if (this.isSkipping) return
if (this.nsResolver) {
this.nsResolverStack.push(this.nsResolver)
this.nsResolver = this.nsResolver.forChildTag(this.ns)
return new Token.StartTag(
this.nsResolver.resolveTagName(this.name),
this.nsResolver.resolveAtts(this.atts),
this.nsResolver.resolveNs(this.ns),
selfclosing
)
} else {
return new Token.StartTag(
this.name,
this.atts,
this.ns.map(ns => ({ name: ns.name, uri: ns.uri, uriString: ns.uri.toString() })),
selfclosing
)
}
}
/** creates an EndTag */
private endTag() {
while (this.parentNames.length > 0) {
// should it generate EndTag for unproperly closed tags?
const prevName = this.parentNames.pop()
if (prevName && this.name.equals(prevName)) break
}
if (this.isSkipping) return
this.skippingLevel = -1
if (this.skippingEndTag) {
this.skippingEndTag = false
return
}
const endTag = this.nsResolver
? new Token.EndTag(this.nsResolver.resolveTagName(this.name))
: new Token.EndTag(this.name)
if (this.nsResolver) this.nsResolver = this.nsResolverStack.pop()
return endTag
}
/** skipping spaces, ie after names */
private skipSpaceLike() {
const bs = this.bs
let b = bs[this.at]
while (b !== undefined && (b === 0x20 || b === 0xa || b === 0xd || b === 0x9)) b = bs[++this.at]
return b === undefined
}
/** testing 'XMLNS' or 'xmlns' sequence */
private isAtXmlns(bs: BS) {
return (bs.length >= 5 && this.isAt_xmlns(bs)) || this.isAt_XMLNS(bs)
}
/** testing 'xmlns' sequence */
private isAt_xmlns(bs: BS) {
return bs[0] === 0x78 && bs[1] === 0x6d && bs[2] === 0x6c && bs[3] === 0x6e && bs[4] === 0x73
}
/** testing 'XMLNS' sequence */
private isAt_XMLNS(bs: BS) {
return bs[0] === 0x58 && bs[1] === 0x4d && bs[2] === 0x4c && bs[3] === 0x4e && bs[4] === 0x53
}
}