| 'use strict'; | 
|   | 
| const unicode = require('../common/unicode'); | 
| const ERR = require('../common/error-codes'); | 
|   | 
| //Aliases | 
| const $ = unicode.CODE_POINTS; | 
|   | 
| //Const | 
| const DEFAULT_BUFFER_WATERLINE = 1 << 16; | 
|   | 
| //Preprocessor | 
| //NOTE: HTML input preprocessing | 
| //(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream) | 
| class Preprocessor { | 
|     constructor() { | 
|         this.html = null; | 
|   | 
|         this.pos = -1; | 
|         this.lastGapPos = -1; | 
|         this.lastCharPos = -1; | 
|   | 
|         this.gapStack = []; | 
|   | 
|         this.skipNextNewLine = false; | 
|   | 
|         this.lastChunkWritten = false; | 
|         this.endOfChunkHit = false; | 
|         this.bufferWaterline = DEFAULT_BUFFER_WATERLINE; | 
|     } | 
|   | 
|     _err() { | 
|         // NOTE: err reporting is noop by default. Enabled by mixin. | 
|     } | 
|   | 
|     _addGap() { | 
|         this.gapStack.push(this.lastGapPos); | 
|         this.lastGapPos = this.pos; | 
|     } | 
|   | 
|     _processSurrogate(cp) { | 
|         //NOTE: try to peek a surrogate pair | 
|         if (this.pos !== this.lastCharPos) { | 
|             const nextCp = this.html.charCodeAt(this.pos + 1); | 
|   | 
|             if (unicode.isSurrogatePair(nextCp)) { | 
|                 //NOTE: we have a surrogate pair. Peek pair character and recalculate code point. | 
|                 this.pos++; | 
|   | 
|                 //NOTE: add gap that should be avoided during retreat | 
|                 this._addGap(); | 
|   | 
|                 return unicode.getSurrogatePairCodePoint(cp, nextCp); | 
|             } | 
|         } | 
|   | 
|         //NOTE: we are at the end of a chunk, therefore we can't infer surrogate pair yet. | 
|         else if (!this.lastChunkWritten) { | 
|             this.endOfChunkHit = true; | 
|             return $.EOF; | 
|         } | 
|   | 
|         //NOTE: isolated surrogate | 
|         this._err(ERR.surrogateInInputStream); | 
|   | 
|         return cp; | 
|     } | 
|   | 
|     dropParsedChunk() { | 
|         if (this.pos > this.bufferWaterline) { | 
|             this.lastCharPos -= this.pos; | 
|             this.html = this.html.substring(this.pos); | 
|             this.pos = 0; | 
|             this.lastGapPos = -1; | 
|             this.gapStack = []; | 
|         } | 
|     } | 
|   | 
|     write(chunk, isLastChunk) { | 
|         if (this.html) { | 
|             this.html += chunk; | 
|         } else { | 
|             this.html = chunk; | 
|         } | 
|   | 
|         this.lastCharPos = this.html.length - 1; | 
|         this.endOfChunkHit = false; | 
|         this.lastChunkWritten = isLastChunk; | 
|     } | 
|   | 
|     insertHtmlAtCurrentPos(chunk) { | 
|         this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1, this.html.length); | 
|   | 
|         this.lastCharPos = this.html.length - 1; | 
|         this.endOfChunkHit = false; | 
|     } | 
|   | 
|     advance() { | 
|         this.pos++; | 
|   | 
|         if (this.pos > this.lastCharPos) { | 
|             this.endOfChunkHit = !this.lastChunkWritten; | 
|             return $.EOF; | 
|         } | 
|   | 
|         let cp = this.html.charCodeAt(this.pos); | 
|   | 
|         //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character | 
|         //must be ignored. | 
|         if (this.skipNextNewLine && cp === $.LINE_FEED) { | 
|             this.skipNextNewLine = false; | 
|             this._addGap(); | 
|             return this.advance(); | 
|         } | 
|   | 
|         //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters | 
|         if (cp === $.CARRIAGE_RETURN) { | 
|             this.skipNextNewLine = true; | 
|             return $.LINE_FEED; | 
|         } | 
|   | 
|         this.skipNextNewLine = false; | 
|   | 
|         if (unicode.isSurrogate(cp)) { | 
|             cp = this._processSurrogate(cp); | 
|         } | 
|   | 
|         //OPTIMIZATION: first check if code point is in the common allowed | 
|         //range (ASCII alphanumeric, whitespaces, big chunk of BMP) | 
|         //before going into detailed performance cost validation. | 
|         const isCommonValidRange = | 
|             (cp > 0x1f && cp < 0x7f) || cp === $.LINE_FEED || cp === $.CARRIAGE_RETURN || (cp > 0x9f && cp < 0xfdd0); | 
|   | 
|         if (!isCommonValidRange) { | 
|             this._checkForProblematicCharacters(cp); | 
|         } | 
|   | 
|         return cp; | 
|     } | 
|   | 
|     _checkForProblematicCharacters(cp) { | 
|         if (unicode.isControlCodePoint(cp)) { | 
|             this._err(ERR.controlCharacterInInputStream); | 
|         } else if (unicode.isUndefinedCodePoint(cp)) { | 
|             this._err(ERR.noncharacterInInputStream); | 
|         } | 
|     } | 
|   | 
|     retreat() { | 
|         if (this.pos === this.lastGapPos) { | 
|             this.lastGapPos = this.gapStack.pop(); | 
|             this.pos--; | 
|         } | 
|   | 
|         this.pos--; | 
|     } | 
| } | 
|   | 
| module.exports = Preprocessor; |