Tokenizer.js 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859
  1. "use strict";
  2. var __importDefault = (this && this.__importDefault) || function (mod) {
  3. return (mod && mod.__esModule) ? mod : { "default": mod };
  4. };
  5. Object.defineProperty(exports, "__esModule", { value: true });
  6. var decode_codepoint_1 = __importDefault(require("entities/lib/decode_codepoint"));
  7. var entities_json_1 = __importDefault(require("entities/lib/maps/entities.json"));
  8. var legacy_json_1 = __importDefault(require("entities/lib/maps/legacy.json"));
  9. var xml_json_1 = __importDefault(require("entities/lib/maps/xml.json"));
  10. function whitespace(c) {
  11. return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r";
  12. }
  13. function ifElseState(upper, SUCCESS, FAILURE) {
  14. var lower = upper.toLowerCase();
  15. if (upper === lower) {
  16. return function (t, c) {
  17. if (c === lower) {
  18. t._state = SUCCESS;
  19. }
  20. else {
  21. t._state = FAILURE;
  22. t._index--;
  23. }
  24. };
  25. }
  26. else {
  27. return function (t, c) {
  28. if (c === lower || c === upper) {
  29. t._state = SUCCESS;
  30. }
  31. else {
  32. t._state = FAILURE;
  33. t._index--;
  34. }
  35. };
  36. }
  37. }
  38. function consumeSpecialNameChar(upper, NEXT_STATE) {
  39. var lower = upper.toLowerCase();
  40. return function (t, c) {
  41. if (c === lower || c === upper) {
  42. t._state = NEXT_STATE;
  43. }
  44. else {
  45. t._state = 3 /* InTagName */;
  46. t._index--; //consume the token again
  47. }
  48. };
  49. }
  50. var stateBeforeCdata1 = ifElseState("C", 23 /* BeforeCdata2 */, 16 /* InDeclaration */);
  51. var stateBeforeCdata2 = ifElseState("D", 24 /* BeforeCdata3 */, 16 /* InDeclaration */);
  52. var stateBeforeCdata3 = ifElseState("A", 25 /* BeforeCdata4 */, 16 /* InDeclaration */);
  53. var stateBeforeCdata4 = ifElseState("T", 26 /* BeforeCdata5 */, 16 /* InDeclaration */);
  54. var stateBeforeCdata5 = ifElseState("A", 27 /* BeforeCdata6 */, 16 /* InDeclaration */);
  55. var stateBeforeScript1 = consumeSpecialNameChar("R", 34 /* BeforeScript2 */);
  56. var stateBeforeScript2 = consumeSpecialNameChar("I", 35 /* BeforeScript3 */);
  57. var stateBeforeScript3 = consumeSpecialNameChar("P", 36 /* BeforeScript4 */);
  58. var stateBeforeScript4 = consumeSpecialNameChar("T", 37 /* BeforeScript5 */);
  59. var stateAfterScript1 = ifElseState("R", 39 /* AfterScript2 */, 1 /* Text */);
  60. var stateAfterScript2 = ifElseState("I", 40 /* AfterScript3 */, 1 /* Text */);
  61. var stateAfterScript3 = ifElseState("P", 41 /* AfterScript4 */, 1 /* Text */);
  62. var stateAfterScript4 = ifElseState("T", 42 /* AfterScript5 */, 1 /* Text */);
  63. var stateBeforeStyle1 = consumeSpecialNameChar("Y", 44 /* BeforeStyle2 */);
  64. var stateBeforeStyle2 = consumeSpecialNameChar("L", 45 /* BeforeStyle3 */);
  65. var stateBeforeStyle3 = consumeSpecialNameChar("E", 46 /* BeforeStyle4 */);
  66. var stateAfterStyle1 = ifElseState("Y", 48 /* AfterStyle2 */, 1 /* Text */);
  67. var stateAfterStyle2 = ifElseState("L", 49 /* AfterStyle3 */, 1 /* Text */);
  68. var stateAfterStyle3 = ifElseState("E", 50 /* AfterStyle4 */, 1 /* Text */);
  69. var stateBeforeEntity = ifElseState("#", 52 /* BeforeNumericEntity */, 53 /* InNamedEntity */);
  70. var stateBeforeNumericEntity = ifElseState("X", 55 /* InHexEntity */, 54 /* InNumericEntity */);
  71. var Tokenizer = /** @class */ (function () {
  72. function Tokenizer(options, cbs) {
  73. /** The current state the tokenizer is in. */
  74. this._state = 1 /* Text */;
  75. /** The read buffer. */
  76. this._buffer = "";
  77. /** The beginning of the section that is currently being read. */
  78. this._sectionStart = 0;
  79. /** The index within the buffer that we are currently looking at. */
  80. this._index = 0;
  81. /**
  82. * Data that has already been processed will be removed from the buffer occasionally.
  83. * `_bufferOffset` keeps track of how many characters have been removed, to make sure position information is accurate.
  84. */
  85. this._bufferOffset = 0;
  86. /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
  87. this._baseState = 1 /* Text */;
  88. /** For special parsing behavior inside of script and style tags. */
  89. this._special = 1 /* None */;
  90. /** Indicates whether the tokenizer has been paused. */
  91. this._running = true;
  92. /** Indicates whether the tokenizer has finished running / `.end` has been called. */
  93. this._ended = false;
  94. this._cbs = cbs;
  95. this._xmlMode = !!(options && options.xmlMode);
  96. this._decodeEntities = !!(options && options.decodeEntities);
  97. }
  98. Tokenizer.prototype.reset = function () {
  99. this._state = 1 /* Text */;
  100. this._buffer = "";
  101. this._sectionStart = 0;
  102. this._index = 0;
  103. this._bufferOffset = 0;
  104. this._baseState = 1 /* Text */;
  105. this._special = 1 /* None */;
  106. this._running = true;
  107. this._ended = false;
  108. };
  109. Tokenizer.prototype._stateText = function (c) {
  110. if (c === "<") {
  111. if (this._index > this._sectionStart) {
  112. this._cbs.ontext(this._getSection());
  113. }
  114. this._state = 2 /* BeforeTagName */;
  115. this._sectionStart = this._index;
  116. }
  117. else if (this._decodeEntities &&
  118. this._special === 1 /* None */ &&
  119. c === "&") {
  120. if (this._index > this._sectionStart) {
  121. this._cbs.ontext(this._getSection());
  122. }
  123. this._baseState = 1 /* Text */;
  124. this._state = 51 /* BeforeEntity */;
  125. this._sectionStart = this._index;
  126. }
  127. };
  128. Tokenizer.prototype._stateBeforeTagName = function (c) {
  129. if (c === "/") {
  130. this._state = 5 /* BeforeClosingTagName */;
  131. }
  132. else if (c === "<") {
  133. this._cbs.ontext(this._getSection());
  134. this._sectionStart = this._index;
  135. }
  136. else if (c === ">" ||
  137. this._special !== 1 /* None */ ||
  138. whitespace(c)) {
  139. this._state = 1 /* Text */;
  140. }
  141. else if (c === "!") {
  142. this._state = 15 /* BeforeDeclaration */;
  143. this._sectionStart = this._index + 1;
  144. }
  145. else if (c === "?") {
  146. this._state = 17 /* InProcessingInstruction */;
  147. this._sectionStart = this._index + 1;
  148. }
  149. else {
  150. this._state =
  151. !this._xmlMode && (c === "s" || c === "S")
  152. ? 31 /* BeforeSpecial */
  153. : 3 /* InTagName */;
  154. this._sectionStart = this._index;
  155. }
  156. };
  157. Tokenizer.prototype._stateInTagName = function (c) {
  158. if (c === "/" || c === ">" || whitespace(c)) {
  159. this._emitToken("onopentagname");
  160. this._state = 8 /* BeforeAttributeName */;
  161. this._index--;
  162. }
  163. };
  164. Tokenizer.prototype._stateBeforeClosingTagName = function (c) {
  165. if (whitespace(c)) {
  166. // ignore
  167. }
  168. else if (c === ">") {
  169. this._state = 1 /* Text */;
  170. }
  171. else if (this._special !== 1 /* None */) {
  172. if (c === "s" || c === "S") {
  173. this._state = 32 /* BeforeSpecialEnd */;
  174. }
  175. else {
  176. this._state = 1 /* Text */;
  177. this._index--;
  178. }
  179. }
  180. else {
  181. this._state = 6 /* InClosingTagName */;
  182. this._sectionStart = this._index;
  183. }
  184. };
  185. Tokenizer.prototype._stateInClosingTagName = function (c) {
  186. if (c === ">" || whitespace(c)) {
  187. this._emitToken("onclosetag");
  188. this._state = 7 /* AfterClosingTagName */;
  189. this._index--;
  190. }
  191. };
  192. Tokenizer.prototype._stateAfterClosingTagName = function (c) {
  193. //skip everything until ">"
  194. if (c === ">") {
  195. this._state = 1 /* Text */;
  196. this._sectionStart = this._index + 1;
  197. }
  198. };
  199. Tokenizer.prototype._stateBeforeAttributeName = function (c) {
  200. if (c === ">") {
  201. this._cbs.onopentagend();
  202. this._state = 1 /* Text */;
  203. this._sectionStart = this._index + 1;
  204. }
  205. else if (c === "/") {
  206. this._state = 4 /* InSelfClosingTag */;
  207. }
  208. else if (!whitespace(c)) {
  209. this._state = 9 /* InAttributeName */;
  210. this._sectionStart = this._index;
  211. }
  212. };
  213. Tokenizer.prototype._stateInSelfClosingTag = function (c) {
  214. if (c === ">") {
  215. this._cbs.onselfclosingtag();
  216. this._state = 1 /* Text */;
  217. this._sectionStart = this._index + 1;
  218. }
  219. else if (!whitespace(c)) {
  220. this._state = 8 /* BeforeAttributeName */;
  221. this._index--;
  222. }
  223. };
  224. Tokenizer.prototype._stateInAttributeName = function (c) {
  225. if (c === "=" || c === "/" || c === ">" || whitespace(c)) {
  226. this._cbs.onattribname(this._getSection());
  227. this._sectionStart = -1;
  228. this._state = 10 /* AfterAttributeName */;
  229. this._index--;
  230. }
  231. };
  232. Tokenizer.prototype._stateAfterAttributeName = function (c) {
  233. if (c === "=") {
  234. this._state = 11 /* BeforeAttributeValue */;
  235. }
  236. else if (c === "/" || c === ">") {
  237. this._cbs.onattribend();
  238. this._state = 8 /* BeforeAttributeName */;
  239. this._index--;
  240. }
  241. else if (!whitespace(c)) {
  242. this._cbs.onattribend();
  243. this._state = 9 /* InAttributeName */;
  244. this._sectionStart = this._index;
  245. }
  246. };
  247. Tokenizer.prototype._stateBeforeAttributeValue = function (c) {
  248. if (c === '"') {
  249. this._state = 12 /* InAttributeValueDq */;
  250. this._sectionStart = this._index + 1;
  251. }
  252. else if (c === "'") {
  253. this._state = 13 /* InAttributeValueSq */;
  254. this._sectionStart = this._index + 1;
  255. }
  256. else if (!whitespace(c)) {
  257. this._state = 14 /* InAttributeValueNq */;
  258. this._sectionStart = this._index;
  259. this._index--; //reconsume token
  260. }
  261. };
  262. Tokenizer.prototype._stateInAttributeValueDoubleQuotes = function (c) {
  263. if (c === '"') {
  264. this._emitToken("onattribdata");
  265. this._cbs.onattribend();
  266. this._state = 8 /* BeforeAttributeName */;
  267. }
  268. else if (this._decodeEntities && c === "&") {
  269. this._emitToken("onattribdata");
  270. this._baseState = this._state;
  271. this._state = 51 /* BeforeEntity */;
  272. this._sectionStart = this._index;
  273. }
  274. };
  275. Tokenizer.prototype._stateInAttributeValueSingleQuotes = function (c) {
  276. if (c === "'") {
  277. this._emitToken("onattribdata");
  278. this._cbs.onattribend();
  279. this._state = 8 /* BeforeAttributeName */;
  280. }
  281. else if (this._decodeEntities && c === "&") {
  282. this._emitToken("onattribdata");
  283. this._baseState = this._state;
  284. this._state = 51 /* BeforeEntity */;
  285. this._sectionStart = this._index;
  286. }
  287. };
  288. Tokenizer.prototype._stateInAttributeValueNoQuotes = function (c) {
  289. if (whitespace(c) || c === ">") {
  290. this._emitToken("onattribdata");
  291. this._cbs.onattribend();
  292. this._state = 8 /* BeforeAttributeName */;
  293. this._index--;
  294. }
  295. else if (this._decodeEntities && c === "&") {
  296. this._emitToken("onattribdata");
  297. this._baseState = this._state;
  298. this._state = 51 /* BeforeEntity */;
  299. this._sectionStart = this._index;
  300. }
  301. };
  302. Tokenizer.prototype._stateBeforeDeclaration = function (c) {
  303. this._state =
  304. c === "["
  305. ? 22 /* BeforeCdata1 */
  306. : c === "-"
  307. ? 18 /* BeforeComment */
  308. : 16 /* InDeclaration */;
  309. };
  310. Tokenizer.prototype._stateInDeclaration = function (c) {
  311. if (c === ">") {
  312. this._cbs.ondeclaration(this._getSection());
  313. this._state = 1 /* Text */;
  314. this._sectionStart = this._index + 1;
  315. }
  316. };
  317. Tokenizer.prototype._stateInProcessingInstruction = function (c) {
  318. if (c === ">") {
  319. this._cbs.onprocessinginstruction(this._getSection());
  320. this._state = 1 /* Text */;
  321. this._sectionStart = this._index + 1;
  322. }
  323. };
  324. Tokenizer.prototype._stateBeforeComment = function (c) {
  325. if (c === "-") {
  326. this._state = 19 /* InComment */;
  327. this._sectionStart = this._index + 1;
  328. }
  329. else {
  330. this._state = 16 /* InDeclaration */;
  331. }
  332. };
  333. Tokenizer.prototype._stateInComment = function (c) {
  334. if (c === "-")
  335. this._state = 20 /* AfterComment1 */;
  336. };
  337. Tokenizer.prototype._stateAfterComment1 = function (c) {
  338. if (c === "-") {
  339. this._state = 21 /* AfterComment2 */;
  340. }
  341. else {
  342. this._state = 19 /* InComment */;
  343. }
  344. };
  345. Tokenizer.prototype._stateAfterComment2 = function (c) {
  346. if (c === ">") {
  347. //remove 2 trailing chars
  348. this._cbs.oncomment(this._buffer.substring(this._sectionStart, this._index - 2));
  349. this._state = 1 /* Text */;
  350. this._sectionStart = this._index + 1;
  351. }
  352. else if (c !== "-") {
  353. this._state = 19 /* InComment */;
  354. }
  355. // else: stay in AFTER_COMMENT_2 (`--->`)
  356. };
  357. Tokenizer.prototype._stateBeforeCdata6 = function (c) {
  358. if (c === "[") {
  359. this._state = 28 /* InCdata */;
  360. this._sectionStart = this._index + 1;
  361. }
  362. else {
  363. this._state = 16 /* InDeclaration */;
  364. this._index--;
  365. }
  366. };
  367. Tokenizer.prototype._stateInCdata = function (c) {
  368. if (c === "]")
  369. this._state = 29 /* AfterCdata1 */;
  370. };
  371. Tokenizer.prototype._stateAfterCdata1 = function (c) {
  372. if (c === "]")
  373. this._state = 30 /* AfterCdata2 */;
  374. else
  375. this._state = 28 /* InCdata */;
  376. };
  377. Tokenizer.prototype._stateAfterCdata2 = function (c) {
  378. if (c === ">") {
  379. //remove 2 trailing chars
  380. this._cbs.oncdata(this._buffer.substring(this._sectionStart, this._index - 2));
  381. this._state = 1 /* Text */;
  382. this._sectionStart = this._index + 1;
  383. }
  384. else if (c !== "]") {
  385. this._state = 28 /* InCdata */;
  386. }
  387. //else: stay in AFTER_CDATA_2 (`]]]>`)
  388. };
  389. Tokenizer.prototype._stateBeforeSpecial = function (c) {
  390. if (c === "c" || c === "C") {
  391. this._state = 33 /* BeforeScript1 */;
  392. }
  393. else if (c === "t" || c === "T") {
  394. this._state = 43 /* BeforeStyle1 */;
  395. }
  396. else {
  397. this._state = 3 /* InTagName */;
  398. this._index--; //consume the token again
  399. }
  400. };
  401. Tokenizer.prototype._stateBeforeSpecialEnd = function (c) {
  402. if (this._special === 2 /* Script */ && (c === "c" || c === "C")) {
  403. this._state = 38 /* AfterScript1 */;
  404. }
  405. else if (this._special === 3 /* Style */ &&
  406. (c === "t" || c === "T")) {
  407. this._state = 47 /* AfterStyle1 */;
  408. }
  409. else
  410. this._state = 1 /* Text */;
  411. };
  412. Tokenizer.prototype._stateBeforeScript5 = function (c) {
  413. if (c === "/" || c === ">" || whitespace(c)) {
  414. this._special = 2 /* Script */;
  415. }
  416. this._state = 3 /* InTagName */;
  417. this._index--; //consume the token again
  418. };
  419. Tokenizer.prototype._stateAfterScript5 = function (c) {
  420. if (c === ">" || whitespace(c)) {
  421. this._special = 1 /* None */;
  422. this._state = 6 /* InClosingTagName */;
  423. this._sectionStart = this._index - 6;
  424. this._index--; //reconsume the token
  425. }
  426. else
  427. this._state = 1 /* Text */;
  428. };
  429. Tokenizer.prototype._stateBeforeStyle4 = function (c) {
  430. if (c === "/" || c === ">" || whitespace(c)) {
  431. this._special = 3 /* Style */;
  432. }
  433. this._state = 3 /* InTagName */;
  434. this._index--; //consume the token again
  435. };
  436. Tokenizer.prototype._stateAfterStyle4 = function (c) {
  437. if (c === ">" || whitespace(c)) {
  438. this._special = 1 /* None */;
  439. this._state = 6 /* InClosingTagName */;
  440. this._sectionStart = this._index - 5;
  441. this._index--; //reconsume the token
  442. }
  443. else
  444. this._state = 1 /* Text */;
  445. };
  446. //for entities terminated with a semicolon
  447. Tokenizer.prototype._parseNamedEntityStrict = function () {
  448. //offset = 1
  449. if (this._sectionStart + 1 < this._index) {
  450. var entity = this._buffer.substring(this._sectionStart + 1, this._index), map = this._xmlMode ? xml_json_1.default : entities_json_1.default;
  451. if (Object.prototype.hasOwnProperty.call(map, entity)) {
  452. // @ts-ignore
  453. this._emitPartial(map[entity]);
  454. this._sectionStart = this._index + 1;
  455. }
  456. }
  457. };
  458. //parses legacy entities (without trailing semicolon)
  459. Tokenizer.prototype._parseLegacyEntity = function () {
  460. var start = this._sectionStart + 1;
  461. var limit = this._index - start;
  462. if (limit > 6)
  463. limit = 6; // The max length of legacy entities is 6
  464. while (limit >= 2) {
  465. // The min length of legacy entities is 2
  466. var entity = this._buffer.substr(start, limit);
  467. if (Object.prototype.hasOwnProperty.call(legacy_json_1.default, entity)) {
  468. // @ts-ignore
  469. this._emitPartial(legacy_json_1.default[entity]);
  470. this._sectionStart += limit + 1;
  471. return;
  472. }
  473. else {
  474. limit--;
  475. }
  476. }
  477. };
  478. Tokenizer.prototype._stateInNamedEntity = function (c) {
  479. if (c === ";") {
  480. this._parseNamedEntityStrict();
  481. if (this._sectionStart + 1 < this._index && !this._xmlMode) {
  482. this._parseLegacyEntity();
  483. }
  484. this._state = this._baseState;
  485. }
  486. else if ((c < "a" || c > "z") &&
  487. (c < "A" || c > "Z") &&
  488. (c < "0" || c > "9")) {
  489. if (this._xmlMode || this._sectionStart + 1 === this._index) {
  490. // ignore
  491. }
  492. else if (this._baseState !== 1 /* Text */) {
  493. if (c !== "=") {
  494. this._parseNamedEntityStrict();
  495. }
  496. }
  497. else {
  498. this._parseLegacyEntity();
  499. }
  500. this._state = this._baseState;
  501. this._index--;
  502. }
  503. };
  504. Tokenizer.prototype._decodeNumericEntity = function (offset, base) {
  505. var sectionStart = this._sectionStart + offset;
  506. if (sectionStart !== this._index) {
  507. //parse entity
  508. var entity = this._buffer.substring(sectionStart, this._index);
  509. var parsed = parseInt(entity, base);
  510. this._emitPartial(decode_codepoint_1.default(parsed));
  511. this._sectionStart = this._index;
  512. }
  513. else {
  514. this._sectionStart--;
  515. }
  516. this._state = this._baseState;
  517. };
  518. Tokenizer.prototype._stateInNumericEntity = function (c) {
  519. if (c === ";") {
  520. this._decodeNumericEntity(2, 10);
  521. this._sectionStart++;
  522. }
  523. else if (c < "0" || c > "9") {
  524. if (!this._xmlMode) {
  525. this._decodeNumericEntity(2, 10);
  526. }
  527. else {
  528. this._state = this._baseState;
  529. }
  530. this._index--;
  531. }
  532. };
  533. Tokenizer.prototype._stateInHexEntity = function (c) {
  534. if (c === ";") {
  535. this._decodeNumericEntity(3, 16);
  536. this._sectionStart++;
  537. }
  538. else if ((c < "a" || c > "f") &&
  539. (c < "A" || c > "F") &&
  540. (c < "0" || c > "9")) {
  541. if (!this._xmlMode) {
  542. this._decodeNumericEntity(3, 16);
  543. }
  544. else {
  545. this._state = this._baseState;
  546. }
  547. this._index--;
  548. }
  549. };
  550. Tokenizer.prototype._cleanup = function () {
  551. if (this._sectionStart < 0) {
  552. this._buffer = "";
  553. this._bufferOffset += this._index;
  554. this._index = 0;
  555. }
  556. else if (this._running) {
  557. if (this._state === 1 /* Text */) {
  558. if (this._sectionStart !== this._index) {
  559. this._cbs.ontext(this._buffer.substr(this._sectionStart));
  560. }
  561. this._buffer = "";
  562. this._bufferOffset += this._index;
  563. this._index = 0;
  564. }
  565. else if (this._sectionStart === this._index) {
  566. //the section just started
  567. this._buffer = "";
  568. this._bufferOffset += this._index;
  569. this._index = 0;
  570. }
  571. else {
  572. //remove everything unnecessary
  573. this._buffer = this._buffer.substr(this._sectionStart);
  574. this._index -= this._sectionStart;
  575. this._bufferOffset += this._sectionStart;
  576. }
  577. this._sectionStart = 0;
  578. }
  579. };
  580. //TODO make events conditional
  581. Tokenizer.prototype.write = function (chunk) {
  582. if (this._ended)
  583. this._cbs.onerror(Error(".write() after done!"));
  584. this._buffer += chunk;
  585. this._parse();
  586. };
  587. // Iterates through the buffer, calling the function corresponding to the current state.
  588. // States that are more likely to be hit are higher up, as a performance improvement.
  589. Tokenizer.prototype._parse = function () {
  590. while (this._index < this._buffer.length && this._running) {
  591. var c = this._buffer.charAt(this._index);
  592. if (this._state === 1 /* Text */) {
  593. this._stateText(c);
  594. }
  595. else if (this._state === 12 /* InAttributeValueDq */) {
  596. this._stateInAttributeValueDoubleQuotes(c);
  597. }
  598. else if (this._state === 9 /* InAttributeName */) {
  599. this._stateInAttributeName(c);
  600. }
  601. else if (this._state === 19 /* InComment */) {
  602. this._stateInComment(c);
  603. }
  604. else if (this._state === 8 /* BeforeAttributeName */) {
  605. this._stateBeforeAttributeName(c);
  606. }
  607. else if (this._state === 3 /* InTagName */) {
  608. this._stateInTagName(c);
  609. }
  610. else if (this._state === 6 /* InClosingTagName */) {
  611. this._stateInClosingTagName(c);
  612. }
  613. else if (this._state === 2 /* BeforeTagName */) {
  614. this._stateBeforeTagName(c);
  615. }
  616. else if (this._state === 10 /* AfterAttributeName */) {
  617. this._stateAfterAttributeName(c);
  618. }
  619. else if (this._state === 13 /* InAttributeValueSq */) {
  620. this._stateInAttributeValueSingleQuotes(c);
  621. }
  622. else if (this._state === 11 /* BeforeAttributeValue */) {
  623. this._stateBeforeAttributeValue(c);
  624. }
  625. else if (this._state === 5 /* BeforeClosingTagName */) {
  626. this._stateBeforeClosingTagName(c);
  627. }
  628. else if (this._state === 7 /* AfterClosingTagName */) {
  629. this._stateAfterClosingTagName(c);
  630. }
  631. else if (this._state === 31 /* BeforeSpecial */) {
  632. this._stateBeforeSpecial(c);
  633. }
  634. else if (this._state === 20 /* AfterComment1 */) {
  635. this._stateAfterComment1(c);
  636. }
  637. else if (this._state === 14 /* InAttributeValueNq */) {
  638. this._stateInAttributeValueNoQuotes(c);
  639. }
  640. else if (this._state === 4 /* InSelfClosingTag */) {
  641. this._stateInSelfClosingTag(c);
  642. }
  643. else if (this._state === 16 /* InDeclaration */) {
  644. this._stateInDeclaration(c);
  645. }
  646. else if (this._state === 15 /* BeforeDeclaration */) {
  647. this._stateBeforeDeclaration(c);
  648. }
  649. else if (this._state === 21 /* AfterComment2 */) {
  650. this._stateAfterComment2(c);
  651. }
  652. else if (this._state === 18 /* BeforeComment */) {
  653. this._stateBeforeComment(c);
  654. }
  655. else if (this._state === 32 /* BeforeSpecialEnd */) {
  656. this._stateBeforeSpecialEnd(c);
  657. }
  658. else if (this._state === 38 /* AfterScript1 */) {
  659. stateAfterScript1(this, c);
  660. }
  661. else if (this._state === 39 /* AfterScript2 */) {
  662. stateAfterScript2(this, c);
  663. }
  664. else if (this._state === 40 /* AfterScript3 */) {
  665. stateAfterScript3(this, c);
  666. }
  667. else if (this._state === 33 /* BeforeScript1 */) {
  668. stateBeforeScript1(this, c);
  669. }
  670. else if (this._state === 34 /* BeforeScript2 */) {
  671. stateBeforeScript2(this, c);
  672. }
  673. else if (this._state === 35 /* BeforeScript3 */) {
  674. stateBeforeScript3(this, c);
  675. }
  676. else if (this._state === 36 /* BeforeScript4 */) {
  677. stateBeforeScript4(this, c);
  678. }
  679. else if (this._state === 37 /* BeforeScript5 */) {
  680. this._stateBeforeScript5(c);
  681. }
  682. else if (this._state === 41 /* AfterScript4 */) {
  683. stateAfterScript4(this, c);
  684. }
  685. else if (this._state === 42 /* AfterScript5 */) {
  686. this._stateAfterScript5(c);
  687. }
  688. else if (this._state === 43 /* BeforeStyle1 */) {
  689. stateBeforeStyle1(this, c);
  690. }
  691. else if (this._state === 28 /* InCdata */) {
  692. this._stateInCdata(c);
  693. }
  694. else if (this._state === 44 /* BeforeStyle2 */) {
  695. stateBeforeStyle2(this, c);
  696. }
  697. else if (this._state === 45 /* BeforeStyle3 */) {
  698. stateBeforeStyle3(this, c);
  699. }
  700. else if (this._state === 46 /* BeforeStyle4 */) {
  701. this._stateBeforeStyle4(c);
  702. }
  703. else if (this._state === 47 /* AfterStyle1 */) {
  704. stateAfterStyle1(this, c);
  705. }
  706. else if (this._state === 48 /* AfterStyle2 */) {
  707. stateAfterStyle2(this, c);
  708. }
  709. else if (this._state === 49 /* AfterStyle3 */) {
  710. stateAfterStyle3(this, c);
  711. }
  712. else if (this._state === 50 /* AfterStyle4 */) {
  713. this._stateAfterStyle4(c);
  714. }
  715. else if (this._state === 17 /* InProcessingInstruction */) {
  716. this._stateInProcessingInstruction(c);
  717. }
  718. else if (this._state === 53 /* InNamedEntity */) {
  719. this._stateInNamedEntity(c);
  720. }
  721. else if (this._state === 22 /* BeforeCdata1 */) {
  722. stateBeforeCdata1(this, c);
  723. }
  724. else if (this._state === 51 /* BeforeEntity */) {
  725. stateBeforeEntity(this, c);
  726. }
  727. else if (this._state === 23 /* BeforeCdata2 */) {
  728. stateBeforeCdata2(this, c);
  729. }
  730. else if (this._state === 24 /* BeforeCdata3 */) {
  731. stateBeforeCdata3(this, c);
  732. }
  733. else if (this._state === 29 /* AfterCdata1 */) {
  734. this._stateAfterCdata1(c);
  735. }
  736. else if (this._state === 30 /* AfterCdata2 */) {
  737. this._stateAfterCdata2(c);
  738. }
  739. else if (this._state === 25 /* BeforeCdata4 */) {
  740. stateBeforeCdata4(this, c);
  741. }
  742. else if (this._state === 26 /* BeforeCdata5 */) {
  743. stateBeforeCdata5(this, c);
  744. }
  745. else if (this._state === 27 /* BeforeCdata6 */) {
  746. this._stateBeforeCdata6(c);
  747. }
  748. else if (this._state === 55 /* InHexEntity */) {
  749. this._stateInHexEntity(c);
  750. }
  751. else if (this._state === 54 /* InNumericEntity */) {
  752. this._stateInNumericEntity(c);
  753. }
  754. else if (this._state === 52 /* BeforeNumericEntity */) {
  755. stateBeforeNumericEntity(this, c);
  756. }
  757. else {
  758. this._cbs.onerror(Error("unknown _state"), this._state);
  759. }
  760. this._index++;
  761. }
  762. this._cleanup();
  763. };
  764. Tokenizer.prototype.pause = function () {
  765. this._running = false;
  766. };
  767. Tokenizer.prototype.resume = function () {
  768. this._running = true;
  769. if (this._index < this._buffer.length) {
  770. this._parse();
  771. }
  772. if (this._ended) {
  773. this._finish();
  774. }
  775. };
  776. Tokenizer.prototype.end = function (chunk) {
  777. if (this._ended)
  778. this._cbs.onerror(Error(".end() after done!"));
  779. if (chunk)
  780. this.write(chunk);
  781. this._ended = true;
  782. if (this._running)
  783. this._finish();
  784. };
  785. Tokenizer.prototype._finish = function () {
  786. //if there is remaining data, emit it in a reasonable way
  787. if (this._sectionStart < this._index) {
  788. this._handleTrailingData();
  789. }
  790. this._cbs.onend();
  791. };
  792. Tokenizer.prototype._handleTrailingData = function () {
  793. var data = this._buffer.substr(this._sectionStart);
  794. if (this._state === 28 /* InCdata */ ||
  795. this._state === 29 /* AfterCdata1 */ ||
  796. this._state === 30 /* AfterCdata2 */) {
  797. this._cbs.oncdata(data);
  798. }
  799. else if (this._state === 19 /* InComment */ ||
  800. this._state === 20 /* AfterComment1 */ ||
  801. this._state === 21 /* AfterComment2 */) {
  802. this._cbs.oncomment(data);
  803. }
  804. else if (this._state === 53 /* InNamedEntity */ && !this._xmlMode) {
  805. this._parseLegacyEntity();
  806. if (this._sectionStart < this._index) {
  807. this._state = this._baseState;
  808. this._handleTrailingData();
  809. }
  810. }
  811. else if (this._state === 54 /* InNumericEntity */ && !this._xmlMode) {
  812. this._decodeNumericEntity(2, 10);
  813. if (this._sectionStart < this._index) {
  814. this._state = this._baseState;
  815. this._handleTrailingData();
  816. }
  817. }
  818. else if (this._state === 55 /* InHexEntity */ && !this._xmlMode) {
  819. this._decodeNumericEntity(3, 16);
  820. if (this._sectionStart < this._index) {
  821. this._state = this._baseState;
  822. this._handleTrailingData();
  823. }
  824. }
  825. else if (this._state !== 3 /* InTagName */ &&
  826. this._state !== 8 /* BeforeAttributeName */ &&
  827. this._state !== 11 /* BeforeAttributeValue */ &&
  828. this._state !== 10 /* AfterAttributeName */ &&
  829. this._state !== 9 /* InAttributeName */ &&
  830. this._state !== 13 /* InAttributeValueSq */ &&
  831. this._state !== 12 /* InAttributeValueDq */ &&
  832. this._state !== 14 /* InAttributeValueNq */ &&
  833. this._state !== 6 /* InClosingTagName */) {
  834. this._cbs.ontext(data);
  835. }
  836. //else, ignore remaining data
  837. //TODO add a way to remove current tag
  838. };
  839. Tokenizer.prototype.getAbsoluteIndex = function () {
  840. return this._bufferOffset + this._index;
  841. };
  842. Tokenizer.prototype._getSection = function () {
  843. return this._buffer.substring(this._sectionStart, this._index);
  844. };
  845. Tokenizer.prototype._emitToken = function (name) {
  846. this._cbs[name](this._getSection());
  847. this._sectionStart = -1;
  848. };
  849. Tokenizer.prototype._emitPartial = function (value) {
  850. if (this._baseState !== 1 /* Text */) {
  851. this._cbs.onattribdata(value); //TODO implement the new event
  852. }
  853. else {
  854. this._cbs.ontext(value);
  855. }
  856. };
  857. return Tokenizer;
  858. }());
  859. exports.default = Tokenizer;