aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2021-04-08 08:42:01 +0200
committerMartin Fischer <martin@push-f.com>2021-04-08 15:40:37 +0200
commit57e7eefcbe6fb8c3dc4b01c707be9de4c34963a7 (patch)
tree6a9d296389bf3023396592c8514ed6712e011c7f
import https://github.com/servo/html5ever
commit d1206daa740305f55a5fa159e43eb33afc359cb4
-rw-r--r--Cargo.toml30
-rw-r--r--LICENSE-APACHE201
-rw-r--r--LICENSE-MIT25
-rw-r--r--benches/html5ever.rs81
-rw-r--r--build.rs33
-rw-r--r--data/bench/lipsum-zh.html19
-rw-r--r--data/bench/lipsum.html40
-rw-r--r--data/bench/medium-fragment.html24
-rw-r--r--data/bench/small-fragment.html7
-rw-r--r--data/bench/strong.html1
-rw-r--r--data/bench/tiny-fragment.html1
-rw-r--r--examples/arena.rs335
-rw-r--r--examples/capi/tokenize.c74
-rw-r--r--examples/noop-tokenize.rs43
-rw-r--r--examples/noop-tree-builder.rs112
-rw-r--r--examples/print-tree-actions.rs177
-rw-r--r--examples/tokenize.rs103
-rw-r--r--fuzz/.gitignore4
-rw-r--r--fuzz/Cargo.toml27
-rw-r--r--fuzz/fuzz_targets/fuzz_document_parse.rs35
-rw-r--r--macros/match_token.rs464
-rw-r--r--src/driver.rs137
-rw-r--r--src/lib.rs30
-rw-r--r--src/macros.rs33
-rw-r--r--src/serialize/mod.rs256
-rw-r--r--src/tokenizer/char_ref/mod.rs449
-rw-r--r--src/tokenizer/interface.rs110
-rw-r--r--src/tokenizer/mod.rs1713
-rw-r--r--src/tokenizer/states.rs93
-rw-r--r--src/tree_builder/data.rs171
-rw-r--r--src/tree_builder/mod.rs1681
-rw-r--r--src/tree_builder/rules.rs1449
-rw-r--r--src/tree_builder/tag_sets.rs115
-rw-r--r--src/tree_builder/types.rs95
-rw-r--r--src/util/str.rs60
35 files changed, 8228 insertions, 0 deletions
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..3caf8c6
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+
+name = "html5ever"
+version = "0.25.1"
+authors = [ "The html5ever Project Developers" ]
+license = "MIT / Apache-2.0"
+repository = "https://github.com/servo/html5ever"
+description = "High-performance browser-grade HTML5 parser"
+documentation = "https://docs.rs/html5ever"
+build = "build.rs"
+categories = [ "parser-implementations", "web-programming" ]
+edition = "2018"
+
+[dependencies]
+log = "0.4"
+mac = "0.1"
+markup5ever = { version = "0.10", path = "../markup5ever" }
+
+[dev-dependencies]
+typed-arena = "1.3.0"
+criterion = "0.3"
+
+[build-dependencies]
+quote = "1"
+syn = { version = "1", features = ["extra-traits", "full", "fold"] }
+proc-macro2 = "1"
+
+[[bench]]
+name = "html5ever"
+harness = false
diff --git a/LICENSE-APACHE b/LICENSE-APACHE
new file mode 100644
index 0000000..16fe87b
--- /dev/null
+++ b/LICENSE-APACHE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/LICENSE-MIT b/LICENSE-MIT
new file mode 100644
index 0000000..6e45102
--- /dev/null
+++ b/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright (c) 2014 The html5ever Project Developers
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/benches/html5ever.rs b/benches/html5ever.rs
new file mode 100644
index 0000000..ff20c4f
--- /dev/null
+++ b/benches/html5ever.rs
@@ -0,0 +1,81 @@
+#[macro_use]
+extern crate criterion;
+extern crate html5ever;
+
+use std::fs;
+use std::path::PathBuf;
+
+use criterion::{black_box, Criterion};
+
+use html5ever::tendril::*;
+use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer};
+
+struct Sink;
+
+impl TokenSink for Sink {
+ type Handle = ();
+
+ fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
+ // Don't use the token, but make sure we don't get
+ // optimized out entirely.
+ black_box(token);
+ TokenSinkResult::Continue
+ }
+}
+
+fn run_bench(c: &mut Criterion, name: &str) {
+ let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+ path.push("data/bench/");
+ path.push(name);
+ let mut file = fs::File::open(&path).ok().expect("can't open file");
+
+ // Read the file and treat it as an infinitely repeating sequence of characters.
+ let mut file_input = ByteTendril::new();
+ file.read_to_tendril(&mut file_input)
+ .ok()
+ .expect("can't read file");
+ let file_input: StrTendril = file_input.try_reinterpret().unwrap();
+ let size = file_input.len();
+ let mut stream = file_input.chars().cycle();
+
+ // Break the input into chunks of 1024 chars (= a few kB).
+ // This simulates reading from the network.
+ let mut input = vec![];
+ let mut total = 0usize;
+ while total < size {
+ // The by_ref() call is important, otherwise we get wrong results!
+ // See rust-lang/rust#18045.
+ let sz = std::cmp::min(1024, size - total);
+ input.push(stream.by_ref().take(sz).collect::<String>().to_tendril());
+ total += sz;
+ }
+
+ let test_name = format!("html tokenizing {}", name);
+
+ c.bench_function(&test_name, move |b| {
+ b.iter(|| {
+ let mut tok = Tokenizer::new(Sink, Default::default());
+ let mut buffer = BufferQueue::new();
+ // We are doing clone inside the bench function, this is not ideal, but possibly
+ // necessary since our iterator consumes the underlying buffer.
+ for buf in input.clone().into_iter() {
+ buffer.push_back(buf);
+ let _ = tok.feed(&mut buffer);
+ }
+ let _ = tok.feed(&mut buffer);
+ tok.end();
+ })
+ });
+}
+
+fn html5ever_benchmark(c: &mut Criterion) {
+ run_bench(c, "lipsum.html");
+ run_bench(c, "lipsum-zh.html");
+ run_bench(c, "medium-fragment.html");
+ run_bench(c, "small-fragment.html");
+ run_bench(c, "tiny-fragment.html");
+ run_bench(c, "strong.html");
+}
+
+criterion_group!(benches, html5ever_benchmark);
+criterion_main!(benches);
diff --git a/build.rs b/build.rs
new file mode 100644
index 0000000..bfac771
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,33 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use std::env;
+use std::path::Path;
+use std::thread::Builder;
+
+#[path = "macros/match_token.rs"]
+mod match_token;
+
+fn main() {
+ let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
+
+ let input = Path::new(&manifest_dir).join("src/tree_builder/rules.rs");
+ let output = Path::new(&env::var("OUT_DIR").unwrap()).join("rules.rs");
+ println!("cargo:rerun-if-changed={}", input.display());
+
+ // We have stack overflows on Servo's CI.
+ let handle = Builder::new()
+ .stack_size(128 * 1024 * 1024)
+ .spawn(move || {
+ match_token::expand(&input, &output);
+ })
+ .unwrap();
+
+ handle.join().unwrap();
+}
diff --git a/data/bench/lipsum-zh.html b/data/bench/lipsum-zh.html
new file mode 100644
index 0000000..1efe2fa
--- /dev/null
+++ b/data/bench/lipsum-zh.html
@@ -0,0 +1,19 @@
+甀 曒檃檑 糲蘥蠩 櫋瀩, 嗢 剆坲姏 齸圞趲 葠蜄蛖 砎粁 擙樲橚 噅尰崺 廘榙榾 誙 煘煓, 腶 敔耜 逯郹酟 蝪蝩覤 顲鱭鸋, 趍 櫱瀯灂 碄碆碃 矠筸 砫粍 耜僇鄗 搋朠楟 溔 齝囃 槏 鼏噳墺 滭滹漇, 骱 翀胲胵 蝑蝞蝢 鑅鷖
+
+痯 荾莯 驧鬤鸕 梪涫湴, 踙 黈龠懱 椼毸溠 蠬襱覾 滱漮, 耜僇鄗 沀皯竻 饇馦 蒏 斠 墐墆墏 艎艑蔉 貕貔 廑憀慡 嫬廙彯 鳻嶬 跿, 飹勫嫢 熤熡磎 慛 賗跿, 灂瀿 綧 摿斠榱 橀槶澉 碄碆碃 鯦鯢鯡 踾踶輵 鍌鍗鍷 溿 滭滹, 綧 藙藨 蝪蝩覤 渮湸湤, 輗 鰝鰨 犌犐瑆 櫞氌瀙 鵳齖齘 塝 寁崏 摨敹暯 檌檒濦 滭滹漇, 撖 輈鄍 婸媥媕 漦澌潬, 膣 姛帡恦 莃荶衒 昢炾
+
+儮嬼懫 馦騧騜 覛谼貆 墏壾 鋱, 緦 豥趍 翍脝艴 絟缾臮 摲 輴郺 篧糑縒 獧瞝瞣 袀豇貣, 廞 鶄鵳 肒芅邥 泏狔狑 覛谼貆 儋圚墝 滭滹漇 鰝鰨 蔰, 忁曨曣 蝪蝩覤 埱娵徖 萴葂 跬, 緷 巂鞪 晛桼桾 踥踕踛 翣聜蒢 虥諰諨 箄縴儳 磼簎 殠, 銇 烺焆琀 鱐鱍鱕 垽娭屔 齫儽, 蒮 靮傿 烍烚珜 蒝蒧蓏 璈皞緪 圪妀 綧 溮煡煟 轛轝酅 濷瓂癚, 篧糑縒 谾踘遳 讘麡 腶, 鯦鯢鯡 邆錉霋 鋱 蛚袲 鋱鋟鋈 瀷瀹藶 騉鬵 嗢
+
+蝺 鰔鶟 濇燖燏 梪涫湴 齫儽戃, 馺 髬魆 齴讘麡 袟袘觕, 甀瞂硾 鍹餳駷 邆錉霋 曮禷 瑽 虰豖 瀿犨皫 蜬蝁蜠 檹瀔濼 榯, 獝瘝磈 輣鋄銶 抏旲 諃 褌 緳廞徲 轞騹鼚 瘵瘲 媥媕 踙 簎艜薤 鸙讟钃
+
+滘 鐩闤鞿 轞騹鼚 絟缾臮 碃稘, 鮥鴮 輗 渳湥牋 獿譿躐 趉軨鄇 鋑鋡髬 嶜憃撊 磑 棳棔 滜溙 蔏 烺焆琀 鱐鱍鱕 撌斳暩 緅 彃慔 釢髟偛 礯籔羻
+
+鏾鐇闠 擙樲橚 塓塕 慔 笢笣 壾 婸媥媕 奫嫮嫳, 愄揎揇 趡趛踠 瑽 秎穾, 腤萰 蛃袚觙 玝甿虮 濆澓澋 魦 綧 瘱瘵瘲 擙樲橚 瞵瞷矰 璈皞, 腠腶舝 翣聜蒢 魵 潧潣, 慖摲摓 橍殧澞 蟷蠉蟼 摮 嗢嗂 誙賗跿 磏磑禠 蝩覤 穊 鷕黰戄 鼀齕櫌 殔湝 緦, 緁 瘱瘵瘲 鸃鼞欘 窞綆腤 嗼嗹 輷邆 壿 櫱瀯灂 鶭黮齥 鏙闛颾, 眊砎粁 硻禂稢 薢蟌 鋈, 榎榯槄 墂嫫嵾 毄滱漮 豥 髟偛
+
+掭掝 暲 瞵瞷矰 鬄鵊鵙 鍎鞚韕, 齞齝囃 脬舑莕 蔍 嫳嫬 絼綒 縸縩薋 毊灚襳 珝砯砨 嵧 裌覅詵 崸嵀惉 慛 碞碠 蒮 橁橖澭 摨敹暯 罫蓱蒆 嵥嵧 蟷蠉 滆 櫱瀯灂 鶟儹巏 瘑睯碫
+
+滈 簎艜薤 廑憀慡 鑴鱱爧 屼汆, 歅 彔抳 鏾鐇闠 桏毢涒 垽娭屔 磝磢磭 袟袘觕 鍌鍗鍷 鋈 氠洷, 棳棔 雈靮傿 臡虈觿 氃濈瀄 槄 橀槶澉 麷劻穋 嘽 簅縭, 狑玝 垥娀庣 僤凘墈 岯岪弨 摲, 馺骱魡 抩枎殀 迗俀侹 蓪 錛鍆 蔰 暯樧 璸瓁穟 瘑睯碫 濍燂犝, 犵艿邔 獧瞝瞣 馻噈嫶 蝢褗 僣, 嬨嶵 壿 蠝襭譸 痑祣筇 觛詏貁 蜙 珶珸珿 濷瓂癚 箑箖 嗼嗹墋 峷敊浭 阰刲 鄜, 柦柋牬 寁崏庲 礯籔羻 鋍鞎 鉾 跠跬 蜸 勯噚嶢 礌簨繖 醳鏻鐆
+
+蟷蠉蟼 熩熝犚 摓 髽鮛 顤鰩鷎 駍駔鳿 鸃鼞欘 褅 牬玾 殍涾烰 誽賚賧 鴸鼢曘 搋朠 殟 蟼襛 溔 嶵嶯幯 蒘蝆蜪, 蟣襋 溿煔煃 銇韎餀 蹸蹪鏂 摮 踸躽 踣 廦廥彋 鼀齕櫌, 靾鞂 虥諰諨 婸媥媕 毄滱漮 魆 蒛 裧頖 鍆錌雔 枅杺枙 堔埧娾, 蓂蓌蓖 噾噿嚁 洷炟砏 砎粁 鋱, 嬼懫 杍肜阰 麷劻穋 蔊蓴蔖 豥
+
+暕 忀瀸蘌 褣諝趥 髽鮛 滍 噾噿 顤鰩鷎 逯郹酟 樏殣氀 煻獌 蚔趵郚 枲柊氠 鄃鈌鈅 暕, 禖穊稯 鄨鎷闒 鏾鐇闠 蒝蒧 誙 賌輈鄍 鶊鵱鶆 毊灚襳 珋疧 滘 瀗犡礝 簻臗藱 駔鳿 磑, 墐 圩芰敔 婂崥崣 溹溦滜 鍗鍷
diff --git a/data/bench/lipsum.html b/data/bench/lipsum.html
new file mode 100644
index 0000000..27dc14a
--- /dev/null
+++ b/data/bench/lipsum.html
@@ -0,0 +1,40 @@
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer eu arcu varius, fringilla nisi non, pulvinar lorem. Nam et sollicitudin nisi, eget tempus sapien. Suspendisse ac libero velit. Proin semper lacinia posuere. Morbi sollicitudin lacinia urna, eget aliquet risus lobortis sit amet. Fusce rhoncus sodales mauris, a rutrum erat elementum id. Integer nec sapien sit amet nisl convallis vehicula eu eu augue. Etiam nec elit ac nibh lacinia porta. Integer dapibus feugiat magna, eget varius ante vestibulum vel. Vestibulum vitae felis quis est tristique varius quis eget libero. Nullam tincidunt magna eros, nec luctus ante pretium at. Aenean laoreet justo vitae risus fringilla convallis. In malesuada scelerisque lorem, sed luctus tortor varius at. Morbi odio ligula, commodo eu sodales vitae, bibendum eget leo. In odio est, laoreet sit amet eleifend at, placerat in elit.
+
+Nullam ac viverra elit. Vestibulum et massa vel justo bibendum imperdiet. Donec elementum vitae nibh sit amet pellentesque. Ut id fringilla sem, in tincidunt quam. In a dui dignissim, gravida magna in, porta ante. Integer adipiscing porta risus. Nulla facilisi. Cras erat leo, tempor a ligula in, posuere ullamcorper nulla. Maecenas id auctor elit, imperdiet sagittis augue. Curabitur consectetur suscipit lorem porta sollicitudin. Etiam turpis orci, eleifend eu felis in, placerat consequat est. Sed ultrices, tellus ut volutpat venenatis, metus lectus malesuada diam, id ornare risus lectus sed massa. Vivamus mauris diam, lobortis ut interdum eget, porta a elit. Suspendisse potenti.
+
+Donec tincidunt nisi sed mollis feugiat. Mauris ultricies risus non eros feugiat tempor. In aliquam ut nunc id tempor. Curabitur vel elit dolor. Mauris ullamcorper tortor ac nisl feugiat, quis gravida nisl ullamcorper. Pellentesque a ligula quis erat rutrum sollicitudin in a metus. Aliquam ligula massa, cursus in libero a, blandit feugiat tortor. In ac auctor lorem. Ut faucibus leo nec egestas tristique.
+
+Nulla adipiscing consectetur odio, a iaculis eros aliquam at. Nullam dapibus ac ante et convallis. Phasellus tempor arcu velit. Donec adipiscing neque eu molestie mattis. Vestibulum id elit fringilla, ultrices orci eu, rhoncus purus. Mauris ornare nisi massa, et luctus tortor tincidunt vel. Maecenas eu ultrices enim, et varius est. Integer ipsum nunc, suscipit eu dapibus ac, ornare vitae sapien. Vestibulum posuere, nulla sed dictum tempus, magna metus commodo turpis, a aliquet orci tellus eu lectus. Mauris nulla magna, malesuada vitae iaculis ut, facilisis varius sem. In tristique sapien urna, et tristique dolor lacinia non. Suspendisse eu tincidunt eros. Pellentesque dignissim elit vitae purus auctor, non malesuada dolor scelerisque.
+
+Cras commodo tortor at risus ornare euismod a et risus. Sed rutrum, justo vel mollis condimentum, mi elit consectetur mi, non ultricies quam orci mollis sapien. Donec tincidunt, lacus molestie porttitor elementum, enim ligula hendrerit lacus, quis porttitor magna velit sed nisi. Quisque pretium eros id sem posuere consequat id sit amet nunc. Fusce pulvinar commodo ipsum, quis congue tellus faucibus eu. Sed bibendum dolor vitae ante porttitor pretium. Integer id malesuada eros, sed tristique metus. Nunc vitae turpis eu risus sodales vestibulum quis ut magna. In eget metus elit. Donec gravida libero risus, eget tempus erat varius eu. Vestibulum id dignissim sapien. Fusce pretium posuere lacus. Aliquam ac arcu sollicitudin, lacinia tellus vitae, pellentesque tortor. Mauris viverra velit ac lacus egestas sagittis. Duis auctor interdum tincidunt. Aenean eu ullamcorper sapien, sit amet sollicitudin magna.
+
+Nam vel lorem a quam sollicitudin fringilla sit amet quis nibh. Quisque commodo molestie augue. Vivamus ut erat aliquet, gravida ante at, suscipit arcu. Fusce nulla massa, lobortis vel dictum non, vehicula ac lorem. Etiam blandit sodales urna, at aliquet libero dapibus a. Cras odio mauris, porta at enim vitae, aliquam tincidunt libero. Praesent at tortor eu eros cursus consequat vel non elit. Mauris risus urna, sagittis eget turpis eu, malesuada semper nisl. Nunc posuere placerat ligula, in tristique urna pharetra et. Duis consectetur mauris nulla. Etiam auctor tincidunt molestie. Fusce eu faucibus diam, nec fermentum felis. Curabitur non lacinia quam, non luctus neque. Morbi sed ultrices diam.
+
+Fusce accumsan nisl sed nibh fringilla euismod. In ut arcu cursus erat imperdiet porttitor. Pellentesque tempus, nisi quis viverra convallis, eros sem dapibus magna, ut aliquet quam urna vitae dolor. Aenean id tortor turpis. Etiam lacinia arcu lorem, in consectetur arcu placerat sed. Duis non est ornare, dictum mi sit amet, cursus nunc. Suspendisse at venenatis massa. Etiam eget lorem diam. Donec tristique sapien at scelerisque porta. Aenean ornare ligula sed nibh gravida, vel commodo erat ultrices. Donec id enim purus. Vivamus malesuada tristique sapien id tempus. Morbi nec nunc dolor.
+
+Aliquam molestie turpis cursus blandit blandit. Integer imperdiet ullamcorper arcu, a fermentum nisi. Cras hendrerit quam id mollis elementum. Etiam ut erat ac leo posuere aliquet eget non tortor. Nam vel velit sed dui tincidunt gravida eget eget risus. Suspendisse adipiscing sed nulla vel molestie. Aliquam suscipit, sem sed volutpat sagittis, magna enim feugiat erat, pharetra feugiat magna neque a ante. Duis at metus eget leo congue molestie. Vivamus id massa ornare, rutrum ante nec, ullamcorper lacus. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Vestibulum lobortis arcu eu arcu hendrerit convallis. Integer mollis velit at ante consequat, eu pharetra erat venenatis. Integer tincidunt sit amet massa vel hendrerit. Morbi malesuada facilisis augue sed congue. Phasellus porttitor vel mi eu imperdiet. Aenean tincidunt, massa et tristique mollis, nisl metus vulputate est, quis sollicitudin metus ipsum vel felis.
+
+Suspendisse nec feugiat dui. Proin nec lorem semper, dignissim leo et, suscipit turpis. In posuere sem ut blandit scelerisque. Fusce vel ultricies augue, adipiscing pretium lacus. Mauris ac dui non odio convallis pellentesque. Curabitur posuere nec odio ut sodales. Morbi varius risus lacinia, convallis mauris in, tristique turpis.
+
+Vivamus hendrerit justo augue, et molestie ligula aliquam ac. Nunc nec vehicula felis. Donec quam lacus, commodo sollicitudin aliquet eu, aliquam ut leo. Donec vulputate arcu urna, in molestie orci faucibus non. Praesent ut ullamcorper ante. Quisque sollicitudin libero in arcu gravida, quis scelerisque tortor volutpat. Nulla ornare mi ac odio sagittis rutrum. Sed quis sagittis felis. Praesent bibendum orci sed risus elementum, malesuada posuere massa condimentum. Sed velit nunc, pulvinar eu feugiat at, ultrices eu odio. Mauris lacinia ut odio eget ornare. Nullam commodo mollis lorem, ac vehicula justo tristique a.
+
+Morbi est ipsum, egestas a urna sed, aliquet tempus ipsum. In eget fermentum libero. Nullam a sodales dui. Nam imperdiet condimentum luctus. Morbi bibendum at nulla sed aliquam. Quisque nibh nibh, sollicitudin non ullamcorper commodo, viverra non metus. Suspendisse eleifend turpis massa. Cras tortor metus, rutrum sit amet tellus a, sodales suscipit eros. Sed in vulputate ligula. Integer posuere velit sed nisl tristique suscipit. Quisque bibendum eleifend enim in sollicitudin. Phasellus tincidunt orci pretium, molestie felis eu, sodales metus.
+
+Vestibulum consectetur orci ut blandit aliquet. Sed posuere cursus lacus vestibulum posuere. Phasellus ut risus sem. Vivamus et purus non felis pellentesque lacinia. Phasellus aliquam, diam eget vestibulum lobortis, purus tortor porttitor eros, vitae auctor lorem velit a turpis. Integer eu metus vel nisi porta lobortis sollicitudin eget arcu. Maecenas ac blandit dolor. In et sapien ornare, dignissim nulla quis, tempor odio.
+
+Ut nec quam ligula. Ut euismod, nisi nec iaculis faucibus, nisi arcu dignissim neque, a fringilla dolor tellus ut arcu. Curabitur iaculis rhoncus orci sed fermentum. Cras augue elit, eleifend sodales pellentesque ac, varius bibendum nulla. Etiam id diam non purus porta lobortis. Cras fringilla metus in ipsum laoreet placerat. Integer vel quam nec libero varius mattis in non nibh.
+
+Pellentesque adipiscing feugiat neque, vitae imperdiet dui. Duis pharetra elit a dictum laoreet. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Nulla vulputate malesuada nisi, vel egestas nulla mollis ut. Nunc faucibus pharetra leo ac ultricies. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus in odio a magna convallis molestie ut at mauris. Morbi bibendum id dui id imperdiet. Curabitur volutpat et erat quis venenatis. Integer tincidunt et felis sed rutrum. Donec vitae porttitor enim. Sed nisi nunc, auctor ac ullamcorper quis, eleifend id metus.
+
+Morbi felis est, tincidunt at eros at, interdum tempor tortor. Nam et semper metus. Vivamus lacinia pulvinar magna, a lacinia ligula condimentum vitae. Donec vitae ullamcorper diam. Aenean auctor mollis tincidunt. Mauris hendrerit eros quis nulla posuere, non mattis tellus venenatis. Fusce et ligula nec arcu consequat pulvinar. Maecenas sagittis odio justo, at ultrices velit aliquet quis. In hac habitasse platea dictumst. Suspendisse viverra nunc vitae lectus bibendum, vel pretium arcu pretium. Curabitur iaculis condimentum magna ac rutrum. Aenean placerat massa nunc, id vehicula magna vulputate eget. Integer dignissim nunc in enim bibendum consequat vitae id leo. Mauris quis aliquam quam. Suspendisse vel fringilla purus. Mauris sodales dui vitae lacus pellentesque tincidunt a eget nunc.
+
+Nullam imperdiet vestibulum magna nec dictum. Vestibulum scelerisque vestibulum congue. Phasellus fermentum pulvinar elit, eget fringilla arcu vestibulum sed. Mauris pretium nulla in consectetur cursus. Cras malesuada est vulputate hendrerit bibendum. Aenean a tristique diam, ac convallis ipsum. Nunc ac justo ut ante tristique pulvinar. Donec ornare leo sed iaculis rutrum. Integer tincidunt vestibulum massa scelerisque accumsan. Maecenas malesuada, orci at tincidunt faucibus, ipsum velit condimentum odio, vitae cursus risus justo vel orci. Interdum et malesuada fames ac ante ipsum primis in faucibus. Vivamus eu tincidunt leo. Nam a faucibus ipsum, in convallis ligula. Fusce urna lorem, iaculis ut pharetra a, laoreet a mauris. Maecenas molestie justo enim, vitae tincidunt nulla dictum quis.
+
+Ut ac purus ut velit feugiat tincidunt nec sit amet lorem. Mauris nulla sapien, rhoncus a condimentum et, tincidunt ut enim. Nullam eu rhoncus ante. Proin eget erat est. Vivamus suscipit fringilla metus, ut scelerisque urna. Vivamus id porta nibh, ac tincidunt nisl. Vivamus commodo tincidunt turpis a molestie. Phasellus nec interdum enim. Cras accumsan tristique massa.
+
+Cras vitae blandit dolor. Sed purus sem, pharetra sed orci eu, fermentum porttitor magna. Morbi dictum gravida sodales. Pellentesque varius non quam in ullamcorper. Sed in mauris sit amet sapien tempus gravida. Aliquam suscipit nulla a risus ullamcorper, et pharetra leo pharetra. Pellentesque neque lectus, molestie et eros id, consequat sagittis arcu. Nullam suscipit ipsum id lacus tincidunt sollicitudin. Fusce eget leo non massa tempor scelerisque ut a enim. Vestibulum a elementum ligula. Aliquam vehicula semper nibh nec imperdiet. Interdum et malesuada fames ac ante ipsum primis in faucibus. Etiam pretium ante eget lectus rutrum auctor.
+
+Sed pharetra quam metus. Aenean ac rutrum arcu. Donec sit amet pharetra nulla, vitae porttitor eros. Nullam accumsan cursus dolor, ut sodales magna tincidunt quis. Quisque egestas pellentesque velit id fringilla. Duis vel nisi libero. Vivamus ultrices ligula vel tempor lacinia. Cras dictum ut nunc vel suscipit. Duis convallis tortor varius consectetur tempor. Maecenas sed pharetra quam. Nunc malesuada risus justo, et vehicula quam placerat at. Vestibulum non orci eu felis viverra convallis.
+
+Nulla accumsan ultrices ligula, id commodo odio interdum sed. Fusce sit amet varius tortor. Integer non mattis eros. Curabitur vulputate massa non ante lacinia sodales. Aenean a feugiat ligula. Fusce ultricies molestie lectus auctor dignissim. Duis eu lorem feugiat, varius quam vel, volutpat magna. Pellentesque nec nisl ut lorem interdum condimentum scelerisque eu purus. Vestibulum porttitor elementum lectus quis lobortis. Vestibulum non sem ultricies, elementum risus non, aliquet ipsum. Phasellus pellentesque lacinia purus et tristique. Aenean lacinia, mi vel rutrum dapibus, nibh lacus hendrerit velit, ac faucibus massa erat sodales dui. Etiam in enim varius, auctor risus vel, blandit quam.
+
diff --git a/data/bench/medium-fragment.html b/data/bench/medium-fragment.html
new file mode 100644
index 0000000..570bef2
--- /dev/null
+++ b/data/bench/medium-fragment.html
@@ -0,0 +1,24 @@
+<h2><span class="mw-headline" id="History">History</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="http://en.wikipedia.org/w/index.php?title=UTF-8&amp;action=edit&amp;section=1" title="Edit section: History">edit</a><span class="mw-editsection-bracket">]</span></span></h2>
+<p>By early 1992 the search was on for a good byte-stream encoding of multi-byte character sets. The draft <a href="http://en.wikipedia.org/wiki/Universal_Character_Set" title="Universal Character Set">ISO 10646</a> standard contained a non-required <a href="http://en.wikipedia.org/wiki/Addendum" title="Addendum">annex</a> called <a href="http://en.wikipedia.org/wiki/UTF-1" title="UTF-1">UTF-1</a>
+ that provided a byte-stream encoding of its 32-bit code points. This
+encoding was not satisfactory on performance grounds, but did introduce
+the notion that bytes in the range of 0–127 continue representing the
+ASCII characters in UTF, thereby providing backward compatibility with
+ASCII.</p>
+<p>In July 1992, the <a href="http://en.wikipedia.org/wiki/X/Open" title="X/Open">X/Open</a> committee XoJIG was looking for a better encoding. Dave Prosser of <a href="http://en.wikipedia.org/wiki/Unix_System_Laboratories" title="Unix System Laboratories">Unix System Laboratories</a>
+ submitted a proposal for one that had faster implementation
+characteristics and introduced the improvement that 7-bit ASCII
+characters would <i>only</i> represent themselves; all multibyte
+sequences would include only bytes where the high bit was set. This
+original proposal, FSS-UTF (File System Safe UCS Transformation Format),
+ was similar in concept to UTF-8, but lacked the crucial property of <a href="http://en.wikipedia.org/wiki/Self-synchronizing_code" title="Self-synchronizing code">self-synchronization</a>.<sup id="cite_ref-pikeviacambridge_7-0" class="reference"><a href="#cite_note-pikeviacambridge-7"><span>[</span>7<span>]</span></a></sup><sup id="cite_ref-8" class="reference"><a href="#cite_note-8"><span>[</span>8<span>]</span></a></sup></p>
+<p>In August 1992, this proposal was circulated by an <a href="http://en.wikipedia.org/wiki/IBM" title="IBM">IBM</a> X/Open representative to interested parties. <a href="http://en.wikipedia.org/wiki/Ken_Thompson" title="Ken Thompson">Ken Thompson</a> of the <a href="http://en.wikipedia.org/wiki/Plan_9_from_Bell_Labs" title="Plan 9 from Bell Labs">Plan 9</a> <a href="http://en.wikipedia.org/wiki/Operating_system" title="Operating system">operating system</a> group at <a href="http://en.wikipedia.org/wiki/Bell_Labs" title="Bell Labs">Bell Labs</a>
+ then made a small but crucial modification to the encoding, making it
+very slightly less bit-efficient than the previous proposal but allowing
+ it to be <a href="http://en.wikipedia.org/wiki/Self-synchronizing_code" title="Self-synchronizing code">self-synchronizing</a>,
+ meaning that it was no longer necessary to read from the beginning of
+the string to find code point boundaries. Thompson's design was outlined
+ on September 2, 1992, on a placemat in a New Jersey diner with <a href="http://en.wikipedia.org/wiki/Rob_Pike" title="Rob Pike">Rob Pike</a>. In the following days, Pike and Thompson implemented it and updated <a href="http://en.wikipedia.org/wiki/Plan_9_from_Bell_Labs" title="Plan 9 from Bell Labs">Plan 9</a> to use it throughout, and then communicated their success back to X/Open.<sup id="cite_ref-pikeviacambridge_7-1" class="reference"><a href="#cite_note-pikeviacambridge-7"><span>[</span>7<span>]</span></a></sup></p>
+<p>UTF-8 was first officially presented at the <a href="http://en.wikipedia.org/wiki/USENIX" title="USENIX">USENIX</a> conference in <a href="http://en.wikipedia.org/wiki/San_Diego" title="San Diego">San Diego</a>, from January 25 to 29, 1993.</p>
+<p>Google reported that in 2008 UTF-8 (misleadingly labelled "Unicode") became the most common encoding for HTML files.<sup id="cite_ref-markdavis_9-0" class="reference"><a href="#cite_note-markdavis-9"><span>[</span>9<span>]</span></a></sup><sup id="cite_ref-davidgoodger_10-0" class="reference"><a href="#cite_note-davidgoodger-10"><span>[</span>10<span>]</span></a></sup></p>
+<h2><span class="mw-headline" id="Description">Description</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="http://en.wikipedia.org/w/index.php?title=UTF-8&amp;action=edit&amp;section=2" title="Edit section: Description">edit</a><span class="mw-editsection-bracket">]</span></span></h2>
diff --git a/data/bench/small-fragment.html b/data/bench/small-fragment.html
new file mode 100644
index 0000000..a0b9643
--- /dev/null
+++ b/data/bench/small-fragment.html
@@ -0,0 +1,7 @@
+<p>In July 1992, the <a href="http://en.wikipedia.org/wiki/X/Open" title="X/Open">X/Open</a> committee XoJIG was looking for a better encoding. Dave Prosser of <a href="http://en.wikipedia.org/wiki/Unix_System_Laboratories" title="Unix System Laboratories">Unix System Laboratories</a>
+ submitted a proposal for one that had faster implementation
+characteristics and introduced the improvement that 7-bit ASCII
+characters would <i>only</i> represent themselves; all multibyte
+sequences would include only bytes where the high bit was set. This
+original proposal, FSS-UTF (File System Safe UCS Transformation Format),
+ was similar in concept to UTF-8, but lacked the crucial property of <a href="http://en.wikipedia.org/wiki/Self-synchronizing_code" title="Self-synchronizing code">self-synchronization</a>.
diff --git a/data/bench/strong.html b/data/bench/strong.html
new file mode 100644
index 0000000..0ef665e
--- /dev/null
+++ b/data/bench/strong.html
@@ -0,0 +1 @@
+<strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong><strong> \ No newline at end of file
diff --git a/data/bench/tiny-fragment.html b/data/bench/tiny-fragment.html
new file mode 100644
index 0000000..7ce5354
--- /dev/null
+++ b/data/bench/tiny-fragment.html
@@ -0,0 +1 @@
+<p>Hello, world!</p>
diff --git a/examples/arena.rs b/examples/arena.rs
new file mode 100644
index 0000000..1b59ae1
--- /dev/null
+++ b/examples/arena.rs
@@ -0,0 +1,335 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+extern crate html5ever;
+extern crate typed_arena;
+
+use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
+use html5ever::tendril::{StrTendril, TendrilSink};
+use html5ever::{parse_document, Attribute, ExpandedName, QualName};
+use std::borrow::Cow;
+use std::cell::{Cell, RefCell};
+use std::collections::HashSet;
+use std::io::{self, Read};
+use std::ptr;
+
+fn main() {
+ let mut bytes = Vec::new();
+ io::stdin().read_to_end(&mut bytes).unwrap();
+ let arena = typed_arena::Arena::new();
+ html5ever_parse_slice_into_arena(&bytes, &arena);
+}
+
+fn html5ever_parse_slice_into_arena<'a>(bytes: &[u8], arena: Arena<'a>) -> Ref<'a> {
+ let sink = Sink {
+ arena: arena,
+ document: arena.alloc(Node::new(NodeData::Document)),
+ quirks_mode: QuirksMode::NoQuirks,
+ };
+ parse_document(sink, Default::default())
+ .from_utf8()
+ .one(bytes)
+}
+
+type Arena<'arena> = &'arena typed_arena::Arena<Node<'arena>>;
+
+type Ref<'arena> = &'arena Node<'arena>;
+
+type Link<'arena> = Cell<Option<Ref<'arena>>>;
+
+struct Sink<'arena> {
+ arena: Arena<'arena>,
+ document: Ref<'arena>,
+ quirks_mode: QuirksMode,
+}
+
+pub struct Node<'arena> {
+ parent: Link<'arena>,
+ next_sibling: Link<'arena>,
+ previous_sibling: Link<'arena>,
+ first_child: Link<'arena>,
+ last_child: Link<'arena>,
+ data: NodeData<'arena>,
+}
+
+pub enum NodeData<'arena> {
+ Document,
+ Doctype {
+ name: StrTendril,
+ public_id: StrTendril,
+ system_id: StrTendril,
+ },
+ Text {
+ contents: RefCell<StrTendril>,
+ },
+ Comment {
+ contents: StrTendril,
+ },
+ Element {
+ name: QualName,
+ attrs: RefCell<Vec<Attribute>>,
+ template_contents: Option<Ref<'arena>>,
+ mathml_annotation_xml_integration_point: bool,
+ },
+ ProcessingInstruction {
+ target: StrTendril,
+ contents: StrTendril,
+ },
+}
+
+impl<'arena> Node<'arena> {
+ fn new(data: NodeData<'arena>) -> Self {
+ Node {
+ parent: Cell::new(None),
+ previous_sibling: Cell::new(None),
+ next_sibling: Cell::new(None),
+ first_child: Cell::new(None),
+ last_child: Cell::new(None),
+ data: data,
+ }
+ }
+
+ fn detach(&self) {
+ let parent = self.parent.take();
+ let previous_sibling = self.previous_sibling.take();
+ let next_sibling = self.next_sibling.take();
+
+ if let Some(next_sibling) = next_sibling {
+ next_sibling.previous_sibling.set(previous_sibling);
+ } else if let Some(parent) = parent {
+ parent.last_child.set(previous_sibling);
+ }
+
+ if let Some(previous_sibling) = previous_sibling {
+ previous_sibling.next_sibling.set(next_sibling);
+ } else if let Some(parent) = parent {
+ parent.first_child.set(next_sibling);
+ }
+ }
+
+ fn append(&'arena self, new_child: &'arena Self) {
+ new_child.detach();
+ new_child.parent.set(Some(self));
+ if let Some(last_child) = self.last_child.take() {
+ new_child.previous_sibling.set(Some(last_child));
+ debug_assert!(last_child.next_sibling.get().is_none());
+ last_child.next_sibling.set(Some(new_child));
+ } else {
+ debug_assert!(self.first_child.get().is_none());
+ self.first_child.set(Some(new_child));
+ }
+ self.last_child.set(Some(new_child));
+ }
+
+ fn insert_before(&'arena self, new_sibling: &'arena Self) {
+ new_sibling.detach();
+ new_sibling.parent.set(self.parent.get());
+ new_sibling.next_sibling.set(Some(self));
+ if let Some(previous_sibling) = self.previous_sibling.take() {
+ new_sibling.previous_sibling.set(Some(previous_sibling));
+ debug_assert!(ptr::eq::<Node>(
+ previous_sibling.next_sibling.get().unwrap(),
+ self
+ ));
+ previous_sibling.next_sibling.set(Some(new_sibling));
+ } else if let Some(parent) = self.parent.get() {
+ debug_assert!(ptr::eq::<Node>(parent.first_child.get().unwrap(), self));
+ parent.first_child.set(Some(new_sibling));
+ }
+ self.previous_sibling.set(Some(new_sibling));
+ }
+}
+
+impl<'arena> Sink<'arena> {
+ fn new_node(&self, data: NodeData<'arena>) -> Ref<'arena> {
+ self.arena.alloc(Node::new(data))
+ }
+
+ fn append_common<P, A>(&self, child: NodeOrText<Ref<'arena>>, previous: P, append: A)
+ where
+ P: FnOnce() -> Option<Ref<'arena>>,
+ A: FnOnce(Ref<'arena>),
+ {
+ let new_node = match child {
+ NodeOrText::AppendText(text) => {
+ // Append to an existing Text node if we have one.
+ if let Some(&Node {
+ data: NodeData::Text { ref contents },
+ ..
+ }) = previous()
+ {
+ contents.borrow_mut().push_tendril(&text);
+ return;
+ }
+ self.new_node(NodeData::Text {
+ contents: RefCell::new(text),
+ })
+ },
+ NodeOrText::AppendNode(node) => node,
+ };
+
+ append(new_node)
+ }
+}
+
+impl<'arena> TreeSink for Sink<'arena> {
+ type Handle = Ref<'arena>;
+ type Output = Ref<'arena>;
+
+ fn finish(self) -> Ref<'arena> {
+ self.document
+ }
+
+ fn parse_error(&mut self, _: Cow<'static, str>) {}
+
+ fn get_document(&mut self) -> Ref<'arena> {
+ self.document
+ }
+
+ fn set_quirks_mode(&mut self, mode: QuirksMode) {
+ self.quirks_mode = mode;
+ }
+
+ fn same_node(&self, x: &Ref<'arena>, y: &Ref<'arena>) -> bool {
+ ptr::eq::<Node>(*x, *y)
+ }
+
+ fn elem_name<'a>(&self, target: &'a Ref<'arena>) -> ExpandedName<'a> {
+ match target.data {
+ NodeData::Element { ref name, .. } => name.expanded(),
+ _ => panic!("not an element!"),
+ }
+ }
+
+ fn get_template_contents(&mut self, target: &Ref<'arena>) -> Ref<'arena> {
+ if let NodeData::Element {
+ template_contents: Some(ref contents),
+ ..
+ } = target.data
+ {
+ contents
+ } else {
+ panic!("not a template element!")
+ }
+ }
+
+ fn is_mathml_annotation_xml_integration_point(&self, target: &Ref<'arena>) -> bool {
+ if let NodeData::Element {
+ mathml_annotation_xml_integration_point,
+ ..
+ } = target.data
+ {
+ mathml_annotation_xml_integration_point
+ } else {
+ panic!("not an element!")
+ }
+ }
+
+ fn create_element(
+ &mut self,
+ name: QualName,
+ attrs: Vec<Attribute>,
+ flags: ElementFlags,
+ ) -> Ref<'arena> {
+ self.new_node(NodeData::Element {
+ name,
+ attrs: RefCell::new(attrs),
+ template_contents: if flags.template {
+ Some(self.new_node(NodeData::Document))
+ } else {
+ None
+ },
+ mathml_annotation_xml_integration_point: flags.mathml_annotation_xml_integration_point,
+ })
+ }
+
+ fn create_comment(&mut self, text: StrTendril) -> Ref<'arena> {
+ self.new_node(NodeData::Comment { contents: text })
+ }
+
+ fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Ref<'arena> {
+ self.new_node(NodeData::ProcessingInstruction {
+ target: target,
+ contents: data,
+ })
+ }
+
+ fn append(&mut self, parent: &Ref<'arena>, child: NodeOrText<Ref<'arena>>) {
+ self.append_common(
+ child,
+ || parent.last_child.get(),
+ |new_node| parent.append(new_node),
+ )
+ }
+
+ fn append_before_sibling(&mut self, sibling: &Ref<'arena>, child: NodeOrText<Ref<'arena>>) {
+ self.append_common(
+ child,
+ || sibling.previous_sibling.get(),
+ |new_node| sibling.insert_before(new_node),
+ )
+ }
+
+ fn append_based_on_parent_node(
+ &mut self,
+ element: &Ref<'arena>,
+ prev_element: &Ref<'arena>,
+ child: NodeOrText<Ref<'arena>>,
+ ) {
+ if element.parent.get().is_some() {
+ self.append_before_sibling(element, child)
+ } else {
+ self.append(prev_element, child)
+ }
+ }
+
+ fn append_doctype_to_document(
+ &mut self,
+ name: StrTendril,
+ public_id: StrTendril,
+ system_id: StrTendril,
+ ) {
+ self.document.append(self.new_node(NodeData::Doctype {
+ name,
+ public_id,
+ system_id,
+ }))
+ }
+
+ fn add_attrs_if_missing(&mut self, target: &Ref<'arena>, attrs: Vec<Attribute>) {
+ let mut existing = if let NodeData::Element { ref attrs, .. } = target.data {
+ attrs.borrow_mut()
+ } else {
+ panic!("not an element")
+ };
+
+ let existing_names = existing
+ .iter()
+ .map(|e| e.name.clone())
+ .collect::<HashSet<_>>();
+ existing.extend(
+ attrs
+ .into_iter()
+ .filter(|attr| !existing_names.contains(&attr.name)),
+ );
+ }
+
+ fn remove_from_parent(&mut self, target: &Ref<'arena>) {
+ target.detach()
+ }
+
+ fn reparent_children(&mut self, node: &Ref<'arena>, new_parent: &Ref<'arena>) {
+ let mut next_child = node.first_child.get();
+ while let Some(child) = next_child {
+ debug_assert!(ptr::eq::<Node>(child.parent.get().unwrap(), *node));
+ next_child = child.next_sibling.get();
+ new_parent.append(child)
+ }
+ }
+}
diff --git a/examples/capi/tokenize.c b/examples/capi/tokenize.c
new file mode 100644
index 0000000..8c8cdd4
--- /dev/null
+++ b/examples/capi/tokenize.c
@@ -0,0 +1,74 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#include <stdio.h>
+
+#include "html5ever.h"
+
+void put_str(const char *x) {
+ fputs(x, stdout);
+}
+
+void put_buf(struct h5e_buf text) {
+ fwrite(text.data, text.len, 1, stdout);
+}
+
+void do_chars(void *user, struct h5e_buf text) {
+ put_str("CHARS : ");
+ put_buf(text);
+ put_str("\n");
+}
+
+void do_start_tag(void *user, struct h5e_buf name, int self_closing, size_t num_attrs) {
+ put_str("TAG : <");
+ put_buf(name);
+ if (self_closing) {
+ putchar('/');
+ }
+ put_str(">\n");
+}
+
+void do_tag_attr(void *user, struct h5e_buf name, struct h5e_buf value) {
+ put_str(" ATTR: ");
+ put_buf(name);
+ put_str("=\"");
+ put_buf(value);
+ put_str("\"\n");
+}
+
+void do_end_tag(void *user, struct h5e_buf name) {
+ put_str("TAG : </");
+ put_buf(name);
+ put_str(">\n");
+}
+
+struct h5e_token_ops ops = {
+ .do_chars = do_chars,
+ .do_start_tag = do_start_tag,
+ .do_tag_attr = do_tag_attr,
+ .do_end_tag = do_end_tag,
+};
+
+struct h5e_token_sink sink = {
+ .ops = &ops,
+ .user = NULL,
+};
+
+int main(int argc, char *argv[]) {
+ if (argc < 2) {
+ printf("Usage: %s 'HTML fragment'\n", argv[0]);
+ return 1;
+ }
+
+ struct h5e_tokenizer *tok = h5e_tokenizer_new(&sink);
+ h5e_tokenizer_feed(tok, h5e_buf_from_cstr(argv[1]));
+ h5e_tokenizer_end(tok);
+ h5e_tokenizer_free(tok);
+ return 0;
+}
diff --git a/examples/noop-tokenize.rs b/examples/noop-tokenize.rs
new file mode 100644
index 0000000..d6c62f1
--- /dev/null
+++ b/examples/noop-tokenize.rs
@@ -0,0 +1,43 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// Run a single benchmark once. For use with profiling tools.
+
+extern crate html5ever;
+
+use std::default::Default;
+use std::io;
+
+use html5ever::tendril::*;
+use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer};
+
+struct Sink(Vec<Token>);
+
+impl TokenSink for Sink {
+ type Handle = ();
+
+ fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
+ // Don't use the token, but make sure we don't get
+ // optimized out entirely.
+ self.0.push(token);
+ TokenSinkResult::Continue
+ }
+}
+
+fn main() {
+ let mut chunk = ByteTendril::new();
+ io::stdin().read_to_tendril(&mut chunk).unwrap();
+ let mut input = BufferQueue::new();
+ input.push_back(chunk.try_reinterpret().unwrap());
+
+ let mut tok = Tokenizer::new(Sink(Vec::new()), Default::default());
+ let _ = tok.feed(&mut input);
+ assert!(input.is_empty());
+ tok.end();
+}
diff --git a/examples/noop-tree-builder.rs b/examples/noop-tree-builder.rs
new file mode 100644
index 0000000..0775449
--- /dev/null
+++ b/examples/noop-tree-builder.rs
@@ -0,0 +1,112 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#[macro_use]
+extern crate html5ever;
+
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::default::Default;
+use std::io;
+
+use html5ever::parse_document;
+use html5ever::tendril::*;
+use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
+use html5ever::{Attribute, ExpandedName, QualName};
+
+struct Sink {
+ next_id: usize,
+ names: HashMap<usize, QualName>,
+}
+
+impl Sink {
+ fn get_id(&mut self) -> usize {
+ let id = self.next_id;
+ self.next_id += 2;
+ id
+ }
+}
+
+impl TreeSink for Sink {
+ type Handle = usize;
+ type Output = Self;
+ fn finish(self) -> Self {
+ self
+ }
+
+ fn get_document(&mut self) -> usize {
+ 0
+ }
+
+ fn get_template_contents(&mut self, target: &usize) -> usize {
+ if let Some(expanded_name!(html "template")) = self.names.get(&target).map(|n| n.expanded())
+ {
+ target + 1
+ } else {
+ panic!("not a template element")
+ }
+ }
+
+ fn same_node(&self, x: &usize, y: &usize) -> bool {
+ x == y
+ }
+
+ fn elem_name(&self, target: &usize) -> ExpandedName {
+ self.names.get(target).expect("not an element").expanded()
+ }
+
+ fn create_element(&mut self, name: QualName, _: Vec<Attribute>, _: ElementFlags) -> usize {
+ let id = self.get_id();
+ self.names.insert(id, name);
+ id
+ }
+
+ fn create_comment(&mut self, _text: StrTendril) -> usize {
+ self.get_id()
+ }
+
+ #[allow(unused_variables)]
+ fn create_pi(&mut self, target: StrTendril, value: StrTendril) -> usize {
+ unimplemented!()
+ }
+
+ fn append_before_sibling(&mut self, _sibling: &usize, _new_node: NodeOrText<usize>) {}
+
+ fn append_based_on_parent_node(
+ &mut self,
+ _element: &usize,
+ _prev_element: &usize,
+ _new_node: NodeOrText<usize>,
+ ) {
+ }
+
+ fn parse_error(&mut self, _msg: Cow<'static, str>) {}
+ fn set_quirks_mode(&mut self, _mode: QuirksMode) {}
+ fn append(&mut self, _parent: &usize, _child: NodeOrText<usize>) {}
+
+ fn append_doctype_to_document(&mut self, _: StrTendril, _: StrTendril, _: StrTendril) {}
+ fn add_attrs_if_missing(&mut self, target: &usize, _attrs: Vec<Attribute>) {
+ assert!(self.names.contains_key(&target), "not an element");
+ }
+ fn remove_from_parent(&mut self, _target: &usize) {}
+ fn reparent_children(&mut self, _node: &usize, _new_parent: &usize) {}
+ fn mark_script_already_started(&mut self, _node: &usize) {}
+}
+
+fn main() {
+ let sink = Sink {
+ next_id: 1,
+ names: HashMap::new(),
+ };
+ let stdin = io::stdin();
+ parse_document(sink, Default::default())
+ .from_utf8()
+ .read_from(&mut stdin.lock())
+ .unwrap();
+}
diff --git a/examples/print-tree-actions.rs b/examples/print-tree-actions.rs
new file mode 100644
index 0000000..dbb6c6e
--- /dev/null
+++ b/examples/print-tree-actions.rs
@@ -0,0 +1,177 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#[macro_use]
+extern crate html5ever;
+
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::default::Default;
+use std::io;
+
+use html5ever::parse_document;
+use html5ever::tendril::*;
+use html5ever::tree_builder::{
+ AppendNode, AppendText, ElementFlags, NodeOrText, QuirksMode, TreeSink,
+};
+use html5ever::{Attribute, ExpandedName, QualName};
+
+struct Sink {
+ next_id: usize,
+ names: HashMap<usize, QualName>,
+}
+
+impl Sink {
+ fn get_id(&mut self) -> usize {
+ let id = self.next_id;
+ self.next_id += 2;
+ id
+ }
+}
+
+impl TreeSink for Sink {
+ type Handle = usize;
+ type Output = Self;
+ fn finish(self) -> Self {
+ self
+ }
+
+ fn parse_error(&mut self, msg: Cow<'static, str>) {
+ println!("Parse error: {}", msg);
+ }
+
+ fn get_document(&mut self) -> usize {
+ 0
+ }
+
+ fn get_template_contents(&mut self, target: &usize) -> usize {
+ if let Some(expanded_name!(html "template")) = self.names.get(target).map(|n| n.expanded())
+ {
+ target + 1
+ } else {
+ panic!("not a template element")
+ }
+ }
+
+ fn set_quirks_mode(&mut self, mode: QuirksMode) {
+ println!("Set quirks mode to {:?}", mode);
+ }
+
+ fn same_node(&self, x: &usize, y: &usize) -> bool {
+ x == y
+ }
+
+ fn elem_name(&self, target: &usize) -> ExpandedName {
+ self.names.get(target).expect("not an element").expanded()
+ }
+
+ fn create_element(&mut self, name: QualName, _: Vec<Attribute>, _: ElementFlags) -> usize {
+ let id = self.get_id();
+ println!("Created {:?} as {}", name, id);
+ self.names.insert(id, name);
+ id
+ }
+
+ fn create_comment(&mut self, text: StrTendril) -> usize {
+ let id = self.get_id();
+ println!("Created comment \"{}\" as {}", escape_default(&text), id);
+ id
+ }
+
+ #[allow(unused_variables)]
+ fn create_pi(&mut self, target: StrTendril, value: StrTendril) -> usize {
+ unimplemented!()
+ }
+
+ fn append(&mut self, parent: &usize, child: NodeOrText<usize>) {
+ match child {
+ AppendNode(n) => println!("Append node {} to {}", n, parent),
+ AppendText(t) => println!("Append text to {}: \"{}\"", parent, escape_default(&t)),
+ }
+ }
+
+ fn append_before_sibling(&mut self, sibling: &usize, new_node: NodeOrText<usize>) {
+ match new_node {
+ AppendNode(n) => println!("Append node {} before {}", n, sibling),
+ AppendText(t) => println!("Append text before {}: \"{}\"", sibling, escape_default(&t)),
+ }
+ }
+
+ fn append_based_on_parent_node(
+ &mut self,
+ element: &Self::Handle,
+ _prev_element: &Self::Handle,
+ child: NodeOrText<Self::Handle>,
+ ) {
+ self.append_before_sibling(element, child);
+ }
+
+ fn append_doctype_to_document(
+ &mut self,
+ name: StrTendril,
+ public_id: StrTendril,
+ system_id: StrTendril,
+ ) {
+ println!("Append doctype: {} {} {}", name, public_id, system_id);
+ }
+
+ fn add_attrs_if_missing(&mut self, target: &usize, attrs: Vec<Attribute>) {
+ assert!(self.names.contains_key(target), "not an element");
+ println!("Add missing attributes to {}:", target);
+ for attr in attrs.into_iter() {
+ println!(" {:?} = {}", attr.name, attr.value);
+ }
+ }
+
+ fn associate_with_form(
+ &mut self,
+ _target: &usize,
+ _form: &usize,
+ _nodes: (&usize, Option<&usize>),
+ ) {
+ // No form owner support.
+ }
+
+ fn remove_from_parent(&mut self, target: &usize) {
+ println!("Remove {} from parent", target);
+ }
+
+ fn reparent_children(&mut self, node: &usize, new_parent: &usize) {
+ println!("Move children from {} to {}", node, new_parent);
+ }
+
+ fn mark_script_already_started(&mut self, node: &usize) {
+ println!("Mark script {} as already started", node);
+ }
+
+ fn set_current_line(&mut self, line_number: u64) {
+ println!("Set current line to {}", line_number);
+ }
+
+ fn pop(&mut self, elem: &usize) {
+ println!("Popped element {}", elem);
+ }
+}
+
+// FIXME: Copy of str::escape_default from std, which is currently unstable
+pub fn escape_default(s: &str) -> String {
+ s.chars().flat_map(|c| c.escape_default()).collect()
+}
+
+fn main() {
+ let sink = Sink {
+ next_id: 1,
+ names: HashMap::new(),
+ };
+ let stdin = io::stdin();
+ parse_document(sink, Default::default())
+ .from_utf8()
+ .read_from(&mut stdin.lock())
+ .unwrap();
+}
diff --git a/examples/tokenize.rs b/examples/tokenize.rs
new file mode 100644
index 0000000..039ffb7
--- /dev/null
+++ b/examples/tokenize.rs
@@ -0,0 +1,103 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+extern crate html5ever;
+
+use std::default::Default;
+use std::io;
+
+use html5ever::tendril::*;
+use html5ever::tokenizer::BufferQueue;
+use html5ever::tokenizer::{CharacterTokens, EndTag, NullCharacterToken, StartTag, TagToken};
+use html5ever::tokenizer::{
+ ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
+};
+
+#[derive(Copy, Clone)]
+struct TokenPrinter {
+ in_char_run: bool,
+}
+
+impl TokenPrinter {
+ fn is_char(&mut self, is_char: bool) {
+ match (self.in_char_run, is_char) {
+ (false, true) => print!("CHAR : \""),
+ (true, false) => println!("\""),
+ _ => (),
+ }
+ self.in_char_run = is_char;
+ }
+
+ fn do_char(&mut self, c: char) {
+ self.is_char(true);
+ print!("{}", c.escape_default().collect::<String>());
+ }
+}
+
+impl TokenSink for TokenPrinter {
+ type Handle = ();
+
+ fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
+ match token {
+ CharacterTokens(b) => {
+ for c in b.chars() {
+ self.do_char(c);
+ }
+ },
+ NullCharacterToken => self.do_char('\0'),
+ TagToken(tag) => {
+ self.is_char(false);
+ // This is not proper HTML serialization, of course.
+ match tag.kind {
+ StartTag => print!("TAG : <\x1b[32m{}\x1b[0m", tag.name),
+ EndTag => print!("TAG : <\x1b[31m/{}\x1b[0m", tag.name),
+ }
+ for attr in tag.attrs.iter() {
+ print!(
+ " \x1b[36m{}\x1b[0m='\x1b[34m{}\x1b[0m'",
+ attr.name.local, attr.value
+ );
+ }
+ if tag.self_closing {
+ print!(" \x1b[31m/\x1b[0m");
+ }
+ println!(">");
+ },
+ ParseError(err) => {
+ self.is_char(false);
+ println!("ERROR: {}", err);
+ },
+ _ => {
+ self.is_char(false);
+ println!("OTHER: {:?}", token);
+ },
+ }
+ TokenSinkResult::Continue
+ }
+}
+
+fn main() {
+ let mut sink = TokenPrinter { in_char_run: false };
+ let mut chunk = ByteTendril::new();
+ io::stdin().read_to_tendril(&mut chunk).unwrap();
+ let mut input = BufferQueue::new();
+ input.push_back(chunk.try_reinterpret().unwrap());
+
+ let mut tok = Tokenizer::new(
+ sink,
+ TokenizerOpts {
+ profile: true,
+ ..Default::default()
+ },
+ );
+ let _ = tok.feed(&mut input);
+ assert!(input.is_empty());
+ tok.end();
+ sink.is_char(false);
+}
diff --git a/fuzz/.gitignore b/fuzz/.gitignore
new file mode 100644
index 0000000..572e03b
--- /dev/null
+++ b/fuzz/.gitignore
@@ -0,0 +1,4 @@
+
+target
+corpus
+artifacts
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
new file mode 100644
index 0000000..b4dcbe1
--- /dev/null
+++ b/fuzz/Cargo.toml
@@ -0,0 +1,27 @@
+
+[package]
+name = "html5ever-fuzz"
+version = "0.0.0"
+authors = ["David Korczynski <david@adalogics.com>"]
+publish = false
+edition = "2018"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+libfuzzer-sys = "0.3"
+
+[dependencies.html5ever]
+path = ".."
+
+[dependencies.markup5ever_rcdom]
+path = "../../rcdom/"
+
+# Prevent this from interfering with workspaces
+[workspace]
+members = ["."]
+
+[[bin]]
+name = "fuzz_document_parse"
+path = "fuzz_targets/fuzz_document_parse.rs"
diff --git a/fuzz/fuzz_targets/fuzz_document_parse.rs b/fuzz/fuzz_targets/fuzz_document_parse.rs
new file mode 100644
index 0000000..17840de
--- /dev/null
+++ b/fuzz/fuzz_targets/fuzz_document_parse.rs
@@ -0,0 +1,35 @@
+#![no_main]
+use libfuzzer_sys::fuzz_target;
+
+use std::io::BufReader;
+use html5ever::driver::ParseOpts;
+use markup5ever_rcdom::{RcDom, SerializableHandle};
+use html5ever::tendril::TendrilSink;
+use html5ever::tree_builder::TreeBuilderOpts;
+use html5ever::{parse_document, serialize};
+
+// Target inspired by the Rust-Fuzz project
+// https://github.com/rust-fuzz/targets
+fuzz_target!(|data: &[u8]| {
+ let opts = ParseOpts {
+ tree_builder: TreeBuilderOpts {
+ drop_doctype: true,
+ ..Default::default()
+ },
+ ..Default::default()
+ };
+
+ let dom = parse_document(RcDom::default(), opts)
+ .from_utf8()
+ .read_from(&mut BufReader::new(data));
+
+ let dom = if let Ok(dom) = dom {
+ dom
+ } else {
+ return;
+ };
+
+ let mut out = std::io::sink();
+ let document: SerializableHandle = dom.document.into();
+ let _ = serialize(&mut out, &document, Default::default());
+});
diff --git a/macros/match_token.rs b/macros/match_token.rs
new file mode 100644
index 0000000..7d73519
--- /dev/null
+++ b/macros/match_token.rs
@@ -0,0 +1,464 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+/*!
+
+Implements the `match_token!()` macro for use by the HTML tree builder
+in `src/tree_builder/rules.rs`.
+
+
+## Example
+
+```rust
+match_token!(token {
+ CommentToken(text) => 1,
+
+ tag @ <base> <link> <meta> => 2,
+
+ </head> => 3,
+
+ </body> </html> </br> => else,
+
+ tag @ </_> => 4,
+
+ token => 5,
+})
+```
+
+
+## Syntax
+
+Because of the simplistic parser, the macro invocation must
+start with exactly `match_token!(token {` (with whitespace as specified)
+and end with exactly `})`.
+
+The left-hand side of each match arm is an optional `name @` binding, followed by
+
+ - an ordinary Rust pattern that starts with an identifier or an underscore, or
+
+ - a sequence of HTML tag names as identifiers, each inside "<...>" or "</...>"
+ to match an open or close tag respectively, or
+
+ - a "wildcard tag" "<_>" or "</_>" to match all open tags or all close tags
+ respectively.
+
+The right-hand side is either an expression or the keyword `else`.
+
+Note that this syntax does not support guards or pattern alternation like
+`Foo | Bar`. This is not a fundamental limitation; it's done for implementation
+simplicity.
+
+
+## Semantics
+
+Ordinary Rust patterns match as usual. If present, the `name @` binding has
+the usual meaning.
+
+A sequence of named tags matches any of those tags. A single sequence can
+contain both open and close tags. If present, the `name @` binding binds (by
+move) the `Tag` struct, not the outer `Token`. That is, a match arm like
+
+```rust
+tag @ <html> <head> => ...
+```
+
+expands to something like
+
+```rust
+TagToken(tag @ Tag { name: local_name!("html"), kind: StartTag })
+| TagToken(tag @ Tag { name: local_name!("head"), kind: StartTag }) => ...
+```
+
+A wildcard tag matches any tag of the appropriate kind, *unless* it was
+previously matched with an `else` right-hand side (more on this below).
+
+The expansion of this macro reorders code somewhat, to satisfy various
+restrictions arising from moves. However it provides the semantics of in-order
+matching, by enforcing the following restrictions on its input:
+
+ - The last pattern must be a variable or the wildcard "_". In other words
+ it must match everything.
+
+ - Otherwise, ordinary Rust patterns and specific-tag patterns cannot appear
+ after wildcard tag patterns.
+
+ - No tag name may appear more than once.
+
+ - A wildcard tag pattern may not occur in the same arm as any other tag.
+ "<_> <html> => ..." and "<_> </_> => ..." are both forbidden.
+
+ - The right-hand side "else" may only appear with specific-tag patterns.
+ It means that these specific tags should be handled by the last,
+ catch-all case arm, rather than by any wildcard tag arm. This situation
+ is common in the HTML5 syntax.
+*/
+
+use quote::quote;
+use syn::{braced, parse_quote, Token};
+
+use proc_macro2::TokenStream;
+use quote::ToTokens;
+use std::collections::HashSet;
+use std::fs::File;
+use std::io::{Read, Write};
+use std::path::Path;
+use syn::ext::IdentExt;
+use syn::fold::Fold;
+use syn::parse::{Parse, ParseStream, Result};
+
+pub fn expand(from: &Path, to: &Path) {
+ let mut source = String::new();
+ File::open(from)
+ .unwrap()
+ .read_to_string(&mut source)
+ .unwrap();
+ let ast = syn::parse_file(&source).expect("Parsing rules.rs module");
+ let mut m = MatchTokenParser {};
+ let ast = m.fold_file(ast);
+ let code = ast
+ .into_token_stream()
+ .to_string()
+ .replace("{ ", "{\n")
+ .replace(" }", "\n}");
+ File::create(to)
+ .unwrap()
+ .write_all(code.as_bytes())
+ .unwrap();
+}
+
+struct MatchTokenParser {}
+
+struct MatchToken {
+ ident: syn::Ident,
+ arms: Vec<MatchTokenArm>,
+}
+
+struct MatchTokenArm {
+ binding: Option<syn::Ident>,
+ lhs: LHS,
+ rhs: RHS,
+}
+
+enum LHS {
+ Tags(Vec<Tag>),
+ Pattern(syn::Pat),
+}
+
+enum RHS {
+ Expression(syn::Expr),
+ Else,
+}
+
+#[derive(PartialEq, Eq, Hash, Clone)]
+enum TagKind {
+ StartTag,
+ EndTag,
+}
+
+// Option is None if wildcard
+#[derive(PartialEq, Eq, Hash, Clone)]
+pub struct Tag {
+ kind: TagKind,
+ name: Option<syn::Ident>,
+}
+
+impl Parse for Tag {
+ fn parse(input: ParseStream) -> Result<Self> {
+ input.parse::<Token![<]>()?;
+ let closing: Option<Token![/]> = input.parse()?;
+ let name = match input.call(syn::Ident::parse_any)? {
+ ref wildcard if wildcard == "_" => None,
+ other => Some(other),
+ };
+ input.parse::<Token![>]>()?;
+ Ok(Tag {
+ kind: if closing.is_some() {
+ TagKind::EndTag
+ } else {
+ TagKind::StartTag
+ },
+ name,
+ })
+ }
+}
+
+impl Parse for LHS {
+ fn parse(input: ParseStream) -> Result<Self> {
+ if input.peek(Token![<]) {
+ let mut tags = Vec::new();
+ while !input.peek(Token![=>]) {
+ tags.push(input.parse()?);
+ }
+ Ok(LHS::Tags(tags))
+ } else {
+ let p: syn::Pat = input.parse()?;
+ Ok(LHS::Pattern(p))
+ }
+ }
+}
+
+impl Parse for MatchTokenArm {
+ fn parse(input: ParseStream) -> Result<Self> {
+ let binding = if input.peek2(Token![@]) {
+ let binding = input.parse::<syn::Ident>()?;
+ input.parse::<Token![@]>()?;
+ Some(binding)
+ } else {
+ None
+ };
+ let lhs = input.parse::<LHS>()?;
+ input.parse::<Token![=>]>()?;
+ let rhs = if input.peek(syn::token::Brace) {
+ let block = input.parse::<syn::Block>().unwrap();
+ let block = syn::ExprBlock {
+ attrs: vec![],
+ label: None,
+ block,
+ };
+ input.parse::<Option<Token![,]>>()?;
+ RHS::Expression(syn::Expr::Block(block))
+ } else if input.peek(Token![else]) {
+ input.parse::<Token![else]>()?;
+ input.parse::<Token![,]>()?;
+ RHS::Else
+ } else {
+ let expr = input.parse::<syn::Expr>().unwrap();
+ input.parse::<Option<Token![,]>>()?;
+ RHS::Expression(expr)
+ };
+
+ Ok(MatchTokenArm { binding, lhs, rhs })
+ }
+}
+
+impl Parse for MatchToken {
+ fn parse(input: ParseStream) -> Result<Self> {
+ let ident = input.parse::<syn::Ident>()?;
+ let content;
+ braced!(content in input);
+ let mut arms = vec![];
+ while !content.is_empty() {
+ arms.push(content.parse()?);
+ }
+ Ok(MatchToken { ident, arms })
+ }
+}
+
+pub fn expand_match_token(body: &TokenStream) -> syn::Expr {
+ let match_token = syn::parse2::<MatchToken>(body.clone());
+ let ast = expand_match_token_macro(match_token.unwrap());
+ syn::parse2(ast).unwrap()
+}
+
+fn expand_match_token_macro(match_token: MatchToken) -> TokenStream {
+ let mut arms = match_token.arms;
+ let to_be_matched = match_token.ident;
+ // Handle the last arm specially at the end.
+ let last_arm = arms.pop().unwrap();
+
+ // Tags we've seen, used for detecting duplicates.
+ let mut seen_tags: HashSet<Tag> = HashSet::new();
+
+ // Case arms for wildcard matching. We collect these and
+ // emit them later.
+ let mut wildcards_patterns: Vec<TokenStream> = Vec::new();
+ let mut wildcards_expressions: Vec<syn::Expr> = Vec::new();
+
+ // Tags excluded (by an 'else' RHS) from wildcard matching.
+ let mut wild_excluded_patterns: Vec<TokenStream> = Vec::new();
+
+ let mut arms_code = Vec::new();
+
+ for MatchTokenArm { binding, lhs, rhs } in arms {
+ // Build Rust syntax for the `name @` binding, if any.
+ let binding = match binding {
+ Some(ident) => quote!(#ident @),
+ None => quote!(),
+ };
+
+ match (lhs, rhs) {
+ (LHS::Pattern(_), RHS::Else) => {
+ panic!("'else' may not appear with an ordinary pattern")
+ },
+
+ // ordinary pattern => expression
+ (LHS::Pattern(pat), RHS::Expression(expr)) => {
+ if !wildcards_patterns.is_empty() {
+ panic!(
+ "ordinary patterns may not appear after wildcard tags {:?} {:?}",
+ pat, expr
+ );
+ }
+ arms_code.push(quote!(#binding #pat => #expr,))
+ },
+
+ // <tag> <tag> ... => else
+ (LHS::Tags(tags), RHS::Else) => {
+ for tag in tags {
+ if !seen_tags.insert(tag.clone()) {
+ panic!("duplicate tag");
+ }
+ if tag.name.is_none() {
+ panic!("'else' may not appear with a wildcard tag");
+ }
+ wild_excluded_patterns.push(make_tag_pattern(&TokenStream::new(), tag));
+ }
+ },
+
+ // <_> => expression
+ // <tag> <tag> ... => expression
+ (LHS::Tags(tags), RHS::Expression(expr)) => {
+ // Is this arm a tag wildcard?
+ // `None` if we haven't processed the first tag yet.
+ let mut wildcard = None;
+ for tag in tags {
+ if !seen_tags.insert(tag.clone()) {
+ panic!("duplicate tag");
+ }
+
+ match tag.name {
+ // <tag>
+ Some(_) => {
+ if !wildcards_patterns.is_empty() {
+ panic!("specific tags may not appear after wildcard tags");
+ }
+
+ if wildcard == Some(true) {
+ panic!("wildcard tags must appear alone");
+ }
+
+ if wildcard.is_some() {
+ // Push the delimeter `|` if it's not the first tag.
+ arms_code.push(quote!( | ))
+ }
+ arms_code.push(make_tag_pattern(&binding, tag));
+
+ wildcard = Some(false);
+ },
+
+ // <_>
+ None => {
+ if wildcard.is_some() {
+ panic!("wildcard tags must appear alone");
+ }
+ wildcard = Some(true);
+ wildcards_patterns.push(make_tag_pattern(&binding, tag));
+ wildcards_expressions.push(expr.clone());
+ },
+ }
+ }
+
+ match wildcard {
+ None => panic!("[internal macro error] tag arm with no tags"),
+ Some(false) => arms_code.push(quote!( => #expr,)),
+ Some(true) => {}, // codegen for wildcards is deferred
+ }
+ },
+ }
+ }
+
+ // Time to process the last, catch-all arm. We will generate something like
+ //
+ // last_arm_token => {
+ // let enable_wildcards = match last_arm_token {
+ // TagToken(Tag { kind: EndTag, name: local_name!("body"), .. }) => false,
+ // TagToken(Tag { kind: EndTag, name: local_name!("html"), .. }) => false,
+ // // ...
+ // _ => true,
+ // };
+ //
+ // match (enable_wildcards, last_arm_token) {
+ // (true, TagToken(name @ Tag { kind: StartTag, .. }))
+ // => ..., // wildcard action for start tags
+ //
+ // (true, TagToken(name @ Tag { kind: EndTag, .. }))
+ // => ..., // wildcard action for end tags
+ //
+ // (_, token) => ... // using the pattern from that last arm
+ // }
+ // }
+
+ let MatchTokenArm { binding, lhs, rhs } = last_arm;
+
+ let (last_pat, last_expr) = match (binding, lhs, rhs) {
+ (Some(_), _, _) => panic!("the last arm cannot have an @-binding"),
+ (None, LHS::Tags(_), _) => panic!("the last arm cannot have tag patterns"),
+ (None, _, RHS::Else) => panic!("the last arm cannot use 'else'"),
+ (None, LHS::Pattern(p), RHS::Expression(e)) => (p, e),
+ };
+
+ quote! {
+ match #to_be_matched {
+ #(
+ #arms_code
+ )*
+ last_arm_token => {
+ let enable_wildcards = match last_arm_token {
+ #(
+ #wild_excluded_patterns => false,
+ )*
+ _ => true,
+ };
+ match (enable_wildcards, last_arm_token) {
+ #(
+ (true, #wildcards_patterns) => #wildcards_expressions,
+ )*
+ (_, #last_pat) => #last_expr,
+ }
+ }
+ }
+ }
+}
+
+impl Fold for MatchTokenParser {
+ fn fold_stmt(&mut self, stmt: syn::Stmt) -> syn::Stmt {
+ match stmt {
+ syn::Stmt::Item(syn::Item::Macro(syn::ItemMacro { ref mac, .. })) => {
+ if mac.path == parse_quote!(match_token) {
+ return syn::fold::fold_stmt(
+ self,
+ syn::Stmt::Expr(expand_match_token(&mac.tokens)),
+ );
+ }
+ },
+ _ => {},
+ }
+
+ syn::fold::fold_stmt(self, stmt)
+ }
+
+ fn fold_expr(&mut self, expr: syn::Expr) -> syn::Expr {
+ match expr {
+ syn::Expr::Macro(syn::ExprMacro { ref mac, .. }) => {
+ if mac.path == parse_quote!(match_token) {
+ return syn::fold::fold_expr(self, expand_match_token(&mac.tokens));
+ }
+ },
+ _ => {},
+ }
+
+ syn::fold::fold_expr(self, expr)
+ }
+}
+
+fn make_tag_pattern(binding: &TokenStream, tag: Tag) -> TokenStream {
+ let kind = match tag.kind {
+ TagKind::StartTag => quote!(crate::tokenizer::StartTag),
+ TagKind::EndTag => quote!(crate::tokenizer::EndTag),
+ };
+ let name_field = if let Some(name) = tag.name {
+ let name = name.to_string();
+ quote!(name: local_name!(#name),)
+ } else {
+ quote!()
+ };
+ quote! {
+ crate::tree_builder::types::TagToken(#binding crate::tokenizer::Tag { kind: #kind, #name_field .. })
+ }
+}
diff --git a/src/driver.rs b/src/driver.rs
new file mode 100644
index 0000000..26db9b8
--- /dev/null
+++ b/src/driver.rs
@@ -0,0 +1,137 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! High-level interface to the parser.
+
+use crate::buffer_queue::BufferQueue;
+use crate::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult};
+use crate::tree_builder::{create_element, TreeBuilder, TreeBuilderOpts, TreeSink};
+use crate::{Attribute, QualName};
+
+use std::borrow::Cow;
+
+use crate::tendril;
+use crate::tendril::stream::{TendrilSink, Utf8LossyDecoder};
+use crate::tendril::StrTendril;
+
+/// All-encompassing options struct for the parser.
+#[derive(Clone, Default)]
+pub struct ParseOpts {
+ /// Tokenizer options.
+ pub tokenizer: TokenizerOpts,
+
+ /// Tree builder options.
+ pub tree_builder: TreeBuilderOpts,
+}
+
+/// Parse an HTML document
+///
+/// The returned value implements `tendril::TendrilSink`
+/// so that Unicode input may be provided incrementally,
+/// or all at once with the `one` method.
+///
+/// If your input is bytes, use `Parser::from_utf8`.
+pub fn parse_document<Sink>(sink: Sink, opts: ParseOpts) -> Parser<Sink>
+where
+ Sink: TreeSink,
+{
+ let tb = TreeBuilder::new(sink, opts.tree_builder);
+ let tok = Tokenizer::new(tb, opts.tokenizer);
+ Parser {
+ tokenizer: tok,
+ input_buffer: BufferQueue::new(),
+ }
+}
+
+/// Parse an HTML fragment
+///
+/// The returned value implements `tendril::TendrilSink`
+/// so that Unicode input may be provided incrementally,
+/// or all at once with the `one` method.
+///
+/// If your input is bytes, use `Parser::from_utf8`.
+pub fn parse_fragment<Sink>(
+ mut sink: Sink,
+ opts: ParseOpts,
+ context_name: QualName,
+ context_attrs: Vec<Attribute>,
+) -> Parser<Sink>
+where
+ Sink: TreeSink,
+{
+ let context_elem = create_element(&mut sink, context_name, context_attrs);
+ parse_fragment_for_element(sink, opts, context_elem, None)
+}
+
+/// Like `parse_fragment`, but with an existing context element
+/// and optionally a form element.
+pub fn parse_fragment_for_element<Sink>(
+ sink: Sink,
+ opts: ParseOpts,
+ context_element: Sink::Handle,
+ form_element: Option<Sink::Handle>,
+) -> Parser<Sink>
+where
+ Sink: TreeSink,
+{
+ let tb = TreeBuilder::new_for_fragment(sink, context_element, form_element, opts.tree_builder);
+ let tok_opts = TokenizerOpts {
+ initial_state: Some(tb.tokenizer_state_for_context_elem()),
+ ..opts.tokenizer
+ };
+ let tok = Tokenizer::new(tb, tok_opts);
+ Parser {
+ tokenizer: tok,
+ input_buffer: BufferQueue::new(),
+ }
+}
+
+/// An HTML parser,
+/// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods.
+pub struct Parser<Sink>
+where
+ Sink: TreeSink,
+{
+ pub tokenizer: Tokenizer<TreeBuilder<Sink::Handle, Sink>>,
+ pub input_buffer: BufferQueue,
+}
+
+impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> {
+ fn process(&mut self, t: StrTendril) {
+ self.input_buffer.push_back(t);
+ // FIXME: Properly support </script> somehow.
+ while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {}
+ }
+
+ // FIXME: Is it too noisy to report every character decoding error?
+ fn error(&mut self, desc: Cow<'static, str>) {
+ self.tokenizer.sink.sink.parse_error(desc)
+ }
+
+ type Output = Sink::Output;
+
+ fn finish(mut self) -> Self::Output {
+ // FIXME: Properly support </script> somehow.
+ while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {}
+ assert!(self.input_buffer.is_empty());
+ self.tokenizer.end();
+ self.tokenizer.sink.sink.finish()
+ }
+}
+
+impl<Sink: TreeSink> Parser<Sink> {
+ /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes.
+ ///
+ /// Use this when your input is bytes that are known to be in the UTF-8 encoding.
+ /// Decoding is lossy, like `String::from_utf8_lossy`.
+ #[allow(clippy::wrong_self_convention)]
+ pub fn from_utf8(self) -> Utf8LossyDecoder<Self> {
+ Utf8LossyDecoder::new(self)
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..65fadaa
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,30 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#![crate_name = "html5ever"]
+#![crate_type = "dylib"]
+#![cfg_attr(test, deny(warnings))]
+#![allow(unused_parens)]
+
+pub use driver::{parse_document, parse_fragment, ParseOpts, Parser};
+pub use markup5ever::*;
+
+pub use serialize::serialize;
+
+#[macro_use]
+mod macros;
+
+mod util {
+ pub mod str;
+}
+
+pub mod driver;
+pub mod serialize;
+pub mod tokenizer;
+pub mod tree_builder;
diff --git a/src/macros.rs b/src/macros.rs
new file mode 100644
index 0000000..558a4a9
--- /dev/null
+++ b/src/macros.rs
@@ -0,0 +1,33 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+macro_rules! unwrap_or_else {
+ ($opt:expr, $else_block:block) => {
+ match $opt {
+ None => $else_block,
+ Some(x) => x,
+ }
+ };
+}
+
+macro_rules! unwrap_or_return {
+ ($opt:expr, $retval:expr) => {
+ unwrap_or_else!($opt, { return $retval })
+ };
+}
+
+macro_rules! time {
+ ($e:expr) => {{
+ let now = ::std::time::Instant::now();
+ let result = $e;
+ let d = now.elapsed();
+ let dt = d.as_secs() * 1_000_000_000 + u64::from(d.subsec_nanos());
+ (result, dt)
+ }};
+}
diff --git a/src/serialize/mod.rs b/src/serialize/mod.rs
new file mode 100644
index 0000000..3a57b47
--- /dev/null
+++ b/src/serialize/mod.rs
@@ -0,0 +1,256 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use log::warn;
+pub use markup5ever::serialize::{AttrRef, Serialize, Serializer, TraversalScope};
+use markup5ever::{local_name, namespace_url, ns};
+use std::default::Default;
+use std::io::{self, Write};
+
+use crate::{LocalName, QualName};
+
+pub fn serialize<Wr, T>(writer: Wr, node: &T, opts: SerializeOpts) -> io::Result<()>
+where
+ Wr: Write,
+ T: Serialize,
+{
+ let mut ser = HtmlSerializer::new(writer, opts.clone());
+ node.serialize(&mut ser, opts.traversal_scope)
+}
+
+#[derive(Clone)]
+pub struct SerializeOpts {
+ /// Is scripting enabled?
+ pub scripting_enabled: bool,
+
+ /// Serialize the root node? Default: ChildrenOnly
+ pub traversal_scope: TraversalScope,
+
+ /// If the serializer is asked to serialize an invalid tree, the default
+ /// behavior is to panic in the event that an `end_elem` is created without a
+ /// matching `start_elem`. Setting this to true will prevent those panics by
+ /// creating a default parent on the element stack. No extra start elem will
+ /// actually be written. Default: false
+ pub create_missing_parent: bool,
+}
+
+impl Default for SerializeOpts {
+ fn default() -> SerializeOpts {
+ SerializeOpts {
+ scripting_enabled: true,
+ traversal_scope: TraversalScope::ChildrenOnly(None),
+ create_missing_parent: false,
+ }
+ }
+}
+
+#[derive(Default)]
+struct ElemInfo {
+ html_name: Option<LocalName>,
+ ignore_children: bool
+}
+
+pub struct HtmlSerializer<Wr: Write> {
+ pub writer: Wr,
+ opts: SerializeOpts,
+ stack: Vec<ElemInfo>,
+}
+
+fn tagname(name: &QualName) -> LocalName {
+ match name.ns {
+ ns!(html) | ns!(mathml) | ns!(svg) => (),
+ ref ns => {
+ // FIXME(#122)
+ warn!("node with weird namespace {:?}", ns);
+ },
+ }
+
+ name.local.clone()
+}
+
+impl<Wr: Write> HtmlSerializer<Wr> {
+ pub fn new(writer: Wr, opts: SerializeOpts) -> Self {
+ let html_name = match opts.traversal_scope {
+ TraversalScope::IncludeNode | TraversalScope::ChildrenOnly(None) => None,
+ TraversalScope::ChildrenOnly(Some(ref n)) => Some(tagname(n)),
+ };
+ HtmlSerializer {
+ writer,
+ opts,
+ stack: vec![ElemInfo {
+ html_name,
+ ignore_children: false,
+ }],
+ }
+ }
+
+ fn parent(&mut self) -> &mut ElemInfo {
+ if self.stack.is_empty() {
+ if self.opts.create_missing_parent {
+ warn!("ElemInfo stack empty, creating new parent");
+ self.stack.push(Default::default());
+ } else {
+ panic!("no parent ElemInfo")
+ }
+ }
+ self.stack.last_mut().unwrap()
+ }
+
+ fn write_escaped(&mut self, text: &str, attr_mode: bool) -> io::Result<()> {
+ for c in text.chars() {
+ match c {
+ '&' => self.writer.write_all(b"&amp;"),
+ '\u{00A0}' => self.writer.write_all(b"&nbsp;"),
+ '"' if attr_mode => self.writer.write_all(b"&quot;"),
+ '<' if !attr_mode => self.writer.write_all(b"&lt;"),
+ '>' if !attr_mode => self.writer.write_all(b"&gt;"),
+ c => self.writer.write_fmt(format_args!("{}", c)),
+ }?;
+ }
+ Ok(())
+ }
+}
+
+impl<Wr: Write> Serializer for HtmlSerializer<Wr> {
+ fn start_elem<'a, AttrIter>(&mut self, name: QualName, attrs: AttrIter) -> io::Result<()>
+ where
+ AttrIter: Iterator<Item = AttrRef<'a>>,
+ {
+ let html_name = match name.ns {
+ ns!(html) => Some(name.local.clone()),
+ _ => None,
+ };
+
+ if self.parent().ignore_children {
+ self.stack.push(ElemInfo {
+ html_name,
+ ignore_children: true,
+ });
+ return Ok(());
+ }
+
+ self.writer.write_all(b"<")?;
+ self.writer.write_all(tagname(&name).as_bytes())?;
+ for (name, value) in attrs {
+ self.writer.write_all(b" ")?;
+
+ match name.ns {
+ ns!() => (),
+ ns!(xml) => self.writer.write_all(b"xml:")?,
+ ns!(xmlns) => {
+ if name.local != local_name!("xmlns") {
+ self.writer.write_all(b"xmlns:")?;
+ }
+ },
+ ns!(xlink) => self.writer.write_all(b"xlink:")?,
+ ref ns => {
+ // FIXME(#122)
+ warn!("attr with weird namespace {:?}", ns);
+ self.writer.write_all(b"unknown_namespace:")?;
+ },
+ }
+
+ self.writer.write_all(name.local.as_bytes())?;
+ self.writer.write_all(b"=\"")?;
+ self.write_escaped(value, true)?;
+ self.writer.write_all(b"\"")?;
+ }
+ self.writer.write_all(b">")?;
+
+ let ignore_children = name.ns == ns!(html) &&
+ match name.local {
+ local_name!("area") |
+ local_name!("base") |
+ local_name!("basefont") |
+ local_name!("bgsound") |
+ local_name!("br") |
+ local_name!("col") |
+ local_name!("embed") |
+ local_name!("frame") |
+ local_name!("hr") |
+ local_name!("img") |
+ local_name!("input") |
+ local_name!("keygen") |
+ local_name!("link") |
+ local_name!("meta") |
+ local_name!("param") |
+ local_name!("source") |
+ local_name!("track") |
+ local_name!("wbr") => true,
+ _ => false,
+ };
+
+ self.stack.push(ElemInfo {
+ html_name,
+ ignore_children,
+ });
+
+ Ok(())
+ }
+
+ fn end_elem(&mut self, name: QualName) -> io::Result<()> {
+ let info = match self.stack.pop() {
+ Some(info) => info,
+ None if self.opts.create_missing_parent => {
+ warn!("missing ElemInfo, creating default.");
+ Default::default()
+ },
+ _ => panic!("no ElemInfo"),
+ };
+ if info.ignore_children {
+ return Ok(());
+ }
+
+ self.writer.write_all(b"</")?;
+ self.writer.write_all(tagname(&name).as_bytes())?;
+ self.writer.write_all(b">")
+ }
+
+ fn write_text(&mut self, text: &str) -> io::Result<()> {
+ let escape = match self.parent().html_name {
+ Some(local_name!("style")) |
+ Some(local_name!("script")) |
+ Some(local_name!("xmp")) |
+ Some(local_name!("iframe")) |
+ Some(local_name!("noembed")) |
+ Some(local_name!("noframes")) |
+ Some(local_name!("plaintext")) => false,
+
+ Some(local_name!("noscript")) => !self.opts.scripting_enabled,
+
+ _ => true,
+ };
+
+ if escape {
+ self.write_escaped(text, false)
+ } else {
+ self.writer.write_all(text.as_bytes())
+ }
+ }
+
+ fn write_comment(&mut self, text: &str) -> io::Result<()> {
+ self.writer.write_all(b"<!--")?;
+ self.writer.write_all(text.as_bytes())?;
+ self.writer.write_all(b"-->")
+ }
+
+ fn write_doctype(&mut self, name: &str) -> io::Result<()> {
+ self.writer.write_all(b"<!DOCTYPE ")?;
+ self.writer.write_all(name.as_bytes())?;
+ self.writer.write_all(b">")
+ }
+
+ fn write_processing_instruction(&mut self, target: &str, data: &str) -> io::Result<()> {
+ self.writer.write_all(b"<?")?;
+ self.writer.write_all(target.as_bytes())?;
+ self.writer.write_all(b" ")?;
+ self.writer.write_all(data.as_bytes())?;
+ self.writer.write_all(b">")
+ }
+}
diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs
new file mode 100644
index 0000000..a52485d
--- /dev/null
+++ b/src/tokenizer/char_ref/mod.rs
@@ -0,0 +1,449 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use super::{TokenSink, Tokenizer};
+use crate::buffer_queue::BufferQueue;
+use crate::data;
+use crate::tendril::StrTendril;
+use crate::util::str::is_ascii_alnum;
+
+use log::debug;
+use mac::format_if;
+use std::borrow::Cow::Borrowed;
+use std::char::from_u32;
+
+use self::State::*;
+pub use self::Status::*;
+
+//§ tokenizing-character-references
+pub struct CharRef {
+ /// The resulting character(s)
+ pub chars: [char; 2],
+
+ /// How many slots in `chars` are valid?
+ pub num_chars: u8,
+}
+
+pub enum Status {
+ Stuck,
+ Progress,
+ Done,
+}
+
+#[derive(Debug)]
+enum State {
+ Begin,
+ Octothorpe,
+ Numeric(u32), // base
+ NumericSemicolon,
+ Named,
+ BogusName,
+}
+
+pub struct CharRefTokenizer {
+ state: State,
+ addnl_allowed: Option<char>,
+ result: Option<CharRef>,
+
+ num: u32,
+ num_too_big: bool,
+ seen_digit: bool,
+ hex_marker: Option<char>,
+
+ name_buf_opt: Option<StrTendril>,
+ name_match: Option<(u32, u32)>,
+ name_len: usize,
+}
+
+impl CharRefTokenizer {
+ // NB: We assume that we have an additional allowed character iff we're
+ // tokenizing in an attribute value.
+ pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
+ CharRefTokenizer {
+ state: Begin,
+ addnl_allowed,
+ result: None,
+ num: 0,
+ num_too_big: false,
+ seen_digit: false,
+ hex_marker: None,
+ name_buf_opt: None,
+ name_match: None,
+ name_len: 0,
+ }
+ }
+
+ // A CharRefTokenizer can only tokenize one character reference,
+ // so this method consumes the tokenizer.
+ pub fn get_result(self) -> CharRef {
+ self.result.expect("get_result called before done")
+ }
+
+ fn name_buf(&self) -> &StrTendril {
+ self.name_buf_opt
+ .as_ref()
+ .expect("name_buf missing in named character reference")
+ }
+
+ fn name_buf_mut(&mut self) -> &mut StrTendril {
+ self.name_buf_opt
+ .as_mut()
+ .expect("name_buf missing in named character reference")
+ }
+
+ fn finish_none(&mut self) -> Status {
+ self.result = Some(CharRef {
+ chars: ['\0', '\0'],
+ num_chars: 0,
+ });
+ Done
+ }
+
+ fn finish_one(&mut self, c: char) -> Status {
+ self.result = Some(CharRef {
+ chars: [c, '\0'],
+ num_chars: 1,
+ });
+ Done
+ }
+}
+
+impl CharRefTokenizer {
+ pub fn step<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ if self.result.is_some() {
+ return Done;
+ }
+
+ debug!("char ref tokenizer stepping in state {:?}", self.state);
+ match self.state {
+ Begin => self.do_begin(tokenizer, input),
+ Octothorpe => self.do_octothorpe(tokenizer, input),
+ Numeric(base) => self.do_numeric(tokenizer, input, base),
+ NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
+ Named => self.do_named(tokenizer, input),
+ BogusName => self.do_bogus_name(tokenizer, input),
+ }
+ }
+
+ fn do_begin<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ match unwrap_or_return!(tokenizer.peek(input), Stuck) {
+ '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(),
+ c if Some(c) == self.addnl_allowed => self.finish_none(),
+
+ '#' => {
+ tokenizer.discard_char(input);
+ self.state = Octothorpe;
+ Progress
+ },
+
+ _ => {
+ self.state = Named;
+ self.name_buf_opt = Some(StrTendril::new());
+ Progress
+ },
+ }
+ }
+
+ fn do_octothorpe<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
+ match c {
+ 'x' | 'X' => {
+ tokenizer.discard_char(input);
+ self.hex_marker = Some(c);
+ self.state = Numeric(16);
+ },
+
+ _ => {
+ self.hex_marker = None;
+ self.state = Numeric(10);
+ },
+ }
+ Progress
+ }
+
+ fn do_numeric<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ base: u32,
+ ) -> Status {
+ let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
+ match c.to_digit(base) {
+ Some(n) => {
+ tokenizer.discard_char(input);
+ self.num = self.num.wrapping_mul(base);
+ if self.num > 0x10FFFF {
+ // We might overflow, and the character is definitely invalid.
+ // We still parse digits and semicolon, but don't use the result.
+ self.num_too_big = true;
+ }
+ self.num = self.num.wrapping_add(n);
+ self.seen_digit = true;
+ Progress
+ },
+
+ None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
+
+ None => {
+ self.state = NumericSemicolon;
+ Progress
+ },
+ }
+ }
+
+ fn do_numeric_semicolon<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ match unwrap_or_return!(tokenizer.peek(input), Stuck) {
+ ';' => tokenizer.discard_char(input),
+ _ => tokenizer.emit_error(Borrowed(
+ "Semicolon missing after numeric character reference",
+ )),
+ };
+ self.finish_numeric(tokenizer)
+ }
+
+ fn unconsume_numeric<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ let mut unconsume = StrTendril::from_char('#');
+ match self.hex_marker {
+ Some(c) => unconsume.push_char(c),
+ None => (),
+ }
+
+ input.push_front(unconsume);
+ tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
+ self.finish_none()
+ }
+
+ fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status {
+ fn conv(n: u32) -> char {
+ from_u32(n).expect("invalid char missed by error handling cases")
+ }
+
+ let (c, error) = match self.num {
+ n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
+ 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
+
+ 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
+ Some(c) => (c, true),
+ None => (conv(self.num), true),
+ },
+
+ 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
+
+ n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
+
+ n => (conv(n), false),
+ };
+
+ if error {
+ let msg = format_if!(
+ tokenizer.opts.exact_errors,
+ "Invalid numeric character reference",
+ "Invalid numeric character reference value 0x{:06X}",
+ self.num
+ );
+ tokenizer.emit_error(msg);
+ }
+
+ self.finish_one(c)
+ }
+
+ fn do_named<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
+ self.name_buf_mut().push_char(c);
+ match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
+ // We have either a full match or a prefix of one.
+ Some(&m) => {
+ if m.0 != 0 {
+ // We have a full match, but there might be a longer one to come.
+ self.name_match = Some(m);
+ self.name_len = self.name_buf().len();
+ }
+ // Otherwise we just have a prefix match.
+ Progress
+ },
+
+ // Can't continue the match.
+ None => self.finish_named(tokenizer, input, Some(c)),
+ }
+ }
+
+ fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {
+ let msg = format_if!(
+ tokenizer.opts.exact_errors,
+ "Invalid character reference",
+ "Invalid character reference &{}",
+ self.name_buf()
+ );
+ tokenizer.emit_error(msg);
+ }
+
+ fn unconsume_name(&mut self, input: &mut BufferQueue) {
+ input.push_front(self.name_buf_opt.take().unwrap());
+ }
+
+ fn finish_named<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ end_char: Option<char>,
+ ) -> Status {
+ match self.name_match {
+ None => {
+ match end_char {
+ Some(c) if is_ascii_alnum(c) => {
+ // Keep looking for a semicolon, to determine whether
+ // we emit a parse error.
+ self.state = BogusName;
+ return Progress;
+ },
+
+ // Check length because &; is not a parse error.
+ Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
+
+ _ => (),
+ }
+ self.unconsume_name(input);
+ self.finish_none()
+ },
+
+ Some((c1, c2)) => {
+ // We have a complete match, but we may have consumed
+ // additional characters into self.name_buf. Usually
+ // at least one, but several in cases like
+ //
+ // &not => match for U+00AC
+ // &noti => valid prefix for &notin
+ // &notit => can't continue match
+
+ let name_len = self.name_len;
+ assert!(name_len > 0);
+ let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
+
+ // There might not be a next character after the match, if
+ // we had a full match and then hit EOF.
+ let next_after = if name_len == self.name_buf().len() {
+ None
+ } else {
+ Some(self.name_buf()[name_len..].chars().next().unwrap())
+ };
+
+ // "If the character reference is being consumed as part of an
+ // attribute, and the last character matched is not a U+003B
+ // SEMICOLON character (;), and the next character is either a
+ // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
+ // character, then, for historical reasons, all the characters
+ // that were matched after the U+0026 AMPERSAND character (&)
+ // must be unconsumed, and nothing is returned. However, if
+ // this next character is in fact a U+003D EQUALS SIGN
+ // character (=), then this is a parse error"
+
+ let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
+ (_, ';', _) => false,
+ (Some(_), _, Some('=')) => {
+ tokenizer.emit_error(Borrowed(
+ "Equals sign after character reference in attribute",
+ ));
+ true
+ },
+ (Some(_), _, Some(c)) if is_ascii_alnum(c) => true,
+ _ => {
+ tokenizer.emit_error(Borrowed(
+ "Character reference does not end with semicolon",
+ ));
+ false
+ },
+ };
+
+ if unconsume_all {
+ self.unconsume_name(input);
+ self.finish_none()
+ } else {
+ input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
+ self.result = Some(CharRef {
+ chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
+ num_chars: if c2 == 0 { 1 } else { 2 },
+ });
+ Done
+ }
+ },
+ }
+ }
+
+ fn do_bogus_name<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) -> Status {
+ let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
+ self.name_buf_mut().push_char(c);
+ match c {
+ _ if is_ascii_alnum(c) => return Progress,
+ ';' => self.emit_name_error(tokenizer),
+ _ => (),
+ }
+ self.unconsume_name(input);
+ self.finish_none()
+ }
+
+ pub fn end_of_file<Sink: TokenSink>(
+ &mut self,
+ tokenizer: &mut Tokenizer<Sink>,
+ input: &mut BufferQueue,
+ ) {
+ while self.result.is_none() {
+ match self.state {
+ Begin => drop(self.finish_none()),
+
+ Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
+
+ Numeric(_) | NumericSemicolon => {
+ tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
+ self.finish_numeric(tokenizer);
+ },
+
+ Named => drop(self.finish_named(tokenizer, input, None)),
+
+ BogusName => {
+ self.unconsume_name(input);
+ self.finish_none();
+ },
+
+ Octothorpe => {
+ input.push_front(StrTendril::from_slice("#"));
+ tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
+ self.finish_none();
+ },
+ }
+ }
+ }
+}
diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs
new file mode 100644
index 0000000..22d11be
--- /dev/null
+++ b/src/tokenizer/interface.rs
@@ -0,0 +1,110 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use crate::interface::Attribute;
+use crate::tendril::StrTendril;
+use crate::tokenizer::states;
+use crate::LocalName;
+use std::borrow::Cow;
+
+pub use self::TagKind::{EndTag, StartTag};
+pub use self::Token::{CharacterTokens, CommentToken, DoctypeToken, TagToken};
+pub use self::Token::{EOFToken, NullCharacterToken, ParseError};
+
+/// A `DOCTYPE` token.
+// FIXME: already exists in Servo DOM
+#[derive(PartialEq, Eq, Clone, Debug)]
+pub struct Doctype {
+ pub name: Option<StrTendril>,
+ pub public_id: Option<StrTendril>,
+ pub system_id: Option<StrTendril>,
+ pub force_quirks: bool,
+}
+
+impl Doctype {
+ pub fn new() -> Doctype {
+ Doctype {
+ name: None,
+ public_id: None,
+ system_id: None,
+ force_quirks: false,
+ }
+ }
+}
+
+#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)]
+pub enum TagKind {
+ StartTag,
+ EndTag,
+}
+
+/// A tag token.
+#[derive(PartialEq, Eq, Clone, Debug)]
+pub struct Tag {
+ pub kind: TagKind,
+ pub name: LocalName,
+ pub self_closing: bool,
+ pub attrs: Vec<Attribute>,
+}
+
+impl Tag {
+ /// Are the tags equivalent when we don't care about attribute order?
+ /// Also ignores the self-closing flag.
+ pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool {
+ if (self.kind != other.kind) || (self.name != other.name) {
+ return false;
+ }
+
+ let mut self_attrs = self.attrs.clone();
+ let mut other_attrs = other.attrs.clone();
+ self_attrs.sort();
+ other_attrs.sort();
+
+ self_attrs == other_attrs
+ }
+}
+
+#[derive(PartialEq, Eq, Debug)]
+pub enum Token {
+ DoctypeToken(Doctype),
+ TagToken(Tag),
+ CommentToken(StrTendril),
+ CharacterTokens(StrTendril),
+ NullCharacterToken,
+ EOFToken,
+ ParseError(Cow<'static, str>),
+}
+
+#[derive(Debug, PartialEq)]
+#[must_use]
+pub enum TokenSinkResult<Handle> {
+ Continue,
+ Script(Handle),
+ Plaintext,
+ RawData(states::RawKind),
+}
+
+/// Types which can receive tokens from the tokenizer.
+pub trait TokenSink {
+ type Handle;
+
+ /// Process a token.
+ fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle>;
+
+ // Signal sink that tokenization reached the end.
+ fn end(&mut self) {}
+
+ /// Used in the markup declaration open state. By default, this always
+ /// returns false and thus all CDATA sections are tokenized as bogus
+ /// comments.
+ /// https://html.spec.whatwg.org/multipage/#markup-declaration-open-state
+ fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool {
+ false
+ }
+}
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
new file mode 100644
index 0000000..267fdf3
--- /dev/null
+++ b/src/tokenizer/mod.rs
@@ -0,0 +1,1713 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! The HTML5 tokenizer.
+
+pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
+pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
+pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
+pub use self::interface::{TokenSink, TokenSinkResult};
+
+use self::states::{DoctypeIdKind, Public, System};
+use self::states::{DoubleEscaped, Escaped};
+use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
+use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
+
+use self::char_ref::{CharRef, CharRefTokenizer};
+
+use crate::util::str::lower_ascii_letter;
+
+use log::debug;
+use mac::{_tt_as_expr_hack, format_if, matches};
+use markup5ever::{namespace_url, ns, small_char_set};
+use std::borrow::Cow::{self, Borrowed};
+use std::collections::BTreeMap;
+use std::default::Default;
+use std::mem::replace;
+
+pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
+use crate::tendril::StrTendril;
+use crate::{Attribute, LocalName, QualName, SmallCharSet};
+
+mod char_ref;
+mod interface;
+pub mod states;
+
+pub enum ProcessResult<Handle> {
+ Continue,
+ Suspend,
+ Script(Handle),
+}
+
+#[must_use]
+pub enum TokenizerResult<Handle> {
+ Done,
+ Script(Handle),
+}
+
+fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
+ match *opt_str {
+ Some(ref mut s) => s.push_char(c),
+ None => *opt_str = Some(StrTendril::from_char(c)),
+ }
+}
+
+/// Tokenizer options, with an impl for `Default`.
+#[derive(Clone)]
+pub struct TokenizerOpts {
+ /// Report all parse errors described in the spec, at some
+ /// performance penalty? Default: false
+ pub exact_errors: bool,
+
+ /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
+ /// of the stream? Default: true
+ pub discard_bom: bool,
+
+ /// Keep a record of how long we spent in each state? Printed
+ /// when `end()` is called. Default: false
+ pub profile: bool,
+
+ /// Initial state override. Only the test runner should use
+ /// a non-`None` value!
+ pub initial_state: Option<states::State>,
+
+ /// Last start tag. Only the test runner should use a
+ /// non-`None` value!
+ ///
+ /// FIXME: Can't use Tendril because we want TokenizerOpts
+ /// to be Send.
+ pub last_start_tag_name: Option<String>,
+}
+
+impl Default for TokenizerOpts {
+ fn default() -> TokenizerOpts {
+ TokenizerOpts {
+ exact_errors: false,
+ discard_bom: true,
+ profile: false,
+ initial_state: None,
+ last_start_tag_name: None,
+ }
+ }
+}
+
+/// The HTML tokenizer.
+pub struct Tokenizer<Sink> {
+ /// Options controlling the behavior of the tokenizer.
+ opts: TokenizerOpts,
+
+ /// Destination for tokens we emit.
+ pub sink: Sink,
+
+ /// The abstract machine state as described in the spec.
+ state: states::State,
+
+ /// Are we at the end of the file, once buffers have been processed
+ /// completely? This affects whether we will wait for lookahead or not.
+ at_eof: bool,
+
+ /// Tokenizer for character references, if we're tokenizing
+ /// one at the moment.
+ char_ref_tokenizer: Option<Box<CharRefTokenizer>>,
+
+ /// Current input character. Just consumed, may reconsume.
+ current_char: char,
+
+ /// Should we reconsume the current input character?
+ reconsume: bool,
+
+ /// Did we just consume \r, translating it to \n? In that case we need
+ /// to ignore the next character if it's \n.
+ ignore_lf: bool,
+
+ /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the
+ /// beginning of the stream.
+ discard_bom: bool,
+
+ /// Current tag kind.
+ current_tag_kind: TagKind,
+
+ /// Current tag name.
+ current_tag_name: StrTendril,
+
+ /// Current tag is self-closing?
+ current_tag_self_closing: bool,
+
+ /// Current tag attributes.
+ current_tag_attrs: Vec<Attribute>,
+
+ /// Current attribute name.
+ current_attr_name: StrTendril,
+
+ /// Current attribute value.
+ current_attr_value: StrTendril,
+
+ /// Current comment.
+ current_comment: StrTendril,
+
+ /// Current doctype token.
+ current_doctype: Doctype,
+
+ /// Last start tag name, for use in checking "appropriate end tag".
+ last_start_tag_name: Option<LocalName>,
+
+ /// The "temporary buffer" mentioned in the spec.
+ temp_buf: StrTendril,
+
+ /// Record of how many ns we spent in each state, if profiling is enabled.
+ state_profile: BTreeMap<states::State, u64>,
+
+ /// Record of how many ns we spent in the token sink.
+ time_in_sink: u64,
+
+ /// Track current line
+ current_line: u64,
+}
+
+impl<Sink: TokenSink> Tokenizer<Sink> {
+ /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
+ pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
+ let start_tag_name = opts
+ .last_start_tag_name
+ .take()
+ .map(|s| LocalName::from(&*s));
+ let state = opts.initial_state.unwrap_or(states::Data);
+ let discard_bom = opts.discard_bom;
+ Tokenizer {
+ opts,
+ sink,
+ state,
+ char_ref_tokenizer: None,
+ at_eof: false,
+ current_char: '\0',
+ reconsume: false,
+ ignore_lf: false,
+ discard_bom,
+ current_tag_kind: StartTag,
+ current_tag_name: StrTendril::new(),
+ current_tag_self_closing: false,
+ current_tag_attrs: vec![],
+ current_attr_name: StrTendril::new(),
+ current_attr_value: StrTendril::new(),
+ current_comment: StrTendril::new(),
+ current_doctype: Doctype::new(),
+ last_start_tag_name: start_tag_name,
+ temp_buf: StrTendril::new(),
+ state_profile: BTreeMap::new(),
+ time_in_sink: 0,
+ current_line: 1,
+ }
+ }
+
+ /// Feed an input string into the tokenizer.
+ pub fn feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
+ if input.is_empty() {
+ return TokenizerResult::Done;
+ }
+
+ if self.discard_bom {
+ if let Some(c) = input.peek() {
+ if c == '\u{feff}' {
+ input.next();
+ }
+ } else {
+ return TokenizerResult::Done;
+ }
+ };
+
+ self.run(input)
+ }
+
+ pub fn set_plaintext_state(&mut self) {
+ self.state = states::Plaintext;
+ }
+
+ fn process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle> {
+ if self.opts.profile {
+ let (ret, dt) = time!(self.sink.process_token(token, self.current_line));
+ self.time_in_sink += dt;
+ ret
+ } else {
+ self.sink.process_token(token, self.current_line)
+ }
+ }
+
+ fn process_token_and_continue(&mut self, token: Token) {
+ assert!(matches!(
+ self.process_token(token),
+ TokenSinkResult::Continue
+ ));
+ }
+
+ //§ preprocessing-the-input-stream
+ // Get the next input character, which might be the character
+ // 'c' that we already consumed from the buffers.
+ fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char> {
+ if self.ignore_lf {
+ self.ignore_lf = false;
+ if c == '\n' {
+ c = unwrap_or_return!(input.next(), None);
+ }
+ }
+
+ if c == '\r' {
+ self.ignore_lf = true;
+ c = '\n';
+ }
+
+ if c == '\n' {
+ self.current_line += 1;
+ }
+
+ if self.opts.exact_errors &&
+ match c as u32 {
+ 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
+ n if (n & 0xFFFE) == 0xFFFE => true,
+ _ => false,
+ }
+ {
+ let msg = format!("Bad character {}", c);
+ self.emit_error(Cow::Owned(msg));
+ }
+
+ debug!("got character {}", c);
+ self.current_char = c;
+ Some(c)
+ }
+
+ //§ tokenization
+ // Get the next input character, if one is available.
+ fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> {
+ if self.reconsume {
+ self.reconsume = false;
+ Some(self.current_char)
+ } else {
+ input
+ .next()
+ .and_then(|c| self.get_preprocessed_char(c, input))
+ }
+ }
+
+ fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> {
+ // Bail to the slow path for various corner cases.
+ // This means that `FromSet` can contain characters not in the set!
+ // It shouldn't matter because the fallback `FromSet` case should
+ // always do the same thing as the `NotFromSet` case.
+ if self.opts.exact_errors || self.reconsume || self.ignore_lf {
+ return self.get_char(input).map(FromSet);
+ }
+
+ let d = input.pop_except_from(set);
+ debug!("got characters {:?}", d);
+ match d {
+ Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
+
+ // NB: We don't set self.current_char for a run of characters not
+ // in the set. It shouldn't matter for the codepaths that use
+ // this.
+ _ => d,
+ }
+ }
+
+ // Check if the next characters are an ASCII case-insensitive match. See
+ // BufferQueue::eat.
+ //
+ // NB: this doesn't do input stream preprocessing or set the current input
+ // character.
+ fn eat(
+ &mut self,
+ input: &mut BufferQueue,
+ pat: &str,
+ eq: fn(&u8, &u8) -> bool,
+ ) -> Option<bool> {
+ input.push_front(replace(&mut self.temp_buf, StrTendril::new()));
+ match input.eat(pat, eq) {
+ None if self.at_eof => Some(false),
+ None => {
+ while let Some(c) = input.next() {
+ self.temp_buf.push_char(c);
+ }
+ None
+ },
+ Some(matched) => Some(matched),
+ }
+ }
+
+ /// Run the state machine for as long as we can.
+ fn run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
+ if self.opts.profile {
+ loop {
+ let state = self.state;
+ let old_sink = self.time_in_sink;
+ let (run, mut dt) = time!(self.step(input));
+ dt -= (self.time_in_sink - old_sink);
+ let new = match self.state_profile.get_mut(&state) {
+ Some(x) => {
+ *x += dt;
+ false
+ },
+ None => true,
+ };
+ if new {
+ // do this here because of borrow shenanigans
+ self.state_profile.insert(state, dt);
+ }
+ match run {
+ ProcessResult::Continue => (),
+ ProcessResult::Suspend => break,
+ ProcessResult::Script(node) => return TokenizerResult::Script(node),
+ }
+ }
+ } else {
+ loop {
+ match self.step(input) {
+ ProcessResult::Continue => (),
+ ProcessResult::Suspend => break,
+ ProcessResult::Script(node) => return TokenizerResult::Script(node),
+ }
+ }
+ }
+ TokenizerResult::Done
+ }
+
+ fn bad_char_error(&mut self) {
+ let msg = format_if!(
+ self.opts.exact_errors,
+ "Bad character",
+ "Saw {} in state {:?}",
+ self.current_char,
+ self.state
+ );
+ self.emit_error(msg);
+ }
+
+ fn bad_eof_error(&mut self) {
+ let msg = format_if!(
+ self.opts.exact_errors,
+ "Unexpected EOF",
+ "Saw EOF in state {:?}",
+ self.state
+ );
+ self.emit_error(msg);
+ }
+
+ fn emit_char(&mut self, c: char) {
+ self.process_token_and_continue(match c {
+ '\0' => NullCharacterToken,
+ _ => CharacterTokens(StrTendril::from_char(c)),
+ });
+ }
+
+ // The string must not contain '\0'!
+ fn emit_chars(&mut self, b: StrTendril) {
+ self.process_token_and_continue(CharacterTokens(b));
+ }
+
+ fn emit_current_tag(&mut self) -> ProcessResult<Sink::Handle> {
+ self.finish_attribute();
+
+ let name = LocalName::from(&*self.current_tag_name);
+ self.current_tag_name.clear();
+
+ match self.current_tag_kind {
+ StartTag => {
+ self.last_start_tag_name = Some(name.clone());
+ },
+ EndTag => {
+ if !self.current_tag_attrs.is_empty() {
+ self.emit_error(Borrowed("Attributes on an end tag"));
+ }
+ if self.current_tag_self_closing {
+ self.emit_error(Borrowed("Self-closing end tag"));
+ }
+ },
+ }
+
+ let token = TagToken(Tag {
+ kind: self.current_tag_kind,
+ name,
+ self_closing: self.current_tag_self_closing,
+ attrs: replace(&mut self.current_tag_attrs, vec![]),
+ });
+
+ match self.process_token(token) {
+ TokenSinkResult::Continue => ProcessResult::Continue,
+ TokenSinkResult::Plaintext => {
+ self.state = states::Plaintext;
+ ProcessResult::Continue
+ },
+ TokenSinkResult::Script(node) => {
+ self.state = states::Data;
+ ProcessResult::Script(node)
+ },
+ TokenSinkResult::RawData(kind) => {
+ self.state = states::RawData(kind);
+ ProcessResult::Continue
+ },
+ }
+ }
+
+ fn emit_temp_buf(&mut self) {
+ // FIXME: Make sure that clearing on emit is spec-compatible.
+ let buf = replace(&mut self.temp_buf, StrTendril::new());
+ self.emit_chars(buf);
+ }
+
+ fn clear_temp_buf(&mut self) {
+ // Do this without a new allocation.
+ self.temp_buf.clear();
+ }
+
+ fn emit_current_comment(&mut self) {
+ let comment = replace(&mut self.current_comment, StrTendril::new());
+ self.process_token_and_continue(CommentToken(comment));
+ }
+
+ fn discard_tag(&mut self) {
+ self.current_tag_name.clear();
+ self.current_tag_self_closing = false;
+ self.current_tag_attrs = vec![];
+ }
+
+ fn create_tag(&mut self, kind: TagKind, c: char) {
+ self.discard_tag();
+ self.current_tag_name.push_char(c);
+ self.current_tag_kind = kind;
+ }
+
+ fn have_appropriate_end_tag(&self) -> bool {
+ match self.last_start_tag_name.as_ref() {
+ Some(last) => (self.current_tag_kind == EndTag) && (*self.current_tag_name == **last),
+ None => false,
+ }
+ }
+
+ fn create_attribute(&mut self, c: char) {
+ self.finish_attribute();
+
+ self.current_attr_name.push_char(c);
+ }
+
+ fn finish_attribute(&mut self) {
+ if self.current_attr_name.is_empty() {
+ return;
+ }
+
+ // Check for a duplicate attribute.
+ // FIXME: the spec says we should error as soon as the name is finished.
+ // FIXME: linear time search, do we care?
+ let dup = {
+ let name = &*self.current_attr_name;
+ self.current_tag_attrs
+ .iter()
+ .any(|a| &*a.name.local == name)
+ };
+
+ if dup {
+ self.emit_error(Borrowed("Duplicate attribute"));
+ self.current_attr_name.clear();
+ self.current_attr_value.clear();
+ } else {
+ let name = LocalName::from(&*self.current_attr_name);
+ self.current_attr_name.clear();
+ self.current_tag_attrs.push(Attribute {
+ // The tree builder will adjust the namespace if necessary.
+ // This only happens in foreign elements.
+ name: QualName::new(None, ns!(), name),
+ value: replace(&mut self.current_attr_value, StrTendril::new()),
+ });
+ }
+ }
+
+ fn emit_current_doctype(&mut self) {
+ let doctype = replace(&mut self.current_doctype, Doctype::new());
+ self.process_token_and_continue(DoctypeToken(doctype));
+ }
+
+ fn doctype_id(&mut self, kind: DoctypeIdKind) -> &mut Option<StrTendril> {
+ match kind {
+ Public => &mut self.current_doctype.public_id,
+ System => &mut self.current_doctype.system_id,
+ }
+ }
+
+ fn clear_doctype_id(&mut self, kind: DoctypeIdKind) {
+ let id = self.doctype_id(kind);
+ match *id {
+ Some(ref mut s) => s.clear(),
+ None => *id = Some(StrTendril::new()),
+ }
+ }
+
+ fn consume_char_ref(&mut self, addnl_allowed: Option<char>) {
+ // NB: The char ref tokenizer assumes we have an additional allowed
+ // character iff we're tokenizing in an attribute value.
+ self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
+ }
+
+ fn emit_eof(&mut self) {
+ self.process_token_and_continue(EOFToken);
+ }
+
+ fn peek(&mut self, input: &BufferQueue) -> Option<char> {
+ if self.reconsume {
+ Some(self.current_char)
+ } else {
+ input.peek()
+ }
+ }
+
+ fn discard_char(&mut self, input: &mut BufferQueue) {
+ self.get_char(input);
+ }
+
+ fn emit_error(&mut self, error: Cow<'static, str>) {
+ self.process_token_and_continue(ParseError(error));
+ }
+}
+//§ END
+
+// Shorthand for common state machine behaviors.
+macro_rules! shorthand (
+ ( $me:ident : emit $c:expr ) => ( $me.emit_char($c); );
+ ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c); );
+ ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push_char($c); );
+ ( $me:ident : discard_tag ) => ( $me.discard_tag(); );
+ ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input); );
+ ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push_char($c); );
+ ( $me:ident : emit_temp ) => ( $me.emit_temp_buf(); );
+ ( $me:ident : clear_temp ) => ( $me.clear_temp_buf(); );
+ ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c); );
+ ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push_char($c); );
+ ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push_char($c); );
+ ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_tendril($c); );
+ ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push_char($c); );
+ ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_slice($c); );
+ ( $me:ident : emit_comment ) => ( $me.emit_current_comment(); );
+ ( $me:ident : clear_comment ) => ( $me.current_comment.clear(); );
+ ( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new(); );
+ ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.name, $c); );
+ ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c); );
+ ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k); );
+ ( $me:ident : force_quirks ) => ( $me.current_doctype.force_quirks = true; );
+ ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype(); );
+ ( $me:ident : error ) => ( $me.bad_char_error(); );
+ ( $me:ident : error_eof ) => ( $me.bad_eof_error(); );
+);
+
+// Tracing of tokenizer actions. This adds significant bloat and compile time,
+// so it's behind a cfg flag.
+#[cfg(trace_tokenizer)]
+macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
+ debug!(" {:s}", stringify!($($cmds)*));
+ shorthand!($me:expr : $($cmds)*);
+}));
+
+#[cfg(not(trace_tokenizer))]
+macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
+
+// A little DSL for sequencing shorthand actions.
+macro_rules! go (
+ // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
+ // We have to tell the parser how much lookahead we need.
+
+ ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
+ ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
+ ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
+ ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
+
+ // These can only come at the end.
+
+ ( $me:ident : to $s:ident ) => ({ $me.state = states::$s; return ProcessResult::Continue; });
+ ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state = states::$s($k1); return ProcessResult::Continue; });
+ ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return ProcessResult::Continue; });
+
+ ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume = true; go!($me: to $s); });
+ ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1); });
+ ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); });
+
+ ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue; });
+ ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; });
+
+ // We have a default next state after emitting a tag, but the sink can override.
+ ( $me:ident : emit_tag $s:ident ) => ({
+ $me.state = states::$s;
+ return $me.emit_current_tag();
+ });
+
+ ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
+
+ // If nothing else matched, it's a single command
+ ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+); );
+
+ // or nothing.
+ ( $me:ident : ) => (());
+);
+
+macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => (
+ match $x {
+ $($pats)|+ => go!($me: $($cmds)*),
+ _ => (),
+ }
+));
+
+// This is a macro because it can cause early return
+// from the function where it is used.
+macro_rules! get_char ( ($me:expr, $input:expr) => (
+ unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
+));
+
+macro_rules! peek ( ($me:expr, $input:expr) => (
+ unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
+));
+
+macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
+ unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
+));
+
+macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
+ unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
+));
+
+macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
+ unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
+));
+
+impl<Sink: TokenSink> Tokenizer<Sink> {
+ // Run the state machine for a while.
+ // Return true if we should be immediately re-invoked
+ // (this just simplifies control flow vs. break / continue).
+ #[allow(clippy::never_loop)]
+ fn step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
+ if self.char_ref_tokenizer.is_some() {
+ return self.step_char_ref_tokenizer(input);
+ }
+
+ debug!("processing in state {:?}", self.state);
+ match self.state {
+ //§ data-state
+ states::Data => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
+ FromSet('\0') => go!(self: error; emit '\0'),
+ FromSet('&') => go!(self: consume_char_ref),
+ FromSet('<') => go!(self: to TagOpen),
+ FromSet(c) => go!(self: emit c),
+ NotFromSet(b) => self.emit_chars(b),
+ }
+ },
+
+ //§ rcdata-state
+ states::RawData(Rcdata) => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
+ FromSet('\0') => go!(self: error; emit '\u{fffd}'),
+ FromSet('&') => go!(self: consume_char_ref),
+ FromSet('<') => go!(self: to RawLessThanSign Rcdata),
+ FromSet(c) => go!(self: emit c),
+ NotFromSet(b) => self.emit_chars(b),
+ }
+ },
+
+ //§ rawtext-state
+ states::RawData(Rawtext) => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
+ FromSet('\0') => go!(self: error; emit '\u{fffd}'),
+ FromSet('<') => go!(self: to RawLessThanSign Rawtext),
+ FromSet(c) => go!(self: emit c),
+ NotFromSet(b) => self.emit_chars(b),
+ }
+ },
+
+ //§ script-data-state
+ states::RawData(ScriptData) => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
+ FromSet('\0') => go!(self: error; emit '\u{fffd}'),
+ FromSet('<') => go!(self: to RawLessThanSign ScriptData),
+ FromSet(c) => go!(self: emit c),
+ NotFromSet(b) => self.emit_chars(b),
+ }
+ },
+
+ //§ script-data-escaped-state
+ states::RawData(ScriptDataEscaped(Escaped)) => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
+ FromSet('\0') => go!(self: error; emit '\u{fffd}'),
+ FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped),
+ FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
+ FromSet(c) => go!(self: emit c),
+ NotFromSet(b) => self.emit_chars(b),
+ }
+ },
+
+ //§ script-data-double-escaped-state
+ states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
+ FromSet('\0') => go!(self: error; emit '\u{fffd}'),
+ FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped),
+ FromSet('<') => {
+ go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped)
+ },
+ FromSet(c) => go!(self: emit c),
+ NotFromSet(b) => self.emit_chars(b),
+ }
+ },
+
+ //§ plaintext-state
+ states::Plaintext => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
+ FromSet('\0') => go!(self: error; emit '\u{fffd}'),
+ FromSet(c) => go!(self: emit c),
+ NotFromSet(b) => self.emit_chars(b),
+ }
+ },
+
+ //§ tag-open-state
+ states::TagOpen => loop {
+ match get_char!(self, input) {
+ '!' => go!(self: clear_temp; to MarkupDeclarationOpen),
+ '/' => go!(self: to EndTagOpen),
+ '?' => go!(self: error; clear_comment; push_comment '?'; to BogusComment),
+ c => match lower_ascii_letter(c) {
+ Some(cl) => go!(self: create_tag StartTag cl; to TagName),
+ None => go!(self: error; emit '<'; reconsume Data),
+ },
+ }
+ },
+
+ //§ end-tag-open-state
+ states::EndTagOpen => loop {
+ match get_char!(self, input) {
+ '>' => go!(self: error; to Data),
+ '\0' => {
+ go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment)
+ },
+ c => match lower_ascii_letter(c) {
+ Some(cl) => go!(self: create_tag EndTag cl; to TagName),
+ None => go!(self: error; clear_comment; push_comment c; to BogusComment),
+ },
+ }
+ },
+
+ //§ tag-name-state
+ states::TagName => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
+ '/' => go!(self: to SelfClosingStartTag),
+ '>' => go!(self: emit_tag Data),
+ '\0' => go!(self: error; push_tag '\u{fffd}'),
+ c => go!(self: push_tag (c.to_ascii_lowercase())),
+ }
+ },
+
+ //§ script-data-escaped-less-than-sign-state
+ states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
+ match get_char!(self, input) {
+ '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
+ c => match lower_ascii_letter(c) {
+ Some(cl) => go!(self: clear_temp; push_temp cl; emit '<'; emit c;
+ to ScriptDataEscapeStart DoubleEscaped),
+ None => go!(self: emit '<'; reconsume RawData ScriptDataEscaped Escaped),
+ },
+ }
+ },
+
+ //§ script-data-double-escaped-less-than-sign-state
+ states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
+ match get_char!(self, input) {
+ '/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd),
+ _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
+ }
+ },
+
+ //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state
+ // otherwise
+ states::RawLessThanSign(kind) => loop {
+ match get_char!(self, input) {
+ '/' => go!(self: clear_temp; to RawEndTagOpen kind),
+ '!' if kind == ScriptData => {
+ go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped)
+ },
+ _ => go!(self: emit '<'; reconsume RawData kind),
+ }
+ },
+
+ //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state
+ states::RawEndTagOpen(kind) => loop {
+ let c = get_char!(self, input);
+ match lower_ascii_letter(c) {
+ Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
+ None => go!(self: emit '<'; emit '/'; reconsume RawData kind),
+ }
+ },
+
+ //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state
+ states::RawEndTagName(kind) => loop {
+ let c = get_char!(self, input);
+ if self.have_appropriate_end_tag() {
+ match c {
+ '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
+ '/' => go!(self: to SelfClosingStartTag),
+ '>' => go!(self: emit_tag Data),
+ _ => (),
+ }
+ }
+
+ match lower_ascii_letter(c) {
+ Some(cl) => go!(self: push_tag cl; push_temp c),
+ None => {
+ go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind)
+ },
+ }
+ },
+
+ //§ script-data-double-escape-start-state
+ states::ScriptDataEscapeStart(DoubleEscaped) => loop {
+ let c = get_char!(self, input);
+ match c {
+ '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
+ let esc = if &*self.temp_buf == "script" {
+ DoubleEscaped
+ } else {
+ Escaped
+ };
+ go!(self: emit c; to RawData ScriptDataEscaped esc);
+ },
+ _ => match lower_ascii_letter(c) {
+ Some(cl) => go!(self: push_temp cl; emit c),
+ None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
+ },
+ }
+ },
+
+ //§ script-data-escape-start-state
+ states::ScriptDataEscapeStart(Escaped) => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: emit '-'; to ScriptDataEscapeStartDash),
+ _ => go!(self: reconsume RawData ScriptData),
+ }
+ },
+
+ //§ script-data-escape-start-dash-state
+ states::ScriptDataEscapeStartDash => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped),
+ _ => go!(self: reconsume RawData ScriptData),
+ }
+ },
+
+ //§ script-data-escaped-dash-state script-data-double-escaped-dash-state
+ states::ScriptDataEscapedDash(kind) => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash kind),
+ '<' => {
+ if kind == DoubleEscaped {
+ go!(self: emit '<');
+ }
+ go!(self: to RawLessThanSign ScriptDataEscaped kind);
+ },
+ '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
+ c => go!(self: emit c; to RawData ScriptDataEscaped kind),
+ }
+ },
+
+ //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state
+ states::ScriptDataEscapedDashDash(kind) => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: emit '-'),
+ '<' => {
+ if kind == DoubleEscaped {
+ go!(self: emit '<');
+ }
+ go!(self: to RawLessThanSign ScriptDataEscaped kind);
+ },
+ '>' => go!(self: emit '>'; to RawData ScriptData),
+ '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
+ c => go!(self: emit c; to RawData ScriptDataEscaped kind),
+ }
+ },
+
+ //§ script-data-double-escape-end-state
+ states::ScriptDataDoubleEscapeEnd => loop {
+ let c = get_char!(self, input);
+ match c {
+ '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
+ let esc = if &*self.temp_buf == "script" {
+ Escaped
+ } else {
+ DoubleEscaped
+ };
+ go!(self: emit c; to RawData ScriptDataEscaped esc);
+ },
+ _ => match lower_ascii_letter(c) {
+ Some(cl) => go!(self: push_temp cl; emit c),
+ None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
+ },
+ }
+ },
+
+ //§ before-attribute-name-state
+ states::BeforeAttributeName => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => (),
+ '/' => go!(self: to SelfClosingStartTag),
+ '>' => go!(self: emit_tag Data),
+ '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
+ c => match lower_ascii_letter(c) {
+ Some(cl) => go!(self: create_attr cl; to AttributeName),
+ None => {
+ go_match!(self: c,
+ '"' , '\'' , '<' , '=' => error);
+ go!(self: create_attr c; to AttributeName);
+ },
+ },
+ }
+ },
+
+ //§ attribute-name-state
+ states::AttributeName => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName),
+ '/' => go!(self: to SelfClosingStartTag),
+ '=' => go!(self: to BeforeAttributeValue),
+ '>' => go!(self: emit_tag Data),
+ '\0' => go!(self: error; push_name '\u{fffd}'),
+ c => match lower_ascii_letter(c) {
+ Some(cl) => go!(self: push_name cl),
+ None => {
+ go_match!(self: c,
+ '"' , '\'' , '<' => error);
+ go!(self: push_name c);
+ },
+ },
+ }
+ },
+
+ //§ after-attribute-name-state
+ states::AfterAttributeName => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => (),
+ '/' => go!(self: to SelfClosingStartTag),
+ '=' => go!(self: to BeforeAttributeValue),
+ '>' => go!(self: emit_tag Data),
+ '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
+ c => match lower_ascii_letter(c) {
+ Some(cl) => go!(self: create_attr cl; to AttributeName),
+ None => {
+ go_match!(self: c,
+ '"' , '\'' , '<' => error);
+ go!(self: create_attr c; to AttributeName);
+ },
+ },
+ }
+ },
+
+ //§ before-attribute-value-state
+ // Use peek so we can handle the first attr character along with the rest,
+ // hopefully in the same zero-copy buffer.
+ states::BeforeAttributeValue => loop {
+ match peek!(self, input) {
+ '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
+ '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
+ '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
+ '\0' => {
+ go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted)
+ },
+ '>' => go!(self: discard_char input; error; emit_tag Data),
+ _ => go!(self: to AttributeValue Unquoted),
+ }
+ },
+
+ //§ attribute-value-(double-quoted)-state
+ states::AttributeValue(DoubleQuoted) => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
+ FromSet('"') => go!(self: to AfterAttributeValueQuoted),
+ FromSet('&') => go!(self: consume_char_ref '"'),
+ FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
+ FromSet(c) => go!(self: push_value c),
+ NotFromSet(ref b) => go!(self: append_value b),
+ }
+ },
+
+ //§ attribute-value-(single-quoted)-state
+ states::AttributeValue(SingleQuoted) => loop {
+ match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
+ FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
+ FromSet('&') => go!(self: consume_char_ref '\''),
+ FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
+ FromSet(c) => go!(self: push_value c),
+ NotFromSet(ref b) => go!(self: append_value b),
+ }
+ },
+
+ //§ attribute-value-(unquoted)-state
+ states::AttributeValue(Unquoted) => loop {
+ match pop_except_from!(
+ self,
+ input,
+ small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
+ ) {
+ FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
+ go!(self: to BeforeAttributeName)
+ },
+ FromSet('&') => go!(self: consume_char_ref '>'),
+ FromSet('>') => go!(self: emit_tag Data),
+ FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
+ FromSet(c) => {
+ go_match!(self: c,
+ '"' , '\'' , '<' , '=' , '`' => error);
+ go!(self: push_value c);
+ },
+ NotFromSet(ref b) => go!(self: append_value b),
+ }
+ },
+
+ //§ after-attribute-value-(quoted)-state
+ states::AfterAttributeValueQuoted => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
+ '/' => go!(self: to SelfClosingStartTag),
+ '>' => go!(self: emit_tag Data),
+ _ => go!(self: error; reconsume BeforeAttributeName),
+ }
+ },
+
+ //§ self-closing-start-tag-state
+ states::SelfClosingStartTag => loop {
+ match get_char!(self, input) {
+ '>' => {
+ self.current_tag_self_closing = true;
+ go!(self: emit_tag Data);
+ },
+ _ => go!(self: error; reconsume BeforeAttributeName),
+ }
+ },
+
+ //§ comment-start-state
+ states::CommentStart => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: to CommentStartDash),
+ '\0' => go!(self: error; push_comment '\u{fffd}'; to Comment),
+ '>' => go!(self: error; emit_comment; to Data),
+ c => go!(self: push_comment c; to Comment),
+ }
+ },
+
+ //§ comment-start-dash-state
+ states::CommentStartDash => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: to CommentEnd),
+ '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
+ '>' => go!(self: error; emit_comment; to Data),
+ c => go!(self: push_comment '-'; push_comment c; to Comment),
+ }
+ },
+
+ //§ comment-state
+ states::Comment => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: to CommentEndDash),
+ '\0' => go!(self: error; push_comment '\u{fffd}'),
+ c => go!(self: push_comment c),
+ }
+ },
+
+ //§ comment-end-dash-state
+ states::CommentEndDash => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: to CommentEnd),
+ '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
+ c => go!(self: push_comment '-'; push_comment c; to Comment),
+ }
+ },
+
+ //§ comment-end-state
+ states::CommentEnd => loop {
+ match get_char!(self, input) {
+ '>' => go!(self: emit_comment; to Data),
+ '\0' => go!(self: error; append_comment "--\u{fffd}"; to Comment),
+ '!' => go!(self: error; to CommentEndBang),
+ '-' => go!(self: error; push_comment '-'),
+ c => go!(self: error; append_comment "--"; push_comment c; to Comment),
+ }
+ },
+
+ //§ comment-end-bang-state
+ states::CommentEndBang => loop {
+ match get_char!(self, input) {
+ '-' => go!(self: append_comment "--!"; to CommentEndDash),
+ '>' => go!(self: emit_comment; to Data),
+ '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment),
+ c => go!(self: append_comment "--!"; push_comment c; to Comment),
+ }
+ },
+
+ //§ doctype-state
+ states::Doctype => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
+ _ => go!(self: error; reconsume BeforeDoctypeName),
+ }
+ },
+
+ //§ before-doctype-name-state
+ states::BeforeDoctypeName => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => (),
+ '\0' => {
+ go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
+ },
+ '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data),
+ c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
+ to DoctypeName),
+ }
+ },
+
+ //§ doctype-name-state
+ states::DoctypeName => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName),
+ '>' => go!(self: emit_doctype; to Data),
+ '\0' => go!(self: error; push_doctype_name '\u{fffd}'),
+ c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
+ }
+ },
+
+ //§ after-doctype-name-state
+ states::AfterDoctypeName => loop {
+ if eat!(self, input, "public") {
+ go!(self: to AfterDoctypeKeyword Public);
+ } else if eat!(self, input, "system") {
+ go!(self: to AfterDoctypeKeyword System);
+ } else {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => (),
+ '>' => go!(self: emit_doctype; to Data),
+ _ => go!(self: error; force_quirks; to BogusDoctype),
+ }
+ }
+ },
+
+ //§ after-doctype-public-keyword-state after-doctype-system-keyword-state
+ states::AfterDoctypeKeyword(kind) => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
+ '"' => {
+ go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
+ },
+ '\'' => {
+ go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
+ },
+ '>' => go!(self: error; force_quirks; emit_doctype; to Data),
+ _ => go!(self: error; force_quirks; to BogusDoctype),
+ }
+ },
+
+ //§ before-doctype-public-identifier-state before-doctype-system-identifier-state
+ states::BeforeDoctypeIdentifier(kind) => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => (),
+ '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
+ '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
+ '>' => go!(self: error; force_quirks; emit_doctype; to Data),
+ _ => go!(self: error; force_quirks; to BogusDoctype),
+ }
+ },
+
+ //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state
+ states::DoctypeIdentifierDoubleQuoted(kind) => loop {
+ match get_char!(self, input) {
+ '"' => go!(self: to AfterDoctypeIdentifier kind),
+ '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
+ '>' => go!(self: error; force_quirks; emit_doctype; to Data),
+ c => go!(self: push_doctype_id kind c),
+ }
+ },
+
+ //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state
+ states::DoctypeIdentifierSingleQuoted(kind) => loop {
+ match get_char!(self, input) {
+ '\'' => go!(self: to AfterDoctypeIdentifier kind),
+ '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
+ '>' => go!(self: error; force_quirks; emit_doctype; to Data),
+ c => go!(self: push_doctype_id kind c),
+ }
+ },
+
+ //§ after-doctype-public-identifier-state
+ states::AfterDoctypeIdentifier(Public) => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => {
+ go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
+ },
+ '>' => go!(self: emit_doctype; to Data),
+ '"' => {
+ go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
+ },
+ '\'' => {
+ go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
+ },
+ _ => go!(self: error; force_quirks; to BogusDoctype),
+ }
+ },
+
+ //§ after-doctype-system-identifier-state
+ states::AfterDoctypeIdentifier(System) => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => (),
+ '>' => go!(self: emit_doctype; to Data),
+ _ => go!(self: error; to BogusDoctype),
+ }
+ },
+
+ //§ between-doctype-public-and-system-identifiers-state
+ states::BetweenDoctypePublicAndSystemIdentifiers => loop {
+ match get_char!(self, input) {
+ '\t' | '\n' | '\x0C' | ' ' => (),
+ '>' => go!(self: emit_doctype; to Data),
+ '"' => {
+ go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
+ },
+ '\'' => {
+ go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
+ },
+ _ => go!(self: error; force_quirks; to BogusDoctype),
+ }
+ },
+
+ //§ bogus-doctype-state
+ states::BogusDoctype => loop {
+ match get_char!(self, input) {
+ '>' => go!(self: emit_doctype; to Data),
+ _ => (),
+ }
+ },
+
+ //§ bogus-comment-state
+ states::BogusComment => loop {
+ match get_char!(self, input) {
+ '>' => go!(self: emit_comment; to Data),
+ '\0' => go!(self: push_comment '\u{fffd}'),
+ c => go!(self: push_comment c),
+ }
+ },
+
+ //§ markup-declaration-open-state
+ states::MarkupDeclarationOpen => loop {
+ if eat_exact!(self, input, "--") {
+ go!(self: clear_comment; to CommentStart);
+ } else if eat!(self, input, "doctype") {
+ go!(self: to Doctype);
+ } else {
+ if self
+ .sink
+ .adjusted_current_node_present_but_not_in_html_namespace()
+ {
+ if eat_exact!(self, input, "[CDATA[") {
+ go!(self: clear_temp; to CdataSection);
+ }
+ }
+ go!(self: error; to BogusComment);
+ }
+ },
+
+ //§ cdata-section-state
+ states::CdataSection => loop {
+ match get_char!(self, input) {
+ ']' => go!(self: to CdataSectionBracket),
+ '\0' => go!(self: emit_temp; emit '\0'),
+ c => go!(self: push_temp c),
+ }
+ },
+
+ //§ cdata-section-bracket
+ states::CdataSectionBracket => match get_char!(self, input) {
+ ']' => go!(self: to CdataSectionEnd),
+ _ => go!(self: push_temp ']'; reconsume CdataSection),
+ },
+
+ //§ cdata-section-end
+ states::CdataSectionEnd => loop {
+ match get_char!(self, input) {
+ ']' => go!(self: push_temp ']'),
+ '>' => go!(self: emit_temp; to Data),
+ _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
+ }
+ },
+ //§ END
+ }
+ }
+
+ fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
+ // FIXME HACK: Take and replace the tokenizer so we don't
+ // double-mut-borrow self. This is why it's boxed.
+ let mut tok = self.char_ref_tokenizer.take().unwrap();
+ let outcome = tok.step(self, input);
+
+ let progress = match outcome {
+ char_ref::Done => {
+ self.process_char_ref(tok.get_result());
+ return ProcessResult::Continue;
+ },
+
+ char_ref::Stuck => ProcessResult::Suspend,
+ char_ref::Progress => ProcessResult::Continue,
+ };
+
+ self.char_ref_tokenizer = Some(tok);
+ progress
+ }
+
+ fn process_char_ref(&mut self, char_ref: CharRef) {
+ let CharRef {
+ mut chars,
+ mut num_chars,
+ } = char_ref;
+
+ if num_chars == 0 {
+ chars[0] = '&';
+ num_chars = 1;
+ }
+
+ for i in 0..num_chars {
+ let c = chars[i as usize];
+ match self.state {
+ states::Data | states::RawData(states::Rcdata) => go!(self: emit c),
+
+ states::AttributeValue(_) => go!(self: push_value c),
+
+ _ => panic!(
+ "state {:?} should not be reachable in process_char_ref",
+ self.state
+ ),
+ }
+ }
+ }
+
+ /// Indicate that we have reached the end of the input.
+ pub fn end(&mut self) {
+ // Handle EOF in the char ref sub-tokenizer, if there is one.
+ // Do this first because it might un-consume stuff.
+ let mut input = BufferQueue::new();
+ match self.char_ref_tokenizer.take() {
+ None => (),
+ Some(mut tok) => {
+ tok.end_of_file(self, &mut input);
+ self.process_char_ref(tok.get_result());
+ },
+ }
+
+ // Process all remaining buffered input.
+ // If we're waiting for lookahead, we're not gonna get it.
+ self.at_eof = true;
+ assert!(matches!(self.run(&mut input), TokenizerResult::Done));
+ assert!(input.is_empty());
+
+ loop {
+ match self.eof_step() {
+ ProcessResult::Continue => (),
+ ProcessResult::Suspend => break,
+ ProcessResult::Script(_) => unreachable!(),
+ }
+ }
+
+ self.sink.end();
+
+ if self.opts.profile {
+ self.dump_profile();
+ }
+ }
+
+ fn dump_profile(&self) {
+ let mut results: Vec<(states::State, u64)> =
+ self.state_profile.iter().map(|(s, t)| (*s, *t)).collect();
+ results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
+
+ let total: u64 = results
+ .iter()
+ .map(|&(_, t)| t)
+ .fold(0, ::std::ops::Add::add);
+ println!("\nTokenizer profile, in nanoseconds");
+ println!("\n{:12} total in token sink", self.time_in_sink);
+ println!("\n{:12} total in tokenizer", total);
+
+ for (k, v) in results.into_iter() {
+ let pct = 100.0 * (v as f64) / (total as f64);
+ println!("{:12} {:4.1}% {:?}", v, pct, k);
+ }
+ }
+
+ fn eof_step(&mut self) -> ProcessResult<Sink::Handle> {
+ debug!("processing EOF in state {:?}", self.state);
+ match self.state {
+ states::Data |
+ states::RawData(Rcdata) |
+ states::RawData(Rawtext) |
+ states::RawData(ScriptData) |
+ states::Plaintext => go!(self: eof),
+
+ states::TagName |
+ states::RawData(ScriptDataEscaped(_)) |
+ states::BeforeAttributeName |
+ states::AttributeName |
+ states::AfterAttributeName |
+ states::BeforeAttributeValue |
+ states::AttributeValue(_) |
+ states::AfterAttributeValueQuoted |
+ states::SelfClosingStartTag |
+ states::ScriptDataEscapedDash(_) |
+ states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data),
+
+ states::TagOpen => go!(self: error_eof; emit '<'; to Data),
+
+ states::EndTagOpen => go!(self: error_eof; emit '<'; emit '/'; to Data),
+
+ states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
+ go!(self: to RawData ScriptDataEscaped DoubleEscaped)
+ },
+
+ states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind),
+
+ states::RawEndTagOpen(kind) => go!(self: emit '<'; emit '/'; to RawData kind),
+
+ states::RawEndTagName(kind) => {
+ go!(self: emit '<'; emit '/'; emit_temp; to RawData kind)
+ },
+
+ states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
+
+ states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
+
+ states::ScriptDataDoubleEscapeEnd => {
+ go!(self: to RawData ScriptDataEscaped DoubleEscaped)
+ },
+
+ states::CommentStart |
+ states::CommentStartDash |
+ states::Comment |
+ states::CommentEndDash |
+ states::CommentEnd |
+ states::CommentEndBang => go!(self: error_eof; emit_comment; to Data),
+
+ states::Doctype | states::BeforeDoctypeName => {
+ go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data)
+ },
+
+ states::DoctypeName |
+ states::AfterDoctypeName |
+ states::AfterDoctypeKeyword(_) |
+ states::BeforeDoctypeIdentifier(_) |
+ states::DoctypeIdentifierDoubleQuoted(_) |
+ states::DoctypeIdentifierSingleQuoted(_) |
+ states::AfterDoctypeIdentifier(_) |
+ states::BetweenDoctypePublicAndSystemIdentifiers => {
+ go!(self: error_eof; force_quirks; emit_doctype; to Data)
+ },
+
+ states::BogusDoctype => go!(self: emit_doctype; to Data),
+
+ states::BogusComment => go!(self: emit_comment; to Data),
+
+ states::MarkupDeclarationOpen => go!(self: error; to BogusComment),
+
+ states::CdataSection => go!(self: emit_temp; error_eof; to Data),
+
+ states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
+
+ states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
+ }
+ }
+}
+
+#[cfg(test)]
+#[allow(non_snake_case)]
+mod test {
+ use super::option_push; // private items
+ use crate::tendril::{SliceExt, StrTendril};
+
+ use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
+
+ use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
+ use super::interface::{EndTag, StartTag, Tag, TagKind};
+ use super::interface::{TagToken, Token};
+
+ use markup5ever::buffer_queue::BufferQueue;
+ use std::mem::replace;
+
+ use crate::LocalName;
+
+ // LinesMatch implements the TokenSink trait. It is used for testing to see
+ // if current_line is being updated when process_token is called. The lines
+ // vector is a collection of the line numbers that each token is on.
+ struct LinesMatch {
+ tokens: Vec<Token>,
+ current_str: StrTendril,
+ lines: Vec<(Token, u64)>,
+ }
+
+ impl LinesMatch {
+ fn new() -> LinesMatch {
+ LinesMatch {
+ tokens: vec![],
+ current_str: StrTendril::new(),
+ lines: vec![],
+ }
+ }
+
+ fn push(&mut self, token: Token, line_number: u64) {
+ self.finish_str();
+ self.lines.push((token, line_number));
+ }
+
+ fn finish_str(&mut self) {
+ if self.current_str.len() > 0 {
+ let s = replace(&mut self.current_str, StrTendril::new());
+ self.tokens.push(CharacterTokens(s));
+ }
+ }
+ }
+
+ impl TokenSink for LinesMatch {
+ type Handle = ();
+
+ fn process_token(
+ &mut self,
+ token: Token,
+ line_number: u64,
+ ) -> TokenSinkResult<Self::Handle> {
+ match token {
+ CharacterTokens(b) => {
+ self.current_str.push_slice(&b);
+ },
+
+ NullCharacterToken => {
+ self.current_str.push_char('\0');
+ },
+
+ ParseError(_) => {
+ panic!("unexpected parse error");
+ },
+
+ TagToken(mut t) => {
+ // The spec seems to indicate that one can emit
+ // erroneous end tags with attrs, but the test
+ // cases don't contain them.
+ match t.kind {
+ EndTag => {
+ t.self_closing = false;
+ t.attrs = vec![];
+ },
+ _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
+ }
+ self.push(TagToken(t), line_number);
+ },
+
+ EOFToken => (),
+
+ _ => self.push(token, line_number),
+ }
+ TokenSinkResult::Continue
+ }
+ }
+
+ // Take in tokens, process them, and return vector with line
+ // numbers that each token is on
+ fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
+ let sink = LinesMatch::new();
+ let mut tok = Tokenizer::new(sink, opts);
+ let mut buffer = BufferQueue::new();
+ for chunk in input.into_iter() {
+ buffer.push_back(chunk);
+ let _ = tok.feed(&mut buffer);
+ }
+ tok.end();
+ tok.sink.lines
+ }
+
+ // Create a tag token
+ fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
+ let name = LocalName::from(&*token);
+ let token = TagToken(Tag {
+ kind: tagkind,
+ name,
+ self_closing: false,
+ attrs: vec![],
+ });
+ token
+ }
+
+ #[test]
+ fn push_to_None_gives_singleton() {
+ let mut s: Option<StrTendril> = None;
+ option_push(&mut s, 'x');
+ assert_eq!(s, Some("x".to_tendril()));
+ }
+
+ #[test]
+ fn push_to_empty_appends() {
+ let mut s: Option<StrTendril> = Some(StrTendril::new());
+ option_push(&mut s, 'x');
+ assert_eq!(s, Some("x".to_tendril()));
+ }
+
+ #[test]
+ fn push_to_nonempty_appends() {
+ let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
+ option_push(&mut s, 'x');
+ assert_eq!(s, Some("yx".to_tendril()));
+ }
+
+ #[test]
+ fn check_lines() {
+ let opts = TokenizerOpts {
+ exact_errors: false,
+ discard_bom: true,
+ profile: false,
+ initial_state: None,
+ last_start_tag_name: None,
+ };
+ let vector = vec![
+ StrTendril::from("<a>\n"),
+ StrTendril::from("<b>\n"),
+ StrTendril::from("</b>\n"),
+ StrTendril::from("</a>\n"),
+ ];
+ let expected = vec![
+ (create_tag(StrTendril::from("a"), StartTag), 1),
+ (create_tag(StrTendril::from("b"), StartTag), 2),
+ (create_tag(StrTendril::from("b"), EndTag), 3),
+ (create_tag(StrTendril::from("a"), EndTag), 4),
+ ];
+ let results = tokenize(vector, opts);
+ assert_eq!(results, expected);
+ }
+
+ #[test]
+ fn check_lines_with_new_line() {
+ let opts = TokenizerOpts {
+ exact_errors: false,
+ discard_bom: true,
+ profile: false,
+ initial_state: None,
+ last_start_tag_name: None,
+ };
+ let vector = vec![
+ StrTendril::from("<a>\r\n"),
+ StrTendril::from("<b>\r\n"),
+ StrTendril::from("</b>\r\n"),
+ StrTendril::from("</a>\r\n"),
+ ];
+ let expected = vec![
+ (create_tag(StrTendril::from("a"), StartTag), 1),
+ (create_tag(StrTendril::from("b"), StartTag), 2),
+ (create_tag(StrTendril::from("b"), EndTag), 3),
+ (create_tag(StrTendril::from("a"), EndTag), 4),
+ ];
+ let results = tokenize(vector, opts);
+ assert_eq!(results, expected);
+ }
+}
diff --git a/src/tokenizer/states.rs b/src/tokenizer/states.rs
new file mode 100644
index 0000000..d455e9a
--- /dev/null
+++ b/src/tokenizer/states.rs
@@ -0,0 +1,93 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Tokenizer states.
+//!
+//! This is public for use by the tokenizer tests. Other library
+//! users should not have to care about this.
+
+pub use self::AttrValueKind::*;
+pub use self::DoctypeIdKind::*;
+pub use self::RawKind::*;
+pub use self::ScriptEscapeKind::*;
+pub use self::State::*;
+
+#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)]
+pub enum ScriptEscapeKind {
+ Escaped,
+ DoubleEscaped,
+}
+
+#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)]
+pub enum DoctypeIdKind {
+ Public,
+ System,
+}
+
+#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)]
+pub enum RawKind {
+ Rcdata,
+ Rawtext,
+ ScriptData,
+ ScriptDataEscaped(ScriptEscapeKind),
+}
+
+#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)]
+pub enum AttrValueKind {
+ Unquoted,
+ SingleQuoted,
+ DoubleQuoted,
+}
+
+#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)]
+pub enum State {
+ Data,
+ Plaintext,
+ TagOpen,
+ EndTagOpen,
+ TagName,
+ RawData(RawKind),
+ RawLessThanSign(RawKind),
+ RawEndTagOpen(RawKind),
+ RawEndTagName(RawKind),
+ ScriptDataEscapeStart(ScriptEscapeKind),
+ ScriptDataEscapeStartDash,
+ ScriptDataEscapedDash(ScriptEscapeKind),
+ ScriptDataEscapedDashDash(ScriptEscapeKind),
+ ScriptDataDoubleEscapeEnd,
+ BeforeAttributeName,
+ AttributeName,
+ AfterAttributeName,
+ BeforeAttributeValue,
+ AttributeValue(AttrValueKind),
+ AfterAttributeValueQuoted,
+ SelfClosingStartTag,
+ BogusComment,
+ MarkupDeclarationOpen,
+ CommentStart,
+ CommentStartDash,
+ Comment,
+ CommentEndDash,
+ CommentEnd,
+ CommentEndBang,
+ Doctype,
+ BeforeDoctypeName,
+ DoctypeName,
+ AfterDoctypeName,
+ AfterDoctypeKeyword(DoctypeIdKind),
+ BeforeDoctypeIdentifier(DoctypeIdKind),
+ DoctypeIdentifierDoubleQuoted(DoctypeIdKind),
+ DoctypeIdentifierSingleQuoted(DoctypeIdKind),
+ AfterDoctypeIdentifier(DoctypeIdKind),
+ BetweenDoctypePublicAndSystemIdentifiers,
+ BogusDoctype,
+ CdataSection,
+ CdataSectionBracket,
+ CdataSectionEnd,
+}
diff --git a/src/tree_builder/data.rs b/src/tree_builder/data.rs
new file mode 100644
index 0000000..9d51a71
--- /dev/null
+++ b/src/tree_builder/data.rs
@@ -0,0 +1,171 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use crate::interface::{LimitedQuirks, NoQuirks, Quirks, QuirksMode};
+use crate::tendril::StrTendril;
+use crate::tokenizer::Doctype;
+
+// These should all be lowercase, for ASCII-case-insensitive matching.
+static QUIRKY_PUBLIC_PREFIXES: &'static [&'static str] = &[
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
+ "-//as//dtd html 3.0 aswedit + extensions//",
+ "-//ietf//dtd html 2.0 level 1//",
+ "-//ietf//dtd html 2.0 level 2//",
+ "-//ietf//dtd html 2.0 strict level 1//",
+ "-//ietf//dtd html 2.0 strict level 2//",
+ "-//ietf//dtd html 2.0 strict//",
+ "-//ietf//dtd html 2.0//",
+ "-//ietf//dtd html 2.1e//",
+ "-//ietf//dtd html 3.0//",
+ "-//ietf//dtd html 3.2 final//",
+ "-//ietf//dtd html 3.2//",
+ "-//ietf//dtd html 3//",
+ "-//ietf//dtd html level 0//",
+ "-//ietf//dtd html level 1//",
+ "-//ietf//dtd html level 2//",
+ "-//ietf//dtd html level 3//",
+ "-//ietf//dtd html strict level 0//",
+ "-//ietf//dtd html strict level 1//",
+ "-//ietf//dtd html strict level 2//",
+ "-//ietf//dtd html strict level 3//",
+ "-//ietf//dtd html strict//",
+ "-//ietf//dtd html//",
+ "-//metrius//dtd metrius presentational//",
+ "-//microsoft//dtd internet explorer 2.0 html strict//",
+ "-//microsoft//dtd internet explorer 2.0 html//",
+ "-//microsoft//dtd internet explorer 2.0 tables//",
+ "-//microsoft//dtd internet explorer 3.0 html strict//",
+ "-//microsoft//dtd internet explorer 3.0 html//",
+ "-//microsoft//dtd internet explorer 3.0 tables//",
+ "-//netscape comm. corp.//dtd html//",
+ "-//netscape comm. corp.//dtd strict html//",
+ "-//o'reilly and associates//dtd html 2.0//",
+ "-//o'reilly and associates//dtd html extended 1.0//",
+ "-//o'reilly and associates//dtd html extended relaxed 1.0//",
+ "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
+ "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
+ "-//spyglass//dtd html 2.0 extended//",
+ "-//sq//dtd html 2.0 hotmetal + extensions//",
+ "-//sun microsystems corp.//dtd hotjava html//",
+ "-//sun microsystems corp.//dtd hotjava strict html//",
+ "-//w3c//dtd html 3 1995-03-24//",
+ "-//w3c//dtd html 3.2 draft//",
+ "-//w3c//dtd html 3.2 final//",
+ "-//w3c//dtd html 3.2//",
+ "-//w3c//dtd html 3.2s draft//",
+ "-//w3c//dtd html 4.0 frameset//",
+ "-//w3c//dtd html 4.0 transitional//",
+ "-//w3c//dtd html experimental 19960712//",
+ "-//w3c//dtd html experimental 970421//",
+ "-//w3c//dtd w3 html//",
+ "-//w3o//dtd w3 html 3.0//",
+ "-//webtechs//dtd mozilla html 2.0//",
+ "-//webtechs//dtd mozilla html//",
+];
+
+static QUIRKY_PUBLIC_MATCHES: &'static [&'static str] = &[
+ "-//w3o//dtd w3 html strict 3.0//en//",
+ "-/w3c/dtd html 4.0 transitional/en",
+ "html",
+];
+
+static QUIRKY_SYSTEM_MATCHES: &'static [&'static str] =
+ &["http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"];
+
+static LIMITED_QUIRKY_PUBLIC_PREFIXES: &'static [&'static str] = &[
+ "-//w3c//dtd xhtml 1.0 frameset//",
+ "-//w3c//dtd xhtml 1.0 transitional//",
+];
+
+static HTML4_PUBLIC_PREFIXES: &'static [&'static str] = &[
+ "-//w3c//dtd html 4.01 frameset//",
+ "-//w3c//dtd html 4.01 transitional//",
+];
+
+pub fn doctype_error_and_quirks(doctype: &Doctype, iframe_srcdoc: bool) -> (bool, QuirksMode) {
+ fn opt_string_as_slice<'t>(x: &'t Option<String>) -> Option<&'t str> {
+ x.as_ref().map(|y| &y[..])
+ }
+
+ fn opt_tendril_as_slice<'t>(x: &'t Option<StrTendril>) -> Option<&'t str> {
+ match *x {
+ Some(ref t) => Some(t),
+ None => None,
+ }
+ }
+
+ fn opt_to_ascii_lower(x: Option<&str>) -> Option<String> {
+ x.map(|y| y.to_ascii_lowercase())
+ }
+
+ let name = opt_tendril_as_slice(&doctype.name);
+ let public = opt_tendril_as_slice(&doctype.public_id);
+ let system = opt_tendril_as_slice(&doctype.system_id);
+
+ let err = match (name, public, system) {
+ (Some("html"), None, None) |
+ (Some("html"), None, Some("about:legacy-compat")) |
+ (Some("html"), Some("-//W3C//DTD HTML 4.0//EN"), None) |
+ (
+ Some("html"),
+ Some("-//W3C//DTD HTML 4.0//EN"),
+ Some("http://www.w3.org/TR/REC-html40/strict.dtd"),
+ ) |
+ (Some("html"), Some("-//W3C//DTD HTML 4.01//EN"), None) |
+ (
+ Some("html"),
+ Some("-//W3C//DTD HTML 4.01//EN"),
+ Some("http://www.w3.org/TR/html4/strict.dtd"),
+ ) |
+ (
+ Some("html"),
+ Some("-//W3C//DTD XHTML 1.0 Strict//EN"),
+ Some("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"),
+ ) |
+ (
+ Some("html"),
+ Some("-//W3C//DTD XHTML 1.1//EN"),
+ Some("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"),
+ ) => false,
+
+ _ => true,
+ };
+
+ // FIXME: We could do something asymptotically faster here.
+ // But there aren't many strings, and this happens at most once per parse.
+ fn contains_pfx(haystack: &[&str], needle: &str) -> bool {
+ haystack.iter().any(|&x| needle.starts_with(x))
+ }
+
+ // Quirks-mode matches are case-insensitive.
+ let public = opt_to_ascii_lower(public);
+ let system = opt_to_ascii_lower(system);
+
+ let quirk = match (opt_string_as_slice(&public), opt_string_as_slice(&system)) {
+ _ if doctype.force_quirks => Quirks,
+ _ if name != Some("html") => Quirks,
+
+ _ if iframe_srcdoc => NoQuirks,
+
+ (Some(ref p), _) if QUIRKY_PUBLIC_MATCHES.contains(p) => Quirks,
+ (_, Some(ref s)) if QUIRKY_SYSTEM_MATCHES.contains(s) => Quirks,
+
+ (Some(p), _) if contains_pfx(QUIRKY_PUBLIC_PREFIXES, p) => Quirks,
+ (Some(p), _) if contains_pfx(LIMITED_QUIRKY_PUBLIC_PREFIXES, p) => LimitedQuirks,
+
+ (Some(p), s) if contains_pfx(HTML4_PUBLIC_PREFIXES, p) => match s {
+ None => Quirks,
+ Some(_) => LimitedQuirks,
+ },
+
+ _ => NoQuirks,
+ };
+
+ (err, quirk)
+}
diff --git a/src/tree_builder/mod.rs b/src/tree_builder/mod.rs
new file mode 100644
index 0000000..a6fa8bf
--- /dev/null
+++ b/src/tree_builder/mod.rs
@@ -0,0 +1,1681 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#![allow(warnings)]
+
+//! The HTML5 tree builder.
+
+pub use crate::interface::{create_element, ElementFlags, NextParserState, Tracer, TreeSink};
+pub use crate::interface::{AppendNode, AppendText, Attribute, NodeOrText};
+pub use crate::interface::{LimitedQuirks, NoQuirks, Quirks, QuirksMode};
+
+use self::types::*;
+
+use crate::tendril::StrTendril;
+use crate::{ExpandedName, LocalName, Namespace, QualName};
+
+use crate::tokenizer;
+use crate::tokenizer::states as tok_state;
+use crate::tokenizer::{Doctype, EndTag, StartTag, Tag, TokenSink, TokenSinkResult};
+
+use crate::util::str::is_ascii_whitespace;
+
+use std::borrow::Cow::Borrowed;
+use std::collections::VecDeque;
+use std::default::Default;
+use std::iter::{Enumerate, Rev};
+use std::mem::replace;
+use std::{fmt, slice};
+
+use crate::tokenizer::states::{RawData, RawKind};
+use crate::tree_builder::tag_sets::*;
+use crate::tree_builder::types::*;
+use crate::util::str::to_escaped_string;
+use log::{debug, log_enabled, warn, Level};
+use mac::{_tt_as_expr_hack, format_if, matches};
+
+pub use self::PushFlag::*;
+
+#[macro_use]
+mod tag_sets;
+
+mod data;
+mod types;
+
+include!(concat!(env!("OUT_DIR"), "/rules.rs"));
+
+/// Tree builder options, with an impl for Default.
+#[derive(Copy, Clone)]
+pub struct TreeBuilderOpts {
+ /// Report all parse errors described in the spec, at some
+ /// performance penalty? Default: false
+ pub exact_errors: bool,
+
+ /// Is scripting enabled?
+ pub scripting_enabled: bool,
+
+ /// Is this an `iframe srcdoc` document?
+ pub iframe_srcdoc: bool,
+
+ /// Should we drop the DOCTYPE (if any) from the tree?
+ pub drop_doctype: bool,
+
+ /// Obsolete, ignored.
+ pub ignore_missing_rules: bool,
+
+ /// Initial TreeBuilder quirks mode. Default: NoQuirks
+ pub quirks_mode: QuirksMode,
+}
+
+impl Default for TreeBuilderOpts {
+ fn default() -> TreeBuilderOpts {
+ TreeBuilderOpts {
+ exact_errors: false,
+ scripting_enabled: true,
+ iframe_srcdoc: false,
+ drop_doctype: false,
+ ignore_missing_rules: false,
+ quirks_mode: NoQuirks,
+ }
+ }
+}
+
+/// The HTML tree builder.
+pub struct TreeBuilder<Handle, Sink> {
+ /// Options controlling the behavior of the tree builder.
+ opts: TreeBuilderOpts,
+
+ /// Consumer of tree modifications.
+ pub sink: Sink,
+
+ /// Insertion mode.
+ mode: InsertionMode,
+
+ /// Original insertion mode, used by Text and InTableText modes.
+ orig_mode: Option<InsertionMode>,
+
+ /// Stack of template insertion modes.
+ template_modes: Vec<InsertionMode>,
+
+ /// Pending table character tokens.
+ pending_table_text: Vec<(SplitStatus, StrTendril)>,
+
+ /// Quirks mode as set by the parser.
+ /// FIXME: can scripts etc. change this?
+ quirks_mode: QuirksMode,
+
+ /// The document node, which is created by the sink.
+ doc_handle: Handle,
+
+ /// Stack of open elements, most recently added at end.
+ open_elems: Vec<Handle>,
+
+ /// List of active formatting elements.
+ active_formatting: Vec<FormatEntry<Handle>>,
+
+ //§ the-element-pointers
+ /// Head element pointer.
+ head_elem: Option<Handle>,
+
+ /// Form element pointer.
+ form_elem: Option<Handle>,
+ //§ END
+ /// Frameset-ok flag.
+ frameset_ok: bool,
+
+ /// Ignore a following U+000A LINE FEED?
+ ignore_lf: bool,
+
+ /// Is foster parenting enabled?
+ foster_parenting: bool,
+
+ /// The context element for the fragment parsing algorithm.
+ context_elem: Option<Handle>,
+
+ /// Track current line
+ current_line: u64,
+ // WARNING: If you add new fields that contain Handles, you
+ // must add them to trace_handles() below to preserve memory
+ // safety!
+ //
+ // FIXME: Auto-generate the trace hooks like Servo does.
+}
+
+impl<Handle, Sink> TreeBuilder<Handle, Sink>
+where
+ Handle: Clone,
+ Sink: TreeSink<Handle = Handle>,
+{
+ /// Create a new tree builder which sends tree modifications to a particular `TreeSink`.
+ ///
+ /// The tree builder is also a `TokenSink`.
+ pub fn new(mut sink: Sink, opts: TreeBuilderOpts) -> TreeBuilder<Handle, Sink> {
+ let doc_handle = sink.get_document();
+ TreeBuilder {
+ opts: opts,
+ sink: sink,
+ mode: Initial,
+ orig_mode: None,
+ template_modes: vec![],
+ pending_table_text: vec![],
+ quirks_mode: opts.quirks_mode,
+ doc_handle: doc_handle,
+ open_elems: vec![],
+ active_formatting: vec![],
+ head_elem: None,
+ form_elem: None,
+ frameset_ok: true,
+ ignore_lf: false,
+ foster_parenting: false,
+ context_elem: None,
+ current_line: 1,
+ }
+ }
+
+ /// Create a new tree builder which sends tree modifications to a particular `TreeSink`.
+ /// This is for parsing fragments.
+ ///
+ /// The tree builder is also a `TokenSink`.
+ pub fn new_for_fragment(
+ mut sink: Sink,
+ context_elem: Handle,
+ form_elem: Option<Handle>,
+ opts: TreeBuilderOpts,
+ ) -> TreeBuilder<Handle, Sink> {
+ let doc_handle = sink.get_document();
+ let context_is_template = sink.elem_name(&context_elem) == expanded_name!(html "template");
+ let mut tb = TreeBuilder {
+ opts: opts,
+ sink: sink,
+ mode: Initial,
+ orig_mode: None,
+ template_modes: if context_is_template {
+ vec![InTemplate]
+ } else {
+ vec![]
+ },
+ pending_table_text: vec![],
+ quirks_mode: opts.quirks_mode,
+ doc_handle: doc_handle,
+ open_elems: vec![],
+ active_formatting: vec![],
+ head_elem: None,
+ form_elem: form_elem,
+ frameset_ok: true,
+ ignore_lf: false,
+ foster_parenting: false,
+ context_elem: Some(context_elem),
+ current_line: 1,
+ };
+
+ // https://html.spec.whatwg.org/multipage/#parsing-html-fragments
+ // 5. Let root be a new html element with no attributes.
+ // 6. Append the element root to the Document node created above.
+ // 7. Set up the parser's stack of open elements so that it contains just the single element root.
+ tb.create_root(vec![]);
+ // 10. Reset the parser's insertion mode appropriately.
+ tb.mode = tb.reset_insertion_mode();
+
+ tb
+ }
+
+ // https://html.spec.whatwg.org/multipage/#concept-frag-parse-context
+ // Step 4. Set the state of the HTML parser's tokenization stage as follows:
+ pub fn tokenizer_state_for_context_elem(&self) -> tok_state::State {
+ let elem = self.context_elem.as_ref().expect("no context element");
+ let name = match self.sink.elem_name(elem) {
+ ExpandedName {
+ ns: &ns!(html),
+ local,
+ } => local,
+ _ => return tok_state::Data,
+ };
+ match *name {
+ local_name!("title") | local_name!("textarea") => tok_state::RawData(tok_state::Rcdata),
+
+ local_name!("style") |
+ local_name!("xmp") |
+ local_name!("iframe") |
+ local_name!("noembed") |
+ local_name!("noframes") => tok_state::RawData(tok_state::Rawtext),
+
+ local_name!("script") => tok_state::RawData(tok_state::ScriptData),
+
+ local_name!("noscript") => {
+ if self.opts.scripting_enabled {
+ tok_state::RawData(tok_state::Rawtext)
+ } else {
+ tok_state::Data
+ }
+ },
+
+ local_name!("plaintext") => tok_state::Plaintext,
+
+ _ => tok_state::Data,
+ }
+ }
+
+ /// Call the `Tracer`'s `trace_handle` method on every `Handle` in the tree builder's
+ /// internal state. This is intended to support garbage-collected DOMs.
+ pub fn trace_handles(&self, tracer: &Tracer<Handle = Handle>) {
+ tracer.trace_handle(&self.doc_handle);
+ for e in &self.open_elems {
+ tracer.trace_handle(e);
+ }
+ for e in &self.active_formatting {
+ match e {
+ &Element(ref h, _) => tracer.trace_handle(h),
+ _ => (),
+ }
+ }
+ self.head_elem.as_ref().map(|h| tracer.trace_handle(h));
+ self.form_elem.as_ref().map(|h| tracer.trace_handle(h));
+ self.context_elem.as_ref().map(|h| tracer.trace_handle(h));
+ }
+
+ #[allow(dead_code)]
+ fn dump_state(&self, label: String) {
+ println!("dump_state on {}", label);
+ print!(" open_elems:");
+ for node in self.open_elems.iter() {
+ let name = self.sink.elem_name(node);
+ match *name.ns {
+ ns!(html) => print!(" {}", name.local),
+ _ => panic!(),
+ }
+ }
+ println!("");
+ print!(" active_formatting:");
+ for entry in self.active_formatting.iter() {
+ match entry {
+ &Marker => print!(" Marker"),
+ &Element(ref h, _) => {
+ let name = self.sink.elem_name(h);
+ match *name.ns {
+ ns!(html) => print!(" {}", name.local),
+ _ => panic!(),
+ }
+ },
+ }
+ }
+ println!("");
+ }
+
+ fn debug_step(&self, mode: InsertionMode, token: &Token) {
+ if log_enabled!(Level::Debug) {
+ debug!(
+ "processing {} in insertion mode {:?}",
+ to_escaped_string(token),
+ mode
+ );
+ }
+ }
+
+ fn process_to_completion(&mut self, mut token: Token) -> TokenSinkResult<Handle> {
+ // Queue of additional tokens yet to be processed.
+ // This stays empty in the common case where we don't split whitespace.
+ let mut more_tokens = VecDeque::new();
+
+ loop {
+ let should_have_acknowledged_self_closing_flag = matches!(
+ token,
+ TagToken(Tag {
+ self_closing: true,
+ kind: StartTag,
+ ..
+ })
+ );
+ let result = if self.is_foreign(&token) {
+ self.step_foreign(token)
+ } else {
+ let mode = self.mode;
+ self.step(mode, token)
+ };
+ match result {
+ Done => {
+ if should_have_acknowledged_self_closing_flag {
+ self.sink
+ .parse_error(Borrowed("Unacknowledged self-closing tag"));
+ }
+ token = unwrap_or_return!(
+ more_tokens.pop_front(),
+ tokenizer::TokenSinkResult::Continue
+ );
+ },
+ DoneAckSelfClosing => {
+ token = unwrap_or_return!(
+ more_tokens.pop_front(),
+ tokenizer::TokenSinkResult::Continue
+ );
+ },
+ Reprocess(m, t) => {
+ self.mode = m;
+ token = t;
+ },
+ ReprocessForeign(t) => {
+ token = t;
+ },
+ SplitWhitespace(mut buf) => {
+ let p = buf.pop_front_char_run(is_ascii_whitespace);
+ let (first, is_ws) = unwrap_or_return!(p, tokenizer::TokenSinkResult::Continue);
+ let status = if is_ws { Whitespace } else { NotWhitespace };
+ token = CharacterTokens(status, first);
+
+ if buf.len32() > 0 {
+ more_tokens.push_back(CharacterTokens(NotSplit, buf));
+ }
+ },
+ Script(node) => {
+ assert!(more_tokens.is_empty());
+ return tokenizer::TokenSinkResult::Script(node);
+ },
+ ToPlaintext => {
+ assert!(more_tokens.is_empty());
+ return tokenizer::TokenSinkResult::Plaintext;
+ },
+ ToRawData(k) => {
+ assert!(more_tokens.is_empty());
+ return tokenizer::TokenSinkResult::RawData(k);
+ },
+ }
+ }
+ }
+
+ /// Are we parsing a HTML fragment?
+ pub fn is_fragment(&self) -> bool {
+ self.context_elem.is_some()
+ }
+
+ /// https://html.spec.whatwg.org/multipage/#appropriate-place-for-inserting-a-node
+ fn appropriate_place_for_insertion(
+ &mut self,
+ override_target: Option<Handle>,
+ ) -> InsertionPoint<Handle> {
+ use self::tag_sets::*;
+
+ declare_tag_set!(foster_target = "table" "tbody" "tfoot" "thead" "tr");
+ let target = override_target.unwrap_or_else(|| self.current_node().clone());
+ if !(self.foster_parenting && self.elem_in(&target, foster_target)) {
+ if self.html_elem_named(&target, local_name!("template")) {
+ // No foster parenting (inside template).
+ let contents = self.sink.get_template_contents(&target);
+ return LastChild(contents);
+ } else {
+ // No foster parenting (the common case).
+ return LastChild(target);
+ }
+ }
+
+ // Foster parenting
+ let mut iter = self.open_elems.iter().rev().peekable();
+ while let Some(elem) = iter.next() {
+ if self.html_elem_named(&elem, local_name!("template")) {
+ let contents = self.sink.get_template_contents(&elem);
+ return LastChild(contents);
+ } else if self.html_elem_named(&elem, local_name!("table")) {
+ return TableFosterParenting {
+ element: elem.clone(),
+ prev_element: (*iter.peek().unwrap()).clone(),
+ };
+ }
+ }
+ let html_elem = self.html_elem();
+ LastChild(html_elem.clone())
+ }
+
+ fn insert_at(&mut self, insertion_point: InsertionPoint<Handle>, child: NodeOrText<Handle>) {
+ match insertion_point {
+ LastChild(parent) => self.sink.append(&parent, child),
+ BeforeSibling(sibling) => self.sink.append_before_sibling(&sibling, child),
+ TableFosterParenting {
+ element,
+ prev_element,
+ } => self
+ .sink
+ .append_based_on_parent_node(&element, &prev_element, child),
+ }
+ }
+}
+
+impl<Handle, Sink> TokenSink for TreeBuilder<Handle, Sink>
+where
+ Handle: Clone,
+ Sink: TreeSink<Handle = Handle>,
+{
+ type Handle = Handle;
+
+ fn process_token(
+ &mut self,
+ token: tokenizer::Token,
+ line_number: u64,
+ ) -> TokenSinkResult<Handle> {
+ if line_number != self.current_line {
+ self.sink.set_current_line(line_number);
+ }
+ let ignore_lf = replace(&mut self.ignore_lf, false);
+
+ // Handle `ParseError` and `DoctypeToken`; convert everything else to the local `Token` type.
+ let token = match token {
+ tokenizer::ParseError(e) => {
+ self.sink.parse_error(e);
+ return tokenizer::TokenSinkResult::Continue;
+ },
+
+ tokenizer::DoctypeToken(dt) => {
+ if self.mode == Initial {
+ let (err, quirk) = data::doctype_error_and_quirks(&dt, self.opts.iframe_srcdoc);
+ if err {
+ self.sink.parse_error(format_if!(
+ self.opts.exact_errors,
+ "Bad DOCTYPE",
+ "Bad DOCTYPE: {:?}",
+ dt
+ ));
+ }
+ let Doctype {
+ name,
+ public_id,
+ system_id,
+ force_quirks: _,
+ } = dt;
+ if !self.opts.drop_doctype {
+ self.sink.append_doctype_to_document(
+ name.unwrap_or(StrTendril::new()),
+ public_id.unwrap_or(StrTendril::new()),
+ system_id.unwrap_or(StrTendril::new()),
+ );
+ }
+ self.set_quirks_mode(quirk);
+
+ self.mode = BeforeHtml;
+ return tokenizer::TokenSinkResult::Continue;
+ } else {
+ self.sink.parse_error(format_if!(
+ self.opts.exact_errors,
+ "DOCTYPE in body",
+ "DOCTYPE in insertion mode {:?}",
+ self.mode
+ ));
+ return tokenizer::TokenSinkResult::Continue;
+ }
+ },
+
+ tokenizer::TagToken(x) => TagToken(x),
+ tokenizer::CommentToken(x) => CommentToken(x),
+ tokenizer::NullCharacterToken => NullCharacterToken,
+ tokenizer::EOFToken => EOFToken,
+
+ tokenizer::CharacterTokens(mut x) => {
+ if ignore_lf && x.starts_with("\n") {
+ x.pop_front(1);
+ }
+ if x.is_empty() {
+ return tokenizer::TokenSinkResult::Continue;
+ }
+ CharacterTokens(NotSplit, x)
+ },
+ };
+
+ self.process_to_completion(token)
+ }
+
+ fn end(&mut self) {
+ for elem in self.open_elems.drain(..).rev() {
+ self.sink.pop(&elem);
+ }
+ }
+
+ fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool {
+ !self.open_elems.is_empty() &&
+ self.sink.elem_name(self.adjusted_current_node()).ns != &ns!(html)
+ }
+}
+
+pub fn html_elem<Handle>(open_elems: &[Handle]) -> &Handle {
+ &open_elems[0]
+}
+
+pub struct ActiveFormattingIter<'a, Handle: 'a> {
+ iter: Rev<Enumerate<slice::Iter<'a, FormatEntry<Handle>>>>,
+}
+
+impl<'a, Handle> Iterator for ActiveFormattingIter<'a, Handle> {
+ type Item = (usize, &'a Handle, &'a Tag);
+ fn next(&mut self) -> Option<(usize, &'a Handle, &'a Tag)> {
+ match self.iter.next() {
+ None | Some((_, &Marker)) => None,
+ Some((i, &Element(ref h, ref t))) => Some((i, h, t)),
+ }
+ }
+}
+
+pub enum PushFlag {
+ Push,
+ NoPush,
+}
+
+enum Bookmark<Handle> {
+ Replace(Handle),
+ InsertAfter(Handle),
+}
+
+macro_rules! qualname {
+ ("", $local:tt) => {
+ QualName {
+ prefix: None,
+ ns: ns!(),
+ local: local_name!($local),
+ }
+ };
+ ($prefix: tt $ns:tt $local:tt) => {
+ QualName {
+ prefix: Some(namespace_prefix!($prefix)),
+ ns: ns!($ns),
+ local: local_name!($local),
+ }
+ };
+}
+
+#[doc(hidden)]
+impl<Handle, Sink> TreeBuilder<Handle, Sink>
+where
+ Handle: Clone,
+ Sink: TreeSink<Handle = Handle>,
+{
+ fn unexpected<T: fmt::Debug>(&mut self, _thing: &T) -> ProcessResult<Handle> {
+ self.sink.parse_error(format_if!(
+ self.opts.exact_errors,
+ "Unexpected token",
+ "Unexpected token {} in insertion mode {:?}",
+ to_escaped_string(_thing),
+ self.mode
+ ));
+ Done
+ }
+
+ fn assert_named(&mut self, node: &Handle, name: LocalName) {
+ assert!(self.html_elem_named(&node, name));
+ }
+
+ /// Iterate over the active formatting elements (with index in the list) from the end
+ /// to the last marker, or the beginning if there are no markers.
+ fn active_formatting_end_to_marker<'a>(&'a self) -> ActiveFormattingIter<'a, Handle> {
+ ActiveFormattingIter {
+ iter: self.active_formatting.iter().enumerate().rev(),
+ }
+ }
+
+ fn position_in_active_formatting(&self, element: &Handle) -> Option<usize> {
+ self.active_formatting.iter().position(|n| match n {
+ &Marker => false,
+ &Element(ref handle, _) => self.sink.same_node(handle, element),
+ })
+ }
+
+ fn set_quirks_mode(&mut self, mode: QuirksMode) {
+ self.quirks_mode = mode;
+ self.sink.set_quirks_mode(mode);
+ }
+
+ fn stop_parsing(&mut self) -> ProcessResult<Handle> {
+ Done
+ }
+
+ //§ parsing-elements-that-contain-only-text
+ // Switch to `Text` insertion mode, save the old mode, and
+ // switch the tokenizer to a raw-data state.
+ // The latter only takes effect after the current / next
+ // `process_token` of a start tag returns!
+ fn to_raw_text_mode(&mut self, k: RawKind) -> ProcessResult<Handle> {
+ self.orig_mode = Some(self.mode);
+ self.mode = Text;
+ ToRawData(k)
+ }
+
+ // The generic raw text / RCDATA parsing algorithm.
+ fn parse_raw_data(&mut self, tag: Tag, k: RawKind) -> ProcessResult<Handle> {
+ self.insert_element_for(tag);
+ self.to_raw_text_mode(k)
+ }
+ //§ END
+
+ fn current_node(&self) -> &Handle {
+ self.open_elems.last().expect("no current element")
+ }
+
+ fn adjusted_current_node(&self) -> &Handle {
+ if self.open_elems.len() == 1 {
+ if let Some(ctx) = self.context_elem.as_ref() {
+ return ctx;
+ }
+ }
+ self.current_node()
+ }
+
+ fn current_node_in<TagSet>(&self, set: TagSet) -> bool
+ where
+ TagSet: Fn(ExpandedName) -> bool,
+ {
+ set(self.sink.elem_name(self.current_node()))
+ }
+
+ // Insert at the "appropriate place for inserting a node".
+ fn insert_appropriately(&mut self, child: NodeOrText<Handle>, override_target: Option<Handle>) {
+ let insertion_point = self.appropriate_place_for_insertion(override_target);
+ self.insert_at(insertion_point, child);
+ }
+
+ fn adoption_agency(&mut self, subject: LocalName) {
+ // 1.
+ if self.current_node_named(subject.clone()) {
+ if self
+ .position_in_active_formatting(self.current_node())
+ .is_none()
+ {
+ self.pop();
+ return;
+ }
+ }
+
+ // 2. 3. 4.
+ for _ in 0..8 {
+ // 5.
+ let (fmt_elem_index, fmt_elem, fmt_elem_tag) = unwrap_or_return!(
+ // We clone the Handle and Tag so they don't cause an immutable borrow of self.
+ self.active_formatting_end_to_marker()
+ .filter(|&(_, _, tag)| tag.name == subject)
+ .next()
+ .map(|(i, h, t)| (i, h.clone(), t.clone())),
+ {
+ self.process_end_tag_in_body(Tag {
+ kind: EndTag,
+ name: subject,
+ self_closing: false,
+ attrs: vec![],
+ });
+ }
+ );
+
+ let fmt_elem_stack_index = unwrap_or_return!(
+ self.open_elems
+ .iter()
+ .rposition(|n| self.sink.same_node(n, &fmt_elem)),
+ {
+ self.sink
+ .parse_error(Borrowed("Formatting element not open"));
+ self.active_formatting.remove(fmt_elem_index);
+ }
+ );
+
+ // 7.
+ if !self.in_scope(default_scope, |n| self.sink.same_node(&n, &fmt_elem)) {
+ self.sink
+ .parse_error(Borrowed("Formatting element not in scope"));
+ return;
+ }
+
+ // 8.
+ if !self.sink.same_node(self.current_node(), &fmt_elem) {
+ self.sink
+ .parse_error(Borrowed("Formatting element not current node"));
+ }
+
+ // 9.
+ let (furthest_block_index, furthest_block) = unwrap_or_return!(
+ self.open_elems
+ .iter()
+ .enumerate()
+ .skip(fmt_elem_stack_index)
+ .filter(|&(_, open_element)| self.elem_in(open_element, special_tag))
+ .next()
+ .map(|(i, h)| (i, h.clone())),
+ // 10.
+ {
+ self.open_elems.truncate(fmt_elem_stack_index);
+ self.active_formatting.remove(fmt_elem_index);
+ }
+ );
+
+ // 11.
+ let common_ancestor = self.open_elems[fmt_elem_stack_index - 1].clone();
+
+ // 12.
+ let mut bookmark = Bookmark::Replace(fmt_elem.clone());
+
+ // 13.
+ let mut node;
+ let mut node_index = furthest_block_index;
+ let mut last_node = furthest_block.clone();
+
+ // 13.1.
+ let mut inner_counter = 0;
+ loop {
+ // 13.2.
+ inner_counter += 1;
+
+ // 13.3.
+ node_index -= 1;
+ node = self.open_elems[node_index].clone();
+
+ // 13.4.
+ if self.sink.same_node(&node, &fmt_elem) {
+ break;
+ }
+
+ // 13.5.
+ if inner_counter > 3 {
+ self.position_in_active_formatting(&node)
+ .map(|position| self.active_formatting.remove(position));
+ self.open_elems.remove(node_index);
+ continue;
+ }
+
+ let node_formatting_index = unwrap_or_else!(
+ self.position_in_active_formatting(&node),
+ // 13.6.
+ {
+ self.open_elems.remove(node_index);
+ continue;
+ }
+ );
+
+ // 13.7.
+ let tag = match self.active_formatting[node_formatting_index] {
+ Element(ref h, ref t) => {
+ assert!(self.sink.same_node(h, &node));
+ t.clone()
+ },
+ Marker => panic!("Found marker during adoption agency"),
+ };
+ // FIXME: Is there a way to avoid cloning the attributes twice here (once on their
+ // own, once as part of t.clone() above)?
+ let new_element = create_element(
+ &mut self.sink,
+ QualName::new(None, ns!(html), tag.name.clone()),
+ tag.attrs.clone(),
+ );
+ self.open_elems[node_index] = new_element.clone();
+ self.active_formatting[node_formatting_index] = Element(new_element.clone(), tag);
+ node = new_element;
+
+ // 13.8.
+ if self.sink.same_node(&last_node, &furthest_block) {
+ bookmark = Bookmark::InsertAfter(node.clone());
+ }
+
+ // 13.9.
+ self.sink.remove_from_parent(&last_node);
+ self.sink.append(&node, AppendNode(last_node.clone()));
+
+ // 13.10.
+ last_node = node.clone();
+
+ // 13.11.
+ }
+
+ // 14.
+ self.sink.remove_from_parent(&last_node);
+ self.insert_appropriately(AppendNode(last_node.clone()), Some(common_ancestor));
+
+ // 15.
+ // FIXME: Is there a way to avoid cloning the attributes twice here (once on their own,
+ // once as part of t.clone() above)?
+ let new_element = create_element(
+ &mut self.sink,
+ QualName::new(None, ns!(html), fmt_elem_tag.name.clone()),
+ fmt_elem_tag.attrs.clone(),
+ );
+ let new_entry = Element(new_element.clone(), fmt_elem_tag);
+
+ // 16.
+ self.sink.reparent_children(&furthest_block, &new_element);
+
+ // 17.
+ self.sink
+ .append(&furthest_block, AppendNode(new_element.clone()));
+
+ // 18.
+ // FIXME: We could probably get rid of the position_in_active_formatting() calls here
+ // if we had a more clever Bookmark representation.
+ match bookmark {
+ Bookmark::Replace(to_replace) => {
+ let index = self
+ .position_in_active_formatting(&to_replace)
+ .expect("bookmark not found in active formatting elements");
+ self.active_formatting[index] = new_entry;
+ },
+ Bookmark::InsertAfter(previous) => {
+ let index = self
+ .position_in_active_formatting(&previous)
+ .expect("bookmark not found in active formatting elements") +
+ 1;
+ self.active_formatting.insert(index, new_entry);
+ let old_index = self
+ .position_in_active_formatting(&fmt_elem)
+ .expect("formatting element not found in active formatting elements");
+ self.active_formatting.remove(old_index);
+ },
+ }
+
+ // 19.
+ self.remove_from_stack(&fmt_elem);
+ let new_furthest_block_index = self
+ .open_elems
+ .iter()
+ .position(|n| self.sink.same_node(n, &furthest_block))
+ .expect("furthest block missing from open element stack");
+ self.open_elems
+ .insert(new_furthest_block_index + 1, new_element);
+
+ // 20.
+ }
+ }
+
+ fn push(&mut self, elem: &Handle) {
+ self.open_elems.push(elem.clone());
+ }
+
+ fn pop(&mut self) -> Handle {
+ let elem = self.open_elems.pop().expect("no current element");
+ self.sink.pop(&elem);
+ elem
+ }
+
+ fn remove_from_stack(&mut self, elem: &Handle) {
+ let sink = &mut self.sink;
+ let position = self
+ .open_elems
+ .iter()
+ .rposition(|x| sink.same_node(elem, &x));
+ if let Some(position) = position {
+ self.open_elems.remove(position);
+ sink.pop(elem);
+ }
+ }
+
+ fn is_marker_or_open(&self, entry: &FormatEntry<Handle>) -> bool {
+ match *entry {
+ Marker => true,
+ Element(ref node, _) => self
+ .open_elems
+ .iter()
+ .rev()
+ .any(|n| self.sink.same_node(&n, &node)),
+ }
+ }
+
+ /// Reconstruct the active formatting elements.
+ fn reconstruct_formatting(&mut self) {
+ {
+ let last = unwrap_or_return!(self.active_formatting.last(), ());
+ if self.is_marker_or_open(last) {
+ return;
+ }
+ }
+
+ let mut entry_index = self.active_formatting.len() - 1;
+ loop {
+ if entry_index == 0 {
+ break;
+ }
+ entry_index -= 1;
+ if self.is_marker_or_open(&self.active_formatting[entry_index]) {
+ entry_index += 1;
+ break;
+ }
+ }
+
+ loop {
+ let tag = match self.active_formatting[entry_index] {
+ Element(_, ref t) => t.clone(),
+ Marker => panic!("Found marker during formatting element reconstruction"),
+ };
+
+ // FIXME: Is there a way to avoid cloning the attributes twice here (once on their own,
+ // once as part of t.clone() above)?
+ let new_element =
+ self.insert_element(Push, ns!(html), tag.name.clone(), tag.attrs.clone());
+ self.active_formatting[entry_index] = Element(new_element, tag);
+ if entry_index == self.active_formatting.len() - 1 {
+ break;
+ }
+ entry_index += 1;
+ }
+ }
+
+ /// Get the first element on the stack, which will be the <html> element.
+ fn html_elem(&self) -> &Handle {
+ &self.open_elems[0]
+ }
+
+ /// Get the second element on the stack, if it's a HTML body element.
+ fn body_elem(&self) -> Option<&Handle> {
+ if self.open_elems.len() <= 1 {
+ return None;
+ }
+
+ let node = &self.open_elems[1];
+ if self.html_elem_named(node, local_name!("body")) {
+ Some(node)
+ } else {
+ None
+ }
+ }
+
+ /// Signal an error depending on the state of the stack of open elements at
+ /// the end of the body.
+ fn check_body_end(&mut self) {
+ declare_tag_set!(body_end_ok =
+ "dd" "dt" "li" "optgroup" "option" "p" "rp" "rt" "tbody" "td" "tfoot" "th"
+ "thead" "tr" "body" "html");
+
+ for elem in self.open_elems.iter() {
+ let error;
+ {
+ let name = self.sink.elem_name(elem);
+ if body_end_ok(name) {
+ continue;
+ }
+ error = format_if!(
+ self.opts.exact_errors,
+ "Unexpected open tag at end of body",
+ "Unexpected open tag {:?} at end of body",
+ name
+ );
+ }
+ self.sink.parse_error(error);
+ // FIXME: Do we keep checking after finding one bad tag?
+ // The spec suggests not.
+ return;
+ }
+ }
+
+ fn in_scope<TagSet, Pred>(&self, scope: TagSet, pred: Pred) -> bool
+ where
+ TagSet: Fn(ExpandedName) -> bool,
+ Pred: Fn(Handle) -> bool,
+ {
+ for node in self.open_elems.iter().rev() {
+ if pred(node.clone()) {
+ return true;
+ }
+ if scope(self.sink.elem_name(node)) {
+ return false;
+ }
+ }
+
+ // supposed to be impossible, because <html> is always in scope
+
+ false
+ }
+
+ fn elem_in<TagSet>(&self, elem: &Handle, set: TagSet) -> bool
+ where
+ TagSet: Fn(ExpandedName) -> bool,
+ {
+ set(self.sink.elem_name(elem))
+ }
+
+ fn html_elem_named(&self, elem: &Handle, name: LocalName) -> bool {
+ let expanded = self.sink.elem_name(elem);
+ *expanded.ns == ns!(html) && *expanded.local == name
+ }
+
+ fn in_html_elem_named(&self, name: LocalName) -> bool {
+ self.open_elems
+ .iter()
+ .any(|elem| self.html_elem_named(elem, name.clone()))
+ }
+
+ fn current_node_named(&self, name: LocalName) -> bool {
+ self.html_elem_named(self.current_node(), name)
+ }
+
+ fn in_scope_named<TagSet>(&self, scope: TagSet, name: LocalName) -> bool
+ where
+ TagSet: Fn(ExpandedName) -> bool,
+ {
+ self.in_scope(scope, |elem| self.html_elem_named(&elem, name.clone()))
+ }
+
+ //§ closing-elements-that-have-implied-end-tags
+ fn generate_implied_end<TagSet>(&mut self, set: TagSet)
+ where
+ TagSet: Fn(ExpandedName) -> bool,
+ {
+ loop {
+ {
+ let elem = unwrap_or_return!(self.open_elems.last(), ());
+ let nsname = self.sink.elem_name(elem);
+ if !set(nsname) {
+ return;
+ }
+ }
+ self.pop();
+ }
+ }
+
+ fn generate_implied_end_except(&mut self, except: LocalName) {
+ self.generate_implied_end(|p| {
+ if *p.ns == ns!(html) && *p.local == except {
+ false
+ } else {
+ cursory_implied_end(p)
+ }
+ });
+ }
+ //§ END
+
+ // Pop elements until the current element is in the set.
+ fn pop_until_current<TagSet>(&mut self, pred: TagSet)
+ where
+ TagSet: Fn(ExpandedName) -> bool,
+ {
+ loop {
+ if self.current_node_in(|x| pred(x)) {
+ break;
+ }
+ self.open_elems.pop();
+ }
+ }
+
+ // Pop elements until an element from the set has been popped. Returns the
+ // number of elements popped.
+ fn pop_until<P>(&mut self, pred: P) -> usize
+ where
+ P: Fn(ExpandedName) -> bool,
+ {
+ let mut n = 0;
+ loop {
+ n += 1;
+ match self.open_elems.pop() {
+ None => break,
+ Some(elem) => {
+ if pred(self.sink.elem_name(&elem)) {
+ break;
+ }
+ },
+ }
+ }
+ n
+ }
+
+ fn pop_until_named(&mut self, name: LocalName) -> usize {
+ self.pop_until(|p| *p.ns == ns!(html) && *p.local == name)
+ }
+
+ // Pop elements until one with the specified name has been popped.
+ // Signal an error if it was not the first one.
+ fn expect_to_close(&mut self, name: LocalName) {
+ if self.pop_until_named(name.clone()) != 1 {
+ self.sink.parse_error(format_if!(
+ self.opts.exact_errors,
+ "Unexpected open element",
+ "Unexpected open element while closing {:?}",
+ name
+ ));
+ }
+ }
+
+ fn close_p_element(&mut self) {
+ declare_tag_set!(implied = [cursory_implied_end] - "p");
+ self.generate_implied_end(implied);
+ self.expect_to_close(local_name!("p"));
+ }
+
+ fn close_p_element_in_button_scope(&mut self) {
+ if self.in_scope_named(button_scope, local_name!("p")) {
+ self.close_p_element();
+ }
+ }
+
+ // Check <input> tags for type=hidden
+ fn is_type_hidden(&self, tag: &Tag) -> bool {
+ match tag
+ .attrs
+ .iter()
+ .find(|&at| at.name.expanded() == expanded_name!("", "type"))
+ {
+ None => false,
+ Some(at) => (&*at.value).eq_ignore_ascii_case("hidden"),
+ }
+ }
+
+ fn foster_parent_in_body(&mut self, token: Token) -> ProcessResult<Handle> {
+ warn!("foster parenting not implemented");
+ self.foster_parenting = true;
+ let res = self.step(InBody, token);
+ // FIXME: what if res is Reprocess?
+ self.foster_parenting = false;
+ res
+ }
+
+ fn process_chars_in_table(&mut self, token: Token) -> ProcessResult<Handle> {
+ declare_tag_set!(table_outer = "table" "tbody" "tfoot" "thead" "tr");
+ if self.current_node_in(table_outer) {
+ assert!(self.pending_table_text.is_empty());
+ self.orig_mode = Some(self.mode);
+ Reprocess(InTableText, token)
+ } else {
+ self.sink.parse_error(format_if!(
+ self.opts.exact_errors,
+ "Unexpected characters in table",
+ "Unexpected characters {} in table",
+ to_escaped_string(&token)
+ ));
+ self.foster_parent_in_body(token)
+ }
+ }
+
+ // https://html.spec.whatwg.org/multipage/#reset-the-insertion-mode-appropriately
+ fn reset_insertion_mode(&mut self) -> InsertionMode {
+ for (i, mut node) in self.open_elems.iter().enumerate().rev() {
+ let last = i == 0usize;
+ if let (true, Some(ctx)) = (last, self.context_elem.as_ref()) {
+ node = ctx;
+ }
+ let name = match self.sink.elem_name(node) {
+ ExpandedName {
+ ns: &ns!(html),
+ local,
+ } => local,
+ _ => continue,
+ };
+ match *name {
+ local_name!("select") => {
+ for ancestor in self.open_elems[0..i].iter().rev() {
+ if self.html_elem_named(ancestor, local_name!("template")) {
+ return InSelect;
+ } else if self.html_elem_named(ancestor, local_name!("table")) {
+ return InSelectInTable;
+ }
+ }
+ return InSelect;
+ },
+ local_name!("td") | local_name!("th") => {
+ if !last {
+ return InCell;
+ }
+ },
+ local_name!("tr") => return InRow,
+ local_name!("tbody") | local_name!("thead") | local_name!("tfoot") => {
+ return InTableBody;
+ },
+ local_name!("caption") => return InCaption,
+ local_name!("colgroup") => return InColumnGroup,
+ local_name!("table") => return InTable,
+ local_name!("template") => return *self.template_modes.last().unwrap(),
+ local_name!("head") => {
+ if !last {
+ return InHead;
+ }
+ },
+ local_name!("body") => return InBody,
+ local_name!("frameset") => return InFrameset,
+ local_name!("html") => match self.head_elem {
+ None => return BeforeHead,
+ Some(_) => return AfterHead,
+ },
+
+ _ => (),
+ }
+ }
+ InBody
+ }
+
+ fn close_the_cell(&mut self) {
+ self.generate_implied_end(cursory_implied_end);
+ if self.pop_until(td_th) != 1 {
+ self.sink
+ .parse_error(Borrowed("expected to close <td> or <th> with cell"));
+ }
+ self.clear_active_formatting_to_marker();
+ }
+
+ fn append_text(&mut self, text: StrTendril) -> ProcessResult<Handle> {
+ self.insert_appropriately(AppendText(text), None);
+ Done
+ }
+
+ fn append_comment(&mut self, text: StrTendril) -> ProcessResult<Handle> {
+ let comment = self.sink.create_comment(text);
+ self.insert_appropriately(AppendNode(comment), None);
+ Done
+ }
+
+ fn append_comment_to_doc(&mut self, text: StrTendril) -> ProcessResult<Handle> {
+ let comment = self.sink.create_comment(text);
+ self.sink.append(&self.doc_handle, AppendNode(comment));
+ Done
+ }
+
+ fn append_comment_to_html(&mut self, text: StrTendril) -> ProcessResult<Handle> {
+ let target = html_elem(&self.open_elems);
+ let comment = self.sink.create_comment(text);
+ self.sink.append(target, AppendNode(comment));
+ Done
+ }
+
+ //§ creating-and-inserting-nodes
+ fn create_root(&mut self, attrs: Vec<Attribute>) {
+ let elem = create_element(
+ &mut self.sink,
+ QualName::new(None, ns!(html), local_name!("html")),
+ attrs,
+ );
+ self.push(&elem);
+ self.sink.append(&self.doc_handle, AppendNode(elem));
+ // FIXME: application cache selection algorithm
+ }
+
+ // https://html.spec.whatwg.org/multipage/#create-an-element-for-the-token
+ fn insert_element(
+ &mut self,
+ push: PushFlag,
+ ns: Namespace,
+ name: LocalName,
+ attrs: Vec<Attribute>,
+ ) -> Handle {
+ declare_tag_set!(form_associatable =
+ "button" "fieldset" "input" "object"
+ "output" "select" "textarea" "img");
+
+ declare_tag_set!(listed = [form_associatable] - "img");
+
+ // Step 7.
+ let qname = QualName::new(None, ns, name);
+ let elem = create_element(&mut self.sink, qname.clone(), attrs.clone());
+
+ let insertion_point = self.appropriate_place_for_insertion(None);
+ let (node1, node2) = match insertion_point {
+ LastChild(ref p) | BeforeSibling(ref p) => (p.clone(), None),
+ TableFosterParenting {
+ ref element,
+ ref prev_element,
+ } => (element.clone(), Some(prev_element.clone())),
+ };
+
+ // Step 12.
+ if form_associatable(qname.expanded()) &&
+ self.form_elem.is_some() &&
+ !self.in_html_elem_named(local_name!("template")) &&
+ !(listed(qname.expanded()) &&
+ attrs
+ .iter()
+ .any(|a| a.name.expanded() == expanded_name!("", "form")))
+ {
+ let form = self.form_elem.as_ref().unwrap().clone();
+ let node2 = match node2 {
+ Some(ref n) => Some(n),
+ None => None,
+ };
+ self.sink.associate_with_form(&elem, &form, (&node1, node2));
+ }
+
+ self.insert_at(insertion_point, AppendNode(elem.clone()));
+
+ match push {
+ Push => self.push(&elem),
+ NoPush => (),
+ }
+ // FIXME: Remove from the stack if we can't append?
+ elem
+ }
+
+ fn insert_element_for(&mut self, tag: Tag) -> Handle {
+ self.insert_element(Push, ns!(html), tag.name, tag.attrs)
+ }
+
+ fn insert_and_pop_element_for(&mut self, tag: Tag) -> Handle {
+ self.insert_element(NoPush, ns!(html), tag.name, tag.attrs)
+ }
+
+ fn insert_phantom(&mut self, name: LocalName) -> Handle {
+ self.insert_element(Push, ns!(html), name, vec![])
+ }
+ //§ END
+
+ fn create_formatting_element_for(&mut self, tag: Tag) -> Handle {
+ // FIXME: This really wants unit tests.
+ let mut first_match = None;
+ let mut matches = 0usize;
+ for (i, _, old_tag) in self.active_formatting_end_to_marker() {
+ if tag.equiv_modulo_attr_order(old_tag) {
+ first_match = Some(i);
+ matches += 1;
+ }
+ }
+
+ if matches >= 3 {
+ self.active_formatting
+ .remove(first_match.expect("matches with no index"));
+ }
+
+ let elem = self.insert_element(Push, ns!(html), tag.name.clone(), tag.attrs.clone());
+ self.active_formatting.push(Element(elem.clone(), tag));
+ elem
+ }
+
+ fn clear_active_formatting_to_marker(&mut self) {
+ loop {
+ match self.active_formatting.pop() {
+ None | Some(Marker) => break,
+ _ => (),
+ }
+ }
+ }
+
+ fn process_end_tag_in_body(&mut self, tag: Tag) {
+ // Look back for a matching open element.
+ let mut match_idx = None;
+ for (i, elem) in self.open_elems.iter().enumerate().rev() {
+ if self.html_elem_named(elem, tag.name.clone()) {
+ match_idx = Some(i);
+ break;
+ }
+
+ if self.elem_in(elem, special_tag) {
+ self.sink
+ .parse_error(Borrowed("Found special tag while closing generic tag"));
+ return;
+ }
+ }
+
+ // Can't use unwrap_or_return!() due to rust-lang/rust#16617.
+ let match_idx = match match_idx {
+ None => {
+ // I believe this is impossible, because the root
+ // <html> element is in special_tag.
+ self.unexpected(&tag);
+ return;
+ },
+ Some(x) => x,
+ };
+
+ self.generate_implied_end_except(tag.name.clone());
+
+ if match_idx != self.open_elems.len() - 1 {
+ // mis-nested tags
+ self.unexpected(&tag);
+ }
+ self.open_elems.truncate(match_idx);
+ }
+
+ fn handle_misnested_a_tags(&mut self, tag: &Tag) {
+ let node = unwrap_or_return!(
+ self.active_formatting_end_to_marker()
+ .filter(|&(_, n, _)| self.html_elem_named(n, local_name!("a")))
+ .next()
+ .map(|(_, n, _)| n.clone()),
+ ()
+ );
+
+ self.unexpected(tag);
+ self.adoption_agency(local_name!("a"));
+ self.position_in_active_formatting(&node)
+ .map(|index| self.active_formatting.remove(index));
+ self.remove_from_stack(&node);
+ }
+
+ //§ tree-construction
+ fn is_foreign(&mut self, token: &Token) -> bool {
+ if let EOFToken = *token {
+ return false;
+ }
+
+ if self.open_elems.is_empty() {
+ return false;
+ }
+
+ let name = self.sink.elem_name(self.adjusted_current_node());
+ if let ns!(html) = *name.ns {
+ return false;
+ }
+
+ if mathml_text_integration_point(name) {
+ match *token {
+ CharacterTokens(..) | NullCharacterToken => return false,
+ TagToken(Tag {
+ kind: StartTag,
+ ref name,
+ ..
+ }) if !matches!(*name, local_name!("mglyph") | local_name!("malignmark")) => {
+ return false;
+ },
+ _ => (),
+ }
+ }
+
+ if svg_html_integration_point(name) {
+ match *token {
+ CharacterTokens(..) | NullCharacterToken => return false,
+ TagToken(Tag { kind: StartTag, .. }) => return false,
+ _ => (),
+ }
+ }
+
+ if let expanded_name!(mathml "annotation-xml") = name {
+ match *token {
+ TagToken(Tag {
+ kind: StartTag,
+ name: local_name!("svg"),
+ ..
+ }) => return false,
+ CharacterTokens(..) | NullCharacterToken | TagToken(Tag { kind: StartTag, .. }) => {
+ return !self
+ .sink
+ .is_mathml_annotation_xml_integration_point(self.adjusted_current_node());
+ },
+ _ => {},
+ };
+ }
+
+ true
+ }
+ //§ END
+
+ fn enter_foreign(&mut self, mut tag: Tag, ns: Namespace) -> ProcessResult<Handle> {
+ match ns {
+ ns!(mathml) => self.adjust_mathml_attributes(&mut tag),
+ ns!(svg) => self.adjust_svg_attributes(&mut tag),
+ _ => (),
+ }
+ self.adjust_foreign_attributes(&mut tag);
+
+ if tag.self_closing {
+ self.insert_element(NoPush, ns, tag.name, tag.attrs);
+ DoneAckSelfClosing
+ } else {
+ self.insert_element(Push, ns, tag.name, tag.attrs);
+ Done
+ }
+ }
+
+ fn adjust_svg_tag_name(&mut self, tag: &mut Tag) {
+ let Tag { ref mut name, .. } = *tag;
+ match *name {
+ local_name!("altglyph") => *name = local_name!("altGlyph"),
+ local_name!("altglyphdef") => *name = local_name!("altGlyphDef"),
+ local_name!("altglyphitem") => *name = local_name!("altGlyphItem"),
+ local_name!("animatecolor") => *name = local_name!("animateColor"),
+ local_name!("animatemotion") => *name = local_name!("animateMotion"),
+ local_name!("animatetransform") => *name = local_name!("animateTransform"),
+ local_name!("clippath") => *name = local_name!("clipPath"),
+ local_name!("feblend") => *name = local_name!("feBlend"),
+ local_name!("fecolormatrix") => *name = local_name!("feColorMatrix"),
+ local_name!("fecomponenttransfer") => *name = local_name!("feComponentTransfer"),
+ local_name!("fecomposite") => *name = local_name!("feComposite"),
+ local_name!("feconvolvematrix") => *name = local_name!("feConvolveMatrix"),
+ local_name!("fediffuselighting") => *name = local_name!("feDiffuseLighting"),
+ local_name!("fedisplacementmap") => *name = local_name!("feDisplacementMap"),
+ local_name!("fedistantlight") => *name = local_name!("feDistantLight"),
+ local_name!("fedropshadow") => *name = local_name!("feDropShadow"),
+ local_name!("feflood") => *name = local_name!("feFlood"),
+ local_name!("fefunca") => *name = local_name!("feFuncA"),
+ local_name!("fefuncb") => *name = local_name!("feFuncB"),
+ local_name!("fefuncg") => *name = local_name!("feFuncG"),
+ local_name!("fefuncr") => *name = local_name!("feFuncR"),
+ local_name!("fegaussianblur") => *name = local_name!("feGaussianBlur"),
+ local_name!("feimage") => *name = local_name!("feImage"),
+ local_name!("femerge") => *name = local_name!("feMerge"),
+ local_name!("femergenode") => *name = local_name!("feMergeNode"),
+ local_name!("femorphology") => *name = local_name!("feMorphology"),
+ local_name!("feoffset") => *name = local_name!("feOffset"),
+ local_name!("fepointlight") => *name = local_name!("fePointLight"),
+ local_name!("fespecularlighting") => *name = local_name!("feSpecularLighting"),
+ local_name!("fespotlight") => *name = local_name!("feSpotLight"),
+ local_name!("fetile") => *name = local_name!("feTile"),
+ local_name!("feturbulence") => *name = local_name!("feTurbulence"),
+ local_name!("foreignobject") => *name = local_name!("foreignObject"),
+ local_name!("glyphref") => *name = local_name!("glyphRef"),
+ local_name!("lineargradient") => *name = local_name!("linearGradient"),
+ local_name!("radialgradient") => *name = local_name!("radialGradient"),
+ local_name!("textpath") => *name = local_name!("textPath"),
+ _ => (),
+ }
+ }
+
+ fn adjust_attributes<F>(&mut self, tag: &mut Tag, mut map: F)
+ where
+ F: FnMut(LocalName) -> Option<QualName>,
+ {
+ for &mut Attribute { ref mut name, .. } in &mut tag.attrs {
+ if let Some(replacement) = map(name.local.clone()) {
+ *name = replacement;
+ }
+ }
+ }
+
+ fn adjust_svg_attributes(&mut self, tag: &mut Tag) {
+ self.adjust_attributes(tag, |k| match k {
+ local_name!("attributename") => Some(qualname!("", "attributeName")),
+ local_name!("attributetype") => Some(qualname!("", "attributeType")),
+ local_name!("basefrequency") => Some(qualname!("", "baseFrequency")),
+ local_name!("baseprofile") => Some(qualname!("", "baseProfile")),
+ local_name!("calcmode") => Some(qualname!("", "calcMode")),
+ local_name!("clippathunits") => Some(qualname!("", "clipPathUnits")),
+ local_name!("diffuseconstant") => Some(qualname!("", "diffuseConstant")),
+ local_name!("edgemode") => Some(qualname!("", "edgeMode")),
+ local_name!("filterunits") => Some(qualname!("", "filterUnits")),
+ local_name!("glyphref") => Some(qualname!("", "glyphRef")),
+ local_name!("gradienttransform") => Some(qualname!("", "gradientTransform")),
+ local_name!("gradientunits") => Some(qualname!("", "gradientUnits")),
+ local_name!("kernelmatrix") => Some(qualname!("", "kernelMatrix")),
+ local_name!("kernelunitlength") => Some(qualname!("", "kernelUnitLength")),
+ local_name!("keypoints") => Some(qualname!("", "keyPoints")),
+ local_name!("keysplines") => Some(qualname!("", "keySplines")),
+ local_name!("keytimes") => Some(qualname!("", "keyTimes")),
+ local_name!("lengthadjust") => Some(qualname!("", "lengthAdjust")),
+ local_name!("limitingconeangle") => Some(qualname!("", "limitingConeAngle")),
+ local_name!("markerheight") => Some(qualname!("", "markerHeight")),
+ local_name!("markerunits") => Some(qualname!("", "markerUnits")),
+ local_name!("markerwidth") => Some(qualname!("", "markerWidth")),
+ local_name!("maskcontentunits") => Some(qualname!("", "maskContentUnits")),
+ local_name!("maskunits") => Some(qualname!("", "maskUnits")),
+ local_name!("numoctaves") => Some(qualname!("", "numOctaves")),
+ local_name!("pathlength") => Some(qualname!("", "pathLength")),
+ local_name!("patterncontentunits") => Some(qualname!("", "patternContentUnits")),
+ local_name!("patterntransform") => Some(qualname!("", "patternTransform")),
+ local_name!("patternunits") => Some(qualname!("", "patternUnits")),
+ local_name!("pointsatx") => Some(qualname!("", "pointsAtX")),
+ local_name!("pointsaty") => Some(qualname!("", "pointsAtY")),
+ local_name!("pointsatz") => Some(qualname!("", "pointsAtZ")),
+ local_name!("preservealpha") => Some(qualname!("", "preserveAlpha")),
+ local_name!("preserveaspectratio") => Some(qualname!("", "preserveAspectRatio")),
+ local_name!("primitiveunits") => Some(qualname!("", "primitiveUnits")),
+ local_name!("refx") => Some(qualname!("", "refX")),
+ local_name!("refy") => Some(qualname!("", "refY")),
+ local_name!("repeatcount") => Some(qualname!("", "repeatCount")),
+ local_name!("repeatdur") => Some(qualname!("", "repeatDur")),
+ local_name!("requiredextensions") => Some(qualname!("", "requiredExtensions")),
+ local_name!("requiredfeatures") => Some(qualname!("", "requiredFeatures")),
+ local_name!("specularconstant") => Some(qualname!("", "specularConstant")),
+ local_name!("specularexponent") => Some(qualname!("", "specularExponent")),
+ local_name!("spreadmethod") => Some(qualname!("", "spreadMethod")),
+ local_name!("startoffset") => Some(qualname!("", "startOffset")),
+ local_name!("stddeviation") => Some(qualname!("", "stdDeviation")),
+ local_name!("stitchtiles") => Some(qualname!("", "stitchTiles")),
+ local_name!("surfacescale") => Some(qualname!("", "surfaceScale")),
+ local_name!("systemlanguage") => Some(qualname!("", "systemLanguage")),
+ local_name!("tablevalues") => Some(qualname!("", "tableValues")),
+ local_name!("targetx") => Some(qualname!("", "targetX")),
+ local_name!("targety") => Some(qualname!("", "targetY")),
+ local_name!("textlength") => Some(qualname!("", "textLength")),
+ local_name!("viewbox") => Some(qualname!("", "viewBox")),
+ local_name!("viewtarget") => Some(qualname!("", "viewTarget")),
+ local_name!("xchannelselector") => Some(qualname!("", "xChannelSelector")),
+ local_name!("ychannelselector") => Some(qualname!("", "yChannelSelector")),
+ local_name!("zoomandpan") => Some(qualname!("", "zoomAndPan")),
+ _ => None,
+ });
+ }
+
+ fn adjust_mathml_attributes(&mut self, tag: &mut Tag) {
+ self.adjust_attributes(tag, |k| match k {
+ local_name!("definitionurl") => Some(qualname!("", "definitionURL")),
+ _ => None,
+ });
+ }
+
+ fn adjust_foreign_attributes(&mut self, tag: &mut Tag) {
+ self.adjust_attributes(tag, |k| match k {
+ local_name!("xlink:actuate") => Some(qualname!("xlink" xlink "actuate")),
+ local_name!("xlink:arcrole") => Some(qualname!("xlink" xlink "arcrole")),
+ local_name!("xlink:href") => Some(qualname!("xlink" xlink "href")),
+ local_name!("xlink:role") => Some(qualname!("xlink" xlink "role")),
+ local_name!("xlink:show") => Some(qualname!("xlink" xlink "show")),
+ local_name!("xlink:title") => Some(qualname!("xlink" xlink "title")),
+ local_name!("xlink:type") => Some(qualname!("xlink" xlink "type")),
+ local_name!("xml:base") => Some(qualname!("xml" xml "base")),
+ local_name!("xml:lang") => Some(qualname!("xml" xml "lang")),
+ local_name!("xml:space") => Some(qualname!("xml" xml "space")),
+ local_name!("xmlns") => Some(qualname!("" xmlns "xmlns")),
+ local_name!("xmlns:xlink") => Some(qualname!("xmlns" xmlns "xlink")),
+ _ => None,
+ });
+ }
+
+ fn foreign_start_tag(&mut self, mut tag: Tag) -> ProcessResult<Handle> {
+ let current_ns = self.sink.elem_name(self.adjusted_current_node()).ns.clone();
+ match current_ns {
+ ns!(mathml) => self.adjust_mathml_attributes(&mut tag),
+ ns!(svg) => {
+ self.adjust_svg_tag_name(&mut tag);
+ self.adjust_svg_attributes(&mut tag);
+ },
+ _ => (),
+ }
+ self.adjust_foreign_attributes(&mut tag);
+ if tag.self_closing {
+ // FIXME(#118): <script /> in SVG
+ self.insert_element(NoPush, current_ns, tag.name, tag.attrs);
+ DoneAckSelfClosing
+ } else {
+ self.insert_element(Push, current_ns, tag.name, tag.attrs);
+ Done
+ }
+ }
+
+ fn unexpected_start_tag_in_foreign_content(&mut self, tag: Tag) -> ProcessResult<Handle> {
+ self.unexpected(&tag);
+ if self.is_fragment() {
+ self.foreign_start_tag(tag)
+ } else {
+ self.pop();
+ while !self.current_node_in(|n| {
+ *n.ns == ns!(html) ||
+ mathml_text_integration_point(n) ||
+ svg_html_integration_point(n)
+ }) {
+ self.pop();
+ }
+ ReprocessForeign(TagToken(tag))
+ }
+ }
+}
diff --git a/src/tree_builder/rules.rs b/src/tree_builder/rules.rs
new file mode 100644
index 0000000..bdc8afd
--- /dev/null
+++ b/src/tree_builder/rules.rs
@@ -0,0 +1,1449 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// The tree builder rules, as a single, enormous nested match expression.
+
+use markup5ever::{expanded_name, local_name, namespace_prefix, namespace_url, ns};
+use crate::tokenizer::states::{Plaintext, Rawtext, Rcdata, ScriptData};
+use crate::tree_builder::tag_sets::*;
+use crate::tree_builder::types::*;
+
+use std::borrow::ToOwned;
+
+use crate::tendril::SliceExt;
+
+fn any_not_whitespace(x: &StrTendril) -> bool {
+ // FIXME: this might be much faster as a byte scan
+ x.chars().any(|c| !is_ascii_whitespace(c))
+}
+
+fn current_node<Handle>(open_elems: &[Handle]) -> &Handle {
+ open_elems.last().expect("no current element")
+}
+
+#[doc(hidden)]
+impl<Handle, Sink> TreeBuilder<Handle, Sink>
+where
+ Handle: Clone,
+ Sink: TreeSink<Handle = Handle>,
+{
+ fn step(&mut self, mode: InsertionMode, token: Token) -> ProcessResult<Handle> {
+ self.debug_step(mode, &token);
+
+ match mode {
+ //§ the-initial-insertion-mode
+ Initial => match_token!(token {
+ CharacterTokens(NotSplit, text) => SplitWhitespace(text),
+ CharacterTokens(Whitespace, _) => Done,
+ CommentToken(text) => self.append_comment_to_doc(text),
+ token => {
+ if !self.opts.iframe_srcdoc {
+ self.unexpected(&token);
+ self.set_quirks_mode(Quirks);
+ }
+ Reprocess(BeforeHtml, token)
+ }
+ }),
+
+ //§ the-before-html-insertion-mode
+ BeforeHtml => match_token!(token {
+ CharacterTokens(NotSplit, text) => SplitWhitespace(text),
+ CharacterTokens(Whitespace, _) => Done,
+ CommentToken(text) => self.append_comment_to_doc(text),
+
+ tag @ <html> => {
+ self.create_root(tag.attrs);
+ self.mode = BeforeHead;
+ Done
+ }
+
+ </head> </body> </html> </br> => else,
+
+ tag @ </_> => self.unexpected(&tag),
+
+ token => {
+ self.create_root(vec!());
+ Reprocess(BeforeHead, token)
+ }
+ }),
+
+ //§ the-before-head-insertion-mode
+ BeforeHead => match_token!(token {
+ CharacterTokens(NotSplit, text) => SplitWhitespace(text),
+ CharacterTokens(Whitespace, _) => Done,
+ CommentToken(text) => self.append_comment(text),
+
+ <html> => self.step(InBody, token),
+
+ tag @ <head> => {
+ self.head_elem = Some(self.insert_element_for(tag));
+ self.mode = InHead;
+ Done
+ }
+
+ </head> </body> </html> </br> => else,
+
+ tag @ </_> => self.unexpected(&tag),
+
+ token => {
+ self.head_elem = Some(self.insert_phantom(local_name!("head")));
+ Reprocess(InHead, token)
+ }
+ }),
+
+ //§ parsing-main-inhead
+ InHead => match_token!(token {
+ CharacterTokens(NotSplit, text) => SplitWhitespace(text),
+ CharacterTokens(Whitespace, text) => self.append_text(text),
+ CommentToken(text) => self.append_comment(text),
+
+ <html> => self.step(InBody, token),
+
+ tag @ <base> <basefont> <bgsound> <link> <meta> => {
+ // FIXME: handle <meta charset=...> and <meta http-equiv="Content-Type">
+ self.insert_and_pop_element_for(tag);
+ DoneAckSelfClosing
+ }
+
+ tag @ <title> => {
+ self.parse_raw_data(tag, Rcdata)
+ }
+
+ tag @ <noframes> <style> <noscript> => {
+ if (!self.opts.scripting_enabled) && (tag.name == local_name!("noscript")) {
+ self.insert_element_for(tag);
+ self.mode = InHeadNoscript;
+ Done
+ } else {
+ self.parse_raw_data(tag, Rawtext)
+ }
+ }
+
+ tag @ <script> => {
+ let elem = create_element(
+ &mut self.sink, QualName::new(None, ns!(html), local_name!("script")),
+ tag.attrs);
+ if self.is_fragment() {
+ self.sink.mark_script_already_started(&elem);
+ }
+ self.insert_appropriately(AppendNode(elem.clone()), None);
+ self.open_elems.push(elem);
+ self.to_raw_text_mode(ScriptData)
+ }
+
+ </head> => {
+ self.pop();
+ self.mode = AfterHead;
+ Done
+ }
+
+ </body> </html> </br> => else,
+
+ tag @ <template> => {
+ self.insert_element_for(tag);
+ self.active_formatting.push(Marker);
+ self.frameset_ok = false;
+ self.mode = InTemplate;
+ self.template_modes.push(InTemplate);
+ Done
+ }
+
+ tag @ </template> => {
+ if !self.in_html_elem_named(local_name!("template")) {
+ self.unexpected(&tag);
+ } else {
+ self.generate_implied_end(thorough_implied_end);
+ self.expect_to_close(local_name!("template"));
+ self.clear_active_formatting_to_marker();
+ self.template_modes.pop();
+ self.mode = self.reset_insertion_mode();
+ }
+ Done
+ }
+
+ <head> => self.unexpected(&token),
+ tag @ </_> => self.unexpected(&tag),
+
+ token => {
+ self.pop();
+ Reprocess(AfterHead, token)
+ }
+ }),
+
+ //§ parsing-main-inheadnoscript
+ InHeadNoscript => match_token!(token {
+ <html> => self.step(InBody, token),
+
+ </noscript> => {
+ self.pop();
+ self.mode = InHead;
+ Done
+ },
+
+ CharacterTokens(NotSplit, text) => SplitWhitespace(text),
+ CharacterTokens(Whitespace, _) => self.step(InHead, token),
+
+ CommentToken(_) => self.step(InHead, token),
+
+ <basefont> <bgsound> <link> <meta> <noframes> <style>
+ => self.step(InHead, token),
+
+ </br> => else,
+
+ <head> <noscript> => self.unexpected(&token),
+ tag @ </_> => self.unexpected(&tag),
+
+ token => {
+ self.unexpected(&token);
+ self.pop();
+ Reprocess(InHead, token)
+ },
+ }),
+
+ //§ the-after-head-insertion-mode
+ AfterHead => match_token!(token {
+ CharacterTokens(NotSplit, text) => SplitWhitespace(text),
+ CharacterTokens(Whitespace, text) => self.append_text(text),
+ CommentToken(text) => self.append_comment(text),
+
+ <html> => self.step(InBody, token),
+
+ tag @ <body> => {
+ self.insert_element_for(tag);
+ self.frameset_ok = false;
+ self.mode = InBody;
+ Done
+ }
+
+ tag @ <frameset> => {
+ self.insert_element_for(tag);
+ self.mode = InFrameset;
+ Done
+ }
+
+ <base> <basefont> <bgsound> <link> <meta>
+ <noframes> <script> <style> <template> <title> => {
+ self.unexpected(&token);
+ let head = self.head_elem.as_ref().expect("no head element").clone();
+ self.push(&head);
+ let result = self.step(InHead, token);
+ self.remove_from_stack(&head);
+ result
+ }
+
+ </template> => self.step(InHead, token),
+
+ </body> </html> </br> => else,
+
+ <head> => self.unexpected(&token),
+ tag @ </_> => self.unexpected(&tag),
+
+ token => {
+ self.insert_phantom(local_name!("body"));
+ Reprocess(InBody, token)
+ }
+ }),
+
+ //§ parsing-main-inbody
+ InBody => match_token!(token {
+ NullCharacterToken => self.unexpected(&token),
+
+ CharacterTokens(_, text) => {
+ self.reconstruct_formatting();
+ if any_not_whitespace(&text) {
+ self.frameset_ok = false;
+ }
+ self.append_text(text)
+ }
+
+ CommentToken(text) => self.append_comment(text),
+
+ tag @ <html> => {
+ self.unexpected(&tag);
+ if !self.in_html_elem_named(local_name!("template")) {
+ let top = html_elem(&self.open_elems);
+ self.sink.add_attrs_if_missing(top, tag.attrs);
+ }
+ Done
+ }
+
+ <base> <basefont> <bgsound> <link> <meta> <noframes>
+ <script> <style> <template> <title> </template> => {
+ self.step(InHead, token)
+ }
+
+ tag @ <body> => {
+ self.unexpected(&tag);
+ match self.body_elem().cloned() {
+ Some(ref node) if self.open_elems.len() != 1 &&
+ !self.in_html_elem_named(local_name!("template")) => {
+ self.frameset_ok = false;
+ self.sink.add_attrs_if_missing(node, tag.attrs)
+ },
+ _ => {}
+ }
+ Done
+ }
+
+ tag @ <frameset> => {
+ self.unexpected(&tag);
+ if !self.frameset_ok { return Done; }
+
+ let body = unwrap_or_return!(self.body_elem(), Done).clone();
+ self.sink.remove_from_parent(&body);
+
+ // FIXME: can we get here in the fragment case?
+ // What to do with the first element then?
+ self.open_elems.truncate(1);
+ self.insert_element_for(tag);
+ self.mode = InFrameset;
+ Done
+ }
+
+ EOFToken => {
+ if !self.template_modes.is_empty() {
+ self.step(InTemplate, token)
+ } else {
+ self.check_body_end();
+ self.stop_parsing()
+ }
+ }
+
+ </body> => {
+ if self.in_scope_named(default_scope, local_name!("body")) {
+ self.check_body_end();
+ self.mode = AfterBody;
+ } else {
+ self.sink.parse_error(Borrowed("</body> with no <body> in scope"));
+ }
+ Done
+ }
+
+ </html> => {
+ if self.in_scope_named(default_scope, local_name!("body")) {
+ self.check_body_end();
+ Reprocess(AfterBody, token)
+ } else {
+ self.sink.parse_error(Borrowed("</html> with no <body> in scope"));
+ Done
+ }
+ }
+
+ tag @ <address> <article> <aside> <blockquote> <center> <details> <dialog>
+ <dir> <div> <dl> <fieldset> <figcaption> <figure> <footer> <header>
+ <hgroup> <main> <nav> <ol> <p> <section> <summary> <ul> => {
+ self.close_p_element_in_button_scope();
+ self.insert_element_for(tag);
+ Done
+ }
+
+ tag @ <menu> => {
+ self.close_p_element_in_button_scope();
+ self.insert_element_for(tag);
+ Done
+ }
+
+ tag @ <h1> <h2> <h3> <h4> <h5> <h6> => {
+ self.close_p_element_in_button_scope();
+ if self.current_node_in(heading_tag) {
+ self.sink.parse_error(Borrowed("nested heading tags"));
+ self.pop();
+ }
+ self.insert_element_for(tag);
+ Done
+ }
+
+ tag @ <pre> <listing> => {
+ self.close_p_element_in_button_scope();
+ self.insert_element_for(tag);
+ self.ignore_lf = true;
+ self.frameset_ok = false;
+ Done
+ }
+
+ tag @ <form> => {
+ if self.form_elem.is_some() &&
+ !self.in_html_elem_named(local_name!("template")) {
+ self.sink.parse_error(Borrowed("nested forms"));
+ } else {
+ self.close_p_element_in_button_scope();
+ let elem = self.insert_element_for(tag);
+ if !self.in_html_elem_named(local_name!("template")) {
+ self.form_elem = Some(elem);
+ }
+ }
+ Done
+ }
+
+ tag @ <li> <dd> <dt> => {
+ declare_tag_set!(close_list = "li");
+ declare_tag_set!(close_defn = "dd" "dt");
+ declare_tag_set!(extra_special = [special_tag] - "address" "div" "p");
+ let list = match tag.name {
+ local_name!("li") => true,
+ local_name!("dd") | local_name!("dt") => false,
+ _ => unreachable!(),
+ };
+
+ self.frameset_ok = false;
+
+ let mut to_close = None;
+ for node in self.open_elems.iter().rev() {
+ let name = self.sink.elem_name(node);
+ let can_close = if list {
+ close_list(name)
+ } else {
+ close_defn(name)
+ };
+ if can_close {
+ to_close = Some(name.local.clone());
+ break;
+ }
+ if extra_special(name) {
+ break;
+ }
+ }
+
+ match to_close {
+ Some(name) => {
+ self.generate_implied_end_except(name.clone());
+ self.expect_to_close(name);
+ }
+ None => (),
+ }
+
+ self.close_p_element_in_button_scope();
+ self.insert_element_for(tag);
+ Done
+ }
+
+ tag @ <plaintext> => {
+ self.close_p_element_in_button_scope();
+ self.insert_element_for(tag);
+ ToPlaintext
+ }
+
+ tag @ <button> => {
+ if self.in_scope_named(default_scope, local_name!("button")) {
+ self.sink.parse_error(Borrowed("nested buttons"));
+ self.generate_implied_end(cursory_implied_end);
+ self.pop_until_named(local_name!("button"));
+ }
+ self.reconstruct_formatting();
+ self.insert_element_for(tag);
+ self.frameset_ok = false;
+ Done
+ }
+
+ tag @ </address> </article> </aside> </blockquote> </button> </center>
+ </details> </dialog> </dir> </div> </dl> </fieldset> </figcaption>
+ </figure> </footer> </header> </hgroup> </listing> </main> </menu>
+ </nav> </ol> </pre> </section> </summary> </ul> => {
+ if !self.in_scope_named(default_scope, tag.name.clone()) {
+ self.unexpected(&tag);
+ } else {
+ self.generate_implied_end(cursory_implied_end);
+ self.expect_to_close(tag.name);
+ }
+ Done
+ }
+
+ </form> => {
+ if !self.in_html_elem_named(local_name!("template")) {
+ // Can't use unwrap_or_return!() due to rust-lang/rust#16617.
+ let node = match self.form_elem.take() {
+ None => {
+ self.sink.parse_error(Borrowed("Null form element pointer on </form>"));
+ return Done;
+ }
+ Some(x) => x,
+ };
+ if !self.in_scope(default_scope, |n| self.sink.same_node(&node, &n)) {
+ self.sink.parse_error(Borrowed("Form element not in scope on </form>"));
+ return Done;
+ }
+ self.generate_implied_end(cursory_implied_end);
+ let current = self.current_node().clone();
+ self.remove_from_stack(&node);
+ if !self.sink.same_node(&current, &node) {
+ self.sink.parse_error(Borrowed("Bad open element on </form>"));
+ }
+ } else {
+ if !self.in_scope_named(default_scope, local_name!("form")) {
+ self.sink.parse_error(Borrowed("Form element not in scope on </form>"));
+ return Done;
+ }
+ self.generate_implied_end(cursory_implied_end);
+ if !self.current_node_named(local_name!("form")) {
+ self.sink.parse_error(Borrowed("Bad open element on </form>"));
+ }
+ self.pop_until_named(local_name!("form"));
+ }
+ Done
+ }
+
+ </p> => {
+ if !self.in_scope_named(button_scope, local_name!("p")) {
+ self.sink.parse_error(Borrowed("No <p> tag to close"));
+ self.insert_phantom(local_name!("p"));
+ }
+ self.close_p_element();
+ Done
+ }
+
+ tag @ </li> </dd> </dt> => {
+ let in_scope = if tag.name == local_name!("li") {
+ self.in_scope_named(list_item_scope, tag.name.clone())
+ } else {
+ self.in_scope_named(default_scope, tag.name.clone())
+ };
+ if in_scope {
+ self.generate_implied_end_except(tag.name.clone());
+ self.expect_to_close(tag.name);
+ } else {
+ self.sink.parse_error(Borrowed("No matching tag to close"));
+ }
+ Done
+ }
+
+ tag @ </h1> </h2> </h3> </h4> </h5> </h6> => {
+ if self.in_scope(default_scope, |n| self.elem_in(&n, heading_tag)) {
+ self.generate_implied_end(cursory_implied_end);
+ if !self.current_node_named(tag.name) {
+ self.sink.parse_error(Borrowed("Closing wrong heading tag"));
+ }
+ self.pop_until(heading_tag);
+ } else {
+ self.sink.parse_error(Borrowed("No heading tag to close"));
+ }
+ Done
+ }
+
+ tag @ <a> => {
+ self.handle_misnested_a_tags(&tag);
+ self.reconstruct_formatting();
+ self.create_formatting_element_for(tag);
+ Done
+ }
+
+ tag @ <b> <big> <code> <em> <font> <i> <s> <small> <strike> <strong> <tt> <u> => {
+ self.reconstruct_formatting();
+ self.create_formatting_element_for(tag);
+ Done
+ }
+
+ tag @ <nobr> => {
+ self.reconstruct_formatting();
+ if self.in_scope_named(default_scope, local_name!("nobr")) {
+ self.sink.parse_error(Borrowed("Nested <nobr>"));
+ self.adoption_agency(local_name!("nobr"));
+ self.reconstruct_formatting();
+ }
+ self.create_formatting_element_for(tag);
+ Done
+ }
+
+ tag @ </a> </b> </big> </code> </em> </font> </i> </nobr>
+ </s> </small> </strike> </strong> </tt> </u> => {
+ self.adoption_agency(tag.name);
+ Done
+ }
+
+ tag @ <applet> <marquee> <object> => {
+ self.reconstruct_formatting();
+ self.insert_element_for(tag);
+ self.active_formatting.push(Marker);
+ self.frameset_ok = false;
+ Done
+ }
+
+ tag @ </applet> </marquee> </object> => {
+ if !self.in_scope_named(default_scope, tag.name.clone()) {
+ self.unexpected(&tag);
+ } else {
+ self.generate_implied_end(cursory_implied_end);
+ self.expect_to_close(tag.name);
+ self.clear_active_formatting_to_marker();
+ }
+ Done
+ }
+
+ tag @ <table> => {
+ if self.quirks_mode != Quirks {
+ self.close_p_element_in_button_scope();
+ }
+ self.insert_element_for(tag);
+ self.frameset_ok = false;
+ self.mode = InTable;
+ Done
+ }
+
+ tag @ </br> => {
+ self.unexpected(&tag);
+ self.step(InBody, TagToken(Tag {
+ kind: StartTag,
+ attrs: vec!(),
+ ..tag
+ }))
+ }
+
+ tag @ <area> <br> <embed> <img> <keygen> <wbr> <input> => {
+ let keep_frameset_ok = match tag.name {
+ local_name!("input") => self.is_type_hidden(&tag),
+ _ => false,
+ };
+ self.reconstruct_formatting();
+ self.insert_and_pop_element_for(tag);
+ if !keep_frameset_ok {
+ self.frameset_ok = false;
+ }
+ DoneAckSelfClosing
+ }
+
+ tag @ <param> <source> <track> => {
+ self.insert_and_pop_element_for(tag);
+ DoneAckSelfClosing
+ }
+
+ tag @ <hr> => {
+ self.close_p_element_in_button_scope();
+ self.insert_and_pop_element_for(tag);
+ self.frameset_ok = false;
+ DoneAckSelfClosing
+ }
+
+ tag @ <image> => {
+ self.unexpected(&tag);
+ self.step(InBody, TagToken(Tag {
+ name: local_name!("img"),
+ ..tag
+ }))
+ }
+
+ tag @ <textarea> => {
+ self.ignore_lf = true;
+ self.frameset_ok = false;
+ self.parse_raw_data(tag, Rcdata)
+ }
+
+ tag @ <xmp> => {
+ self.close_p_element_in_button_scope();
+ self.reconstruct_formatting();
+ self.frameset_ok = false;
+ self.parse_raw_data(tag, Rawtext)
+ }
+
+ tag @ <iframe> => {
+ self.frameset_ok = false;
+ self.parse_raw_data(tag, Rawtext)
+ }
+
+ tag @ <noembed> => {
+ self.parse_raw_data(tag, Rawtext)
+ }
+
+ // <noscript> handled in wildcard case below
+
+ tag @ <select> => {
+ self.reconstruct_formatting();
+ self.insert_element_for(tag);
+ self.frameset_ok = false;
+ // NB: mode == InBody but possibly self.mode != mode, if
+ // we're processing "as in the rules for InBody".
+ self.mode = match self.mode {
+ InTable | InCaption | InTableBody
+ | InRow | InCell => InSelectInTable,
+ _ => InSelect,
+ };
+ Done
+ }
+
+ tag @ <optgroup> <option> => {
+ if self.current_node_named(local_name!("option")) {
+ self.pop();
+ }
+ self.reconstruct_formatting();
+ self.insert_element_for(tag);
+ Done
+ }
+
+ tag @ <rb> <rtc> => {
+ if self.in_scope_named(default_scope, local_name!("ruby")) {
+ self.generate_implied_end(cursory_implied_end);
+ }
+ if !self.current_node_named(local_name!("ruby")) {
+ self.unexpected(&tag);
+ }
+ self.insert_element_for(tag);
+ Done
+ }
+
+ tag @ <rp> <rt> => {
+ if self.in_scope_named(default_scope, local_name!("ruby")) {
+ self.generate_implied_end_except(local_name!("rtc"));
+ }
+ if !self.current_node_named(local_name!("rtc")) && !self.current_node_named(local_name!("ruby")) {
+ self.unexpected(&tag);
+ }
+ self.insert_element_for(tag);
+ Done
+ }
+
+ tag @ <math> => self.enter_foreign(tag, ns!(mathml)),
+
+ tag @ <svg> => self.enter_foreign(tag, ns!(svg)),
+
+ <caption> <col> <colgroup> <frame> <head>
+ <tbody> <td> <tfoot> <th> <thead> <tr> => {
+ self.unexpected(&token);
+ Done
+ }
+
+ tag @ <_> => {
+ if self.opts.scripting_enabled && tag.name == local_name!("noscript") {
+ self.parse_raw_data(tag, Rawtext)
+ } else {
+ self.reconstruct_formatting();
+ self.insert_element_for(tag);
+ Done
+ }
+ }
+
+ tag @ </_> => {
+ self.process_end_tag_in_body(tag);
+ Done
+ }
+
+ // FIXME: This should be unreachable, but match_token requires a
+ // catch-all case.
+ _ => panic!("impossible case in InBody mode"),
+ }),
+
+ //§ parsing-main-incdata
+ Text => match_token!(token {
+ CharacterTokens(_, text) => self.append_text(text),
+
+ EOFToken => {
+ self.unexpected(&token);
+ if self.current_node_named(local_name!("script")) {
+ let current = current_node(&self.open_elems);
+ self.sink.mark_script_already_started(current);
+ }
+ self.pop();
+ Reprocess(self.orig_mode.take().unwrap(), token)
+ }
+
+ tag @ </_> => {
+ let node = self.pop();
+ self.mode = self.orig_mode.take().unwrap();
+ if tag.name == local_name!("script") {
+ return Script(node);
+ }
+ Done
+ }
+
+ // The spec doesn't say what to do here.
+ // Other tokens are impossible?
+ _ => panic!("impossible case in Text mode"),
+ }),
+
+ //§ parsing-main-intable
+ InTable => match_token!(token {
+ // FIXME: hack, should implement pat | pat for match_token instead
+ NullCharacterToken => self.process_chars_in_table(token),
+
+ CharacterTokens(..) => self.process_chars_in_table(token),
+
+ CommentToken(text) => self.append_comment(text),
+
+ tag @ <caption> => {
+ self.pop_until_current(table_scope);
+ self.active_formatting.push(Marker);
+ self.insert_element_for(tag);
+ self.mode = InCaption;
+ Done
+ }
+
+ tag @ <colgroup> => {
+ self.pop_until_current(table_scope);
+ self.insert_element_for(tag);
+ self.mode = InColumnGroup;
+ Done
+ }
+
+ <col> => {
+ self.pop_until_current(table_scope);
+ self.insert_phantom(local_name!("colgroup"));
+ Reprocess(InColumnGroup, token)
+ }
+
+ tag @ <tbody> <tfoot> <thead> => {
+ self.pop_until_current(table_scope);
+ self.insert_element_for(tag);
+ self.mode = InTableBody;
+ Done
+ }
+
+ <td> <th> <tr> => {
+ self.pop_until_current(table_scope);
+ self.insert_phantom(local_name!("tbody"));
+ Reprocess(InTableBody, token)
+ }
+
+ <table> => {
+ self.unexpected(&token);
+ if self.in_scope_named(table_scope, local_name!("table")) {
+ self.pop_until_named(local_name!("table"));
+ Reprocess(self.reset_insertion_mode(), token)
+ } else {
+ Done
+ }
+ }
+
+ </table> => {
+ if self.in_scope_named(table_scope, local_name!("table")) {
+ self.pop_until_named(local_name!("table"));
+ self.mode = self.reset_insertion_mode();
+ } else {
+ self.unexpected(&token);
+ }
+ Done
+ }
+
+ </body> </caption> </col> </colgroup> </html>
+ </tbody> </td> </tfoot> </th> </thead> </tr> =>
+ self.unexpected(&token),
+
+ <style> <script> <template> </template>
+ => self.step(InHead, token),
+
+ tag @ <input> => {
+ self.unexpected(&tag);
+ if self.is_type_hidden(&tag) {
+ self.insert_and_pop_element_for(tag);
+ DoneAckSelfClosing
+ } else {
+ self.foster_parent_in_body(TagToken(tag))
+ }
+ }
+
+ tag @ <form> => {
+ self.unexpected(&tag);
+ if !self.in_html_elem_named(local_name!("template")) && self.form_elem.is_none() {
+ self.form_elem = Some(self.insert_and_pop_element_for(tag));
+ }
+ Done
+ }
+
+ EOFToken => self.step(InBody, token),
+
+ token => {
+ self.unexpected(&token);
+ self.foster_parent_in_body(token)
+ }
+ }),
+
+ //§ parsing-main-intabletext
+ InTableText => match_token!(token {
+ NullCharacterToken => self.unexpected(&token),
+
+ CharacterTokens(split, text) => {
+ self.pending_table_text.push((split, text));
+ Done
+ }
+
+ token => {
+ let pending = replace(&mut self.pending_table_text, vec!());
+ let contains_nonspace = pending.iter().any(|&(split, ref text)| {
+ match split {
+ Whitespace => false,
+ NotWhitespace => true,
+ NotSplit => any_not_whitespace(text),
+ }
+ });
+
+ if contains_nonspace {
+ self.sink.parse_error(Borrowed("Non-space table text"));
+ for (split, text) in pending.into_iter() {
+ match self.foster_parent_in_body(CharacterTokens(split, text)) {
+ Done => (),
+ _ => panic!("not prepared to handle this!"),
+ }
+ }
+ } else {
+ for (_, text) in pending.into_iter() {
+ self.append_text(text);
+ }
+ }
+
+ Reprocess(self.orig_mode.take().unwrap(), token)
+ }
+ }),
+
+ //§ parsing-main-incaption
+ InCaption => match_token!(token {
+ tag @ <caption> <col> <colgroup> <tbody> <td> <tfoot>
+ <th> <thead> <tr> </table> </caption> => {
+ if self.in_scope_named(table_scope, local_name!("caption")) {
+ self.generate_implied_end(cursory_implied_end);
+ self.expect_to_close(local_name!("caption"));
+ self.clear_active_formatting_to_marker();
+ match tag {
+ Tag { kind: EndTag, name: local_name!("caption"), .. } => {
+ self.mode = InTable;
+ Done
+ }
+ _ => Reprocess(InTable, TagToken(tag))
+ }
+ } else {
+ self.unexpected(&tag);
+ Done
+ }
+ }
+
+ </body> </col> </colgroup> </html> </tbody>
+ </td> </tfoot> </th> </thead> </tr> => self.unexpected(&token),
+
+ token => self.step(InBody, token),
+ }),
+
+ //§ parsing-main-incolgroup
+ InColumnGroup => match_token!(token {
+ CharacterTokens(NotSplit, text) => SplitWhitespace(text),
+ CharacterTokens(Whitespace, text) => self.append_text(text),
+ CommentToken(text) => self.append_comment(text),
+
+ <html> => self.step(InBody, token),
+
+ tag @ <col> => {
+ self.insert_and_pop_element_for(tag);
+ DoneAckSelfClosing
+ }
+
+ </colgroup> => {
+ if self.current_node_named(local_name!("colgroup")) {
+ self.pop();
+ self.mode = InTable;
+ } else {
+ self.unexpected(&token);
+ }
+ Done
+ }
+
+ </col> => self.unexpected(&token),
+
+ <template> </template> => self.step(InHead, token),
+
+ EOFToken => self.step(InBody, token),
+
+ token => {
+ if self.current_node_named(local_name!("colgroup")) {
+ self.pop();
+ Reprocess(InTable, token)
+ } else {
+ self.unexpected(&token)
+ }
+ }
+ }),
+
+ //§ parsing-main-intbody
+ InTableBody => match_token!(token {
+ tag @ <tr> => {
+ self.pop_until_current(table_body_context);
+ self.insert_element_for(tag);
+ self.mode = InRow;
+ Done
+ }
+
+ <th> <td> => {
+ self.unexpected(&token);
+ self.pop_until_current(table_body_context);
+ self.insert_phantom(local_name!("tr"));
+ Reprocess(InRow, token)
+ }
+
+ tag @ </tbody> </tfoot> </thead> => {
+ if self.in_scope_named(table_scope, tag.name.clone()) {
+ self.pop_until_current(table_body_context);
+ self.pop();
+ self.mode = InTable;
+ } else {
+ self.unexpected(&tag);
+ }
+ Done
+ }
+
+ <caption> <col> <colgroup> <tbody> <tfoot> <thead> </table> => {
+ declare_tag_set!(table_outer = "table" "tbody" "tfoot");
+ if self.in_scope(table_scope, |e| self.elem_in(&e, table_outer)) {
+ self.pop_until_current(table_body_context);
+ self.pop();
+ Reprocess(InTable, token)
+ } else {
+ self.unexpected(&token)
+ }
+ }
+
+ </body> </caption> </col> </colgroup> </html> </td> </th> </tr>
+ => self.unexpected(&token),
+
+ token => self.step(InTable, token),
+ }),
+
+ //§ parsing-main-intr
+ InRow => match_token!(token {
+ tag @ <th> <td> => {
+ self.pop_until_current(table_row_context);
+ self.insert_element_for(tag);
+ self.mode = InCell;
+ self.active_formatting.push(Marker);
+ Done
+ }
+
+ </tr> => {
+ if self.in_scope_named(table_scope, local_name!("tr")) {
+ self.pop_until_current(table_row_context);
+ let node = self.pop();
+ self.assert_named(&node, local_name!("tr"));
+ self.mode = InTableBody;
+ } else {
+ self.unexpected(&token);
+ }
+ Done
+ }
+
+ <caption> <col> <colgroup> <tbody> <tfoot> <thead> <tr> </table> => {
+ if self.in_scope_named(table_scope, local_name!("tr")) {
+ self.pop_until_current(table_row_context);
+ let node = self.pop();
+ self.assert_named(&node, local_name!("tr"));
+ Reprocess(InTableBody, token)
+ } else {
+ self.unexpected(&token)
+ }
+ }
+
+ tag @ </tbody> </tfoot> </thead> => {
+ if self.in_scope_named(table_scope, tag.name.clone()) {
+ if self.in_scope_named(table_scope, local_name!("tr")) {
+ self.pop_until_current(table_row_context);
+ let node = self.pop();
+ self.assert_named(&node, local_name!("tr"));
+ Reprocess(InTableBody, TagToken(tag))
+ } else {
+ Done
+ }
+ } else {
+ self.unexpected(&tag)
+ }
+ }
+
+ </body> </caption> </col> </colgroup> </html> </td> </th>
+ => self.unexpected(&token),
+
+ token => self.step(InTable, token),
+ }),
+
+ //§ parsing-main-intd
+ InCell => match_token!(token {
+ tag @ </td> </th> => {
+ if self.in_scope_named(table_scope, tag.name.clone()) {
+ self.generate_implied_end(cursory_implied_end);
+ self.expect_to_close(tag.name);
+ self.clear_active_formatting_to_marker();
+ self.mode = InRow;
+ } else {
+ self.unexpected(&tag);
+ }
+ Done
+ }
+
+ <caption> <col> <colgroup> <tbody> <td> <tfoot> <th> <thead> <tr> => {
+ if self.in_scope(table_scope, |n| self.elem_in(&n, td_th)) {
+ self.close_the_cell();
+ Reprocess(InRow, token)
+ } else {
+ self.unexpected(&token)
+ }
+ }
+
+ </body> </caption> </col> </colgroup> </html>
+ => self.unexpected(&token),
+
+ tag @ </table> </tbody> </tfoot> </thead> </tr> => {
+ if self.in_scope_named(table_scope, tag.name.clone()) {
+ self.close_the_cell();
+ Reprocess(InRow, TagToken(tag))
+ } else {
+ self.unexpected(&tag)
+ }
+ }
+
+ token => self.step(InBody, token),
+ }),
+
+ //§ parsing-main-inselect
+ InSelect => match_token!(token {
+ NullCharacterToken => self.unexpected(&token),
+ CharacterTokens(_, text) => self.append_text(text),
+ CommentToken(text) => self.append_comment(text),
+
+ <html> => self.step(InBody, token),
+
+ tag @ <option> => {
+ if self.current_node_named(local_name!("option")) {
+ self.pop();
+ }
+ self.insert_element_for(tag);
+ Done
+ }
+
+ tag @ <optgroup> => {
+ if self.current_node_named(local_name!("option")) {
+ self.pop();
+ }
+ if self.current_node_named(local_name!("optgroup")) {
+ self.pop();
+ }
+ self.insert_element_for(tag);
+ Done
+ }
+
+ </optgroup> => {
+ if self.open_elems.len() >= 2
+ && self.current_node_named(local_name!("option"))
+ && self.html_elem_named(&self.open_elems[self.open_elems.len() - 2],
+ local_name!("optgroup")) {
+ self.pop();
+ }
+ if self.current_node_named(local_name!("optgroup")) {
+ self.pop();
+ } else {
+ self.unexpected(&token);
+ }
+ Done
+ }
+
+ </option> => {
+ if self.current_node_named(local_name!("option")) {
+ self.pop();
+ } else {
+ self.unexpected(&token);
+ }
+ Done
+ }
+
+ tag @ <select> </select> => {
+ let in_scope = self.in_scope_named(select_scope, local_name!("select"));
+
+ if !in_scope || tag.kind == StartTag {
+ self.unexpected(&tag);
+ }
+
+ if in_scope {
+ self.pop_until_named(local_name!("select"));
+ self.mode = self.reset_insertion_mode();
+ }
+ Done
+ }
+
+ <input> <keygen> <textarea> => {
+ self.unexpected(&token);
+ if self.in_scope_named(select_scope, local_name!("select")) {
+ self.pop_until_named(local_name!("select"));
+ Reprocess(self.reset_insertion_mode(), token)
+ } else {
+ Done
+ }
+ }
+
+ <script> <template> </template> => self.step(InHead, token),
+
+ EOFToken => self.step(InBody, token),
+
+ token => self.unexpected(&token),
+ }),
+
+ //§ parsing-main-inselectintable
+ InSelectInTable => match_token!(token {
+ <caption> <table> <tbody> <tfoot> <thead> <tr> <td> <th> => {
+ self.unexpected(&token);
+ self.pop_until_named(local_name!("select"));
+ Reprocess(self.reset_insertion_mode(), token)
+ }
+
+ tag @ </caption> </table> </tbody> </tfoot> </thead> </tr> </td> </th> => {
+ self.unexpected(&tag);
+ if self.in_scope_named(table_scope, tag.name.clone()) {
+ self.pop_until_named(local_name!("select"));
+ Reprocess(self.reset_insertion_mode(), TagToken(tag))
+ } else {
+ Done
+ }
+ }
+
+ token => self.step(InSelect, token),
+ }),
+
+ //§ parsing-main-intemplate
+ InTemplate => match_token!(token {
+ CharacterTokens(_, _) => self.step(InBody, token),
+ CommentToken(_) => self.step(InBody, token),
+
+ <base> <basefont> <bgsound> <link> <meta> <noframes> <script>
+ <style> <template> <title> </template> => {
+ self.step(InHead, token)
+ }
+
+ <caption> <colgroup> <tbody> <tfoot> <thead> => {
+ self.template_modes.pop();
+ self.template_modes.push(InTable);
+ Reprocess(InTable, token)
+ }
+
+ <col> => {
+ self.template_modes.pop();
+ self.template_modes.push(InColumnGroup);
+ Reprocess(InColumnGroup, token)
+ }
+
+ <tr> => {
+ self.template_modes.pop();
+ self.template_modes.push(InTableBody);
+ Reprocess(InTableBody, token)
+ }
+
+ <td> <th> => {
+ self.template_modes.pop();
+ self.template_modes.push(InRow);
+ Reprocess(InRow, token)
+ }
+
+ EOFToken => {
+ if !self.in_html_elem_named(local_name!("template")) {
+ self.stop_parsing()
+ } else {
+ self.unexpected(&token);
+ self.pop_until_named(local_name!("template"));
+ self.clear_active_formatting_to_marker();
+ self.template_modes.pop();
+ self.mode = self.reset_insertion_mode();
+ Reprocess(self.reset_insertion_mode(), token)
+ }
+ }
+
+ tag @ <_> => {
+ self.template_modes.pop();
+ self.template_modes.push(InBody);
+ Reprocess(InBody, TagToken(tag))
+ }
+
+ token => self.unexpected(&token),
+ }),
+
+ //§ parsing-main-afterbody
+ AfterBody => match_token!(token {
+ CharacterTokens(NotSplit, text) => SplitWhitespace(text),
+ CharacterTokens(Whitespace, _) => self.step(InBody, token),
+ CommentToken(text) => self.append_comment_to_html(text),
+
+ <html> => self.step(InBody, token),
+
+ </html> => {
+ if self.is_fragment() {
+ self.unexpected(&token);
+ } else {
+ self.mode = AfterAfterBody;
+ }
+ Done
+ }
+
+ EOFToken => self.stop_parsing(),
+
+ token => {
+ self.unexpected(&token);
+ Reprocess(InBody, token)
+ }
+ }),
+
+ //§ parsing-main-inframeset
+ InFrameset => match_token!(token {
+ CharacterTokens(NotSplit, text) => SplitWhitespace(text),
+ CharacterTokens(Whitespace, text) => self.append_text(text),
+ CommentToken(text) => self.append_comment(text),
+
+ <html> => self.step(InBody, token),
+
+ tag @ <frameset> => {
+ self.insert_element_for(tag);
+ Done
+ }
+
+ </frameset> => {
+ if self.open_elems.len() == 1 {
+ self.unexpected(&token);
+ } else {
+ self.pop();
+ if !self.is_fragment() && !self.current_node_named(local_name!("frameset")) {
+ self.mode = AfterFrameset;
+ }
+ }
+ Done
+ }
+
+ tag @ <frame> => {
+ self.insert_and_pop_element_for(tag);
+ DoneAckSelfClosing
+ }
+
+ <noframes> => self.step(InHead, token),
+
+ EOFToken => {
+ if self.open_elems.len() != 1 {
+ self.unexpected(&token);
+ }
+ self.stop_parsing()
+ }
+
+ token => self.unexpected(&token),
+ }),
+
+ //§ parsing-main-afterframeset
+ AfterFrameset => match_token!(token {
+ CharacterTokens(NotSplit, text) => SplitWhitespace(text),
+ CharacterTokens(Whitespace, text) => self.append_text(text),
+ CommentToken(text) => self.append_comment(text),
+
+ <html> => self.step(InBody, token),
+
+ </html> => {
+ self.mode = AfterAfterFrameset;
+ Done
+ }
+
+ <noframes> => self.step(InHead, token),
+
+ EOFToken => self.stop_parsing(),
+
+ token => self.unexpected(&token),
+ }),
+
+ //§ the-after-after-body-insertion-mode
+ AfterAfterBody => match_token!(token {
+ CharacterTokens(NotSplit, text) => SplitWhitespace(text),
+ CharacterTokens(Whitespace, _) => self.step(InBody, token),
+ CommentToken(text) => self.append_comment_to_doc(text),
+
+ <html> => self.step(InBody, token),
+
+ EOFToken => self.stop_parsing(),
+
+ token => {
+ self.unexpected(&token);
+ Reprocess(InBody, token)
+ }
+ }),
+
+ //§ the-after-after-frameset-insertion-mode
+ AfterAfterFrameset => match_token!(token {
+ CharacterTokens(NotSplit, text) => SplitWhitespace(text),
+ CharacterTokens(Whitespace, _) => self.step(InBody, token),
+ CommentToken(text) => self.append_comment_to_doc(text),
+
+ <html> => self.step(InBody, token),
+
+ EOFToken => self.stop_parsing(),
+
+ <noframes> => self.step(InHead, token),
+
+ token => self.unexpected(&token),
+ }),
+ //§ END
+ }
+ }
+
+ fn step_foreign(&mut self, token: Token) -> ProcessResult<Handle> {
+ match_token!(token {
+ NullCharacterToken => {
+ self.unexpected(&token);
+ self.append_text("\u{fffd}".to_tendril())
+ }
+
+ CharacterTokens(_, text) => {
+ if any_not_whitespace(&text) {
+ self.frameset_ok = false;
+ }
+ self.append_text(text)
+ }
+
+ CommentToken(text) => self.append_comment(text),
+
+ tag @ <b> <big> <blockquote> <body> <br> <center> <code> <dd> <div> <dl>
+ <dt> <em> <embed> <h1> <h2> <h3> <h4> <h5> <h6> <head> <hr> <i>
+ <img> <li> <listing> <menu> <meta> <nobr> <ol> <p> <pre> <ruby>
+ <s> <small> <span> <strong> <strike> <sub> <sup> <table> <tt>
+ <u> <ul> <var> => self.unexpected_start_tag_in_foreign_content(tag),
+
+ tag @ <font> => {
+ let unexpected = tag.attrs.iter().any(|attr| {
+ matches!(attr.name.expanded(),
+ expanded_name!("", "color") |
+ expanded_name!("", "face") |
+ expanded_name!("", "size"))
+ });
+ if unexpected {
+ self.unexpected_start_tag_in_foreign_content(tag)
+ } else {
+ self.foreign_start_tag(tag)
+ }
+ }
+
+ tag @ <_> => self.foreign_start_tag(tag),
+
+ // FIXME(#118): </script> in SVG
+
+ tag @ </_> => {
+ let mut first = true;
+ let mut stack_idx = self.open_elems.len() - 1;
+ loop {
+ if stack_idx == 0 {
+ return Done;
+ }
+
+ let html;
+ let eq;
+ {
+ let node_name = self.sink.elem_name(&self.open_elems[stack_idx]);
+ html = *node_name.ns == ns!(html);
+ eq = node_name.local.eq_ignore_ascii_case(&tag.name);
+ }
+ if !first && html {
+ let mode = self.mode;
+ return self.step(mode, TagToken(tag));
+ }
+
+ if eq {
+ self.open_elems.truncate(stack_idx);
+ return Done;
+ }
+
+ if first {
+ self.unexpected(&tag);
+ first = false;
+ }
+ stack_idx -= 1;
+ }
+ }
+
+ // FIXME: This should be unreachable, but match_token requires a
+ // catch-all case.
+ _ => panic!("impossible case in foreign content"),
+ })
+ }
+}
diff --git a/src/tree_builder/tag_sets.rs b/src/tree_builder/tag_sets.rs
new file mode 100644
index 0000000..377b34c
--- /dev/null
+++ b/src/tree_builder/tag_sets.rs
@@ -0,0 +1,115 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Various sets of HTML tag names, and macros for declaring them.
+
+use crate::ExpandedName;
+use mac::{_tt_as_expr_hack, matches};
+use markup5ever::{expanded_name, local_name, namespace_prefix, namespace_url, ns};
+
+macro_rules! declare_tag_set_impl ( ($param:ident, $b:ident, $supr:ident, $($tag:tt)+) => (
+ match $param {
+ $( expanded_name!(html $tag) => $b, )+
+ p => $supr(p),
+ }
+));
+
+macro_rules! declare_tag_set_body (
+ ($param:ident = [$supr:ident] - $($tag:tt)+)
+ => ( declare_tag_set_impl!($param, false, $supr, $($tag)+) );
+
+ ($param:ident = [$supr:ident] + $($tag:tt)+)
+ => ( declare_tag_set_impl!($param, true, $supr, $($tag)+) );
+
+ ($param:ident = $($tag:tt)+)
+ => ( declare_tag_set_impl!($param, true, empty_set, $($tag)+) );
+);
+
+macro_rules! declare_tag_set (
+ (pub $name:ident = $($toks:tt)+) => (
+ pub fn $name(p: crate::ExpandedName) -> bool {
+ declare_tag_set_body!(p = $($toks)+)
+ }
+ );
+
+ ($name:ident = $($toks:tt)+) => (
+ fn $name(p: crate::ExpandedName) -> bool {
+ declare_tag_set_body!(p = $($toks)+)
+ }
+ );
+);
+
+#[inline(always)]
+pub fn empty_set(_: ExpandedName) -> bool {
+ false
+}
+#[inline(always)]
+pub fn full_set(_: ExpandedName) -> bool {
+ true
+}
+
+declare_tag_set!(pub html_default_scope =
+ "applet" "caption" "html" "table" "td" "th" "marquee" "object" "template");
+
+#[inline(always)]
+pub fn default_scope(name: ExpandedName) -> bool {
+ html_default_scope(name) ||
+ mathml_text_integration_point(name) ||
+ svg_html_integration_point(name)
+}
+
+declare_tag_set!(pub list_item_scope = [default_scope] + "ol" "ul");
+declare_tag_set!(pub button_scope = [default_scope] + "button");
+declare_tag_set!(pub table_scope = "html" "table" "template");
+declare_tag_set!(pub select_scope = [full_set] - "optgroup" "option");
+
+declare_tag_set!(pub table_body_context = "tbody" "tfoot" "thead" "template" "html");
+declare_tag_set!(pub table_row_context = "tr" "template" "html");
+declare_tag_set!(pub td_th = "td" "th");
+
+declare_tag_set!(pub cursory_implied_end =
+ "dd" "dt" "li" "option" "optgroup" "p" "rb" "rp" "rt" "rtc");
+
+declare_tag_set!(pub thorough_implied_end = [cursory_implied_end]
+ + "caption" "colgroup" "tbody" "td" "tfoot" "th" "thead" "tr");
+
+declare_tag_set!(pub heading_tag = "h1" "h2" "h3" "h4" "h5" "h6");
+
+declare_tag_set!(pub special_tag =
+ "address" "applet" "area" "article" "aside" "base" "basefont" "bgsound" "blockquote" "body"
+ "br" "button" "caption" "center" "col" "colgroup" "dd" "details" "dir" "div" "dl" "dt" "embed"
+ "fieldset" "figcaption" "figure" "footer" "form" "frame" "frameset" "h1" "h2" "h3" "h4" "h5"
+ "h6" "head" "header" "hgroup" "hr" "html" "iframe" "img" "input" "isindex" "li" "link"
+ "listing" "main" "marquee" "menu" "meta" "nav" "noembed" "noframes" "noscript"
+ "object" "ol" "p" "param" "plaintext" "pre" "script" "section" "select" "source" "style"
+ "summary" "table" "tbody" "td" "template" "textarea" "tfoot" "th" "thead" "title" "tr" "track"
+ "ul" "wbr" "xmp");
+//§ END
+
+pub fn mathml_text_integration_point(p: ExpandedName) -> bool {
+ matches!(
+ p,
+ expanded_name!(mathml "mi") |
+ expanded_name!(mathml "mo") |
+ expanded_name!(mathml "mn") |
+ expanded_name!(mathml "ms") |
+ expanded_name!(mathml "mtext")
+ )
+}
+
+/// https://html.spec.whatwg.org/multipage/#html-integration-point
+pub fn svg_html_integration_point(p: ExpandedName) -> bool {
+ // annotation-xml are handle in another place
+ matches!(
+ p,
+ expanded_name!(svg "foreignObject") |
+ expanded_name!(svg "desc") |
+ expanded_name!(svg "title")
+ )
+}
diff --git a/src/tree_builder/types.rs b/src/tree_builder/types.rs
new file mode 100644
index 0000000..e47d69b
--- /dev/null
+++ b/src/tree_builder/types.rs
@@ -0,0 +1,95 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Types used within the tree builder code. Not exported to users.
+
+use crate::tokenizer::states::RawKind;
+use crate::tokenizer::Tag;
+
+use crate::tendril::StrTendril;
+
+pub use self::FormatEntry::*;
+pub use self::InsertionMode::*;
+pub use self::InsertionPoint::*;
+pub use self::ProcessResult::*;
+pub use self::SplitStatus::*;
+pub use self::Token::*;
+
+#[derive(PartialEq, Eq, Copy, Clone, Debug)]
+pub enum InsertionMode {
+ Initial,
+ BeforeHtml,
+ BeforeHead,
+ InHead,
+ InHeadNoscript,
+ AfterHead,
+ InBody,
+ Text,
+ InTable,
+ InTableText,
+ InCaption,
+ InColumnGroup,
+ InTableBody,
+ InRow,
+ InCell,
+ InSelect,
+ InSelectInTable,
+ InTemplate,
+ AfterBody,
+ InFrameset,
+ AfterFrameset,
+ AfterAfterBody,
+ AfterAfterFrameset,
+}
+
+#[derive(PartialEq, Eq, Copy, Clone, Debug)]
+pub enum SplitStatus {
+ NotSplit,
+ Whitespace,
+ NotWhitespace,
+}
+
+/// A subset/refinement of `tokenizer::Token`. Everything else is handled
+/// specially at the beginning of `process_token`.
+#[derive(PartialEq, Eq, Clone, Debug)]
+pub enum Token {
+ TagToken(Tag),
+ CommentToken(StrTendril),
+ CharacterTokens(SplitStatus, StrTendril),
+ NullCharacterToken,
+ EOFToken,
+}
+
+pub enum ProcessResult<Handle> {
+ Done,
+ DoneAckSelfClosing,
+ SplitWhitespace(StrTendril),
+ Reprocess(InsertionMode, Token),
+ ReprocessForeign(Token),
+ Script(Handle),
+ ToPlaintext,
+ ToRawData(RawKind),
+}
+
+pub enum FormatEntry<Handle> {
+ Element(Handle, Tag),
+ Marker,
+}
+
+pub enum InsertionPoint<Handle> {
+ /// Insert as last child in this parent.
+ LastChild(Handle),
+ /// Insert before this following sibling.
+ BeforeSibling(Handle),
+ /// Insertion point is decided based on existence of element's parent node.
+ TableFosterParenting {
+ element: Handle,
+ prev_element: Handle,
+ },
+}
diff --git a/src/util/str.rs b/src/util/str.rs
new file mode 100644
index 0000000..b2eb41a
--- /dev/null
+++ b/src/util/str.rs
@@ -0,0 +1,60 @@
+// Copyright 2014-2017 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use mac::{_tt_as_expr_hack, matches};
+use std::fmt;
+
+pub fn to_escaped_string<T: fmt::Debug>(x: &T) -> String {
+ // FIXME: don't allocate twice
+ let string = format!("{:?}", x);
+ string.chars().flat_map(|c| c.escape_default()).collect()
+}
+
+/// If `c` is an ASCII letter, return the corresponding lowercase
+/// letter, otherwise None.
+pub fn lower_ascii_letter(c: char) -> Option<char> {
+ match c {
+ 'a'..='z' => Some(c),
+ 'A'..='Z' => Some((c as u8 - b'A' + b'a') as char),
+ _ => None,
+ }
+}
+
+/// Is the character an ASCII alphanumeric character?
+pub fn is_ascii_alnum(c: char) -> bool {
+ matches!(c, '0'..='9' | 'a'..='z' | 'A'..='Z')
+}
+
+/// ASCII whitespace characters, as defined by
+/// tree construction modes that treat them specially.
+pub fn is_ascii_whitespace(c: char) -> bool {
+ matches!(c, '\t' | '\r' | '\n' | '\x0C' | ' ')
+}
+
+#[cfg(test)]
+#[allow(non_snake_case)]
+mod test {
+ use super::{is_ascii_alnum, lower_ascii_letter};
+ use mac::test_eq;
+
+ test_eq!(lower_letter_a_is_a, lower_ascii_letter('a'), Some('a'));
+ test_eq!(lower_letter_A_is_a, lower_ascii_letter('A'), Some('a'));
+ test_eq!(lower_letter_symbol_is_None, lower_ascii_letter('!'), None);
+ test_eq!(
+ lower_letter_nonascii_is_None,
+ lower_ascii_letter('\u{a66e}'),
+ None
+ );
+
+ test_eq!(is_alnum_a, is_ascii_alnum('a'), true);
+ test_eq!(is_alnum_A, is_ascii_alnum('A'), true);
+ test_eq!(is_alnum_1, is_ascii_alnum('1'), true);
+ test_eq!(is_not_alnum_symbol, is_ascii_alnum('!'), false);
+ test_eq!(is_not_alnum_nonascii, is_ascii_alnum('\u{a66e}'), false);
+}