diff --git a/html5ever/Cargo.toml b/html5ever/Cargo.toml index c18f5205..e1d3e967 100644 --- a/html5ever/Cargo.toml +++ b/html5ever/Cargo.toml @@ -13,13 +13,16 @@ readme = "../README.md" rust-version.workspace = true [features] +default = ["encoding"] trace_tokenizer = [] +encoding = ["dep:encoding_rs", "markup5ever/encoding"] [dependencies] log = "0.4" mac = "0.1" markup5ever = { version = "0.15", path = "../markup5ever" } match_token = { workspace = true } +encoding_rs = { version = "0.8", optional = true } [dev-dependencies] criterion = "0.5" diff --git a/html5ever/examples/noop-tokenize.rs b/html5ever/examples/noop-tokenize.rs index a95404df..773b30e2 100644 --- a/html5ever/examples/noop-tokenize.rs +++ b/html5ever/examples/noop-tokenize.rs @@ -15,7 +15,8 @@ use std::cell::RefCell; use std::io; use html5ever::tendril::*; -use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer}; +use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer}; +use markup5ever::buffer_queue::BufferQueue; /// In our case, our sink only contains a tokens vector struct Sink(RefCell>); diff --git a/html5ever/examples/tokenize.rs b/html5ever/examples/tokenize.rs index ba984d8f..f1368604 100644 --- a/html5ever/examples/tokenize.rs +++ b/html5ever/examples/tokenize.rs @@ -13,11 +13,11 @@ use std::cell::Cell; use std::io; use html5ever::tendril::*; -use html5ever::tokenizer::BufferQueue; use html5ever::tokenizer::{CharacterTokens, EndTag, NullCharacterToken, StartTag, TagToken}; use html5ever::tokenizer::{ ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts, }; +use markup5ever::buffer_queue::BufferQueue; #[derive(Clone)] struct TokenPrinter { diff --git a/html5ever/src/driver.rs b/html5ever/src/driver.rs index 6a151ee3..1f66ebca 100644 --- a/html5ever/src/driver.rs +++ b/html5ever/src/driver.rs @@ -10,10 +10,10 @@ //! High-level interface to the parser. use crate::buffer_queue::BufferQueue; -use crate::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult}; +use crate::tokenizer::{Tokenizer, TokenizerOpts}; use crate::tree_builder::{create_element, TreeBuilder, TreeBuilderOpts, TreeSink}; use crate::{Attribute, QualName}; - +use markup5ever::TokenizerResult; use std::borrow::Cow; use crate::tendril; diff --git a/html5ever/src/tokenizer/char_ref/mod.rs b/html5ever/src/tokenizer/char_ref/mod.rs index 2f3c6c66..d13d59ee 100644 --- a/html5ever/src/tokenizer/char_ref/mod.rs +++ b/html5ever/src/tokenizer/char_ref/mod.rs @@ -8,12 +8,12 @@ // except according to those terms. use super::{TokenSink, Tokenizer}; -use crate::buffer_queue::BufferQueue; use crate::data; use crate::tendril::StrTendril; use log::debug; use mac::format_if; +use markup5ever::buffer_queue::BufferQueue; use std::borrow::Cow::Borrowed; use std::char::from_u32; diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs index edc6afb9..91b33634 100644 --- a/html5ever/src/tokenizer/interface.rs +++ b/html5ever/src/tokenizer/interface.rs @@ -77,6 +77,8 @@ pub enum TokenSinkResult { Script(Handle), Plaintext, RawData(states::RawKind), + #[cfg(feature = "encoding")] + MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding), } /// Types which can receive tokens from the tokenizer. diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs index 9edeccb7..e974f871 100644 --- a/html5ever/src/tokenizer/mod.rs +++ b/html5ever/src/tokenizer/mod.rs @@ -22,16 +22,18 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped}; use self::char_ref::{CharRef, CharRefTokenizer}; use crate::util::str::lower_ascii_letter; - use log::{debug, trace}; use mac::format_if; -use markup5ever::{namespace_url, ns, small_char_set}; +use markup5ever::{ + buffer_queue::BufferQueue, namespace_url, ns, small_char_set, InputSink, InputSinkResult, + TokenizerResult, +}; use std::borrow::Cow::{self, Borrowed}; use std::cell::{Cell, RefCell, RefMut}; use std::collections::BTreeMap; -use std::mem; +use std::{iter, mem}; -pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult}; +pub use crate::buffer_queue::{FromSet, NotFromSet, SetResult}; use crate::tendril::StrTendril; use crate::{Attribute, LocalName, QualName, SmallCharSet}; @@ -43,13 +45,8 @@ pub enum ProcessResult { Continue, Suspend, Script(Handle), -} - -#[must_use] -#[derive(Debug)] -pub enum TokenizerResult { - Done, - Script(Handle), + #[cfg(feature = "encoding")] + MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding), } fn option_push(opt_str: &mut Option, c: char) { @@ -364,6 +361,10 @@ impl Tokenizer { ProcessResult::Continue => (), ProcessResult::Suspend => break, ProcessResult::Script(node) => return TokenizerResult::Script(node), + #[cfg(feature = "encoding")] + ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => { + return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding) + }, } } } else { @@ -372,6 +373,10 @@ impl Tokenizer { ProcessResult::Continue => (), ProcessResult::Suspend => break, ProcessResult::Script(node) => return TokenizerResult::Script(node), + #[cfg(feature = "encoding")] + ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => { + return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding) + }, } } } @@ -452,6 +457,10 @@ impl Tokenizer { self.state.set(states::RawData(kind)); ProcessResult::Continue }, + #[cfg(feature = "encoding")] + TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding) => { + ProcessResult::MaybeChangeEncodingAndStartOver(encoding) + }, } } @@ -1455,6 +1464,8 @@ impl Tokenizer { ProcessResult::Continue => (), ProcessResult::Suspend => break, ProcessResult::Script(_) => unreachable!(), + #[cfg(feature = "encoding")] + ProcessResult::MaybeChangeEncodingAndStartOver(_) => unreachable!(), } } @@ -1582,13 +1593,24 @@ impl Tokenizer { } } +impl InputSink for Tokenizer +where + Sink: TokenSink, +{ + type Handle = Sink::Handle; + + fn feed(&self, input: &BufferQueue) -> impl Iterator> { + iter::from_fn(|| self.feed(input).into()) + } +} + #[cfg(test)] #[allow(non_snake_case)] mod test { use super::option_push; // private items - use crate::tendril::{SliceExt, StrTendril}; - use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; + use crate::tendril::{SliceExt, StrTendril}; + use crate::LocalName; use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; use super::interface::{EndTag, StartTag, Tag, TagKind}; @@ -1597,8 +1619,6 @@ mod test { use markup5ever::buffer_queue::BufferQueue; use std::cell::RefCell; - use crate::LocalName; - // LinesMatch implements the TokenSink trait. It is used for testing to see // if current_line is being updated when process_token is called. The lines // vector is a collection of the line numbers that each token is on. diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs index c58cfc50..ab47adc7 100644 --- a/html5ever/src/tree_builder/mod.rs +++ b/html5ever/src/tree_builder/mod.rs @@ -9,9 +9,7 @@ //! The HTML5 tree builder. -pub use crate::interface::{ - create_element, ElemName, ElementFlags, NextParserState, Tracer, TreeSink, -}; +pub use crate::interface::{create_element, ElemName, ElementFlags, Tracer, TreeSink}; pub use crate::interface::{AppendNode, AppendText, Attribute, NodeOrText}; pub use crate::interface::{LimitedQuirks, NoQuirks, Quirks, QuirksMode}; @@ -394,6 +392,10 @@ where assert!(more_tokens.is_empty()); return tokenizer::TokenSinkResult::RawData(k); }, + #[cfg(feature = "encoding")] + MaybeChangeEncodingAndStartOver(encoding) => { + return tokenizer::TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding); + }, } } } diff --git a/html5ever/src/tree_builder/rules.rs b/html5ever/src/tree_builder/rules.rs index e56a3ab5..7169f2ec 100644 --- a/html5ever/src/tree_builder/rules.rs +++ b/html5ever/src/tree_builder/rules.rs @@ -10,21 +10,24 @@ // The tree builder rules, as a single, enormous nested match expression. use crate::interface::Quirks; -use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData}; +use crate::tokenizer::states::{Rawtext, Rcdata}; use crate::tokenizer::TagKind::{EndTag, StartTag}; use crate::tree_builder::tag_sets::*; use crate::tree_builder::types::*; -use crate::tree_builder::{ - create_element, html_elem, ElemName, NodeOrText::AppendNode, StrTendril, Tag, TreeBuilder, - TreeSink, -}; -use crate::QualName; -use markup5ever::{expanded_name, local_name, namespace_url, ns}; +use crate::tree_builder::RawKind::ScriptData; +use crate::tree_builder::{html_elem, ElemName, StrTendril, Tag, TreeBuilder, TreeSink}; + +use markup5ever::interface::create_element; +use markup5ever::interface::NodeOrText::AppendNode; +use markup5ever::{expanded_name, local_name, namespace_url, ns, QualName}; use std::borrow::Cow::Borrowed; use crate::tendril::SliceExt; use match_token::match_token; +#[cfg(feature = "encoding")] +use encoding_rs::Encoding; + fn any_not_whitespace(x: &StrTendril) -> bool { // FIXME: this might be much faster as a byte scan x.chars().any(|c| !c.is_ascii_whitespace()) @@ -113,8 +116,21 @@ where => self.step(InBody, token), - tag @ => { - // FIXME: handle and + tag @ => { + // FIXME: handle + #[cfg(feature = "encoding")] + if let Some(charset) = tag.attrs.iter().find(|a| a.name == QualName::new(None, ns!(html), local_name!("charset"))) { + if let Some(encoding) = Encoding::for_label(charset.value.as_bytes()) { + self.insert_and_pop_element_for(tag); + return MaybeChangeEncodingAndStartOver(encoding); + } + } + + self.insert_and_pop_element_for(tag); + DoneAckSelfClosing + }, + + tag @ => { self.insert_and_pop_element_for(tag); DoneAckSelfClosing } diff --git a/html5ever/src/tree_builder/types.rs b/html5ever/src/tree_builder/types.rs index f6bb588d..82d0c7dd 100644 --- a/html5ever/src/tree_builder/types.rs +++ b/html5ever/src/tree_builder/types.rs @@ -77,6 +77,8 @@ pub(crate) enum ProcessResult { Script(Handle), ToPlaintext, ToRawData(RawKind), + #[cfg(feature = "encoding")] + MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding), } pub(crate) enum FormatEntry { diff --git a/markup5ever/Cargo.toml b/markup5ever/Cargo.toml index 285ab5fd..cc95091b 100644 --- a/markup5ever/Cargo.toml +++ b/markup5ever/Cargo.toml @@ -14,11 +14,15 @@ rust-version.workspace = true [lib] path = "lib.rs" +[features] +encoding = ["dep:encoding_rs"] + [dependencies] string_cache = "0.8" phf = "0.11" tendril = "0.4" log = "0.4" +encoding_rs = { version = "0.8", optional = true } [build-dependencies] string_cache_codegen = "0.5.4" diff --git a/markup5ever/encoding.rs b/markup5ever/encoding.rs new file mode 100644 index 00000000..e8ad8d1b --- /dev/null +++ b/markup5ever/encoding.rs @@ -0,0 +1,133 @@ +// Copyright 2014-2025 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_8, WINDOWS_1252, X_USER_DEFINED}; +use tendril::{fmt::Bytes, Tendril}; + +use crate::buffer_queue::BufferQueue; + +/// +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Confidence { + Tentative, + Certain, + Irrelevant, +} + +pub struct Decoder { + inner: encoding_rs::Decoder, + confidence: Confidence, +} + +impl Decoder { + pub fn new(encoding: &'static Encoding, confidence: Confidence) -> Self { + Self { + inner: encoding.new_decoder(), + confidence, + } + } + + pub fn confidence(&self) -> Confidence { + self.confidence + } + + /// Returns `None` if the encoding should not be changed and `Some(encoding)` if the current encoding + /// should be changed to `encoding` + pub fn change_the_encoding_to( + &mut self, + mut new_encoding: &'static Encoding, + ) -> Option<&'static Encoding> { + let current_encoding = self.inner.encoding(); + // Step 1. If the encoding that is already being used to interpret the input stream is UTF-16BE/LE, + // then set the confidence to certain and return. The new encoding is ignored; if it was anything + // but the same encoding, then it would be clearly incorrect. + if current_encoding == UTF_16BE || current_encoding == UTF_16BE { + self.confidence = Confidence::Certain; + return None; + } + + // Step 2. If the new encoding is UTF-16BE/LE, then change it to UTF-8. + if new_encoding == UTF_16BE || new_encoding == UTF_16BE { + new_encoding = UTF_8; + } + + // Step 3. If the new encoding is x-user-defined, then change it to windows-1252. + if new_encoding == X_USER_DEFINED { + new_encoding = WINDOWS_1252; + } + + // Step 4. If the new encoding is identical or equivalent to the encoding that is already being used to interpret + // the input stream, then set the confidence to certain and return. This happens when the encoding information found + // in the file matches what the encoding sniffing algorithm determined to be the encoding, and in the second pass + // through the parser if the first pass found that the encoding sniffing algorithm described in the earlier section + // failed to find the right encoding. + if current_encoding == new_encoding { + self.confidence = Confidence::Certain; + return None; + } + + // Step 5. If all the bytes up to the last byte converted by the current decoder have the same + // Unicode interpretations in both the current encoding and the new encoding, and if the user agent + // supports changing the converter on the fly, then the user agent may change to the new converter + // for the encoding on the fly. Set the document's character encoding and the encoding used to convert + // the input stream to the new encoding, set the confidence to certain, and return. + // NOTE: We don't support changing the converter on the fly + + // Step 6. Otherwise, restart the navigate algorithm, with historyHandling set to "replace" and + // other inputs kept the same, but this time skip the encoding sniffing algorithm and instead just + // set the encoding to the new encoding and the confidence to certain. Whenever possible, this should + // be done without actually contacting the network layer (the bytes should be re-parsed from memory), + // even if, e.g., the document is marked as not being cacheable. If this is not possible and contacting + // the network layer would involve repeating a request that uses a method other than `GET`, then instead + // set the confidence to certain and ignore the new encoding. The resource will be misinterpreted. + // User agents may notify the user of the situation, to aid in application development. + Some(new_encoding) + } + + /// Decode the given chunk with the current encoding. The result will be pushed to the end + /// of the input stream. + pub fn decode(&mut self, chunk: &[u8], last: bool, output: &BufferQueue) { + let mut remaining = chunk; + loop { + let mut out: Tendril = Tendril::new(); + let max_len = self + .inner + .max_utf8_buffer_length_without_replacement(remaining.len()) + .unwrap_or(8192) + .min(8192); + + // SAFETY: encoding_rs::Decoder::decode_to_utf8_without_replacement is going to initialize + // part of the buffer. We are only going to access the initialized segment. + unsafe { + out.push_uninitialized(max_len as u32); + } + + let (result, bytes_read, bytes_written) = self + .inner + .decode_to_utf8_without_replacement(&remaining, &mut out, last); + + if bytes_written > 0 { + let bytes_chunk = out.subtendril(0, bytes_written as u32); + + // SAFETY: encoding_rs::Decoder::decode_to_utf8_without_replacement writes valid utf8 + let utf8_chunk = unsafe { bytes_chunk.reinterpret_without_validating() }; + output.push_back(utf8_chunk); + } + + if matches!(result, DecoderResult::Malformed(_, _)) { + output.push_back("\u{FFFD}".into()); + } + + remaining = &remaining[bytes_read..]; + if remaining.is_empty() { + return; + } + } + } +} diff --git a/markup5ever/input_stream.rs b/markup5ever/input_stream.rs new file mode 100644 index 00000000..4d07bf40 --- /dev/null +++ b/markup5ever/input_stream.rs @@ -0,0 +1,136 @@ +use std::cell::RefCell; + +use encoding_rs::Encoding; +use tendril::StrTendril; + +use crate::buffer_queue::BufferQueue; +use crate::encoding::{Confidence, Decoder}; + +/// +/// +/// Internally the `InputStream` keeps track of the current +/// [insertion point](https://html.spec.whatwg.org/#insertion-point) by using +/// two seperate buffers. +pub struct InputStream { + input: BufferQueue, + decoder: RefCell, +} + +impl InputStream { + fn new(encoding: &'static Encoding) -> Self { + Self { + input: Default::default(), + decoder: RefCell::new(Decoder::new(encoding, Confidence::Tentative)), + } + } + + pub fn append(&self, data: StrTendril) { + self.input.push_back(data); + } + + pub fn append_bytes(&self, data: &[u8]) { + self.decoder + .borrow_mut() + .decode(data, false, &self.input); + } + + pub fn code_points(&self) -> &BufferQueue { + &self.input + } + + /// Attempt to switch to another encoding. + /// + /// If the encoding was switched then the new encoding is returned. Note that the new encoding may be + /// different from the one that this function was called with. + pub fn maybe_switch_encoding(&self, encoding: &'static Encoding) -> Option<&'static Encoding> { + if self.decoder.borrow().confidence() == Confidence::Tentative { + if let Some(new_encoding) = self.decoder.borrow_mut().change_the_encoding_to(encoding) { + return Some(new_encoding); + } + } + None + } + + /// Move any input that is left in the decoding stage to the end of the input stream + pub fn finish_decoding_input(&self) { + self.decoder + .borrow_mut() + .decode(&[], true, &self.input); + } + + /// Remove all input from the stream + pub fn clear(&self) { + self.input.clear(); + } + + /// Swap the contents of the pending input queue with the provided queue + pub fn swap_input_queue(&self, other: &BufferQueue) { + self.input.swap(other); + } +} + +pub struct DecodingParser { + input_stream: InputStream, + input_sink: Sink, +} + +impl DecodingParser +where + Sink: InputSink, +{ + pub fn new(sink: Sink, document_encoding: &'static Encoding) -> Self { + Self { + input_stream: InputStream::new(document_encoding), + input_sink: sink, + } + } + + pub fn sink(&self) -> &Sink { + &self.input_sink + } + + pub fn input_stream(&self) -> &InputStream { + &self.input_stream + } + + /// Return an iterator that can be used to drive the parser + pub fn parse(&self) -> impl Iterator> + '_ { + self.input_sink + .feed(self.input_stream.code_points()) + .filter_map(|sink_result| match sink_result { + InputSinkResult::HandleScript(script) => Some(ParserAction::HandleScript(script)), + InputSinkResult::MaybeStartOverWithEncoding(encoding) => self + .input_stream + .maybe_switch_encoding(encoding) + .map(ParserAction::StartOverWithEncoding), + }) + } +} + +pub enum ParserAction { + HandleScript(Handle), + StartOverWithEncoding(&'static Encoding), +} + +pub enum InputSinkResult { + HandleScript(Handle), + MaybeStartOverWithEncoding(&'static Encoding), +} + +pub trait InputSink { + type Handle; + + fn feed(&self, input: &BufferQueue) -> impl Iterator>; +} + +impl ParserAction { + pub fn map_script(self, f: F) -> ParserAction + where + F: FnOnce(T) -> U, + { + match self { + Self::HandleScript(script) => ParserAction::HandleScript(f(script)), + Self::StartOverWithEncoding(encoding) => ParserAction::StartOverWithEncoding(encoding), + } + } +} diff --git a/markup5ever/interface/mod.rs b/markup5ever/interface/mod.rs index cfe5d86f..f011080f 100644 --- a/markup5ever/interface/mod.rs +++ b/markup5ever/interface/mod.rs @@ -12,8 +12,10 @@ use std::cell::Ref; use std::fmt; use tendril::StrTendril; +use crate::InputSinkResult; + pub use self::tree_builder::{create_element, AppendNode, AppendText, ElementFlags, NodeOrText}; -pub use self::tree_builder::{ElemName, NextParserState, Tracer, TreeSink}; +pub use self::tree_builder::{ElemName, Tracer, TreeSink}; pub use self::tree_builder::{LimitedQuirks, NoQuirks, Quirks, QuirksMode}; use super::{LocalName, Namespace, Prefix}; @@ -60,6 +62,26 @@ impl fmt::Debug for ExpandedName<'_> { } } +#[must_use] +#[derive(Debug)] +pub enum TokenizerResult { + Done, + Script(Handle), + MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding), +} + +impl From> for Option> { + fn from(value: TokenizerResult) -> Self { + match value { + TokenizerResult::Script(handle) => Some(InputSinkResult::HandleScript(handle)), + TokenizerResult::MaybeChangeEncodingAndStartOver(encoding) => { + Some(InputSinkResult::MaybeStartOverWithEncoding(encoding)) + }, + TokenizerResult::Done => None, + } + } +} + /// Helper to quickly create an expanded name. /// /// Can be used with no namespace as `expanded_name!("", "some_name")` diff --git a/markup5ever/interface/tree_builder.rs b/markup5ever/interface/tree_builder.rs index 8083a727..5d7a82e7 100644 --- a/markup5ever/interface/tree_builder.rs +++ b/markup5ever/interface/tree_builder.rs @@ -43,17 +43,6 @@ pub enum QuirksMode { NoQuirks, } -/// Whether to interrupt further parsing of the current input until -/// the next explicit resumption of the tokenizer, or continue without -/// any interruption. -#[derive(PartialEq, Eq, Copy, Clone, Hash, Debug)] -pub enum NextParserState { - /// Stop further parsing. - Suspend, - /// Continue without interruptions. - Continue, -} - /// Special properties of an element, useful for tagging elements with this information. #[derive(Default)] #[non_exhaustive] @@ -256,11 +245,6 @@ pub trait TreeSink { /// Called whenever the line number changes. fn set_current_line(&self, _line_number: u64) {} - /// Indicate that a `script` element is complete. - fn complete_script(&self, _node: &Self::Handle) -> NextParserState { - NextParserState::Continue - } - fn allow_declarative_shadow_roots(&self, _intended_parent: &Self::Handle) -> bool { true } diff --git a/markup5ever/lib.rs b/markup5ever/lib.rs index a04b8f15..c76ce0ed 100644 --- a/markup5ever/lib.rs +++ b/markup5ever/lib.rs @@ -45,6 +45,13 @@ mod util { pub mod smallcharset; } -pub use interface::{Attribute, ExpandedName, QualName}; +pub use interface::{Attribute, ExpandedName, QualName, TokenizerResult}; pub use util::smallcharset::SmallCharSet; pub use util::*; + +#[cfg(feature = "encoding")] +pub mod encoding; + +mod input_stream; + +pub use input_stream::{DecodingParser, InputSink, InputSinkResult, InputStream, ParserAction}; diff --git a/markup5ever/util/buffer_queue.rs b/markup5ever/util/buffer_queue.rs index 95a571e2..0a25a833 100644 --- a/markup5ever/util/buffer_queue.rs +++ b/markup5ever/util/buffer_queue.rs @@ -18,9 +18,12 @@ //! //! [`BufferQueue`]: struct.BufferQueue.html -use std::{cell::RefCell, collections::VecDeque, mem}; +use std::{cell::RefCell, collections::VecDeque, fmt, mem}; -use tendril::StrTendril; +use tendril::{ + fmt::{Bytes, SliceFormat, UTF8}, + Atomicity, NonAtomic, StrTendril, Tendril, +}; pub use self::SetResult::{FromSet, NotFromSet}; use crate::util::smallcharset::SmallCharSet; @@ -38,18 +41,30 @@ pub enum SetResult { NotFromSet(StrTendril), } -/// A queue of owned string buffers, which supports incrementally consuming characters. +/// A queue of tendrils, which supports incrementally consuming characters. /// /// Internally it uses [`VecDeque`] and has the same complexity properties. /// /// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html #[derive(Debug)] -pub struct BufferQueue { +pub struct BufferQueue +where + F: SliceFormat + Default, + ::Slice: fmt::Debug, + A: Atomicity, +{ /// Buffers to process. - buffers: RefCell>, + buffers: RefCell>>, } -impl Default for BufferQueue { +pub type ByteBufferQueue = BufferQueue; + +impl Default for BufferQueue +where + F: SliceFormat + Default, + ::Slice: fmt::Debug, + A: Atomicity, +{ /// Create an empty BufferQueue. #[inline] fn default() -> Self { @@ -59,7 +74,17 @@ impl Default for BufferQueue { } } -impl BufferQueue { +impl BufferQueue +where + F: SliceFormat + Default, + ::Slice: fmt::Debug, + A: Atomicity, +{ + /// Swap the contents of the two buffers + pub fn swap(&self, other: &Self){ + mem::swap(&mut self.buffers.borrow_mut(), &mut other.buffers.borrow_mut()); + } + /// Returns whether the queue is empty. #[inline] pub fn is_empty(&self) -> bool { @@ -68,14 +93,14 @@ impl BufferQueue { /// Get the buffer at the beginning of the queue. #[inline] - pub fn pop_front(&self) -> Option { + pub fn pop_front(&self) -> Option> { self.buffers.borrow_mut().pop_front() } /// Add a buffer to the beginning of the queue. /// /// If the buffer is empty, it will be skipped. - pub fn push_front(&self, buf: StrTendril) { + pub fn push_front(&self, buf: Tendril) { if buf.len32() == 0 { return; } @@ -85,13 +110,27 @@ impl BufferQueue { /// Add a buffer to the end of the queue. /// /// If the buffer is empty, it will be skipped. - pub fn push_back(&self, buf: StrTendril) { + pub fn push_back(&self, buf: Tendril) { if buf.len32() == 0 { return; } self.buffers.borrow_mut().push_back(buf); } + pub fn insert(&self, index: usize, buffer: Tendril) { + if buffer.len32() == 0 { + return; + } + + self.buffers.borrow_mut().insert(index, buffer); + } + + pub fn clear(&self) { + self.buffers.borrow_mut().clear(); + } +} + +impl BufferQueue { /// Look at the next available character without removing it, if the queue is not empty. pub fn peek(&self) -> Option { debug_assert!( @@ -236,11 +275,11 @@ impl BufferQueue { result } - pub fn replace_with(&self, other: BufferQueue) { + pub fn replace_with(&self, other: Self) { let _ = mem::replace(&mut *self.buffers.borrow_mut(), other.buffers.take()); } - pub fn swap_with(&self, other: &BufferQueue) { + pub fn swap_with(&self, other: &Self) { mem::swap( &mut *self.buffers.borrow_mut(), &mut *other.buffers.borrow_mut(), @@ -248,6 +287,20 @@ impl BufferQueue { } } +impl IntoIterator for BufferQueue +where + F: SliceFormat + Default, + ::Slice: fmt::Debug, + A: Atomicity, +{ + type Item = Tendril; + type IntoIter = > as IntoIterator>::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.buffers.into_inner().into_iter() + } +} + #[cfg(test)] #[allow(non_snake_case)] mod test { diff --git a/rcdom/tests/html-serializer.rs b/rcdom/tests/html-serializer.rs index 2c5e6f62..f45c9b83 100644 --- a/rcdom/tests/html-serializer.rs +++ b/rcdom/tests/html-serializer.rs @@ -68,7 +68,7 @@ impl Serialize for Tokens { fn tokenize_and_serialize(input: StrTendril) -> StrTendril { let input = { - let q = ::html5ever::tokenizer::BufferQueue::default(); + let q = markup5ever::buffer_queue::BufferQueue::default(); q.push_front(input); q }; diff --git a/rcdom/tests/html-tokenizer.rs b/rcdom/tests/html-tokenizer.rs index 8de2c3e6..c935c371 100644 --- a/rcdom/tests/html-tokenizer.rs +++ b/rcdom/tests/html-tokenizer.rs @@ -14,12 +14,12 @@ use html5ever::tendril::*; use html5ever::tokenizer::states::{ CdataSection, Data, Plaintext, RawData, Rawtext, Rcdata, ScriptData, }; -use html5ever::tokenizer::BufferQueue; use html5ever::tokenizer::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; use html5ever::tokenizer::{CommentToken, DoctypeToken, TagToken, Token}; use html5ever::tokenizer::{Doctype, EndTag, StartTag, Tag}; use html5ever::tokenizer::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; use html5ever::{namespace_url, ns, Attribute, LocalName, QualName}; +use markup5ever::buffer_queue::BufferQueue; use serde_json::{Map, Value}; use std::cell::RefCell; use std::ffi::OsStr; diff --git a/rcdom/tests/xml-tokenizer.rs b/rcdom/tests/xml-tokenizer.rs index 5c33f2e8..e6160e6f 100644 --- a/rcdom/tests/xml-tokenizer.rs +++ b/rcdom/tests/xml-tokenizer.rs @@ -14,6 +14,7 @@ use std::env; use std::ffi::OsStr; use std::io::Read; use std::path::Path; +use xml5ever::tokenizer::ProcessResult; use util::find_tests::foreach_xml5lib_test; use util::runner::{run_all, Test}; @@ -91,7 +92,9 @@ impl TokenLogger { } impl TokenSink for TokenLogger { - fn process_token(&self, token: Token) { + type Handle = (); + + fn process_token(&self, token: Token) -> ProcessResult<()> { match token { CharacterTokens(b) => { self.current_str.borrow_mut().push_slice(&b); @@ -123,7 +126,8 @@ impl TokenSink for TokenLogger { EOFToken => (), _ => self.push(token), - } + }; + ProcessResult::Continue } } @@ -134,9 +138,9 @@ fn tokenize_xml(input: Vec, opts: XmlTokenizerOpts) -> Vec { for chunk in input.into_iter() { buf.push_back(chunk); - tok.feed(&buf); + let _ = tok.feed(&buf); } - tok.feed(&buf); + let _ = tok.feed(&buf); tok.end(); tok.sink.get_tokens() } @@ -274,9 +278,11 @@ fn json_to_tokens(js: &Value, exact_errors: bool) -> Vec { for tok in js.as_array().unwrap().iter() { match *tok { Value::String(ref s) if &s[..] == "ParseError" => { - sink.process_token(ParseError(Borrowed(""))) + let _ = sink.process_token(ParseError(Borrowed(""))); + }, + _ => { + let _ = sink.process_token(json_to_token(tok)); }, - _ => sink.process_token(json_to_token(tok)), } } sink.get_tokens() diff --git a/xml5ever/benches/xml5ever.rs b/xml5ever/benches/xml5ever.rs index f909486e..3b1e78be 100644 --- a/xml5ever/benches/xml5ever.rs +++ b/xml5ever/benches/xml5ever.rs @@ -10,15 +10,18 @@ use criterion::{black_box, Criterion}; use markup5ever::buffer_queue::BufferQueue; use xml5ever::tendril::*; -use xml5ever::tokenizer::{Token, TokenSink, XmlTokenizer}; +use xml5ever::tokenizer::{ProcessResult, Token, TokenSink, XmlTokenizer}; struct Sink; impl TokenSink for Sink { - fn process_token(&self, token: Token) { + type Handle = (); + + fn process_token(&self, token: Token) -> ProcessResult<()> { // Don't use the token, but make sure we don't get // optimized out entirely. black_box(token); + ProcessResult::Continue } } @@ -58,9 +61,9 @@ fn run_bench(c: &mut Criterion, name: &str) { // necessary since our iterator consumes the underlying buffer. for buf in input.clone().into_iter() { buffer.push_back(buf); - tok.feed(&buffer); + let _ = tok.feed(&buffer); } - tok.feed(&buffer); + let _ = tok.feed(&buffer); tok.end(); }) }); diff --git a/xml5ever/examples/simple_xml_tokenizer.rs b/xml5ever/examples/simple_xml_tokenizer.rs index 8cc930dd..662432bf 100644 --- a/xml5ever/examples/simple_xml_tokenizer.rs +++ b/xml5ever/examples/simple_xml_tokenizer.rs @@ -16,7 +16,7 @@ use std::io; use markup5ever::buffer_queue::BufferQueue; use xml5ever::tendril::{ByteTendril, ReadExt}; -use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, TagToken}; +use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, ProcessResult, TagToken}; use xml5ever::tokenizer::{CommentToken, PIToken, Pi}; use xml5ever::tokenizer::{Doctype, DoctypeToken, EOFToken}; use xml5ever::tokenizer::{ParseError, Token, TokenSink, XmlTokenizer}; @@ -24,7 +24,9 @@ use xml5ever::tokenizer::{ParseError, Token, TokenSink, XmlTokenizer}; struct SimpleTokenPrinter; impl TokenSink for SimpleTokenPrinter { - fn process_token(&self, token: Token) { + type Handle = (); + + fn process_token(&self, token: Token) -> ProcessResult<()> { match token { CharacterTokens(b) => { println!("TEXT: {}", &*b); @@ -55,7 +57,8 @@ impl TokenSink for SimpleTokenPrinter { }) => { println!(""); }, - } + }; + ProcessResult::Continue } } @@ -76,6 +79,6 @@ fn main() { input_buffer.push_back(input.try_reinterpret().unwrap()); // Here we create and run tokenizer let tok = XmlTokenizer::new(sink, Default::default()); - tok.feed(&input_buffer); + let _ = tok.feed(&input_buffer); tok.end(); } diff --git a/xml5ever/examples/xml_tokenizer.rs b/xml5ever/examples/xml_tokenizer.rs index a9115811..2ae52da0 100644 --- a/xml5ever/examples/xml_tokenizer.rs +++ b/xml5ever/examples/xml_tokenizer.rs @@ -17,7 +17,7 @@ use std::io; use markup5ever::buffer_queue::BufferQueue; use xml5ever::tendril::{ByteTendril, ReadExt}; -use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, TagToken}; +use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, ProcessResult, TagToken}; use xml5ever::tokenizer::{EmptyTag, EndTag, ShortTag, StartTag}; use xml5ever::tokenizer::{PIToken, Pi}; use xml5ever::tokenizer::{ParseError, Token, TokenSink, XmlTokenizer, XmlTokenizerOpts}; @@ -44,7 +44,9 @@ impl TokenPrinter { } impl TokenSink for TokenPrinter { - fn process_token(&self, token: Token) { + type Handle = (); + + fn process_token(&self, token: Token) -> ProcessResult<()> { match token { CharacterTokens(b) => { for c in b.chars() { @@ -84,7 +86,9 @@ impl TokenSink for TokenPrinter { self.is_char(false); println!("OTHER: {token:?}"); }, - } + }; + + ProcessResult::Continue } } @@ -105,7 +109,7 @@ fn main() { ..Default::default() }, ); - tok.feed(&input_buffer); + let _ = tok.feed(&input_buffer); tok.end(); tok.sink.is_char(false); } diff --git a/xml5ever/src/driver.rs b/xml5ever/src/driver.rs index 0245431d..061c1ce6 100644 --- a/xml5ever/src/driver.rs +++ b/xml5ever/src/driver.rs @@ -63,7 +63,8 @@ impl TendrilSink for XmlParser { fn process(&mut self, t: StrTendril) { self.input_buffer.push_back(t); - self.tokenizer.feed(&self.input_buffer); + // FIXME: Properly support somehow. + let _ = self.tokenizer.feed(&self.input_buffer); } // FIXME: Is it too noisy to report every character decoding error? diff --git a/xml5ever/src/tokenizer/interface.rs b/xml5ever/src/tokenizer/interface.rs index 802eef33..e4dda107 100644 --- a/xml5ever/src/tokenizer/interface.rs +++ b/xml5ever/src/tokenizer/interface.rs @@ -10,14 +10,13 @@ use std::borrow::Cow; use crate::tendril::StrTendril; +use crate::tokenizer::ProcessResult; use crate::{Attribute, QualName}; pub use self::TagKind::{EmptyTag, EndTag, ShortTag, StartTag}; pub use self::Token::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; pub use self::Token::{CommentToken, DoctypeToken, PIToken, TagToken}; -use super::states; - /// Tag kind denotes which kind of tag did we encounter. #[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)] pub enum TagKind { @@ -108,16 +107,12 @@ pub enum Token { /// Types which can receive tokens from the tokenizer. pub trait TokenSink { + /// Handle to a DOM script element + type Handle; + /// Process a token. - fn process_token(&self, token: Token); + fn process_token(&self, token: Token) -> ProcessResult; /// Signal to the sink that parsing has ended. fn end(&self) {} - - /// The tokenizer will call this after emitting any start tag. - /// This allows the tree builder to change the tokenizer's state. - /// By default no state changes occur. - fn query_state_change(&self) -> Option { - None - } } diff --git a/xml5ever/src/tokenizer/mod.rs b/xml5ever/src/tokenizer/mod.rs index ec8248bb..398118e2 100644 --- a/xml5ever/src/tokenizer/mod.rs +++ b/xml5ever/src/tokenizer/mod.rs @@ -23,13 +23,17 @@ use crate::tendril::StrTendril; use crate::{buffer_queue, Attribute, QualName, SmallCharSet}; use log::debug; use mac::{format_if, unwrap_or_return}; -use markup5ever::{local_name, namespace_prefix, namespace_url, ns, small_char_set}; +use markup5ever::{ + buffer_queue::BufferQueue, local_name, namespace_prefix, namespace_url, ns, small_char_set, + InputSink, InputSinkResult, TokenizerResult, +}; use std::borrow::Cow::{self, Borrowed}; use std::cell::{Cell, RefCell, RefMut}; use std::collections::BTreeMap; +use std::iter; use std::mem::replace; -use self::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult}; +use self::buffer_queue::{FromSet, NotFromSet, SetResult}; use self::char_ref::{CharRef, CharRefTokenizer}; use self::qname::QualNameTokenizer; use self::states::XmlState; @@ -201,9 +205,9 @@ impl XmlTokenizer { } /// Feed an input string into the tokenizer. - pub fn feed(&self, input: &BufferQueue) { + pub fn feed(&self, input: &BufferQueue) -> TokenizerResult { if input.is_empty() { - return; + return TokenizerResult::Done; } if self.discard_bom.get() { @@ -212,19 +216,20 @@ impl XmlTokenizer { input.next(); } } else { - return; + return TokenizerResult::Done; } }; - self.run(input); + self.run(input) } - fn process_token(&self, token: Token) { + fn process_token(&self, token: Token) -> ProcessResult { if self.opts.profile { - let (_, dt) = time!(self.sink.process_token(token)); + let (result, dt) = time!(self.sink.process_token(token)); self.time_in_sink.set(self.time_in_sink.get() + dt); + result } else { - self.sink.process_token(token); + self.sink.process_token(token) } } @@ -317,7 +322,7 @@ impl XmlTokenizer { } /// Run the state machine for as long as we can. - pub fn run(&self, input: &BufferQueue) { + pub fn run(&self, input: &BufferQueue) -> TokenizerResult { if self.opts.profile { loop { let state = self.state.get(); @@ -335,12 +340,20 @@ impl XmlTokenizer { // do this here because of borrow shenanigans self.state_profile.borrow_mut().insert(state, dt); } - if !run { - break; + match run { + ProcessResult::Continue => continue, + ProcessResult::Done => return TokenizerResult::Done, + ProcessResult::Script(handle) => return TokenizerResult::Script(handle), } } } else { - while self.step(input) {} + loop { + match self.step(input) { + ProcessResult::Continue => continue, + ProcessResult::Done => return TokenizerResult::Done, + ProcessResult::Script(handle) => return TokenizerResult::Script(handle), + } + } } } @@ -394,27 +407,27 @@ impl XmlTokenizer { }))); } - fn emit_short_tag(&self) { + fn emit_short_tag(&self) -> ProcessResult { self.current_tag_kind.set(ShortTag); *self.current_tag_name.borrow_mut() = StrTendril::new(); - self.emit_current_tag(); + self.emit_current_tag() } - fn emit_empty_tag(&self) { + fn emit_empty_tag(&self) -> ProcessResult { self.current_tag_kind.set(EmptyTag); - self.emit_current_tag(); + self.emit_current_tag() } fn set_empty_tag(&self) { self.current_tag_kind.set(EmptyTag); } - fn emit_start_tag(&self) { + fn emit_start_tag(&self) -> ProcessResult { self.current_tag_kind.set(StartTag); - self.emit_current_tag(); + self.emit_current_tag() } - fn emit_current_tag(&self) { + fn emit_current_tag(&self) -> ProcessResult { self.finish_attribute(); let qname = process_qname(replace( @@ -441,12 +454,8 @@ impl XmlTokenizer { name: qname, attrs: self.current_tag_attrs.take(), }); - self.process_token(token); - match self.sink.query_state_change() { - None => (), - Some(s) => self.state.set(s), - } + self.process_token(token) } // The string must not contain '\0'! @@ -455,12 +464,12 @@ impl XmlTokenizer { } // Emits the current Processing Instruction - fn emit_pi(&self) { + fn emit_pi(&self) -> ProcessResult<::Handle> { let token = PIToken(Pi { target: replace(&mut *self.current_pi_target.borrow_mut(), StrTendril::new()), data: replace(&mut *self.current_pi_data.borrow_mut(), StrTendril::new()), }); - self.process_token(token); + self.process_token(token) } fn consume_char_ref(&self, addnl_allowed: Option) { @@ -576,50 +585,45 @@ macro_rules! go ( // These can only come at the end. - ( $me:ident : to $s:ident ) => ({ $me.state.set(states::$s); return true; }); - ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state.set(states::$s($k1)); return true; }); - ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(states::$s($k1($k2))); return true; }); + ( $me:ident : to $s:ident ) => ({ $me.state.set(states::$s); return ProcessResult::Continue; }); + ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state.set(states::$s($k1)); return ProcessResult::Continue; }); + ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(states::$s($k1($k2))); return ProcessResult::Continue; }); ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume.set(true); go!($me: to $s); }); ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1); }); ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); }); - ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return true; }); - ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return true; }); + ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue; }); + ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; }); // We have a default next state after emitting a tag, but the sink can override. ( $me:ident : emit_tag $s:ident ) => ({ $me.state.set(states::$s); - $me.emit_current_tag(); - return true; + return $me.emit_current_tag(); }); // We have a special when dealing with empty and short tags in Xml ( $me:ident : emit_short_tag $s:ident ) => ({ $me.state.set(states::$s); - $me.emit_short_tag(); - return true; + return $me.emit_short_tag(); }); ( $me:ident : emit_empty_tag $s:ident ) => ({ $me.state.set(states::$s); - $me.emit_empty_tag(); - return true; + return $me.emit_empty_tag(); }); ( $me:ident : emit_start_tag $s:ident ) => ({ $me.state.set(states::$s); - $me.emit_start_tag(); - return true; + return $me.emit_start_tag(); }); ( $me:ident : emit_pi $s:ident ) => ({ $me.state.set(states::$s); - $me.emit_pi(); - return true; + return $me.emit_pi(); }); - ( $me:ident : eof ) => ({ $me.emit_eof(); return false; }); + ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Done; }); // If nothing else matched, it's a single command ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) ); @@ -631,23 +635,32 @@ macro_rules! go ( // This is a macro because it can cause early return // from the function where it is used. macro_rules! get_char ( ($me:expr, $input:expr) => ( - unwrap_or_return!($me.get_char($input), false) + unwrap_or_return!($me.get_char($input), ProcessResult::Done) )); macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => ( - unwrap_or_return!($me.pop_except_from($input, $set), false) + unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Done) )); macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => ( - unwrap_or_return!($me.eat($input, $pat), false) + unwrap_or_return!($me.eat($input, $pat), ProcessResult::Done) )); +/// The result of a single tokenization operation +pub enum ProcessResult { + /// The tokenizer needs more input before it can continue + Done, + /// The tokenizer can be invoked again immediately + Continue, + /// The tokenizer encountered a script element that must be executed + /// before tokenization can continue + Script(Handle), +} + impl XmlTokenizer { // Run the state machine for a while. - // Return true if we should be immediately re-invoked - // (this just simplifies control flow vs. break / continue). #[allow(clippy::never_loop)] - fn step(&self, input: &BufferQueue) -> bool { + fn step(&self, input: &BufferQueue) -> ProcessResult { if self.char_ref_tokenizer.borrow().is_some() { return self.step_char_ref_tokenizer(input); } @@ -656,7 +669,7 @@ impl XmlTokenizer { match self.state.get() { XmlState::Quiescent => { self.state.set(XmlState::Data); - false + ProcessResult::Done }, //ยง data-state XmlState::Data => loop { @@ -1100,10 +1113,12 @@ impl XmlTokenizer { // Process all remaining buffered input. // If we're waiting for lookahead, we're not gonna get it. self.at_eof.set(true); - self.run(&input); + let _ = self.run(&input); - while self.eof_step() { - // loop + loop { + if !matches!(self.eof_step(), ProcessResult::Continue) { + break; + } } self.sink.end(); @@ -1145,7 +1160,7 @@ impl XmlTokenizer { } } - fn eof_step(&self) -> bool { + fn eof_step(&self) -> ProcessResult { debug!("processing EOF in state {:?}", self.state.get()); match self.state.get() { XmlState::Data | XmlState::Quiescent => go!(self: eof), @@ -1220,18 +1235,18 @@ impl XmlTokenizer { } } - fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> bool { + fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult { let mut tok = self.char_ref_tokenizer.take().unwrap(); let outcome = tok.step(self, input); let progress = match outcome { char_ref::Done => { self.process_char_ref(tok.get_result()); - return true; + return ProcessResult::Continue; }, - char_ref::Stuck => false, - char_ref::Progress => true, + char_ref::Stuck => ProcessResult::Done, + char_ref::Progress => ProcessResult::Continue, }; *self.char_ref_tokenizer.borrow_mut() = Some(tok); @@ -1286,6 +1301,17 @@ impl XmlTokenizer { } } +impl InputSink for XmlTokenizer +where + Sink: TokenSink, +{ + type Handle = Sink::Handle; + + fn feed(&self, input: &BufferQueue) -> impl Iterator> { + iter::from_fn(|| self.feed(input).into()) + } +} + #[cfg(test)] mod test { diff --git a/xml5ever/src/tree_builder/mod.rs b/xml5ever/src/tree_builder/mod.rs index 4c4e0c49..5274bb60 100644 --- a/xml5ever/src/tree_builder/mod.rs +++ b/xml5ever/src/tree_builder/mod.rs @@ -20,12 +20,11 @@ use std::collections::{BTreeMap, HashSet, VecDeque}; use std::fmt::{Debug, Error, Formatter}; use std::mem; -pub use self::interface::{ElemName, NextParserState, NodeOrText, Tracer, TreeSink}; +pub use self::interface::{ElemName, NodeOrText, Tracer, TreeSink}; use self::types::*; use crate::interface::{self, create_element, AppendNode, Attribute, QualName}; use crate::interface::{AppendText, ExpandedName}; -use crate::tokenizer::states::Quiescent; -use crate::tokenizer::{self, EndTag, StartTag, Tag, TokenSink}; +use crate::tokenizer::{self, EndTag, ProcessResult, StartTag, Tag, TokenSink}; use crate::tokenizer::{Doctype, EmptyTag, Pi, ShortTag}; use crate::{LocalName, Namespace, Prefix}; @@ -182,9 +181,6 @@ pub struct XmlTreeBuilder { /// The document node, which is created by the sink. doc_handle: Handle, - /// Next state change for the tokenizer, if any. - next_tokenizer_state: Cell>, - /// Stack of open elements, most recently added at end. open_elems: RefCell>, @@ -214,7 +210,6 @@ where _opts: opts, sink, doc_handle, - next_tokenizer_state: Cell::new(None), open_elems: RefCell::new(vec![]), curr_elem: RefCell::new(None), namespace_stack: RefCell::new(NamespaceMapStack::new()), @@ -376,7 +371,10 @@ where } } - fn process_to_completion(&self, mut token: Token) { + fn process_to_completion( + &self, + mut token: Token, + ) -> ProcessResult<::Handle> { // Queue of additional tokens yet to be processed. // This stays empty in the common case where we don't split whitespace. let mut more_tokens = VecDeque::new(); @@ -386,13 +384,17 @@ where #[allow(clippy::unused_unit)] match self.step(phase, token) { - Done => { - token = unwrap_or_return!(more_tokens.pop_front(), ()); + XmlProcessResult::Done => { + token = unwrap_or_return!(more_tokens.pop_front(), ProcessResult::Continue); }, - Reprocess(m, t) => { + XmlProcessResult::Reprocess(m, t) => { self.phase.set(m); token = t; }, + XmlProcessResult::Script(node) => { + assert!(more_tokens.is_empty()); + return ProcessResult::Script(node); + }, } } } @@ -403,12 +405,14 @@ where Handle: Clone, Sink: TreeSink, { - fn process_token(&self, token: tokenizer::Token) { + type Handle = Handle; + + fn process_token(&self, token: tokenizer::Token) -> ProcessResult { // Handle `ParseError` and `DoctypeToken`; convert everything else to the local `Token` type. let token = match token { tokenizer::ParseError(e) => { self.sink.parse_error(e); - return; + return ProcessResult::Done; }, tokenizer::DoctypeToken(d) => Doctype(d), @@ -420,7 +424,7 @@ where tokenizer::CharacterTokens(x) => Characters(x), }; - self.process_to_completion(token); + self.process_to_completion(token) } fn end(&self) { @@ -428,10 +432,6 @@ where self.sink.pop(&node); } } - - fn query_state_change(&self) -> Option { - self.next_tokenizer_state.take() - } } fn current_node(open_elems: &[Handle]) -> &Handle { @@ -456,13 +456,13 @@ where self.sink.append(target, child); } - fn insert_tag(&self, tag: Tag) -> XmlProcessResult { + fn insert_tag(&self, tag: Tag) -> XmlProcessResult { let child = create_element(&self.sink, tag.name, tag.attrs); self.insert_appropriately(AppendNode(child.clone())); self.add_to_open_elems(child) } - fn append_tag(&self, tag: Tag) -> XmlProcessResult { + fn append_tag(&self, tag: Tag) -> XmlProcessResult { let child = create_element(&self.sink, tag.name, tag.attrs); self.insert_appropriately(AppendNode(child.clone())); self.sink.pop(&child); @@ -477,19 +477,19 @@ where child } - fn add_to_open_elems(&self, el: Handle) -> XmlProcessResult { + fn add_to_open_elems(&self, el: Handle) -> XmlProcessResult { self.open_elems.borrow_mut().push(el); Done } - fn append_comment_to_doc(&self, text: StrTendril) -> XmlProcessResult { + fn append_comment_to_doc(&self, text: StrTendril) -> XmlProcessResult { let comment = self.sink.create_comment(text); self.sink.append(&self.doc_handle, AppendNode(comment)); Done } - fn append_comment_to_tag(&self, text: StrTendril) -> XmlProcessResult { + fn append_comment_to_tag(&self, text: StrTendril) -> XmlProcessResult { let open_elems = self.open_elems.borrow(); let target = current_node(&open_elems); let comment = self.sink.create_comment(text); @@ -497,7 +497,7 @@ where Done } - fn append_doctype_to_doc(&self, doctype: Doctype) -> XmlProcessResult { + fn append_doctype_to_doc(&self, doctype: Doctype) -> XmlProcessResult { fn get_tendril(opt: Option) -> StrTendril { match opt { Some(expr) => expr, @@ -512,13 +512,13 @@ where Done } - fn append_pi_to_doc(&self, pi: Pi) -> XmlProcessResult { + fn append_pi_to_doc(&self, pi: Pi) -> XmlProcessResult { let pi = self.sink.create_pi(pi.target, pi.data); self.sink.append(&self.doc_handle, AppendNode(pi)); Done } - fn append_pi_to_tag(&self, pi: Pi) -> XmlProcessResult { + fn append_pi_to_tag(&self, pi: Pi) -> XmlProcessResult { let open_elems = self.open_elems.borrow(); let target = current_node(&open_elems); let pi = self.sink.create_pi(pi.target, pi.data); @@ -526,7 +526,7 @@ where Done } - fn append_text(&self, chars: StrTendril) -> XmlProcessResult { + fn append_text(&self, chars: StrTendril) -> XmlProcessResult { self.insert_appropriately(AppendText(chars)); Done } @@ -538,8 +538,7 @@ where .any(|a| self.sink.elem_name(a).expanded() == tag.name.expanded()) } - // Pop elements until an element from the set has been popped. Returns the - // number of elements popped. + // Pop elements until an element from the set has been popped. fn pop_until

(&self, pred: P) where P: Fn(ExpandedName) -> bool, @@ -560,7 +559,7 @@ where set(self.sink.elem_name(&self.current_node()).expanded()) } - fn close_tag(&self, tag: Tag) -> XmlProcessResult { + fn close_tag(&self, tag: Tag) -> XmlProcessResult { debug!( "Close tag: current_node.name {:?} \n Current tag {:?}", self.sink.elem_name(&self.current_node()), @@ -597,18 +596,10 @@ where node } - fn stop_parsing(&self) -> XmlProcessResult { + fn stop_parsing(&self) -> XmlProcessResult { warn!("stop_parsing for XML5 not implemented, full speed ahead!"); Done } - - fn complete_script(&self) { - let open_elems = self.open_elems.borrow(); - let current = current_node(&open_elems); - if self.sink.complete_script(current) == NextParserState::Suspend { - self.next_tokenizer_state.set(Some(Quiescent)); - } - } } fn any_not_whitespace(x: &StrTendril) -> bool { @@ -622,7 +613,7 @@ where Handle: Clone, Sink: TreeSink, { - fn step(&self, mode: XmlPhase, token: Token) -> XmlProcessResult { + fn step(&self, mode: XmlPhase, token: Token) -> XmlProcessResult<::Handle> { self.debug_step(mode, &token); match mode { @@ -716,8 +707,9 @@ where }; if tag.name.local == local_name!("script") { self.insert_tag(tag.clone()); - self.complete_script(); - self.close_tag(tag) + let script = current_node(&self.open_elems.borrow()).clone(); + self.close_tag(tag); + XmlProcessResult::Script(script) } else { self.append_tag(tag) } @@ -737,7 +729,12 @@ where tag }; if tag.name.local == local_name!("script") { - self.complete_script(); + let script = current_node(&self.open_elems.borrow()).clone(); + self.close_tag(tag); + if self.no_open_elems() { + self.phase.set(End); + } + return XmlProcessResult::Script(script); } let retval = self.close_tag(tag); if self.no_open_elems() { diff --git a/xml5ever/src/tree_builder/types.rs b/xml5ever/src/tree_builder/types.rs index 4c031abe..fbc9c0ac 100644 --- a/xml5ever/src/tree_builder/types.rs +++ b/xml5ever/src/tree_builder/types.rs @@ -34,7 +34,8 @@ pub enum Token { Eof, } -pub enum XmlProcessResult { +pub enum XmlProcessResult { Done, Reprocess(XmlPhase, Token), + Script(Handle), }