Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle decoding of input in html5ever #590

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions html5ever/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@ readme = "../README.md"
rust-version.workspace = true

[features]
default = ["encoding"]
trace_tokenizer = []
encoding = ["dep:encoding_rs", "markup5ever/encoding"]

[dependencies]
log = "0.4"
mac = "0.1"
markup5ever = { version = "0.15", path = "../markup5ever" }
match_token = { workspace = true }
encoding_rs = { version = "0.8", optional = true }

[dev-dependencies]
criterion = "0.5"
Expand Down
3 changes: 2 additions & 1 deletion html5ever/examples/noop-tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ use std::cell::RefCell;
use std::io;

use html5ever::tendril::*;
use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer};
use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer};
use markup5ever::buffer_queue::BufferQueue;

/// In our case, our sink only contains a tokens vector
struct Sink(RefCell<Vec<Token>>);
Expand Down
2 changes: 1 addition & 1 deletion html5ever/examples/tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ use std::cell::Cell;
use std::io;

use html5ever::tendril::*;
use html5ever::tokenizer::BufferQueue;
use html5ever::tokenizer::{CharacterTokens, EndTag, NullCharacterToken, StartTag, TagToken};
use html5ever::tokenizer::{
ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
};
use markup5ever::buffer_queue::BufferQueue;

#[derive(Clone)]
struct TokenPrinter {
Expand Down
4 changes: 2 additions & 2 deletions html5ever/src/driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
//! High-level interface to the parser.

use crate::buffer_queue::BufferQueue;
use crate::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult};
use crate::tokenizer::{Tokenizer, TokenizerOpts};
use crate::tree_builder::{create_element, TreeBuilder, TreeBuilderOpts, TreeSink};
use crate::{Attribute, QualName};

use markup5ever::TokenizerResult;
use std::borrow::Cow;

use crate::tendril;
Expand Down
2 changes: 1 addition & 1 deletion html5ever/src/tokenizer/char_ref/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
// except according to those terms.

use super::{TokenSink, Tokenizer};
use crate::buffer_queue::BufferQueue;
use crate::data;
use crate::tendril::StrTendril;

use log::debug;
use mac::format_if;
use markup5ever::buffer_queue::BufferQueue;
use std::borrow::Cow::Borrowed;
use std::char::from_u32;

Expand Down
2 changes: 2 additions & 0 deletions html5ever/src/tokenizer/interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ pub enum TokenSinkResult<Handle> {
Script(Handle),
Plaintext,
RawData(states::RawKind),
#[cfg(feature = "encoding")]
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
}

/// Types which can receive tokens from the tokenizer.
Expand Down
50 changes: 35 additions & 15 deletions html5ever/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,18 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
use self::char_ref::{CharRef, CharRefTokenizer};

use crate::util::str::lower_ascii_letter;

use log::{debug, trace};
use mac::format_if;
use markup5ever::{namespace_url, ns, small_char_set};
use markup5ever::{
buffer_queue::BufferQueue, namespace_url, ns, small_char_set, InputSink, InputSinkResult,
TokenizerResult,
};
use std::borrow::Cow::{self, Borrowed};
use std::cell::{Cell, RefCell, RefMut};
use std::collections::BTreeMap;
use std::mem;
use std::{iter, mem};

pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
pub use crate::buffer_queue::{FromSet, NotFromSet, SetResult};
use crate::tendril::StrTendril;
use crate::{Attribute, LocalName, QualName, SmallCharSet};

Expand All @@ -43,13 +45,8 @@ pub enum ProcessResult<Handle> {
Continue,
Suspend,
Script(Handle),
}

#[must_use]
#[derive(Debug)]
pub enum TokenizerResult<Handle> {
Done,
Script(Handle),
#[cfg(feature = "encoding")]
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
}

fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
Expand Down Expand Up @@ -364,6 +361,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(node) => return TokenizerResult::Script(node),
#[cfg(feature = "encoding")]
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding)
},
}
}
} else {
Expand All @@ -372,6 +373,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(node) => return TokenizerResult::Script(node),
#[cfg(feature = "encoding")]
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding)
},
}
}
}
Expand Down Expand Up @@ -452,6 +457,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
self.state.set(states::RawData(kind));
ProcessResult::Continue
},
#[cfg(feature = "encoding")]
TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding) => {
ProcessResult::MaybeChangeEncodingAndStartOver(encoding)
},
}
}

Expand Down Expand Up @@ -1455,6 +1464,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(_) => unreachable!(),
#[cfg(feature = "encoding")]
ProcessResult::MaybeChangeEncodingAndStartOver(_) => unreachable!(),
}
}

Expand Down Expand Up @@ -1582,13 +1593,24 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
}
}

impl<Sink> InputSink for Tokenizer<Sink>
where
Sink: TokenSink,
{
type Handle = Sink::Handle;

fn feed(&self, input: &BufferQueue) -> impl Iterator<Item = InputSinkResult<Self::Handle>> {
iter::from_fn(|| self.feed(input).into())
}
}

#[cfg(test)]
#[allow(non_snake_case)]
mod test {
use super::option_push; // private items
use crate::tendril::{SliceExt, StrTendril};

use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
use crate::tendril::{SliceExt, StrTendril};
use crate::LocalName;

use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
use super::interface::{EndTag, StartTag, Tag, TagKind};
Expand All @@ -1597,8 +1619,6 @@ mod test {
use markup5ever::buffer_queue::BufferQueue;
use std::cell::RefCell;

use crate::LocalName;

// LinesMatch implements the TokenSink trait. It is used for testing to see
// if current_line is being updated when process_token is called. The lines
// vector is a collection of the line numbers that each token is on.
Expand Down
8 changes: 5 additions & 3 deletions html5ever/src/tree_builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@

//! The HTML5 tree builder.

pub use crate::interface::{
create_element, ElemName, ElementFlags, NextParserState, Tracer, TreeSink,
};
pub use crate::interface::{create_element, ElemName, ElementFlags, Tracer, TreeSink};
pub use crate::interface::{AppendNode, AppendText, Attribute, NodeOrText};
pub use crate::interface::{LimitedQuirks, NoQuirks, Quirks, QuirksMode};

Expand Down Expand Up @@ -394,6 +392,10 @@ where
assert!(more_tokens.is_empty());
return tokenizer::TokenSinkResult::RawData(k);
},
#[cfg(feature = "encoding")]
MaybeChangeEncodingAndStartOver(encoding) => {
return tokenizer::TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding);
},
}
}
}
Expand Down
34 changes: 25 additions & 9 deletions html5ever/src/tree_builder/rules.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,24 @@
// The tree builder rules, as a single, enormous nested match expression.

use crate::interface::Quirks;
use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData};
use crate::tokenizer::states::{Rawtext, Rcdata};
use crate::tokenizer::TagKind::{EndTag, StartTag};
use crate::tree_builder::tag_sets::*;
use crate::tree_builder::types::*;
use crate::tree_builder::{
create_element, html_elem, ElemName, NodeOrText::AppendNode, StrTendril, Tag, TreeBuilder,
TreeSink,
};
use crate::QualName;
use markup5ever::{expanded_name, local_name, namespace_url, ns};
use crate::tree_builder::RawKind::ScriptData;
use crate::tree_builder::{html_elem, ElemName, StrTendril, Tag, TreeBuilder, TreeSink};

use markup5ever::interface::create_element;
use markup5ever::interface::NodeOrText::AppendNode;
use markup5ever::{expanded_name, local_name, namespace_url, ns, QualName};
use std::borrow::Cow::Borrowed;

use crate::tendril::SliceExt;
use match_token::match_token;

#[cfg(feature = "encoding")]
use encoding_rs::Encoding;

fn any_not_whitespace(x: &StrTendril) -> bool {
// FIXME: this might be much faster as a byte scan
x.chars().any(|c| !c.is_ascii_whitespace())
Expand Down Expand Up @@ -113,8 +116,21 @@ where

<html> => self.step(InBody, token),

tag @ <base> <basefont> <bgsound> <link> <meta> => {
// FIXME: handle <meta charset=...> and <meta http-equiv="Content-Type">
tag @ <meta> => {
// FIXME: handle <meta http-equiv="Content-Type">
#[cfg(feature = "encoding")]
if let Some(charset) = tag.attrs.iter().find(|a| a.name == QualName::new(None, ns!(html), local_name!("charset"))) {
if let Some(encoding) = Encoding::for_label(charset.value.as_bytes()) {
self.insert_and_pop_element_for(tag);
return MaybeChangeEncodingAndStartOver(encoding);
}
}

self.insert_and_pop_element_for(tag);
DoneAckSelfClosing
},

tag @ <base> <basefont> <bgsound> <link> => {
self.insert_and_pop_element_for(tag);
DoneAckSelfClosing
}
Expand Down
2 changes: 2 additions & 0 deletions html5ever/src/tree_builder/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ pub(crate) enum ProcessResult<Handle> {
Script(Handle),
ToPlaintext,
ToRawData(RawKind),
#[cfg(feature = "encoding")]
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
}

pub(crate) enum FormatEntry<Handle> {
Expand Down
4 changes: 4 additions & 0 deletions markup5ever/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,15 @@ rust-version.workspace = true
[lib]
path = "lib.rs"

[features]
encoding = ["dep:encoding_rs"]

[dependencies]
string_cache = "0.8"
phf = "0.11"
tendril = "0.4"
log = "0.4"
encoding_rs = { version = "0.8", optional = true }

[build-dependencies]
string_cache_codegen = "0.5.4"
Expand Down
Loading
Loading