Skip to content

Commit f4d8f88

Browse files
committed
Implement a decoding tokenizer
Signed-off-by: Simon Wülker <[email protected]>
1 parent 31a2c31 commit f4d8f88

File tree

17 files changed

+468
-36
lines changed

17 files changed

+468
-36
lines changed

html5ever/Cargo.toml

+3
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,16 @@ readme = "../README.md"
1313
rust-version.workspace = true
1414

1515
[features]
16+
default = ["encoding"]
1617
trace_tokenizer = []
18+
encoding = ["dep:encoding_rs", "markup5ever/encoding"]
1719

1820
[dependencies]
1921
log = "0.4"
2022
mac = "0.1"
2123
markup5ever = { version = "0.15", path = "../markup5ever" }
2224
match_token = { workspace = true }
25+
encoding_rs = { version = "0.8", optional = true }
2326

2427
[dev-dependencies]
2528
criterion = "0.5"

html5ever/examples/noop-tokenize.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ use std::cell::RefCell;
1515
use std::io;
1616

1717
use html5ever::tendril::*;
18-
use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer};
18+
use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer};
19+
use markup5ever::buffer_queue::BufferQueue;
1920

2021
/// In our case, our sink only contains a tokens vector
2122
struct Sink(RefCell<Vec<Token>>);

html5ever/src/driver.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99

1010
//! High-level interface to the parser.
1111
12-
use crate::buffer_queue::BufferQueue;
12+
use markup5ever::buffer_queue::BufferQueue;
13+
1314
use crate::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult};
1415
use crate::tree_builder::{create_element, TreeBuilder, TreeBuilderOpts, TreeSink};
1516
use crate::{Attribute, QualName};

html5ever/src/tokenizer/char_ref/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@
88
// except according to those terms.
99

1010
use super::{TokenSink, Tokenizer};
11-
use crate::buffer_queue::BufferQueue;
1211
use crate::data;
1312
use crate::tendril::StrTendril;
1413

1514
use log::debug;
1615
use mac::format_if;
16+
use markup5ever::buffer_queue::BufferQueue;
1717
use std::borrow::Cow::Borrowed;
1818
use std::char::from_u32;
1919

html5ever/src/tokenizer/interface.rs

+2
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ pub enum TokenSinkResult<Handle> {
7777
Script(Handle),
7878
Plaintext,
7979
RawData(states::RawKind),
80+
#[cfg(feature = "encoding")]
81+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
8082
}
8183

8284
/// Types which can receive tokens from the tokenizer.

html5ever/src/tokenizer/mod.rs

+47-8
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,16 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
2222
use self::char_ref::{CharRef, CharRefTokenizer};
2323

2424
use crate::util::str::lower_ascii_letter;
25-
2625
use log::{debug, trace};
2726
use mac::format_if;
28-
use markup5ever::{namespace_url, ns, small_char_set};
27+
use markup5ever::buffer_queue::BufferQueue;
28+
use markup5ever::{namespace_url, ns, small_char_set, InputSink, InputSinkResult};
2929
use std::borrow::Cow::{self, Borrowed};
3030
use std::cell::{Cell, RefCell, RefMut};
3131
use std::collections::BTreeMap;
32-
use std::mem;
32+
use std::{iter, mem};
3333

34-
pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
34+
pub use crate::buffer_queue::{FromSet, NotFromSet, SetResult};
3535
use crate::tendril::StrTendril;
3636
use crate::{Attribute, LocalName, QualName, SmallCharSet};
3737

@@ -43,13 +43,17 @@ pub enum ProcessResult<Handle> {
4343
Continue,
4444
Suspend,
4545
Script(Handle),
46+
#[cfg(feature = "encoding")]
47+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
4648
}
4749

4850
#[must_use]
4951
#[derive(Debug)]
5052
pub enum TokenizerResult<Handle> {
5153
Done,
5254
Script(Handle),
55+
#[cfg(feature = "encoding")]
56+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
5357
}
5458

5559
fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
@@ -364,6 +368,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
364368
ProcessResult::Continue => (),
365369
ProcessResult::Suspend => break,
366370
ProcessResult::Script(node) => return TokenizerResult::Script(node),
371+
#[cfg(feature = "encoding")]
372+
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
373+
return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding)
374+
},
367375
}
368376
}
369377
} else {
@@ -372,6 +380,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
372380
ProcessResult::Continue => (),
373381
ProcessResult::Suspend => break,
374382
ProcessResult::Script(node) => return TokenizerResult::Script(node),
383+
#[cfg(feature = "encoding")]
384+
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
385+
return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding)
386+
},
375387
}
376388
}
377389
}
@@ -452,6 +464,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
452464
self.state.set(states::RawData(kind));
453465
ProcessResult::Continue
454466
},
467+
#[cfg(feature = "encoding")]
468+
TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding) => {
469+
ProcessResult::MaybeChangeEncodingAndStartOver(encoding)
470+
},
455471
}
456472
}
457473

@@ -1455,6 +1471,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
14551471
ProcessResult::Continue => (),
14561472
ProcessResult::Suspend => break,
14571473
ProcessResult::Script(_) => unreachable!(),
1474+
#[cfg(feature = "encoding")]
1475+
ProcessResult::MaybeChangeEncodingAndStartOver(_) => unreachable!(),
14581476
}
14591477
}
14601478

@@ -1582,13 +1600,36 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
15821600
}
15831601
}
15841602

1603+
impl<Sink> InputSink for Tokenizer<Sink>
1604+
where
1605+
Sink: TokenSink,
1606+
{
1607+
type Handle = Sink::Handle;
1608+
1609+
fn feed(&self, input: &BufferQueue) -> impl Iterator<Item = InputSinkResult<Self::Handle>> {
1610+
iter::from_fn(|| self.feed(input).into())
1611+
}
1612+
}
1613+
1614+
impl<Handle> From<TokenizerResult<Handle>> for Option<InputSinkResult<Handle>> {
1615+
fn from(value: TokenizerResult<Handle>) -> Self {
1616+
match value {
1617+
TokenizerResult::Script(handle) => Some(InputSinkResult::HandleScript(handle)),
1618+
TokenizerResult::MaybeChangeEncodingAndStartOver(encoding) => {
1619+
Some(InputSinkResult::MaybeStartOverWithEncoding(encoding))
1620+
},
1621+
TokenizerResult::Done => None,
1622+
}
1623+
}
1624+
}
1625+
15851626
#[cfg(test)]
15861627
#[allow(non_snake_case)]
15871628
mod test {
15881629
use super::option_push; // private items
1589-
use crate::tendril::{SliceExt, StrTendril};
1590-
15911630
use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
1631+
use crate::tendril::{SliceExt, StrTendril};
1632+
use crate::LocalName;
15921633

15931634
use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
15941635
use super::interface::{EndTag, StartTag, Tag, TagKind};
@@ -1597,8 +1638,6 @@ mod test {
15971638
use markup5ever::buffer_queue::BufferQueue;
15981639
use std::cell::RefCell;
15991640

1600-
use crate::LocalName;
1601-
16021641
// LinesMatch implements the TokenSink trait. It is used for testing to see
16031642
// if current_line is being updated when process_token is called. The lines
16041643
// vector is a collection of the line numbers that each token is on.

html5ever/src/tree_builder/mod.rs

+4
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,10 @@ where
394394
assert!(more_tokens.is_empty());
395395
return tokenizer::TokenSinkResult::RawData(k);
396396
},
397+
#[cfg(feature = "encoding")]
398+
MaybeChangeEncodingAndStartOver(encoding) => {
399+
return tokenizer::TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding);
400+
},
397401
}
398402
}
399403
}

html5ever/src/tree_builder/rules.rs

+25-9
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,24 @@
1010
// The tree builder rules, as a single, enormous nested match expression.
1111

1212
use crate::interface::Quirks;
13-
use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData};
13+
use crate::tokenizer::states::{Rawtext, Rcdata};
1414
use crate::tokenizer::TagKind::{EndTag, StartTag};
1515
use crate::tree_builder::tag_sets::*;
1616
use crate::tree_builder::types::*;
17-
use crate::tree_builder::{
18-
create_element, html_elem, ElemName, NodeOrText::AppendNode, StrTendril, Tag, TreeBuilder,
19-
TreeSink,
20-
};
21-
use crate::QualName;
22-
use markup5ever::{expanded_name, local_name, namespace_url, ns};
17+
use crate::tree_builder::RawKind::ScriptData;
18+
use crate::tree_builder::{html_elem, ElemName, StrTendril, Tag, TreeBuilder, TreeSink};
19+
20+
use markup5ever::interface::create_element;
21+
use markup5ever::interface::NodeOrText::AppendNode;
22+
use markup5ever::{expanded_name, local_name, namespace_url, ns, QualName};
2323
use std::borrow::Cow::Borrowed;
2424

2525
use crate::tendril::SliceExt;
2626
use match_token::match_token;
2727

28+
#[cfg(feature = "encoding")]
29+
use encoding_rs::Encoding;
30+
2831
fn any_not_whitespace(x: &StrTendril) -> bool {
2932
// FIXME: this might be much faster as a byte scan
3033
x.chars().any(|c| !c.is_ascii_whitespace())
@@ -113,8 +116,21 @@ where
113116

114117
<html> => self.step(InBody, token),
115118

116-
tag @ <base> <basefont> <bgsound> <link> <meta> => {
117-
// FIXME: handle <meta charset=...> and <meta http-equiv="Content-Type">
119+
tag @ <meta> => {
120+
// FIXME: handle <meta http-equiv="Content-Type">
121+
#[cfg(feature = "encoding")]
122+
if let Some(charset) = tag.attrs.iter().find(|a| a.name == QualName::new(None, ns!(html), local_name!("charset"))) {
123+
if let Some(encoding) = Encoding::for_label(charset.value.as_bytes()) {
124+
self.insert_and_pop_element_for(tag);
125+
return MaybeChangeEncodingAndStartOver(encoding);
126+
}
127+
}
128+
129+
self.insert_and_pop_element_for(tag);
130+
DoneAckSelfClosing
131+
},
132+
133+
tag @ <base> <basefont> <bgsound> <link> => {
118134
self.insert_and_pop_element_for(tag);
119135
DoneAckSelfClosing
120136
}

html5ever/src/tree_builder/types.rs

+2
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ pub(crate) enum ProcessResult<Handle> {
7777
Script(Handle),
7878
ToPlaintext,
7979
ToRawData(RawKind),
80+
#[cfg(feature = "encoding")]
81+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
8082
}
8183

8284
pub(crate) enum FormatEntry<Handle> {

markup5ever/Cargo.toml

+4
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,15 @@ rust-version.workspace = true
1414
[lib]
1515
path = "lib.rs"
1616

17+
[features]
18+
encoding = ["dep:encoding_rs"]
19+
1720
[dependencies]
1821
string_cache = "0.8"
1922
phf = "0.11"
2023
tendril = "0.4"
2124
log = "0.4"
25+
encoding_rs = { version = "0.8", optional = true }
2226

2327
[build-dependencies]
2428
string_cache_codegen = "0.5.4"

0 commit comments

Comments
 (0)