Skip to content

Commit af87c8d

Browse files
authored
Add ascii fast path for unicode_word_indices and unicode_words (#147)
* add benchmark * add ascii fastpath * add test case IP * add log to benches * restore iterators * add backwards iterator * restore test * replace Box with Enum * add comments with reference to the spec * remove unused alloc * readd Debug derive * use import * remove pub
1 parent 9e3f88c commit af87c8d

File tree

8 files changed

+426
-42
lines changed

8 files changed

+426
-42
lines changed

Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ no_std = [] # This is a no-op, preserved for backward compatibility only.
2424
[dev-dependencies]
2525
quickcheck = "0.7"
2626
criterion = "0.5"
27+
proptest = "1.7.0"
2728

2829
[[bench]]
2930
name = "chars"
@@ -36,3 +37,8 @@ harness = false
3637
[[bench]]
3738
name = "word_bounds"
3839
harness = false
40+
41+
[[bench]]
42+
name = "unicode_word_indices"
43+
harness = false
44+

benches/chars.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,15 @@ fn bench_all(c: &mut Criterion) {
4141
for file in FILES {
4242
group.bench_with_input(
4343
BenchmarkId::new("grapheme", file),
44-
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
44+
&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
4545
|b, content| b.iter(|| grapheme(content)),
4646
);
4747
}
4848

4949
for file in FILES {
5050
group.bench_with_input(
5151
BenchmarkId::new("scalar", file),
52-
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
52+
&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
5353
|b, content| b.iter(|| scalar(content)),
5454
);
5555
}

benches/texts/log.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
2018-07-12 13:59:01 UTC | ERROR | (worker.go:131 in process) | Too many errors for endpoint 'dummy/api/v1/check_run?api_key=*************************00000': retrying later

benches/unicode_word_indices.rs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
2+
3+
use std::fs;
4+
use unicode_segmentation::UnicodeSegmentation;
5+
6+
const FILES: &[&str] = &[
7+
"log", //"arabic",
8+
"english",
9+
//"hindi",
10+
"japanese",
11+
//"korean",
12+
//"mandarin",
13+
//"russian",
14+
//"source_code",
15+
];
16+
17+
#[inline(always)]
18+
fn grapheme(text: &str) {
19+
for w in text.unicode_word_indices() {
20+
black_box(w);
21+
}
22+
}
23+
24+
fn bench_all(c: &mut Criterion) {
25+
let mut group = c.benchmark_group("unicode_word_indices");
26+
27+
for file in FILES {
28+
let input = fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap();
29+
group.throughput(criterion::Throughput::Bytes(input.len() as u64));
30+
group.bench_with_input(BenchmarkId::from_parameter(file), &input, |b, content| {
31+
b.iter(|| grapheme(content))
32+
});
33+
}
34+
}
35+
36+
criterion_group!(benches, bench_all);
37+
criterion_main!(benches);

benches/word_bounds.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ fn bench_all(c: &mut Criterion) {
2727
for file in FILES {
2828
group.bench_with_input(
2929
BenchmarkId::new("grapheme", file),
30-
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
30+
&fs::read_to_string(format!("benches/texts/{file}.txt",)).unwrap(),
3131
|b, content| b.iter(|| grapheme(content)),
3232
);
3333
}

benches/words.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,15 @@ fn bench_all(c: &mut Criterion) {
4141
for file in FILES {
4242
group.bench_with_input(
4343
BenchmarkId::new("grapheme", file),
44-
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
44+
&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
4545
|b, content| b.iter(|| grapheme(content)),
4646
);
4747
}
4848

4949
for file in FILES {
5050
group.bench_with_input(
5151
BenchmarkId::new("scalar", file),
52-
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
52+
&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
5353
|b, content| b.iter(|| scalar(content)),
5454
);
5555
}

src/lib.rs

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,16 @@
5656
)]
5757
#![no_std]
5858

59+
#[cfg(test)]
60+
extern crate std;
61+
5962
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
6063
pub use grapheme::{GraphemeIndices, Graphemes};
6164
pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
6265
pub use tables::UNICODE_VERSION;
63-
pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
66+
pub use word::{UWordBoundIndices, UWordBounds};
67+
68+
use crate::word::{UnicodeWordIndices, UnicodeWords};
6469

6570
mod grapheme;
6671
mod sentence;
@@ -248,7 +253,7 @@ pub trait UnicodeSegmentation {
248253

249254
impl UnicodeSegmentation for str {
250255
#[inline]
251-
fn graphemes(&self, is_extended: bool) -> Graphemes {
256+
fn graphemes(&self, is_extended: bool) -> Graphemes<'_> {
252257
grapheme::new_graphemes(self, is_extended)
253258
}
254259

@@ -258,32 +263,32 @@ impl UnicodeSegmentation for str {
258263
}
259264

260265
#[inline]
261-
fn unicode_words(&self) -> UnicodeWords {
266+
fn unicode_words(&self) -> UnicodeWords<'_> {
262267
word::new_unicode_words(self)
263268
}
264269

265270
#[inline]
266-
fn unicode_word_indices(&self) -> UnicodeWordIndices {
271+
fn unicode_word_indices(&self) -> UnicodeWordIndices<'_> {
267272
word::new_unicode_word_indices(self)
268273
}
269274

270275
#[inline]
271-
fn split_word_bounds(&self) -> UWordBounds {
276+
fn split_word_bounds(&self) -> UWordBounds<'_> {
272277
word::new_word_bounds(self)
273278
}
274279

275280
#[inline]
276-
fn split_word_bound_indices(&self) -> UWordBoundIndices {
281+
fn split_word_bound_indices(&self) -> UWordBoundIndices<'_> {
277282
word::new_word_bound_indices(self)
278283
}
279284

280285
#[inline]
281-
fn unicode_sentences(&self) -> UnicodeSentences {
286+
fn unicode_sentences(&self) -> UnicodeSentences<'_> {
282287
sentence::new_unicode_sentences(self)
283288
}
284289

285290
#[inline]
286-
fn split_sentence_bounds(&self) -> USentenceBounds {
291+
fn split_sentence_bounds(&self) -> USentenceBounds<'_> {
287292
sentence::new_sentence_bounds(self)
288293
}
289294

0 commit comments

Comments
 (0)