Add ascii fast path for unicode_word_indices and unicode_words (#147)

PSeitz-dd · web-flow · commit af87c8d331b8 · 2025-07-28T12:15:42.000+05:30
* add benchmark

* add ascii fastpath

* add test case IP

* add log to benches

* restore iterators

* add backwards iterator

* restore test

* replace Box with Enum

* add comments with reference to the spec

* remove unused alloc

* readd Debug derive

* use import

* remove pub
diff --git a/Cargo.toml b/Cargo.toml
@@ -24,6 +24,7 @@ no_std = [] # This is a no-op, preserved for backward compatibility only.
 [dev-dependencies]
 quickcheck = "0.7"
 criterion = "0.5"
+proptest = "1.7.0"
 
 [[bench]]
 name = "chars"
@@ -36,3 +37,8 @@ harness = false
 [[bench]]
 name = "word_bounds"
 harness = false
+
+[[bench]]
+name = "unicode_word_indices"
+harness = false
+
diff --git a/benches/chars.rs b/benches/chars.rs
@@ -41,15 +41,15 @@ fn bench_all(c: &mut Criterion) {
     for file in FILES {
         group.bench_with_input(
             BenchmarkId::new("grapheme", file),
-            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
             |b, content| b.iter(|| grapheme(content)),
         );
     }
 
     for file in FILES {
         group.bench_with_input(
             BenchmarkId::new("scalar", file),
-            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
             |b, content| b.iter(|| scalar(content)),
         );
     }
diff --git a/benches/texts/log.txt b/benches/texts/log.txt
@@ -0,0 +1 @@
+2018-07-12 13:59:01 UTC | ERROR | (worker.go:131 in process) | Too many errors for endpoint 'dummy/api/v1/check_run?api_key=*************************00000': retrying later
diff --git a/benches/unicode_word_indices.rs b/benches/unicode_word_indices.rs
@@ -0,0 +1,37 @@
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+
+use std::fs;
+use unicode_segmentation::UnicodeSegmentation;
+
+const FILES: &[&str] = &[
+    "log", //"arabic",
+    "english",
+    //"hindi",
+    "japanese",
+    //"korean",
+    //"mandarin",
+    //"russian",
+    //"source_code",
+];
+
+#[inline(always)]
+fn grapheme(text: &str) {
+    for w in text.unicode_word_indices() {
+        black_box(w);
+    }
+}
+
+fn bench_all(c: &mut Criterion) {
+    let mut group = c.benchmark_group("unicode_word_indices");
+
+    for file in FILES {
+        let input = fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap();
+        group.throughput(criterion::Throughput::Bytes(input.len() as u64));
+        group.bench_with_input(BenchmarkId::from_parameter(file), &input, |b, content| {
+            b.iter(|| grapheme(content))
+        });
+    }
+}
+
+criterion_group!(benches, bench_all);
+criterion_main!(benches);
diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs
@@ -27,7 +27,7 @@ fn bench_all(c: &mut Criterion) {
     for file in FILES {
         group.bench_with_input(
             BenchmarkId::new("grapheme", file),
-            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            &fs::read_to_string(format!("benches/texts/{file}.txt",)).unwrap(),
             |b, content| b.iter(|| grapheme(content)),
         );
     }
diff --git a/benches/words.rs b/benches/words.rs
@@ -41,15 +41,15 @@ fn bench_all(c: &mut Criterion) {
     for file in FILES {
         group.bench_with_input(
             BenchmarkId::new("grapheme", file),
-            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
             |b, content| b.iter(|| grapheme(content)),
         );
     }
 
     for file in FILES {
         group.bench_with_input(
             BenchmarkId::new("scalar", file),
-            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
             |b, content| b.iter(|| scalar(content)),
         );
     }
diff --git a/src/lib.rs b/src/lib.rs
@@ -56,11 +56,16 @@
 )]
 #![no_std]
 
+#[cfg(test)]
+extern crate std;
+
 pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
 pub use grapheme::{GraphemeIndices, Graphemes};
 pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
 pub use tables::UNICODE_VERSION;
-pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
+pub use word::{UWordBoundIndices, UWordBounds};
+
+use crate::word::{UnicodeWordIndices, UnicodeWords};
 
 mod grapheme;
 mod sentence;
@@ -248,7 +253,7 @@ pub trait UnicodeSegmentation {
 
 impl UnicodeSegmentation for str {
     #[inline]
-    fn graphemes(&self, is_extended: bool) -> Graphemes {
+    fn graphemes(&self, is_extended: bool) -> Graphemes<'_> {
         grapheme::new_graphemes(self, is_extended)
     }
 
@@ -258,32 +263,32 @@ impl UnicodeSegmentation for str {
     }
 
     #[inline]
-    fn unicode_words(&self) -> UnicodeWords {
+    fn unicode_words(&self) -> UnicodeWords<'_> {
         word::new_unicode_words(self)
     }
 
     #[inline]
-    fn unicode_word_indices(&self) -> UnicodeWordIndices {
+    fn unicode_word_indices(&self) -> UnicodeWordIndices<'_> {
         word::new_unicode_word_indices(self)
     }
 
     #[inline]
-    fn split_word_bounds(&self) -> UWordBounds {
+    fn split_word_bounds(&self) -> UWordBounds<'_> {
         word::new_word_bounds(self)
     }
 
     #[inline]
-    fn split_word_bound_indices(&self) -> UWordBoundIndices {
+    fn split_word_bound_indices(&self) -> UWordBoundIndices<'_> {
         word::new_word_bound_indices(self)
     }
 
     #[inline]
-    fn unicode_sentences(&self) -> UnicodeSentences {
+    fn unicode_sentences(&self) -> UnicodeSentences<'_> {
         sentence::new_unicode_sentences(self)
     }
 
     #[inline]
-    fn split_sentence_bounds(&self) -> USentenceBounds {
+    fn split_sentence_bounds(&self) -> USentenceBounds<'_> {
         sentence::new_sentence_bounds(self)
     }
 
diff --git a/src/word.rs b/src/word.rs

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+2018-07-12 13:59:01 UTC \| ERROR \| (worker.go:131 in process) \| Too many errors for endpoint 'dummy/api/v1/check_run?api_key=*************************00000': retrying later`
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ fn bench_all(c: &mut Criterion) {`
`27`	`27`	`for file in FILES {`
`28`	`28`	`group.bench_with_input(`
`29`	`29`	`BenchmarkId::new("grapheme", file),`
`30`		`- &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),`
	`30`	`+ &fs::read_to_string(format!("benches/texts/{file}.txt",)).unwrap(),`
`31`	`31`	`\|b, content\| b.iter(\|\| grapheme(content)),`
`32`	`32`	`);`
`33`	`33`	`}`