From a367365e22546684781fddb0eb6647aabb9971d0 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Tue, 14 Feb 2023 10:37:15 +0100 Subject: [PATCH 01/12] core: convert Pattern<'a> into Pattern MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a Haystack trait describing something that can be searched in and make core::str::Pattern (and related types) generic on that trait. This will allow Pattern to be used for types other than str (most notably OsStr). This somewhat follows the Pattern API 2.0 design. While that design is apparently abandoned (?), it is somewhat helpful when going for patterns on OsStr, so I’m going with it unless someone tells me otherwise. ;) For now leave Pattern, Haystack et al in core::str::pattern. Since they are no longer str-specific, I’ll move them to core::pattern in future commit. This one leaves them in place to make the diff smaller. --- library/alloc/src/str.rs | 4 +- library/alloc/src/string.rs | 8 +- library/alloc/tests/str.rs | 4 +- library/core/src/str/iter.rs | 92 +++---- library/core/src/str/mod.rs | 55 ++-- library/core/src/str/pattern.rs | 248 ++++++++++++------ tests/rustdoc/async-fn.rs | 6 +- .../bound/assoc-fn-bound-root-obligation.rs | 2 +- .../assoc-fn-bound-root-obligation.stderr | 2 +- 9 files changed, 251 insertions(+), 170 deletions(-) diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index 8497740990443..f09ebcc154d85 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -268,7 +268,7 @@ impl str { without modifying the original"] #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn replace<'a, P: Pattern<'a>>(&'a self, from: P, to: &str) -> String { + pub fn replace<'a, P: Pattern<&'a str>>(&'a self, from: P, to: &str) -> String { let mut result = String::new(); let mut last_end = 0; for (start, part) in self.match_indices(from) { @@ -308,7 +308,7 @@ impl str { #[must_use = "this returns the replaced string as a new allocation, \ without modifying the original"] #[stable(feature = "str_replacen", since = "1.16.0")] - pub fn replacen<'a, P: Pattern<'a>>(&'a self, pat: P, to: &str, count: usize) -> String { + pub fn replacen<'a, P: Pattern<&'a str>>(&'a self, pat: P, to: &str, count: usize) -> String { // Hope to reduce the times of re-allocation let mut result = String::with_capacity(32); let mut last_end = 0; diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index b9ef76c109abf..618f360c35c5c 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -1371,7 +1371,7 @@ impl String { #[unstable(feature = "string_remove_matches", reason = "new API", issue = "72826")] pub fn remove_matches<'a, P>(&'a mut self, pat: P) where - P: for<'x> Pattern<'x>, + P: for<'x> Pattern<&'x str>, { use core::str::pattern::Searcher; @@ -2174,10 +2174,10 @@ impl<'a> Extend> for String { reason = "API not fully fleshed out and ready to be stabilized", issue = "27721" )] -impl<'a, 'b> Pattern<'a> for &'b String { - type Searcher = <&'b str as Pattern<'a>>::Searcher; +impl<'a, 'b> Pattern<&'a str> for &'b String { + type Searcher = <&'b str as Pattern<&'a str>>::Searcher; - fn into_searcher(self, haystack: &'a str) -> <&'b str as Pattern<'a>>::Searcher { + fn into_searcher(self, haystack: &'a str) -> <&'b str as Pattern<&'a str>>::Searcher { self[..].into_searcher(haystack) } diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index c1dbbde08b6b9..12a05028fcd9e 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -1891,7 +1891,7 @@ mod pattern { fn cmp_search_to_vec<'a>( rev: bool, - pat: impl Pattern<'a, Searcher: ReverseSearcher<'a>>, + pat: impl Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, haystack: &'a str, right: Vec, ) { @@ -2155,7 +2155,7 @@ fn different_str_pattern_forwarding_lifetimes() { fn foo<'a, P>(p: P) where - for<'b> &'b P: Pattern<'a>, + for<'b> &'b P: Pattern<&'a str>, { for _ in 0..3 { "asdf".find(&p); diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index 772c3605562cf..22d6b49feab6f 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -361,7 +361,7 @@ macro_rules! derive_pattern_clone { (clone $t:ident with |$s:ident| $e:expr) => { impl<'a, P> Clone for $t<'a, P> where - P: Pattern<'a, Searcher: Clone>, + P: Pattern<&'a str, Searcher: Clone>, { fn clone(&self) -> Self { let $s = self; @@ -374,7 +374,7 @@ macro_rules! derive_pattern_clone { /// This macro generates two public iterator structs /// wrapping a private internal one that makes use of the `Pattern` API. /// -/// For all patterns `P: Pattern<'a>` the following items will be +/// For all patterns `P: Pattern<&'a str>` the following items will be /// generated (generics omitted): /// /// struct $forward_iterator($internal_iterator); @@ -434,12 +434,14 @@ macro_rules! generate_pattern_iterators { } => { $(#[$forward_iterator_attribute])* $(#[$common_stability_attribute])* - pub struct $forward_iterator<'a, P: Pattern<'a>>(pub(super) $internal_iterator<'a, P>); + pub struct $forward_iterator<'a, P: Pattern<&'a str>>( + pub(super) $internal_iterator<'a, P> + ); $(#[$common_stability_attribute])* impl<'a, P> fmt::Debug for $forward_iterator<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple(stringify!($forward_iterator)) @@ -449,7 +451,7 @@ macro_rules! generate_pattern_iterators { } $(#[$common_stability_attribute])* - impl<'a, P: Pattern<'a>> Iterator for $forward_iterator<'a, P> { + impl<'a, P: Pattern<&'a str>> Iterator for $forward_iterator<'a, P> { type Item = $iterty; #[inline] @@ -461,7 +463,7 @@ macro_rules! generate_pattern_iterators { $(#[$common_stability_attribute])* impl<'a, P> Clone for $forward_iterator<'a, P> where - P: Pattern<'a, Searcher: Clone>, + P: Pattern<&'a str, Searcher: Clone>, { fn clone(&self) -> Self { $forward_iterator(self.0.clone()) @@ -470,12 +472,14 @@ macro_rules! generate_pattern_iterators { $(#[$reverse_iterator_attribute])* $(#[$common_stability_attribute])* - pub struct $reverse_iterator<'a, P: Pattern<'a>>(pub(super) $internal_iterator<'a, P>); + pub struct $reverse_iterator<'a, P: Pattern<&'a str>>( + pub(super) $internal_iterator<'a, P> + ); $(#[$common_stability_attribute])* impl<'a, P> fmt::Debug for $reverse_iterator<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple(stringify!($reverse_iterator)) @@ -487,7 +491,7 @@ macro_rules! generate_pattern_iterators { $(#[$common_stability_attribute])* impl<'a, P> Iterator for $reverse_iterator<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { type Item = $iterty; @@ -500,7 +504,7 @@ macro_rules! generate_pattern_iterators { $(#[$common_stability_attribute])* impl<'a, P> Clone for $reverse_iterator<'a, P> where - P: Pattern<'a, Searcher: Clone>, + P: Pattern<&'a str, Searcher: Clone>, { fn clone(&self) -> Self { $reverse_iterator(self.0.clone()) @@ -508,12 +512,12 @@ macro_rules! generate_pattern_iterators { } #[stable(feature = "fused", since = "1.26.0")] - impl<'a, P: Pattern<'a>> FusedIterator for $forward_iterator<'a, P> {} + impl<'a, P: Pattern<&'a str>> FusedIterator for $forward_iterator<'a, P> {} #[stable(feature = "fused", since = "1.26.0")] impl<'a, P> FusedIterator for $reverse_iterator<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, {} generate_pattern_iterators!($($t)* with $(#[$common_stability_attribute])*, @@ -528,7 +532,7 @@ macro_rules! generate_pattern_iterators { $(#[$common_stability_attribute])* impl<'a, P> DoubleEndedIterator for $forward_iterator<'a, P> where - P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>, + P: Pattern<&'a str, Searcher: DoubleEndedSearcher<&'a str>>, { #[inline] fn next_back(&mut self) -> Option<$iterty> { @@ -539,7 +543,7 @@ macro_rules! generate_pattern_iterators { $(#[$common_stability_attribute])* impl<'a, P> DoubleEndedIterator for $reverse_iterator<'a, P> where - P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>, + P: Pattern<&'a str, Searcher: DoubleEndedSearcher<&'a str>>, { #[inline] fn next_back(&mut self) -> Option<$iterty> { @@ -559,7 +563,7 @@ derive_pattern_clone! { with |s| SplitInternal { matcher: s.matcher.clone(), ..*s } } -pub(super) struct SplitInternal<'a, P: Pattern<'a>> { +pub(super) struct SplitInternal<'a, P: Pattern<&'a str>> { pub(super) start: usize, pub(super) end: usize, pub(super) matcher: P::Searcher, @@ -569,7 +573,7 @@ pub(super) struct SplitInternal<'a, P: Pattern<'a>> { impl<'a, P> fmt::Debug for SplitInternal<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("SplitInternal") @@ -582,7 +586,7 @@ where } } -impl<'a, P: Pattern<'a>> SplitInternal<'a, P> { +impl<'a, P: Pattern<&'a str>> SplitInternal<'a, P> { #[inline] fn get_end(&mut self) -> Option<&'a str> { if !self.finished { @@ -639,7 +643,7 @@ impl<'a, P: Pattern<'a>> SplitInternal<'a, P> { #[inline] fn next_back(&mut self) -> Option<&'a str> where - P::Searcher: ReverseSearcher<'a>, + P::Searcher: ReverseSearcher<&'a str>, { if self.finished { return None; @@ -676,7 +680,7 @@ impl<'a, P: Pattern<'a>> SplitInternal<'a, P> { #[inline] fn next_back_inclusive(&mut self) -> Option<&'a str> where - P::Searcher: ReverseSearcher<'a>, + P::Searcher: ReverseSearcher<&'a str>, { if self.finished { return None; @@ -746,7 +750,7 @@ generate_pattern_iterators! { delegate double ended; } -impl<'a, P: Pattern<'a>> Split<'a, P> { +impl<'a, P: Pattern<&'a str>> Split<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -769,7 +773,7 @@ impl<'a, P: Pattern<'a>> Split<'a, P> { } } -impl<'a, P: Pattern<'a>> RSplit<'a, P> { +impl<'a, P: Pattern<&'a str>> RSplit<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -810,7 +814,7 @@ generate_pattern_iterators! { delegate double ended; } -impl<'a, P: Pattern<'a>> SplitTerminator<'a, P> { +impl<'a, P: Pattern<&'a str>> SplitTerminator<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -833,7 +837,7 @@ impl<'a, P: Pattern<'a>> SplitTerminator<'a, P> { } } -impl<'a, P: Pattern<'a>> RSplitTerminator<'a, P> { +impl<'a, P: Pattern<&'a str>> RSplitTerminator<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -861,7 +865,7 @@ derive_pattern_clone! { with |s| SplitNInternal { iter: s.iter.clone(), ..*s } } -pub(super) struct SplitNInternal<'a, P: Pattern<'a>> { +pub(super) struct SplitNInternal<'a, P: Pattern<&'a str>> { pub(super) iter: SplitInternal<'a, P>, /// The number of splits remaining pub(super) count: usize, @@ -869,7 +873,7 @@ pub(super) struct SplitNInternal<'a, P: Pattern<'a>> { impl<'a, P> fmt::Debug for SplitNInternal<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("SplitNInternal") @@ -879,7 +883,7 @@ where } } -impl<'a, P: Pattern<'a>> SplitNInternal<'a, P> { +impl<'a, P: Pattern<&'a str>> SplitNInternal<'a, P> { #[inline] fn next(&mut self) -> Option<&'a str> { match self.count { @@ -898,7 +902,7 @@ impl<'a, P: Pattern<'a>> SplitNInternal<'a, P> { #[inline] fn next_back(&mut self) -> Option<&'a str> where - P::Searcher: ReverseSearcher<'a>, + P::Searcher: ReverseSearcher<&'a str>, { match self.count { 0 => None, @@ -937,7 +941,7 @@ generate_pattern_iterators! { delegate single ended; } -impl<'a, P: Pattern<'a>> SplitN<'a, P> { +impl<'a, P: Pattern<&'a str>> SplitN<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -960,7 +964,7 @@ impl<'a, P: Pattern<'a>> SplitN<'a, P> { } } -impl<'a, P: Pattern<'a>> RSplitN<'a, P> { +impl<'a, P: Pattern<&'a str>> RSplitN<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -988,18 +992,18 @@ derive_pattern_clone! { with |s| MatchIndicesInternal(s.0.clone()) } -pub(super) struct MatchIndicesInternal<'a, P: Pattern<'a>>(pub(super) P::Searcher); +pub(super) struct MatchIndicesInternal<'a, P: Pattern<&'a str>>(pub(super) P::Searcher); impl<'a, P> fmt::Debug for MatchIndicesInternal<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple("MatchIndicesInternal").field(&self.0).finish() } } -impl<'a, P: Pattern<'a>> MatchIndicesInternal<'a, P> { +impl<'a, P: Pattern<&'a str>> MatchIndicesInternal<'a, P> { #[inline] fn next(&mut self) -> Option<(usize, &'a str)> { self.0 @@ -1011,7 +1015,7 @@ impl<'a, P: Pattern<'a>> MatchIndicesInternal<'a, P> { #[inline] fn next_back(&mut self) -> Option<(usize, &'a str)> where - P::Searcher: ReverseSearcher<'a>, + P::Searcher: ReverseSearcher<&'a str>, { self.0 .next_match_back() @@ -1043,18 +1047,18 @@ derive_pattern_clone! { with |s| MatchesInternal(s.0.clone()) } -pub(super) struct MatchesInternal<'a, P: Pattern<'a>>(pub(super) P::Searcher); +pub(super) struct MatchesInternal<'a, P: Pattern<&'a str>>(pub(super) P::Searcher); impl<'a, P> fmt::Debug for MatchesInternal<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple("MatchesInternal").field(&self.0).finish() } } -impl<'a, P: Pattern<'a>> MatchesInternal<'a, P> { +impl<'a, P: Pattern<&'a str>> MatchesInternal<'a, P> { #[inline] fn next(&mut self) -> Option<&'a str> { // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. @@ -1067,7 +1071,7 @@ impl<'a, P: Pattern<'a>> MatchesInternal<'a, P> { #[inline] fn next_back(&mut self) -> Option<&'a str> where - P::Searcher: ReverseSearcher<'a>, + P::Searcher: ReverseSearcher<&'a str>, { // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. self.0.next_match_back().map(|(a, b)| unsafe { @@ -1213,7 +1217,7 @@ pub struct SplitAsciiWhitespace<'a> { /// /// [`split_inclusive`]: str::split_inclusive #[stable(feature = "split_inclusive", since = "1.51.0")] -pub struct SplitInclusive<'a, P: Pattern<'a>>(pub(super) SplitInternal<'a, P>); +pub struct SplitInclusive<'a, P: Pattern<&'a str>>(pub(super) SplitInternal<'a, P>); #[stable(feature = "split_whitespace", since = "1.1.0")] impl<'a> Iterator for SplitWhitespace<'a> { @@ -1335,7 +1339,7 @@ impl<'a> SplitAsciiWhitespace<'a> { } #[stable(feature = "split_inclusive", since = "1.51.0")] -impl<'a, P: Pattern<'a>> Iterator for SplitInclusive<'a, P> { +impl<'a, P: Pattern<&'a str>> Iterator for SplitInclusive<'a, P> { type Item = &'a str; #[inline] @@ -1345,7 +1349,7 @@ impl<'a, P: Pattern<'a>> Iterator for SplitInclusive<'a, P> { } #[stable(feature = "split_inclusive", since = "1.51.0")] -impl<'a, P: Pattern<'a, Searcher: fmt::Debug>> fmt::Debug for SplitInclusive<'a, P> { +impl<'a, P: Pattern<&'a str, Searcher: fmt::Debug>> fmt::Debug for SplitInclusive<'a, P> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("SplitInclusive").field("0", &self.0).finish() } @@ -1353,14 +1357,14 @@ impl<'a, P: Pattern<'a, Searcher: fmt::Debug>> fmt::Debug for SplitInclusive<'a, // FIXME(#26925) Remove in favor of `#[derive(Clone)]` #[stable(feature = "split_inclusive", since = "1.51.0")] -impl<'a, P: Pattern<'a, Searcher: Clone>> Clone for SplitInclusive<'a, P> { +impl<'a, P: Pattern<&'a str, Searcher: Clone>> Clone for SplitInclusive<'a, P> { fn clone(&self) -> Self { SplitInclusive(self.0.clone()) } } #[stable(feature = "split_inclusive", since = "1.51.0")] -impl<'a, P: Pattern<'a, Searcher: ReverseSearcher<'a>>> DoubleEndedIterator +impl<'a, P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>> DoubleEndedIterator for SplitInclusive<'a, P> { #[inline] @@ -1370,9 +1374,9 @@ impl<'a, P: Pattern<'a, Searcher: ReverseSearcher<'a>>> DoubleEndedIterator } #[stable(feature = "split_inclusive", since = "1.51.0")] -impl<'a, P: Pattern<'a>> FusedIterator for SplitInclusive<'a, P> {} +impl<'a, P: Pattern<&'a str>> FusedIterator for SplitInclusive<'a, P> {} -impl<'a, P: Pattern<'a>> SplitInclusive<'a, P> { +impl<'a, P: Pattern<&'a str>> SplitInclusive<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 66fa9cf6f64c0..92d970b08245c 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -1060,7 +1060,7 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn contains<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { + pub fn contains<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> bool { pat.is_contained_in(self) } @@ -1086,7 +1086,7 @@ impl str { /// assert!(!bananas.starts_with("nana")); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - pub fn starts_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { + pub fn starts_with<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> bool { pat.is_prefix_of(self) } @@ -1114,7 +1114,7 @@ impl str { #[stable(feature = "rust1", since = "1.0.0")] pub fn ends_with<'a, P>(&'a self, pat: P) -> bool where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { pat.is_suffix_of(self) } @@ -1163,7 +1163,7 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn find<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option { + pub fn find<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> Option { pat.into_searcher(self).next_match().map(|(i, _)| i) } @@ -1211,7 +1211,7 @@ impl str { #[inline] pub fn rfind<'a, P>(&'a self, pat: P) -> Option where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { pat.into_searcher(self).next_match_back().map(|(i, _)| i) } @@ -1331,7 +1331,7 @@ impl str { /// [`split_whitespace`]: str::split_whitespace #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn split<'a, P: Pattern<'a>>(&'a self, pat: P) -> Split<'a, P> { + pub fn split<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> Split<'a, P> { Split(SplitInternal { start: 0, end: self.len(), @@ -1371,7 +1371,7 @@ impl str { /// ``` #[stable(feature = "split_inclusive", since = "1.51.0")] #[inline] - pub fn split_inclusive<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitInclusive<'a, P> { + pub fn split_inclusive<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> SplitInclusive<'a, P> { SplitInclusive(SplitInternal { start: 0, end: self.len(), @@ -1428,7 +1428,7 @@ impl str { #[inline] pub fn rsplit<'a, P>(&'a self, pat: P) -> RSplit<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { RSplit(self.split(pat).0) } @@ -1477,7 +1477,7 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn split_terminator<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitTerminator<'a, P> { + pub fn split_terminator<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> SplitTerminator<'a, P> { SplitTerminator(SplitInternal { allow_trailing_empty: false, ..self.split(pat).0 }) } @@ -1525,7 +1525,7 @@ impl str { #[inline] pub fn rsplit_terminator<'a, P>(&'a self, pat: P) -> RSplitTerminator<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { RSplitTerminator(self.split_terminator(pat).0) } @@ -1578,7 +1578,7 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn splitn<'a, P: Pattern<'a>>(&'a self, n: usize, pat: P) -> SplitN<'a, P> { + pub fn splitn<'a, P: Pattern<&'a str>>(&'a self, n: usize, pat: P) -> SplitN<'a, P> { SplitN(SplitNInternal { iter: self.split(pat).0, count: n }) } @@ -1629,7 +1629,7 @@ impl str { #[inline] pub fn rsplitn<'a, P>(&'a self, n: usize, pat: P) -> RSplitN<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { RSplitN(self.splitn(n, pat).0) } @@ -1647,7 +1647,10 @@ impl str { /// ``` #[stable(feature = "str_split_once", since = "1.52.0")] #[inline] - pub fn split_once<'a, P: Pattern<'a>>(&'a self, delimiter: P) -> Option<(&'a str, &'a str)> { + pub fn split_once<'a, P: Pattern<&'a str>>( + &'a self, + delimiter: P, + ) -> Option<(&'a str, &'a str)> { let (start, end) = delimiter.into_searcher(self).next_match()?; // SAFETY: `Searcher` is known to return valid indices. unsafe { Some((self.get_unchecked(..start), self.get_unchecked(end..))) } @@ -1667,7 +1670,7 @@ impl str { #[inline] pub fn rsplit_once<'a, P>(&'a self, delimiter: P) -> Option<(&'a str, &'a str)> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { let (start, end) = delimiter.into_searcher(self).next_match_back()?; // SAFETY: `Searcher` is known to return valid indices. @@ -1707,7 +1710,7 @@ impl str { /// ``` #[stable(feature = "str_matches", since = "1.2.0")] #[inline] - pub fn matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> Matches<'a, P> { + pub fn matches<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> Matches<'a, P> { Matches(MatchesInternal(pat.into_searcher(self))) } @@ -1745,7 +1748,7 @@ impl str { #[inline] pub fn rmatches<'a, P>(&'a self, pat: P) -> RMatches<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { RMatches(self.matches(pat).0) } @@ -1789,7 +1792,7 @@ impl str { /// ``` #[stable(feature = "str_match_indices", since = "1.5.0")] #[inline] - pub fn match_indices<'a, P: Pattern<'a>>(&'a self, pat: P) -> MatchIndices<'a, P> { + pub fn match_indices<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> MatchIndices<'a, P> { MatchIndices(MatchIndicesInternal(pat.into_searcher(self))) } @@ -1833,7 +1836,7 @@ impl str { #[inline] pub fn rmatch_indices<'a, P>(&'a self, pat: P) -> RMatchIndices<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { RMatchIndices(self.match_indices(pat).0) } @@ -2050,7 +2053,7 @@ impl str { #[stable(feature = "rust1", since = "1.0.0")] pub fn trim_matches<'a, P>(&'a self, pat: P) -> &'a str where - P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>, + P: Pattern<&'a str, Searcher: DoubleEndedSearcher<&'a str>>, { let mut i = 0; let mut j = 0; @@ -2097,7 +2100,7 @@ impl str { #[must_use = "this returns the trimmed string as a new slice, \ without modifying the original"] #[stable(feature = "trim_direction", since = "1.30.0")] - pub fn trim_start_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str { + pub fn trim_start_matches<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> &'a str { let mut i = self.len(); let mut matcher = pat.into_searcher(self); if let Some((a, _)) = matcher.next_reject() { @@ -2130,7 +2133,7 @@ impl str { #[must_use = "this returns the remaining substring as a new slice, \ without modifying the original"] #[stable(feature = "str_strip", since = "1.45.0")] - pub fn strip_prefix<'a, P: Pattern<'a>>(&'a self, prefix: P) -> Option<&'a str> { + pub fn strip_prefix<'a, P: Pattern<&'a str>>(&'a self, prefix: P) -> Option<&'a str> { prefix.strip_prefix_of(self) } @@ -2159,8 +2162,8 @@ impl str { #[stable(feature = "str_strip", since = "1.45.0")] pub fn strip_suffix<'a, P>(&'a self, suffix: P) -> Option<&'a str> where - P: Pattern<'a>, -

>::Searcher: ReverseSearcher<'a>, + P: Pattern<&'a str>, +

>::Searcher: ReverseSearcher<&'a str>, { suffix.strip_suffix_of(self) } @@ -2203,7 +2206,7 @@ impl str { #[stable(feature = "trim_direction", since = "1.30.0")] pub fn trim_end_matches<'a, P>(&'a self, pat: P) -> &'a str where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { let mut j = 0; let mut matcher = pat.into_searcher(self); @@ -2247,7 +2250,7 @@ impl str { note = "superseded by `trim_start_matches`", suggestion = "trim_start_matches" )] - pub fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str { + pub fn trim_left_matches<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> &'a str { self.trim_start_matches(pat) } @@ -2292,7 +2295,7 @@ impl str { )] pub fn trim_right_matches<'a, P>(&'a self, pat: P) -> &'a str where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { self.trim_end_matches(pat) } diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs index e3a464a1c51a9..9f6eec256bbe0 100644 --- a/library/core/src/str/pattern.rs +++ b/library/core/src/str/pattern.rs @@ -41,17 +41,18 @@ use crate::cmp; use crate::cmp::Ordering; use crate::fmt; +use crate::ops::Range; use crate::slice::memchr; // Pattern -/// A string pattern. +/// A pattern which can be matched against a [`Haystack`]. /// -/// A `Pattern<'a>` expresses that the implementing type -/// can be used as a string pattern for searching in a [`&'a str`][str]. +/// A `Pattern` expresses that the implementing type can be used as a pattern +/// for searching in an `H`. /// -/// For example, both `'a'` and `"aa"` are patterns that -/// would match at index `1` in the string `"baaaab"`. +/// For example, character `'a'` and string `"aa"` are patterns that would match +/// at index `1` in the string `"baaaab"`. /// /// The trait itself acts as a builder for an associated /// [`Searcher`] type, which does the actual work of finding @@ -96,86 +97,126 @@ use crate::slice::memchr; /// assert_eq!("abcdef_z".find(|ch| ch > 'd' && ch < 'y'), Some(4)); /// assert_eq!("abcddd_z".find(|ch| ch > 'd' && ch < 'y'), None); /// ``` -pub trait Pattern<'a>: Sized { - /// Associated searcher for this pattern - type Searcher: Searcher<'a>; +pub trait Pattern: Sized { + /// Associated searcher for this pattern. + type Searcher: Searcher; /// Constructs the associated searcher from /// `self` and the `haystack` to search in. - fn into_searcher(self, haystack: &'a str) -> Self::Searcher; + fn into_searcher(self, haystack: H) -> Self::Searcher; - /// Checks whether the pattern matches anywhere in the haystack + /// Checks whether the pattern matches anywhere in the haystack. #[inline] - fn is_contained_in(self, haystack: &'a str) -> bool { + fn is_contained_in(self, haystack: H) -> bool { self.into_searcher(haystack).next_match().is_some() } - /// Checks whether the pattern matches at the front of the haystack + /// Checks whether the pattern matches at the front of the haystack. #[inline] - fn is_prefix_of(self, haystack: &'a str) -> bool { - matches!(self.into_searcher(haystack).next(), SearchStep::Match(0, _)) + fn is_prefix_of(self, haystack: H) -> bool { + matches!(self.into_searcher(haystack).next(), SearchStep::Match(..)) } - /// Checks whether the pattern matches at the back of the haystack + /// Checks whether the pattern matches at the back of the haystack. #[inline] - fn is_suffix_of(self, haystack: &'a str) -> bool + fn is_suffix_of(self, haystack: H) -> bool where - Self::Searcher: ReverseSearcher<'a>, + Self::Searcher: ReverseSearcher, { - matches!(self.into_searcher(haystack).next_back(), SearchStep::Match(_, j) if haystack.len() == j) + matches!(self.into_searcher(haystack).next_back(), SearchStep::Match(..)) } - /// Removes the pattern from the front of haystack, if it matches. + /// Removes the pattern from the front of a haystack, if it matches. #[inline] - fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> { - if let SearchStep::Match(start, len) = self.into_searcher(haystack).next() { - debug_assert_eq!( - start, 0, + fn strip_prefix_of(self, haystack: H) -> Option { + if let SearchStep::Match(start, pos) = self.into_searcher(haystack).next() { + // This cannot be debug_assert_eq because StartCursor isn’t Debug. + debug_assert!( + start == haystack.cursor_at_front(), "The first search step from Searcher \ - must include the first character" + must include the first character" ); + let end = haystack.cursor_at_back(); // SAFETY: `Searcher` is known to return valid indices. - unsafe { Some(haystack.get_unchecked(len..)) } + Some(unsafe { haystack.get_unchecked(pos..end) }) } else { None } } - /// Removes the pattern from the back of haystack, if it matches. + /// Removes the pattern from the back of a haystack, if it matches. #[inline] - fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> + fn strip_suffix_of(self, haystack: H) -> Option where - Self::Searcher: ReverseSearcher<'a>, + Self::Searcher: ReverseSearcher, { - if let SearchStep::Match(start, end) = self.into_searcher(haystack).next_back() { - debug_assert_eq!( - end, - haystack.len(), + if let SearchStep::Match(pos, end) = self.into_searcher(haystack).next_back() { + // This cannot be debug_assert_eq because StartCursor isn’t Debug. + debug_assert!( + end == haystack.cursor_at_back(), "The first search step from ReverseSearcher \ - must include the last character" + must include the last character" ); + let start = haystack.cursor_at_front(); // SAFETY: `Searcher` is known to return valid indices. - unsafe { Some(haystack.get_unchecked(..start)) } + Some(unsafe { haystack.get_unchecked(start..pos) }) } else { None } } } +// Haystack + +/// A type which can be searched in using a [`Pattern`]. +/// +/// The trait is used in combination with [`Pattern`] trait to express a pattern +/// that can be used to search for elements in given haystack. +pub trait Haystack: Sized + Copy { + /// A cursor representing position in the haystack or its end. + type Cursor: Copy + PartialEq; + + /// Returns cursor pointing at the beginning of the haystack. + fn cursor_at_front(self) -> Self::Cursor; + + /// Returns cursor pointing at the end of the haystack. + fn cursor_at_back(self) -> Self::Cursor; + + /// Returns whether the haystack is empty. + fn is_empty(self) -> bool { + self.cursor_at_front() == self.cursor_at_back() + } + + /// Returns portions of the haystack indicated by the cursor range. + /// + /// # Safety + /// + /// Range’s start and end must be valid haystack split positions. + /// Furthermore, start mustn’t point at position after end. + /// + /// A valid split positions are: + /// - the front of the haystack (as returned by + /// [`cursor_at_front()`][Self::cursor_at_front], + /// - the back of the haystack (as returned by + /// [`cursor_at_back()`][Self::cursor_at_back] or + /// - any cursor returned by a [`Searcher`] or [`ReverseSearcher`]. + unsafe fn get_unchecked(self, range: Range) -> Self; +} + // Searcher /// Result of calling [`Searcher::next()`] or [`ReverseSearcher::next_back()`]. #[derive(Copy, Clone, Eq, PartialEq, Debug)] -pub enum SearchStep { +pub enum SearchStep { /// Expresses that a match of the pattern has been found at /// `haystack[a..b]`. - Match(usize, usize), + Match(T, T), /// Expresses that `haystack[a..b]` has been rejected as a possible match /// of the pattern. /// /// Note that there might be more than one `Reject` between two `Match`es, /// there is no requirement for them to be combined into one. - Reject(usize, usize), + Reject(T, T), /// Expresses that every byte of the haystack has been visited, ending /// the iteration. Done, @@ -193,11 +234,11 @@ pub enum SearchStep { /// [`next()`][Searcher::next] methods are required to lie on valid utf8 /// boundaries in the haystack. This enables consumers of this trait to /// slice the haystack without additional runtime checks. -pub unsafe trait Searcher<'a> { +pub unsafe trait Searcher { /// Getter for the underlying string to be searched in /// /// Will always return the same [`&str`][str]. - fn haystack(&self) -> &'a str; + fn haystack(&self) -> H; /// Performs the next search step starting from the front. /// @@ -220,7 +261,7 @@ pub unsafe trait Searcher<'a> { /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` /// might produce the stream /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]` - fn next(&mut self) -> SearchStep; + fn next(&mut self) -> SearchStep; /// Finds the next [`Match`][SearchStep::Match] result. See [`next()`][Searcher::next]. /// @@ -229,7 +270,7 @@ pub unsafe trait Searcher<'a> { /// `(start_match, end_match)`, where start_match is the index of where /// the match begins, and end_match is the index after the end of the match. #[inline] - fn next_match(&mut self) -> Option<(usize, usize)> { + fn next_match(&mut self) -> Option<(H::Cursor, H::Cursor)> { loop { match self.next() { SearchStep::Match(a, b) => return Some((a, b)), @@ -245,7 +286,7 @@ pub unsafe trait Searcher<'a> { /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges /// of this and [`next_match`][Searcher::next_match] will overlap. #[inline] - fn next_reject(&mut self) -> Option<(usize, usize)> { + fn next_reject(&mut self) -> Option<(H::Cursor, H::Cursor)> { loop { match self.next() { SearchStep::Reject(a, b) => return Some((a, b)), @@ -270,7 +311,7 @@ pub unsafe trait Searcher<'a> { /// /// For the reason why this trait is marked unsafe, see the /// parent trait [`Searcher`]. -pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { +pub unsafe trait ReverseSearcher: Searcher { /// Performs the next search step starting from the back. /// /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` @@ -292,12 +333,12 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` /// might produce the stream /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]`. - fn next_back(&mut self) -> SearchStep; + fn next_back(&mut self) -> SearchStep; /// Finds the next [`Match`][SearchStep::Match] result. /// See [`next_back()`][ReverseSearcher::next_back]. #[inline] - fn next_match_back(&mut self) -> Option<(usize, usize)> { + fn next_match_back(&mut self) -> Option<(H::Cursor, H::Cursor)> { loop { match self.next_back() { SearchStep::Match(a, b) => return Some((a, b)), @@ -310,7 +351,7 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { /// Finds the next [`Reject`][SearchStep::Reject] result. /// See [`next_back()`][ReverseSearcher::next_back]. #[inline] - fn next_reject_back(&mut self) -> Option<(usize, usize)> { + fn next_reject_back(&mut self) -> Option<(H::Cursor, H::Cursor)> { loop { match self.next_back() { SearchStep::Reject(a, b) => return Some((a, b)), @@ -342,13 +383,41 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { /// `(&str)::Searcher` is not a `DoubleEndedSearcher` because /// the pattern `"aa"` in the haystack `"aaa"` matches as either /// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched. -pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} +pub trait DoubleEndedSearcher: ReverseSearcher {} + +///////////////////////////////////////////////////////////////////////////// +// Impl for Haystack +///////////////////////////////////////////////////////////////////////////// + +impl<'a> Haystack for &'a str { + type Cursor = usize; + + #[inline(always)] + fn cursor_at_front(self) -> usize { + 0 + } + #[inline(always)] + fn cursor_at_back(self) -> usize { + self.len() + } + + #[inline(always)] + fn is_empty(self) -> bool { + self.is_empty() + } + + #[inline(always)] + unsafe fn get_unchecked(self, range: Range) -> Self { + // SAFETY: Caller promises position is a character boundary. + unsafe { self.get_unchecked(range) } + } +} ///////////////////////////////////////////////////////////////////////////// // Impl for char ///////////////////////////////////////////////////////////////////////////// -/// Associated type for `>::Searcher`. +/// Associated type for `>::Searcher`. #[derive(Clone, Debug)] pub struct CharSearcher<'a> { haystack: &'a str, @@ -375,7 +444,7 @@ pub struct CharSearcher<'a> { utf8_encoded: [u8; 4], } -unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { +unsafe impl<'a> Searcher<&'a str> for CharSearcher<'a> { #[inline] fn haystack(&self) -> &'a str { self.haystack @@ -453,7 +522,7 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { // let next_reject use the default implementation from the Searcher trait } -unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { +unsafe impl<'a> ReverseSearcher<&'a str> for CharSearcher<'a> { #[inline] fn next_back(&mut self) -> SearchStep { let old_finger = self.finger_back; @@ -527,7 +596,7 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { // let next_reject_back use the default implementation from the Searcher trait } -impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} +impl<'a> DoubleEndedSearcher<&'a str> for CharSearcher<'a> {} /// Searches for chars that are equal to a given [`char`]. /// @@ -536,7 +605,7 @@ impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} /// ``` /// assert_eq!("Hello world".find('o'), Some(4)); /// ``` -impl<'a> Pattern<'a> for char { +impl<'a> Pattern<&'a str> for char { type Searcher = CharSearcher<'a>; #[inline] @@ -576,7 +645,7 @@ impl<'a> Pattern<'a> for char { #[inline] fn is_suffix_of(self, haystack: &'a str) -> bool where - Self::Searcher: ReverseSearcher<'a>, + Self::Searcher: ReverseSearcher<&'a str>, { self.encode_utf8(&mut [0u8; 4]).is_suffix_of(haystack) } @@ -584,7 +653,7 @@ impl<'a> Pattern<'a> for char { #[inline] fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> where - Self::Searcher: ReverseSearcher<'a>, + Self::Searcher: ReverseSearcher<&'a str>, { self.encode_utf8(&mut [0u8; 4]).strip_suffix_of(haystack) } @@ -639,7 +708,7 @@ struct MultiCharEqSearcher<'a, C: MultiCharEq> { char_indices: super::CharIndices<'a>, } -impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern { +impl<'a, C: MultiCharEq> Pattern<&'a str> for MultiCharEqPattern { type Searcher = MultiCharEqSearcher<'a, C>; #[inline] @@ -648,7 +717,7 @@ impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern { } } -unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> { +unsafe impl<'a, C: MultiCharEq> Searcher<&'a str> for MultiCharEqSearcher<'a, C> { #[inline] fn haystack(&self) -> &'a str { self.haystack @@ -673,7 +742,7 @@ unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> { } } -unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> { +unsafe impl<'a, C: MultiCharEq> ReverseSearcher<&'a str> for MultiCharEqSearcher<'a, C> { #[inline] fn next_back(&mut self) -> SearchStep { let s = &mut self.char_indices; @@ -693,7 +762,7 @@ unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, } } -impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {} +impl<'a, C: MultiCharEq> DoubleEndedSearcher<&'a str> for MultiCharEqSearcher<'a, C> {} ///////////////////////////////////////////////////////////////////////////// @@ -724,7 +793,7 @@ macro_rules! pattern_methods { #[inline] fn is_suffix_of(self, haystack: &'a str) -> bool where - $t: ReverseSearcher<'a>, + $t: ReverseSearcher<&'a str>, { ($pmap)(self).is_suffix_of(haystack) } @@ -732,7 +801,7 @@ macro_rules! pattern_methods { #[inline] fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> where - $t: ReverseSearcher<'a>, + $t: ReverseSearcher<&'a str>, { ($pmap)(self).strip_suffix_of(haystack) } @@ -774,16 +843,16 @@ macro_rules! searcher_methods { }; } -/// Associated type for `<[char; N] as Pattern<'a>>::Searcher`. +/// Associated type for `<[char; N] as Pattern<&'a str>>::Searcher`. #[derive(Clone, Debug)] pub struct CharArraySearcher<'a, const N: usize>( - as Pattern<'a>>::Searcher, + as Pattern<&'a str>>::Searcher, ); -/// Associated type for `<&[char; N] as Pattern<'a>>::Searcher`. +/// Associated type for `<&[char; N] as Pattern<&'a str>>::Searcher`. #[derive(Clone, Debug)] pub struct CharArrayRefSearcher<'a, 'b, const N: usize>( - as Pattern<'a>>::Searcher, + as Pattern<&'a str>>::Searcher, ); /// Searches for chars that are equal to any of the [`char`]s in the array. @@ -794,15 +863,15 @@ pub struct CharArrayRefSearcher<'a, 'b, const N: usize>( /// assert_eq!("Hello world".find(['l', 'l']), Some(2)); /// assert_eq!("Hello world".find(['l', 'l']), Some(2)); /// ``` -impl<'a, const N: usize> Pattern<'a> for [char; N] { +impl<'a, const N: usize> Pattern<&'a str> for [char; N] { pattern_methods!(CharArraySearcher<'a, N>, MultiCharEqPattern, CharArraySearcher); } -unsafe impl<'a, const N: usize> Searcher<'a> for CharArraySearcher<'a, N> { +unsafe impl<'a, const N: usize> Searcher<&'a str> for CharArraySearcher<'a, N> { searcher_methods!(forward); } -unsafe impl<'a, const N: usize> ReverseSearcher<'a> for CharArraySearcher<'a, N> { +unsafe impl<'a, const N: usize> ReverseSearcher<&'a str> for CharArraySearcher<'a, N> { searcher_methods!(reverse); } @@ -814,15 +883,15 @@ unsafe impl<'a, const N: usize> ReverseSearcher<'a> for CharArraySearcher<'a, N> /// assert_eq!("Hello world".find(&['l', 'l']), Some(2)); /// assert_eq!("Hello world".find(&['l', 'l']), Some(2)); /// ``` -impl<'a, 'b, const N: usize> Pattern<'a> for &'b [char; N] { +impl<'a, 'b, const N: usize> Pattern<&'a str> for &'b [char; N] { pattern_methods!(CharArrayRefSearcher<'a, 'b, N>, MultiCharEqPattern, CharArrayRefSearcher); } -unsafe impl<'a, 'b, const N: usize> Searcher<'a> for CharArrayRefSearcher<'a, 'b, N> { +unsafe impl<'a, 'b, const N: usize> Searcher<&'a str> for CharArrayRefSearcher<'a, 'b, N> { searcher_methods!(forward); } -unsafe impl<'a, 'b, const N: usize> ReverseSearcher<'a> for CharArrayRefSearcher<'a, 'b, N> { +unsafe impl<'a, 'b, const N: usize> ReverseSearcher<&'a str> for CharArrayRefSearcher<'a, 'b, N> { searcher_methods!(reverse); } @@ -832,19 +901,21 @@ unsafe impl<'a, 'b, const N: usize> ReverseSearcher<'a> for CharArrayRefSearcher // Todo: Change / Remove due to ambiguity in meaning. -/// Associated type for `<&[char] as Pattern<'a>>::Searcher`. +/// Associated type for `<&[char] as Pattern<&'a str>>::Searcher`. #[derive(Clone, Debug)] -pub struct CharSliceSearcher<'a, 'b>( as Pattern<'a>>::Searcher); +pub struct CharSliceSearcher<'a, 'b>( + as Pattern<&'a str>>::Searcher, +); -unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> { +unsafe impl<'a, 'b> Searcher<&'a str> for CharSliceSearcher<'a, 'b> { searcher_methods!(forward); } -unsafe impl<'a, 'b> ReverseSearcher<'a> for CharSliceSearcher<'a, 'b> { +unsafe impl<'a, 'b> ReverseSearcher<&'a str> for CharSliceSearcher<'a, 'b> { searcher_methods!(reverse); } -impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {} +impl<'a, 'b> DoubleEndedSearcher<&'a str> for CharSliceSearcher<'a, 'b> {} /// Searches for chars that are equal to any of the [`char`]s in the slice. /// @@ -854,7 +925,7 @@ impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {} /// assert_eq!("Hello world".find(&['l', 'l'] as &[_]), Some(2)); /// assert_eq!("Hello world".find(&['l', 'l'][..]), Some(2)); /// ``` -impl<'a, 'b> Pattern<'a> for &'b [char] { +impl<'a, 'b> Pattern<&'a str> for &'b [char] { pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher); } @@ -862,9 +933,9 @@ impl<'a, 'b> Pattern<'a> for &'b [char] { // Impl for F: FnMut(char) -> bool ///////////////////////////////////////////////////////////////////////////// -/// Associated type for `>::Searcher`. +/// Associated type for `>::Searcher`. #[derive(Clone)] -pub struct CharPredicateSearcher<'a, F>( as Pattern<'a>>::Searcher) +pub struct CharPredicateSearcher<'a, F>( as Pattern<&'a str>>::Searcher) where F: FnMut(char) -> bool; @@ -879,21 +950,24 @@ where .finish() } } -unsafe impl<'a, F> Searcher<'a> for CharPredicateSearcher<'a, F> +unsafe impl<'a, F> Searcher<&'a str> for CharPredicateSearcher<'a, F> where F: FnMut(char) -> bool, { searcher_methods!(forward); } -unsafe impl<'a, F> ReverseSearcher<'a> for CharPredicateSearcher<'a, F> +unsafe impl<'a, F> ReverseSearcher<&'a str> for CharPredicateSearcher<'a, F> where F: FnMut(char) -> bool, { searcher_methods!(reverse); } -impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> where F: FnMut(char) -> bool {} +impl<'a, F> DoubleEndedSearcher<&'a str> for CharPredicateSearcher<'a, F> where + F: FnMut(char) -> bool +{ +} /// Searches for [`char`]s that match the given predicate. /// @@ -903,7 +977,7 @@ impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> where F: Fn /// assert_eq!("Hello world".find(char::is_uppercase), Some(0)); /// assert_eq!("Hello world".find(|c| "aeiou".contains(c)), Some(1)); /// ``` -impl<'a, F> Pattern<'a> for F +impl<'a, F> Pattern<&'a str> for F where F: FnMut(char) -> bool, { @@ -915,7 +989,7 @@ where ///////////////////////////////////////////////////////////////////////////// /// Delegates to the `&str` impl. -impl<'a, 'b, 'c> Pattern<'a> for &'c &'b str { +impl<'a, 'b, 'c> Pattern<&'a str> for &'c &'b str { pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s); } @@ -933,7 +1007,7 @@ impl<'a, 'b, 'c> Pattern<'a> for &'c &'b str { /// ``` /// assert_eq!("Hello world".find("world"), Some(6)); /// ``` -impl<'a, 'b> Pattern<'a> for &'b str { +impl<'a, 'b> Pattern<&'a str> for &'b str { type Searcher = StrSearcher<'a, 'b>; #[inline] @@ -1008,7 +1082,7 @@ impl<'a, 'b> Pattern<'a> for &'b str { ///////////////////////////////////////////////////////////////////////////// #[derive(Clone, Debug)] -/// Associated type for `<&str as Pattern<'a>>::Searcher`. +/// Associated type for `<&str as Pattern<&'a str>>::Searcher`. pub struct StrSearcher<'a, 'b> { haystack: &'a str, needle: &'b str, @@ -1059,7 +1133,7 @@ impl<'a, 'b> StrSearcher<'a, 'b> { } } -unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { +unsafe impl<'a, 'b> Searcher<&'a str> for StrSearcher<'a, 'b> { #[inline] fn haystack(&self) -> &'a str { self.haystack @@ -1149,7 +1223,7 @@ unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { } } -unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { +unsafe impl<'a, 'b> ReverseSearcher<&'a str> for StrSearcher<'a, 'b> { #[inline] fn next_back(&mut self) -> SearchStep { match self.searcher { diff --git a/tests/rustdoc/async-fn.rs b/tests/rustdoc/async-fn.rs index 70bcbcb6ff44a..3f641473d308a 100644 --- a/tests/rustdoc/async-fn.rs +++ b/tests/rustdoc/async-fn.rs @@ -46,7 +46,7 @@ impl Foo { pub async fn mut_self(mut self, mut first: usize) {} } -pub trait Pattern<'a> {} +pub trait Pattern<&'a str> {} pub trait Trait {} // @has async_fn/fn.const_generics.html @@ -91,5 +91,5 @@ impl Foo { // @has - '//pre[@class="rust item-decl"]' "pub async fn named<'a, 'b>(foo: &'a str) -> &'b str" pub async fn named<'a, 'b>(foo: &'a str) -> &'b str {} // @has async_fn/fn.named_trait.html -// @has - '//pre[@class="rust item-decl"]' "pub async fn named_trait<'a, 'b>(foo: impl Pattern<'a>) -> impl Pattern<'b>" -pub async fn named_trait<'a, 'b>(foo: impl Pattern<'a>) -> impl Pattern<'b> {} +// @has - '//pre[@class="rust item-decl"]' "pub async fn named_trait<'a, 'b>(foo: impl Pattern<&'a str>) -> impl Pattern<'b>" +pub async fn named_trait<'a, 'b>(foo: impl Pattern<&'a str>) -> impl Pattern<'b> {} diff --git a/tests/ui/traits/bound/assoc-fn-bound-root-obligation.rs b/tests/ui/traits/bound/assoc-fn-bound-root-obligation.rs index f9a9347641143..8a047a082c4a4 100644 --- a/tests/ui/traits/bound/assoc-fn-bound-root-obligation.rs +++ b/tests/ui/traits/bound/assoc-fn-bound-root-obligation.rs @@ -3,7 +3,7 @@ fn strip_lf(s: &str) -> &str { //~^ ERROR expected a `FnMut<(char,)>` closure, found `u8` //~| NOTE expected an `FnMut<(char,)>` closure, found `u8` //~| HELP the trait `FnMut<(char,)>` is not implemented for `u8` - //~| HELP the following other types implement trait `Pattern<'a>`: + //~| HELP the following other types implement trait `Pattern<&'a str>`: //~| NOTE required for `u8` to implement `Pattern<'_>` } diff --git a/tests/ui/traits/bound/assoc-fn-bound-root-obligation.stderr b/tests/ui/traits/bound/assoc-fn-bound-root-obligation.stderr index ce9ab2d811ae1..e97aaa6834309 100644 --- a/tests/ui/traits/bound/assoc-fn-bound-root-obligation.stderr +++ b/tests/ui/traits/bound/assoc-fn-bound-root-obligation.stderr @@ -5,7 +5,7 @@ LL | s.strip_suffix(b'\n').unwrap_or(s) | ^^^^^^^^^^^^ expected an `FnMut<(char,)>` closure, found `u8` | = help: the trait `FnMut<(char,)>` is not implemented for `u8` - = help: the following other types implement trait `Pattern<'a>`: + = help: the following other types implement trait `Pattern<&'a str>`: &'b String &'b [char; N] &'b [char] From 9ba42ae40e4e458e94c850e09ae74294dc590d4c Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Tue, 14 Feb 2023 22:34:37 +0100 Subject: [PATCH 02/12] core: move Pattern et al to core::pattern module Pattern is no longer str-specific, so move it from core::str::pattern module to a new core::pattern module. This introduces no changes in behaviour or implementation. Just moves stuff around and adjusts documentation. --- library/alloc/src/str.rs | 2 +- library/alloc/src/string.rs | 4 +- library/alloc/tests/str.rs | 8 +- library/core/src/lib.rs | 1 + library/core/src/pattern.rs | 362 ++++++++++++++++++++++++++++++ library/core/src/str/iter.rs | 3 +- library/core/src/str/mod.rs | 4 +- library/core/src/str/pattern.rs | 385 +++----------------------------- library/core/tests/pattern.rs | 2 +- library/std/src/lib.rs | 2 + 10 files changed, 404 insertions(+), 369 deletions(-) create mode 100644 library/core/src/pattern.rs diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index f09ebcc154d85..b6134256a7c78 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -10,8 +10,8 @@ use core::borrow::{Borrow, BorrowMut}; use core::iter::FusedIterator; use core::mem; +use core::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher}; use core::ptr; -use core::str::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher}; use core::unicode::conversions; use crate::borrow::ToOwned; diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 618f360c35c5c..da58797ad11d4 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -55,9 +55,9 @@ use core::ops::AddAssign; #[cfg(not(no_global_oom_handling))] use core::ops::Bound::{Excluded, Included, Unbounded}; use core::ops::{self, Index, IndexMut, Range, RangeBounds}; +use core::pattern::Pattern; use core::ptr; use core::slice; -use core::str::pattern::Pattern; #[cfg(not(no_global_oom_handling))] use core::str::Utf8Chunks; @@ -1373,7 +1373,7 @@ impl String { where P: for<'x> Pattern<&'x str>, { - use core::str::pattern::Searcher; + use core::pattern::Searcher; let rejections = { let mut searcher = pat.into_searcher(self); diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index 12a05028fcd9e..57c7ad955e43a 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -1868,14 +1868,14 @@ fn test_repeat() { } mod pattern { - use std::str::pattern::SearchStep::{self, Done, Match, Reject}; - use std::str::pattern::{Pattern, ReverseSearcher, Searcher}; + use core::pattern::SearchStep::{self, Done, Match, Reject}; + use core::pattern::{Pattern, ReverseSearcher, Searcher}; macro_rules! make_test { ($name:ident, $p:expr, $h:expr, [$($e:expr,)*]) => { #[allow(unused_imports)] mod $name { - use std::str::pattern::SearchStep::{Match, Reject}; + use core::pattern::SearchStep::{Match, Reject}; use super::{cmp_search_to_vec}; #[test] fn fwd() { @@ -2151,7 +2151,7 @@ generate_iterator_test! { #[test] fn different_str_pattern_forwarding_lifetimes() { - use std::str::pattern::Pattern; + use core::pattern::Pattern; fn foo<'a, P>(p: P) where diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs index ed0c05a686319..79670a0d60adb 100644 --- a/library/core/src/lib.rs +++ b/library/core/src/lib.rs @@ -362,6 +362,7 @@ pub mod sync; pub mod fmt; pub mod hash; +pub mod pattern; pub mod slice; pub mod str; pub mod time; diff --git a/library/core/src/pattern.rs b/library/core/src/pattern.rs new file mode 100644 index 0000000000000..9fb50680a4058 --- /dev/null +++ b/library/core/src/pattern.rs @@ -0,0 +1,362 @@ +//! The Pattern API. +//! +//! The Pattern API provides a generic mechanism for using different pattern +//! types when searching through different objects. +//! +//! For more details, see the traits [`Pattern`], [`Haystack`], [`Searcher`], +//! [`ReverseSearcher`] and [`DoubleEndedSearcher`]. Although this API is +//! unstable, it is exposed via stable methods on corresponding haystack types. +//! +//! # Examples +//! +//! [`Pattern<&str>`] is [implemented][pattern-impls] in the stable API for +//! [`&str`][`str`], [`char`], slices of [`char`], and functions and closures +//! implementing `FnMut(char) -> bool`. +//! +//! ``` +//! let s = "Can you find a needle in a haystack?"; +//! +//! // &str pattern +//! assert_eq!(s.find("you"), Some(4)); +//! // char pattern +//! assert_eq!(s.find('n'), Some(2)); +//! // array of chars pattern +//! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u']), Some(1)); +//! // slice of chars pattern +//! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u'][..]), Some(1)); +//! // closure pattern +//! assert_eq!(s.find(|c: char| c.is_ascii_punctuation()), Some(35)); +//! ``` +//! +//! [pattern-impls]: Pattern#implementors + +#![unstable( + feature = "pattern", + reason = "API not fully fleshed out and ready to be stabilized", + issue = "27721" +)] + +use crate::ops::Range; + +/// A pattern which can be matched against a [`Haystack`]. +/// +/// A `Pattern` expresses that the implementing type can be used as a pattern +/// for searching in an `H`. For example, character `'a'` and string `"aa"` are +/// patterns that would match at index `1` in the string `"baaaab"`. +/// +/// The trait itself acts as a builder for an associated [`Searcher`] type, +/// which does the actual work of finding occurrences of the pattern in +/// a string. +/// +/// Depending on the type of the haystack and the pattern, the semantics of the +/// pattern can change. The table below describes some of those behaviours for +/// a [`&str`][str] haystack. +/// +/// | Pattern type | Match condition | +/// |--------------------------|-------------------------------------------| +/// | `&str` | is substring | +/// | `char` | is contained in string | +/// | `&[char]` | any char in slice is contained in string | +/// | `F: FnMut(char) -> bool` | `F` returns `true` for a char in string | +/// +/// # Examples +/// +/// ``` +/// // &str pattern matching &str +/// assert_eq!("abaaa".find("ba"), Some(1)); +/// assert_eq!("abaaa".find("bac"), None); +/// +/// // char pattern matching &str +/// assert_eq!("abaaa".find('a'), Some(0)); +/// assert_eq!("abaaa".find('b'), Some(1)); +/// assert_eq!("abaaa".find('c'), None); +/// +/// // &[char; N] pattern matching &str +/// assert_eq!("ab".find(&['b', 'a']), Some(0)); +/// assert_eq!("abaaa".find(&['a', 'z']), Some(0)); +/// assert_eq!("abaaa".find(&['c', 'd']), None); +/// +/// // &[char] pattern matching &str +/// assert_eq!("ab".find(&['b', 'a'][..]), Some(0)); +/// assert_eq!("abaaa".find(&['a', 'z'][..]), Some(0)); +/// assert_eq!("abaaa".find(&['c', 'd'][..]), None); +/// +/// // FnMut(char) -> bool pattern matching &str +/// assert_eq!("abcdef_z".find(|ch| ch > 'd' && ch < 'y'), Some(4)); +/// assert_eq!("abcddd_z".find(|ch| ch > 'd' && ch < 'y'), None); +/// ``` +pub trait Pattern: Sized { + /// Associated searcher for this pattern. + type Searcher: Searcher; + + /// Constructs the associated searcher from `self` and the `haystack` to + /// search in. + fn into_searcher(self, haystack: H) -> Self::Searcher; + + /// Checks whether the pattern matches anywhere in the haystack. + fn is_contained_in(self, haystack: H) -> bool { + self.into_searcher(haystack).next_match().is_some() + } + + /// Checks whether the pattern matches at the front of the haystack. + fn is_prefix_of(self, haystack: H) -> bool { + matches!(self.into_searcher(haystack).next(), SearchStep::Match(..)) + } + + /// Checks whether the pattern matches at the back of the haystack. + fn is_suffix_of(self, haystack: H) -> bool + where + Self::Searcher: ReverseSearcher, + { + matches!(self.into_searcher(haystack).next_back(), SearchStep::Match(..)) + } + + /// Removes the pattern from the front of haystack, if it matches. + fn strip_prefix_of(self, haystack: H) -> Option { + if let SearchStep::Match(start, pos) = self.into_searcher(haystack).next() { + // This cannot be debug_assert_eq because StartCursor isn’t Debug. + debug_assert!( + start == haystack.cursor_at_front(), + "The first search step from Searcher \ + must include the first character" + ); + let end = haystack.cursor_at_back(); + // SAFETY: `Searcher` is known to return valid indices. + Some(unsafe { haystack.get_unchecked(pos..end) }) + } else { + None + } + } + + /// Removes the pattern from the back of haystack, if it matches. + fn strip_suffix_of(self, haystack: H) -> Option + where + Self::Searcher: ReverseSearcher, + { + if let SearchStep::Match(pos, end) = self.into_searcher(haystack).next_back() { + // This cannot be debug_assert_eq because StartCursor isn’t Debug. + debug_assert!( + end == haystack.cursor_at_back(), + "The first search step from ReverseSearcher \ + must include the last character" + ); + let start = haystack.cursor_at_front(); + // SAFETY: `Searcher` is known to return valid indices. + Some(unsafe { haystack.get_unchecked(start..pos) }) + } else { + None + } + } +} + +/// A type which can be searched in using a [`Pattern`]. +/// +/// The trait is used in combination with [`Pattern`] trait to express a pattern +/// that can be used to search for elements in given haystack. +pub trait Haystack: Sized + Copy { + /// A cursor representing position in the haystack or its end. + type Cursor: Copy + PartialEq; + + /// Returns cursor pointing at the beginning of the haystack. + fn cursor_at_front(self) -> Self::Cursor; + + /// Returns cursor pointing at the end of the haystack. + fn cursor_at_back(self) -> Self::Cursor; + + /// Returns whether the haystack is empty. + fn is_empty(self) -> bool; + + /// Returns portions of the haystack indicated by the cursor range. + /// + /// # Safety + /// + /// Range’s start and end must be valid haystack split positions. + /// Furthermore, start mustn’t point at position after end. + /// + /// A valid split positions are: + /// - the front of the haystack (as returned by + /// [`cursor_at_front()`][Self::cursor_at_front], + /// - the back of the haystack (as returned by + /// [`cursor_at_back()`][Self::cursor_at_back] or + /// - any cursor returned by a [`Searcher`] or [`ReverseSearcher`]. + unsafe fn get_unchecked(self, range: Range) -> Self; +} + +/// Result of calling [`Searcher::next()`] or [`ReverseSearcher::next_back()`]. +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub enum SearchStep { + /// Expresses that a match of the pattern has been found at + /// `haystack[a..b]`. + Match(T, T), + /// Expresses that `haystack[a..b]` has been rejected as a possible match of + /// the pattern. + /// + /// Note that there might be more than one `Reject` between two `Match`es, + /// there is no requirement for them to be combined into one. + Reject(T, T), + /// Expresses that every element of the haystack has been visited, ending + /// the iteration. + Done, +} + +/// A searcher for a string pattern. +/// +/// This trait provides methods for searching for non-overlapping matches of +/// a pattern starting from the front of a haystack `H`. +/// +/// It will be implemented by associated `Searcher` types of the [`Pattern`] +/// trait. +/// +/// The trait is marked unsafe because the indices returned by the +/// [`next()`][Searcher::next] methods are required to lie on valid haystack +/// split positions. This enables consumers of this trait to slice the haystack +/// without additional runtime checks. +pub unsafe trait Searcher { + /// Getter for the underlying string to be searched in + /// + /// Will always return the same haystack that was used when creating the + /// searcher. + fn haystack(&self) -> H; + + /// Performs the next search step starting from the front. + /// + /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` matches + /// the pattern. + /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` can + /// not match the pattern, even partially. + /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack has + /// been visited. + /// + /// The stream of [`Match`][SearchStep::Match] and + /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done] + /// will contain index ranges that are adjacent, non-overlapping, + /// covering the whole haystack, and laying on utf8 boundaries. + /// + /// A [`Match`][SearchStep::Match] result needs to contain the whole matched + /// pattern, however [`Reject`][SearchStep::Reject] results may be split up + /// into arbitrary many adjacent fragments. Both ranges may have zero length. + /// + /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` might + /// produce the stream `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, + /// 8)]` + fn next(&mut self) -> SearchStep; + + /// Finds the next [`Match`][SearchStep::Match] result. See + /// [`next()`][Searcher::next]. + /// + /// Unlike [`next()`][Searcher::next], there is no guarantee that the + /// returned ranges of this and [`next_reject`][Searcher::next_reject] will + /// overlap. This will return `(start_match, end_match)`, where start_match + /// is the index of where the match begins, and end_match is the index after + /// the end of the match. + fn next_match(&mut self) -> Option<(H::Cursor, H::Cursor)> { + loop { + match self.next() { + SearchStep::Match(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } + + /// Finds the next [`Reject`][SearchStep::Reject] result. See + /// [`next()`][Searcher::next] and [`next_match()`][Searcher::next_match]. + /// + /// Unlike [`next()`][Searcher::next], there is no guarantee that the + /// returned ranges of this and [`next_match`][Searcher::next_match] will + /// overlap. + fn next_reject(&mut self) -> Option<(H::Cursor, H::Cursor)> { + loop { + match self.next() { + SearchStep::Reject(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } +} + +/// A reverse searcher for a string pattern. +/// +/// This trait provides methods for searching for non-overlapping matches of +/// a pattern starting from the back of a haystack `H`. +/// +/// It will be implemented by associated [`Searcher`] types of the [`Pattern`] +/// trait if the pattern supports searching for it from the back. +/// +/// The index ranges returned by this trait are not required to exactly match +/// those of the forward search in reverse. +/// +/// For the reason why this trait is marked unsafe, see the parent trait +/// [`Searcher`]. +pub unsafe trait ReverseSearcher: Searcher { + /// Performs the next search step starting from the back. + /// + /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` + /// matches the pattern. + /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` + /// can not match the pattern, even partially. + /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack + /// has been visited + /// + /// The stream of [`Match`][SearchStep::Match] and + /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done] + /// will contain index ranges that are adjacent, non-overlapping, covering + /// the whole haystack, and laying on utf8 boundaries. + /// + /// A [`Match`][SearchStep::Match] result needs to contain the whole matched + /// pattern, however [`Reject`][SearchStep::Reject] results may be split up + /// into arbitrary many adjacent fragments. Both ranges may have zero + /// length. + /// + /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` might + /// produce the stream `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, + /// 1)]`. + fn next_back(&mut self) -> SearchStep; + + /// Finds the next [`Match`][SearchStep::Match] result. + /// See [`next_back()`][ReverseSearcher::next_back]. + fn next_match_back(&mut self) -> Option<(H::Cursor, H::Cursor)> { + loop { + match self.next_back() { + SearchStep::Match(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } + + /// Finds the next [`Reject`][SearchStep::Reject] result. + /// See [`next_back()`][ReverseSearcher::next_back]. + fn next_reject_back(&mut self) -> Option<(H::Cursor, H::Cursor)> { + loop { + match self.next_back() { + SearchStep::Reject(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } +} + +/// A marker trait to express that a [`ReverseSearcher`] can be used for +/// a [`DoubleEndedIterator`] implementation. +/// +/// For this, the impl of [`Searcher`] and [`ReverseSearcher`] need to follow +/// these conditions: +/// +/// - All results of `next()` need to be identical to the results of +/// `next_back()` in reverse order. +/// - `next()` and `next_back()` need to behave as the two ends of a range of +/// values, that is they can not "walk past each other". +/// +/// # Examples +/// +/// `char::Searcher` is a `DoubleEndedSearcher` because searching for a [`char`] +/// only requires looking at one at a time, which behaves the same from both +/// ends. +/// +/// `(&str)::Searcher` is not a `DoubleEndedSearcher` because the pattern `"aa"` +/// in the haystack `"aaa"` matches as either `"[aa]a"` or `"a[aa]"`, depending +/// from which side it is searched. +pub trait DoubleEndedSearcher: ReverseSearcher {} diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index 22d6b49feab6f..f5acc6e5441a1 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -7,11 +7,10 @@ use crate::iter::{Copied, Filter, FusedIterator, Map, TrustedLen}; use crate::iter::{TrustedRandomAccess, TrustedRandomAccessNoCoerce}; use crate::ops::Try; use crate::option; +use crate::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher}; use crate::slice::{self, Split as SliceSplit}; use super::from_utf8_unchecked; -use super::pattern::Pattern; -use super::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; use super::validations::{next_code_point, next_code_point_reverse}; use super::LinesMap; use super::{BytesIsNotEmpty, UnsafeBytesToStr}; diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 92d970b08245c..2d0e4da1bd520 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -13,12 +13,10 @@ mod iter; mod traits; mod validations; -use self::pattern::Pattern; -use self::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; - use crate::ascii; use crate::char::{self, EscapeDebugExtArgs}; use crate::mem; +use crate::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher}; use crate::slice::{self, SliceIndex}; pub mod pattern; diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs index 9f6eec256bbe0..4183b82413238 100644 --- a/library/core/src/str/pattern.rs +++ b/library/core/src/str/pattern.rs @@ -1,36 +1,47 @@ -//! The string Pattern API. +//! [The Pattern API] implementation for searching in `&str`. //! -//! The Pattern API provides a generic mechanism for using different pattern -//! types when searching through a string. +//! The implementation provides generic mechanism for using different pattern +//! types when searching through a string. Although this API is unstable, it is +//! exposed via stable APIs on the [`str`] type. //! -//! For more details, see the traits [`Pattern`], [`Searcher`], -//! [`ReverseSearcher`], and [`DoubleEndedSearcher`]. +//! Depending on the type of the pattern, the behaviour of methods like +//! [`str::find`] and [`str::contains`] can change. The table below describes +//! some of those behaviours. //! -//! Although this API is unstable, it is exposed via stable APIs on the -//! [`str`] type. +//! | Pattern type | Match condition | +//! |--------------------------|-------------------------------------------| +//! | `&str` | is substring | +//! | `char` | is contained in string | +//! | `&[char]` | any char in slice is contained in string | +//! | `F: FnMut(char) -> bool` | `F` returns `true` for a char in string | +//! | `&&str` | is substring | +//! | `&String` | is substring | //! //! # Examples //! -//! [`Pattern`] is [implemented][pattern-impls] in the stable API for -//! [`&str`][`str`], [`char`], slices of [`char`], and functions and closures -//! implementing `FnMut(char) -> bool`. -//! //! ``` //! let s = "Can you find a needle in a haystack?"; //! //! // &str pattern //! assert_eq!(s.find("you"), Some(4)); +//! assert_eq!(s.find("thou"), None); +//! //! // char pattern //! assert_eq!(s.find('n'), Some(2)); -//! // array of chars pattern +//! assert_eq!(s.find('N'), None); +//! +//! // Array of chars pattern and slices thereof //! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u']), Some(1)); -//! // slice of chars pattern //! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u'][..]), Some(1)); -//! // closure pattern +//! assert_eq!(s.find(&['q', 'v', 'x']), None); +//! +//! // Predicate closure //! assert_eq!(s.find(|c: char| c.is_ascii_punctuation()), Some(35)); +//! assert_eq!(s.find(|c: char| c.is_lowercase()), Some(1)); +//! assert_eq!(s.find(|c: char| !c.is_ascii()), None); //! ``` //! -//! [pattern-impls]: Pattern#implementors +//! [The Pattern API]: crate::pattern #![unstable( feature = "pattern", @@ -42,349 +53,11 @@ use crate::cmp; use crate::cmp::Ordering; use crate::fmt; use crate::ops::Range; +use crate::pattern::{ + DoubleEndedSearcher, Haystack, Pattern, ReverseSearcher, SearchStep, Searcher, +}; use crate::slice::memchr; -// Pattern - -/// A pattern which can be matched against a [`Haystack`]. -/// -/// A `Pattern` expresses that the implementing type can be used as a pattern -/// for searching in an `H`. -/// -/// For example, character `'a'` and string `"aa"` are patterns that would match -/// at index `1` in the string `"baaaab"`. -/// -/// The trait itself acts as a builder for an associated -/// [`Searcher`] type, which does the actual work of finding -/// occurrences of the pattern in a string. -/// -/// Depending on the type of the pattern, the behaviour of methods like -/// [`str::find`] and [`str::contains`] can change. The table below describes -/// some of those behaviours. -/// -/// | Pattern type | Match condition | -/// |--------------------------|-------------------------------------------| -/// | `&str` | is substring | -/// | `char` | is contained in string | -/// | `&[char]` | any char in slice is contained in string | -/// | `F: FnMut(char) -> bool` | `F` returns `true` for a char in string | -/// | `&&str` | is substring | -/// | `&String` | is substring | -/// -/// # Examples -/// -/// ``` -/// // &str -/// assert_eq!("abaaa".find("ba"), Some(1)); -/// assert_eq!("abaaa".find("bac"), None); -/// -/// // char -/// assert_eq!("abaaa".find('a'), Some(0)); -/// assert_eq!("abaaa".find('b'), Some(1)); -/// assert_eq!("abaaa".find('c'), None); -/// -/// // &[char; N] -/// assert_eq!("ab".find(&['b', 'a']), Some(0)); -/// assert_eq!("abaaa".find(&['a', 'z']), Some(0)); -/// assert_eq!("abaaa".find(&['c', 'd']), None); -/// -/// // &[char] -/// assert_eq!("ab".find(&['b', 'a'][..]), Some(0)); -/// assert_eq!("abaaa".find(&['a', 'z'][..]), Some(0)); -/// assert_eq!("abaaa".find(&['c', 'd'][..]), None); -/// -/// // FnMut(char) -> bool -/// assert_eq!("abcdef_z".find(|ch| ch > 'd' && ch < 'y'), Some(4)); -/// assert_eq!("abcddd_z".find(|ch| ch > 'd' && ch < 'y'), None); -/// ``` -pub trait Pattern: Sized { - /// Associated searcher for this pattern. - type Searcher: Searcher; - - /// Constructs the associated searcher from - /// `self` and the `haystack` to search in. - fn into_searcher(self, haystack: H) -> Self::Searcher; - - /// Checks whether the pattern matches anywhere in the haystack. - #[inline] - fn is_contained_in(self, haystack: H) -> bool { - self.into_searcher(haystack).next_match().is_some() - } - - /// Checks whether the pattern matches at the front of the haystack. - #[inline] - fn is_prefix_of(self, haystack: H) -> bool { - matches!(self.into_searcher(haystack).next(), SearchStep::Match(..)) - } - - /// Checks whether the pattern matches at the back of the haystack. - #[inline] - fn is_suffix_of(self, haystack: H) -> bool - where - Self::Searcher: ReverseSearcher, - { - matches!(self.into_searcher(haystack).next_back(), SearchStep::Match(..)) - } - - /// Removes the pattern from the front of a haystack, if it matches. - #[inline] - fn strip_prefix_of(self, haystack: H) -> Option { - if let SearchStep::Match(start, pos) = self.into_searcher(haystack).next() { - // This cannot be debug_assert_eq because StartCursor isn’t Debug. - debug_assert!( - start == haystack.cursor_at_front(), - "The first search step from Searcher \ - must include the first character" - ); - let end = haystack.cursor_at_back(); - // SAFETY: `Searcher` is known to return valid indices. - Some(unsafe { haystack.get_unchecked(pos..end) }) - } else { - None - } - } - - /// Removes the pattern from the back of a haystack, if it matches. - #[inline] - fn strip_suffix_of(self, haystack: H) -> Option - where - Self::Searcher: ReverseSearcher, - { - if let SearchStep::Match(pos, end) = self.into_searcher(haystack).next_back() { - // This cannot be debug_assert_eq because StartCursor isn’t Debug. - debug_assert!( - end == haystack.cursor_at_back(), - "The first search step from ReverseSearcher \ - must include the last character" - ); - let start = haystack.cursor_at_front(); - // SAFETY: `Searcher` is known to return valid indices. - Some(unsafe { haystack.get_unchecked(start..pos) }) - } else { - None - } - } -} - -// Haystack - -/// A type which can be searched in using a [`Pattern`]. -/// -/// The trait is used in combination with [`Pattern`] trait to express a pattern -/// that can be used to search for elements in given haystack. -pub trait Haystack: Sized + Copy { - /// A cursor representing position in the haystack or its end. - type Cursor: Copy + PartialEq; - - /// Returns cursor pointing at the beginning of the haystack. - fn cursor_at_front(self) -> Self::Cursor; - - /// Returns cursor pointing at the end of the haystack. - fn cursor_at_back(self) -> Self::Cursor; - - /// Returns whether the haystack is empty. - fn is_empty(self) -> bool { - self.cursor_at_front() == self.cursor_at_back() - } - - /// Returns portions of the haystack indicated by the cursor range. - /// - /// # Safety - /// - /// Range’s start and end must be valid haystack split positions. - /// Furthermore, start mustn’t point at position after end. - /// - /// A valid split positions are: - /// - the front of the haystack (as returned by - /// [`cursor_at_front()`][Self::cursor_at_front], - /// - the back of the haystack (as returned by - /// [`cursor_at_back()`][Self::cursor_at_back] or - /// - any cursor returned by a [`Searcher`] or [`ReverseSearcher`]. - unsafe fn get_unchecked(self, range: Range) -> Self; -} - -// Searcher - -/// Result of calling [`Searcher::next()`] or [`ReverseSearcher::next_back()`]. -#[derive(Copy, Clone, Eq, PartialEq, Debug)] -pub enum SearchStep { - /// Expresses that a match of the pattern has been found at - /// `haystack[a..b]`. - Match(T, T), - /// Expresses that `haystack[a..b]` has been rejected as a possible match - /// of the pattern. - /// - /// Note that there might be more than one `Reject` between two `Match`es, - /// there is no requirement for them to be combined into one. - Reject(T, T), - /// Expresses that every byte of the haystack has been visited, ending - /// the iteration. - Done, -} - -/// A searcher for a string pattern. -/// -/// This trait provides methods for searching for non-overlapping -/// matches of a pattern starting from the front (left) of a string. -/// -/// It will be implemented by associated `Searcher` -/// types of the [`Pattern`] trait. -/// -/// The trait is marked unsafe because the indices returned by the -/// [`next()`][Searcher::next] methods are required to lie on valid utf8 -/// boundaries in the haystack. This enables consumers of this trait to -/// slice the haystack without additional runtime checks. -pub unsafe trait Searcher { - /// Getter for the underlying string to be searched in - /// - /// Will always return the same [`&str`][str]. - fn haystack(&self) -> H; - - /// Performs the next search step starting from the front. - /// - /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` matches - /// the pattern. - /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` can - /// not match the pattern, even partially. - /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack has - /// been visited. - /// - /// The stream of [`Match`][SearchStep::Match] and - /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done] - /// will contain index ranges that are adjacent, non-overlapping, - /// covering the whole haystack, and laying on utf8 boundaries. - /// - /// A [`Match`][SearchStep::Match] result needs to contain the whole matched - /// pattern, however [`Reject`][SearchStep::Reject] results may be split up - /// into arbitrary many adjacent fragments. Both ranges may have zero length. - /// - /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` - /// might produce the stream - /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]` - fn next(&mut self) -> SearchStep; - - /// Finds the next [`Match`][SearchStep::Match] result. See [`next()`][Searcher::next]. - /// - /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges - /// of this and [`next_reject`][Searcher::next_reject] will overlap. This will return - /// `(start_match, end_match)`, where start_match is the index of where - /// the match begins, and end_match is the index after the end of the match. - #[inline] - fn next_match(&mut self) -> Option<(H::Cursor, H::Cursor)> { - loop { - match self.next() { - SearchStep::Match(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } - } - - /// Finds the next [`Reject`][SearchStep::Reject] result. See [`next()`][Searcher::next] - /// and [`next_match()`][Searcher::next_match]. - /// - /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges - /// of this and [`next_match`][Searcher::next_match] will overlap. - #[inline] - fn next_reject(&mut self) -> Option<(H::Cursor, H::Cursor)> { - loop { - match self.next() { - SearchStep::Reject(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } - } -} - -/// A reverse searcher for a string pattern. -/// -/// This trait provides methods for searching for non-overlapping -/// matches of a pattern starting from the back (right) of a string. -/// -/// It will be implemented by associated [`Searcher`] -/// types of the [`Pattern`] trait if the pattern supports searching -/// for it from the back. -/// -/// The index ranges returned by this trait are not required -/// to exactly match those of the forward search in reverse. -/// -/// For the reason why this trait is marked unsafe, see the -/// parent trait [`Searcher`]. -pub unsafe trait ReverseSearcher: Searcher { - /// Performs the next search step starting from the back. - /// - /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` - /// matches the pattern. - /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` - /// can not match the pattern, even partially. - /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack - /// has been visited - /// - /// The stream of [`Match`][SearchStep::Match] and - /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done] - /// will contain index ranges that are adjacent, non-overlapping, - /// covering the whole haystack, and laying on utf8 boundaries. - /// - /// A [`Match`][SearchStep::Match] result needs to contain the whole matched - /// pattern, however [`Reject`][SearchStep::Reject] results may be split up - /// into arbitrary many adjacent fragments. Both ranges may have zero length. - /// - /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` - /// might produce the stream - /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]`. - fn next_back(&mut self) -> SearchStep; - - /// Finds the next [`Match`][SearchStep::Match] result. - /// See [`next_back()`][ReverseSearcher::next_back]. - #[inline] - fn next_match_back(&mut self) -> Option<(H::Cursor, H::Cursor)> { - loop { - match self.next_back() { - SearchStep::Match(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } - } - - /// Finds the next [`Reject`][SearchStep::Reject] result. - /// See [`next_back()`][ReverseSearcher::next_back]. - #[inline] - fn next_reject_back(&mut self) -> Option<(H::Cursor, H::Cursor)> { - loop { - match self.next_back() { - SearchStep::Reject(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } - } -} - -/// A marker trait to express that a [`ReverseSearcher`] -/// can be used for a [`DoubleEndedIterator`] implementation. -/// -/// For this, the impl of [`Searcher`] and [`ReverseSearcher`] need -/// to follow these conditions: -/// -/// - All results of `next()` need to be identical -/// to the results of `next_back()` in reverse order. -/// - `next()` and `next_back()` need to behave as -/// the two ends of a range of values, that is they -/// can not "walk past each other". -/// -/// # Examples -/// -/// `char::Searcher` is a `DoubleEndedSearcher` because searching for a -/// [`char`] only requires looking at one at a time, which behaves the same -/// from both ends. -/// -/// `(&str)::Searcher` is not a `DoubleEndedSearcher` because -/// the pattern `"aa"` in the haystack `"aaa"` matches as either -/// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched. -pub trait DoubleEndedSearcher: ReverseSearcher {} - ///////////////////////////////////////////////////////////////////////////// // Impl for Haystack ///////////////////////////////////////////////////////////////////////////// diff --git a/library/core/tests/pattern.rs b/library/core/tests/pattern.rs index d4bec996d89a1..0e943bd80ec7f 100644 --- a/library/core/tests/pattern.rs +++ b/library/core/tests/pattern.rs @@ -1,4 +1,4 @@ -use std::str::pattern::*; +use std::pattern::*; // This macro makes it easier to write // tests that do a series of iterations diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index 318a46d1b637e..5bbdc1e0d984c 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -491,6 +491,8 @@ pub use core::mem; pub use core::ops; #[stable(feature = "rust1", since = "1.0.0")] pub use core::option; +#[unstable(feature = "pattern", issue = "27721")] +pub use core::pattern; #[stable(feature = "pin", since = "1.33.0")] pub use core::pin; #[stable(feature = "rust1", since = "1.0.0")] From dedea7badd9e412f93cc388fce61ae496071033d Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Mon, 20 Feb 2023 03:15:14 +0100 Subject: [PATCH 03/12] core: introduce internal core::pattern::{Split,SplitN} types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce core::pattern::Split and core::pattern::SplitN internal types which can be used to implement iterators splitting haystack into parts. Convert str’s Split-family of iterators to use them. In the future, more haystsacks will use those internal types. Co-authored-by: Peter Jaszkowiak --- library/core/src/pattern.rs | 242 +++++++++++++++++++++++++++++++++++ library/core/src/str/iter.rs | 206 +++++------------------------ library/core/src/str/mod.rs | 22 +--- 3 files changed, 281 insertions(+), 189 deletions(-) diff --git a/library/core/src/pattern.rs b/library/core/src/pattern.rs index 9fb50680a4058..3ab2fec3806b3 100644 --- a/library/core/src/pattern.rs +++ b/library/core/src/pattern.rs @@ -36,6 +36,8 @@ issue = "27721" )] +use crate::fmt; +use crate::mem::replace; use crate::ops::Range; /// A pattern which can be matched against a [`Haystack`]. @@ -360,3 +362,243 @@ pub unsafe trait ReverseSearcher: Searcher { /// in the haystack `"aaa"` matches as either `"[aa]a"` or `"a[aa]"`, depending /// from which side it is searched. pub trait DoubleEndedSearcher: ReverseSearcher {} + +////////////////////////////////////////////////////////////////////////////// +// Internal Split and SplitN implementations +////////////////////////////////////////////////////////////////////////////// + +/// Helper type for implementing split iterators. +/// +/// It’s a generic type which works with any [`Haystack`] and [`Searcher`] over +/// that haystack. Intended usage is to create a newtype wrapping this type +/// which implements iterator interface on top of [`next_fwd`][Split::next_fwd] +/// or [`next_fwd`][Split::next_fwd] methods. +/// +/// Note that unless `S` implements [`DoubleEndedSearcher`] trait, it’s +/// incorrect to use this type to implement a double ended iterator. +/// +/// For an example of this type in use, see [`core::str::Split`]. +#[unstable(feature = "pattern_internals", issue = "none")] +pub struct Split> { + /// Start of the region of the haystack yet to be examined. + start: H::Cursor, + /// End of the region of the haystack yet to be examined. + end: H::Cursor, + /// Searcher returning matches of the delimiter pattern. + searcher: S, + /// Whether to return an empty part if there’s delimiter at the end of the + /// haystack. + allow_trailing_empty: bool, + /// Whether splitting has finished. + finished: bool, +} + +/// Helper type for implementing split iterators with a split limit. +/// +/// It’s like [`Split`] but limits number of parts the haystack will be split +/// into. +#[unstable(feature = "pattern_internals", issue = "none")] +pub struct SplitN> { + /// Inner split implementation. + inner: Split, + /// Maximum number of parts the haystack can be split into. + limit: usize, +} + +impl + Clone> Clone for Split { + fn clone(&self) -> Self { + Self { searcher: self.searcher.clone(), ..*self } + } +} + +impl + Clone> Clone for SplitN { + fn clone(&self) -> Self { + Self { inner: self.inner.clone(), ..*self } + } +} + +impl fmt::Debug for Split +where + H: Haystack, + S: Searcher + fmt::Debug, +{ + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.debug_struct("Split") + .field("start", &self.start) + .field("end", &self.end) + .field("searcher", &self.searcher) + .field("allow_trailing_empty", &self.allow_trailing_empty) + .field("finished", &self.finished) + .finish() + } +} + +impl fmt::Debug for SplitN +where + H: Haystack, + S: Searcher + fmt::Debug, +{ + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.debug_struct("SplitN").field("inner", &self.inner).field("limit", &self.limit).finish() + } +} + +impl> Split { + /// Creates a new object configured without a limit and with + /// `allow_trailing_empty` option disabled. + /// + /// To set `allow_trailing_empty`, use + /// [`with_allow_trailing_empty()`][Self::with_allow_trailing_empty] method. + /// To set split limit, use [`with_limit()`][Self::with_limit] method. + pub fn new(searcher: S) -> Self { + let haystack = searcher.haystack(); + Self { + searcher, + start: haystack.cursor_at_front(), + end: haystack.cursor_at_back(), + allow_trailing_empty: false, + finished: false, + } + } + + /// Changes splits limit from unlimited to given value. + /// + /// The limit specifies maximum number of parts haystack will be split into. + pub fn with_limit(self, limit: usize) -> SplitN { + SplitN { inner: self, limit } + } + + /// Enables allow_trailing_empty option. + /// + /// If enabled (which is not the default), if the haystack is empty or + /// terminated by a pattern match, the last haystack part returned will be + /// empty. Otherwise, the last empty split is not returned. + pub fn with_allow_trailing_empty(mut self) -> Self { + self.allow_trailing_empty = true; + self + } +} + +impl> Split { + /// Returns next part of the haystack or `None` if splitting is done. + /// + /// If `INCLUSIVE` is `true`, returned value will include the matching + /// pattern. + pub fn next_fwd(&mut self) -> Option { + if self.finished { + return None; + } + let haystack = self.searcher.haystack(); + if let Some((start, end)) = self.searcher.next_match() { + let range = self.start..(if INCLUSIVE { end } else { start }); + self.start = end; + // SAFETY: self.start and self.end come from Haystack or Searcher + // and thus are guaranteed to be valid split positions. + Some(unsafe { haystack.get_unchecked(range) }) + } else { + self.get_end() + } + } + + /// Returns next looking from back of the haystack part of the haystack or + /// `None` if splitting is done. + /// + /// If `INCLUSIVE` is `true`, returned value will include the matching + /// pattern. + pub fn next_bwd(&mut self) -> Option + where + S: ReverseSearcher, + { + if self.finished { + return None; + } + + if !self.allow_trailing_empty { + self.allow_trailing_empty = true; + if let Some(elt) = self.next_bwd::() { + if !elt.is_empty() { + return Some(elt); + } + } + if self.finished { + return None; + } + } + + let range = if let Some((start, end)) = self.searcher.next_match_back() { + end..replace(&mut self.end, if INCLUSIVE { end } else { start }) + } else { + self.finished = true; + self.start..self.end + }; + // SAFETY: All indices come from Haystack or Searcher which guarantee + // that they are valid split positions. + Some(unsafe { self.searcher.haystack().get_unchecked(range) }) + } + + /// Returns remaining part of the haystack that hasn’t been processed yet. + pub fn remainder(&self) -> Option { + (!self.finished).then(|| { + // SAFETY: self.start and self.end come from Haystack or Searcher + // and thus are guaranteed to be valid split positions. + unsafe { self.searcher.haystack().get_unchecked(self.start..self.end) } + }) + } + + /// Returns the final haystack part. + /// + /// Sets `finished` flag so any further calls to this or other methods will + /// return `None`. + fn get_end(&mut self) -> Option { + if !self.finished { + self.finished = true; + if self.allow_trailing_empty || self.start != self.end { + // SAFETY: self.start and self.end come from Haystack or + // Searcher and thus are guaranteed to be valid split positions. + return Some(unsafe { + self.searcher.haystack().get_unchecked(self.start..self.end) + }); + } + } + None + } +} + +impl> SplitN { + /// Returns next part of the haystack or `None` if splitting is done. + /// + /// If `INCLUSIVE` is `true`, returned value will include the matching + /// pattern. + pub fn next_fwd(&mut self) -> Option { + match self.dec_limit()? { + 0 => self.inner.get_end(), + _ => self.inner.next_fwd::(), + } + } + + /// Returns next looking from back of the haystack part of the haystack or + /// `None` if splitting is done. + /// + /// If `INCLUSIVE` is `true`, returned value will include the matching + /// pattern. + pub fn next_bwd(&mut self) -> Option + where + S: ReverseSearcher, + { + match self.dec_limit()? { + 0 => self.inner.get_end(), + _ => self.inner.next_bwd::(), + } + } + + /// Returns remaining part of the haystack that hasn’t been processed yet. + pub fn remainder(&self) -> Option { + self.inner.remainder() + } + + /// Decrements limit and returns its new value or None if it’s already zero. + fn dec_limit(&mut self) -> Option { + self.limit = self.limit.checked_sub(1)?; + Some(self.limit) + } +} diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index f5acc6e5441a1..a3016194dc001 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -557,17 +557,28 @@ macro_rules! generate_pattern_iterators { } => {} } -derive_pattern_clone! { - clone SplitInternal - with |s| SplitInternal { matcher: s.matcher.clone(), ..*s } +pub(super) struct SplitInternal<'a, P: Pattern<&'a str>>( + core::pattern::Split<&'a str, P::Searcher>, +); + +impl<'a, P: Pattern<&'a str>> SplitInternal<'a, P> { + pub(super) fn new(haystack: &'a str, pattern: P) -> Self { + Self(core::pattern::Split::new(pattern.into_searcher(haystack))) + } + + pub(super) fn with_allow_trailing_empty(self) -> Self { + Self(self.0.with_allow_trailing_empty()) + } + + pub(super) fn with_limit(self, count: usize) -> SplitNInternal<'a, P> { + SplitNInternal(self.0.with_limit(count)) + } } -pub(super) struct SplitInternal<'a, P: Pattern<&'a str>> { - pub(super) start: usize, - pub(super) end: usize, - pub(super) matcher: P::Searcher, - pub(super) allow_trailing_empty: bool, - pub(super) finished: bool, +impl<'a, P: Pattern<&'a str, Searcher: Clone>> Clone for SplitInternal<'a, P> { + fn clone(&self) -> Self { + Self(self.0.clone()) + } } impl<'a, P> fmt::Debug for SplitInternal<'a, P> @@ -575,159 +586,35 @@ where P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("SplitInternal") - .field("start", &self.start) - .field("end", &self.end) - .field("matcher", &self.matcher) - .field("allow_trailing_empty", &self.allow_trailing_empty) - .field("finished", &self.finished) - .finish() + self.0.fmt(f) } } impl<'a, P: Pattern<&'a str>> SplitInternal<'a, P> { - #[inline] - fn get_end(&mut self) -> Option<&'a str> { - if !self.finished { - self.finished = true; - - if self.allow_trailing_empty || self.end - self.start > 0 { - // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. - let string = unsafe { self.matcher.haystack().get_unchecked(self.start..self.end) }; - return Some(string); - } - } - - None - } - - #[inline] fn next(&mut self) -> Option<&'a str> { - if self.finished { - return None; - } - - let haystack = self.matcher.haystack(); - match self.matcher.next_match() { - // SAFETY: `Searcher` guarantees that `a` and `b` lie on unicode boundaries. - Some((a, b)) => unsafe { - let elt = haystack.get_unchecked(self.start..a); - self.start = b; - Some(elt) - }, - None => self.get_end(), - } + self.0.next_fwd::() } - #[inline] fn next_inclusive(&mut self) -> Option<&'a str> { - if self.finished { - return None; - } - - let haystack = self.matcher.haystack(); - match self.matcher.next_match() { - // SAFETY: `Searcher` guarantees that `b` lies on unicode boundary, - // and self.start is either the start of the original string, - // or `b` was assigned to it, so it also lies on unicode boundary. - Some((_, b)) => unsafe { - let elt = haystack.get_unchecked(self.start..b); - self.start = b; - Some(elt) - }, - None => self.get_end(), - } + self.0.next_fwd::() } - #[inline] fn next_back(&mut self) -> Option<&'a str> where P::Searcher: ReverseSearcher<&'a str>, { - if self.finished { - return None; - } - - if !self.allow_trailing_empty { - self.allow_trailing_empty = true; - match self.next_back() { - Some(elt) if !elt.is_empty() => return Some(elt), - _ => { - if self.finished { - return None; - } - } - } - } - - let haystack = self.matcher.haystack(); - match self.matcher.next_match_back() { - // SAFETY: `Searcher` guarantees that `a` and `b` lie on unicode boundaries. - Some((a, b)) => unsafe { - let elt = haystack.get_unchecked(b..self.end); - self.end = a; - Some(elt) - }, - // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. - None => unsafe { - self.finished = true; - Some(haystack.get_unchecked(self.start..self.end)) - }, - } + self.0.next_bwd::() } - #[inline] fn next_back_inclusive(&mut self) -> Option<&'a str> where P::Searcher: ReverseSearcher<&'a str>, { - if self.finished { - return None; - } - - if !self.allow_trailing_empty { - self.allow_trailing_empty = true; - match self.next_back_inclusive() { - Some(elt) if !elt.is_empty() => return Some(elt), - _ => { - if self.finished { - return None; - } - } - } - } - - let haystack = self.matcher.haystack(); - match self.matcher.next_match_back() { - // SAFETY: `Searcher` guarantees that `b` lies on unicode boundary, - // and self.end is either the end of the original string, - // or `b` was assigned to it, so it also lies on unicode boundary. - Some((_, b)) => unsafe { - let elt = haystack.get_unchecked(b..self.end); - self.end = b; - Some(elt) - }, - // SAFETY: self.start is either the start of the original string, - // or start of a substring that represents the part of the string that hasn't - // iterated yet. Either way, it is guaranteed to lie on unicode boundary. - // self.end is either the end of the original string, - // or `b` was assigned to it, so it also lies on unicode boundary. - None => unsafe { - self.finished = true; - Some(haystack.get_unchecked(self.start..self.end)) - }, - } + self.0.next_bwd::() } - #[inline] fn remainder(&self) -> Option<&'a str> { - // `Self::get_end` doesn't change `self.start` - if self.finished { - return None; - } - - // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. - Some(unsafe { self.matcher.haystack().get_unchecked(self.start..self.end) }) + self.0.remainder() } } @@ -861,41 +748,26 @@ impl<'a, P: Pattern<&'a str>> RSplitTerminator<'a, P> { derive_pattern_clone! { clone SplitNInternal - with |s| SplitNInternal { iter: s.iter.clone(), ..*s } + with |s| SplitNInternal(s.0.clone()) } -pub(super) struct SplitNInternal<'a, P: Pattern<&'a str>> { - pub(super) iter: SplitInternal<'a, P>, - /// The number of splits remaining - pub(super) count: usize, -} +pub(super) struct SplitNInternal<'a, P: Pattern<&'a str>>( + core::pattern::SplitN<&'a str, P::Searcher>, +); impl<'a, P> fmt::Debug for SplitNInternal<'a, P> where P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("SplitNInternal") - .field("iter", &self.iter) - .field("count", &self.count) - .finish() + self.0.fmt(f) } } impl<'a, P: Pattern<&'a str>> SplitNInternal<'a, P> { #[inline] fn next(&mut self) -> Option<&'a str> { - match self.count { - 0 => None, - 1 => { - self.count = 0; - self.iter.get_end() - } - _ => { - self.count -= 1; - self.iter.next() - } - } + self.0.next_fwd::() } #[inline] @@ -903,22 +775,12 @@ impl<'a, P: Pattern<&'a str>> SplitNInternal<'a, P> { where P::Searcher: ReverseSearcher<&'a str>, { - match self.count { - 0 => None, - 1 => { - self.count = 0; - self.iter.get_end() - } - _ => { - self.count -= 1; - self.iter.next_back() - } - } + self.0.next_bwd::() } #[inline] fn remainder(&self) -> Option<&'a str> { - self.iter.remainder() + self.0.remainder() } } diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 2d0e4da1bd520..3be15d932a0a7 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -72,8 +72,8 @@ pub use iter::SplitInclusive; pub use validations::{next_code_point, utf8_char_width}; use iter::MatchIndicesInternal; +use iter::MatchesInternal; use iter::SplitInternal; -use iter::{MatchesInternal, SplitNInternal}; #[inline(never)] #[cold] @@ -1330,13 +1330,7 @@ impl str { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn split<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> Split<'a, P> { - Split(SplitInternal { - start: 0, - end: self.len(), - matcher: pat.into_searcher(self), - allow_trailing_empty: true, - finished: false, - }) + Split(SplitInternal::new(self, pat).with_allow_trailing_empty()) } /// An iterator over substrings of this string slice, separated by @@ -1370,13 +1364,7 @@ impl str { #[stable(feature = "split_inclusive", since = "1.51.0")] #[inline] pub fn split_inclusive<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> SplitInclusive<'a, P> { - SplitInclusive(SplitInternal { - start: 0, - end: self.len(), - matcher: pat.into_searcher(self), - allow_trailing_empty: false, - finished: false, - }) + SplitInclusive(SplitInternal::new(self, pat)) } /// An iterator over substrings of the given string slice, separated by @@ -1476,7 +1464,7 @@ impl str { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn split_terminator<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> SplitTerminator<'a, P> { - SplitTerminator(SplitInternal { allow_trailing_empty: false, ..self.split(pat).0 }) + SplitTerminator(SplitInternal::new(self, pat)) } /// An iterator over substrings of `self`, separated by characters @@ -1577,7 +1565,7 @@ impl str { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn splitn<'a, P: Pattern<&'a str>>(&'a self, n: usize, pat: P) -> SplitN<'a, P> { - SplitN(SplitNInternal { iter: self.split(pat).0, count: n }) + SplitN(self.split(pat).0.with_limit(n)) } /// An iterator over substrings of this string slice, separated by a From d01dc9368c263e36f8653c0578b56efa00fa4295 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Wed, 22 Feb 2023 16:12:08 +0100 Subject: [PATCH 04/12] core: add core::pattern::EmptyNeedleSearcher internal type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce core::pattern::EmptyNeedleSearcher internal type which implements logic for matching an empty pattern against a haystack. Convert core::str::pattern::StrSearcher to use it. In future more implementations will take advantage of it. Also adapt and rework TwoWayStrategy into an internal SearchResult trait which abstracts differences between Searcher’s next, next_match and next_rejects methods. It makes it simpler to write a single generic method implementing optimised versions of all those calls. --- library/core/src/pattern.rs | 215 ++++++++++++++++++++++++++- library/core/src/str/pattern.rs | 256 ++++++++++---------------------- 2 files changed, 294 insertions(+), 177 deletions(-) diff --git a/library/core/src/pattern.rs b/library/core/src/pattern.rs index 3ab2fec3806b3..835ca0459eea3 100644 --- a/library/core/src/pattern.rs +++ b/library/core/src/pattern.rs @@ -37,7 +37,7 @@ )] use crate::fmt; -use crate::mem::replace; +use crate::mem::{replace, take}; use crate::ops::Range; /// A pattern which can be matched against a [`Haystack`]. @@ -201,6 +201,84 @@ pub enum SearchStep { Done, } +/// Possible return type of a search. +/// +/// It abstract differences between `next`, `next_match` and `next_reject` +/// methods. Depending on return type an implementation for those functions +/// will generate matches and rejects, only matches or only rejects. +#[unstable(feature = "pattern_internals", issue = "none")] +pub trait SearchResult: Sized + sealed::Sealed { + /// Value indicating searching has finished. + const DONE: Self; + + /// Returns value describing a match or `None` if this implementation + /// doesn’t care about matches. + fn matching(start: T, end: T) -> Option; + + /// Returns value describing a reject or `None` if this implementation + /// doesn’t care about matches. + fn rejecting(start: T, end: T) -> Option; +} + +/// A wrapper for result type which only carries information about matches. +#[unstable(feature = "pattern_internals", issue = "none")] +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub struct MatchOnly(pub Option<(T, T)>); + +/// A wrapper for result type which only carries information about rejects. +#[unstable(feature = "pattern_internals", issue = "none")] +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub struct RejectOnly(pub Option<(T, T)>); + +impl SearchResult for SearchStep { + const DONE: Self = SearchStep::Done; + + #[inline(always)] + fn matching(s: T, e: T) -> Option { + Some(SearchStep::Match(s, e)) + } + + #[inline(always)] + fn rejecting(s: T, e: T) -> Option { + Some(SearchStep::Reject(s, e)) + } +} + +impl SearchResult for MatchOnly { + const DONE: Self = Self(None); + + #[inline(always)] + fn matching(s: T, e: T) -> Option { + Some(Self(Some((s, e)))) + } + + #[inline(always)] + fn rejecting(_s: T, _e: T) -> Option { + None + } +} + +impl SearchResult for RejectOnly { + const DONE: Self = Self(None); + + #[inline(always)] + fn matching(_s: T, _e: T) -> Option { + None + } + + #[inline(always)] + fn rejecting(s: T, e: T) -> Option { + Some(Self(Some((s, e)))) + } +} + +mod sealed { + pub trait Sealed {} + impl Sealed for super::SearchStep {} + impl Sealed for super::MatchOnly {} + impl Sealed for super::RejectOnly {} +} + /// A searcher for a string pattern. /// /// This trait provides methods for searching for non-overlapping matches of @@ -363,6 +441,141 @@ pub unsafe trait ReverseSearcher: Searcher { /// from which side it is searched. pub trait DoubleEndedSearcher: ReverseSearcher {} +////////////////////////////////////////////////////////////////////////////// +// Internal EmptyNeedleSearcher helper +////////////////////////////////////////////////////////////////////////////// + +/// Helper for implementing searchers looking for empty patterns. +/// +/// An empty pattern matches around every element of a haystack. For example, +/// within a `&str` it matches around every character. (This includes at the +/// beginning and end of the string). +/// +/// This struct helps implement searchers for empty patterns for various +/// haystacks. The only requirement is a function which advances the start +/// position or end position of the haystack range. +/// +/// # Examples +/// +/// ``` +/// #![feature(pattern, pattern_internals)] +/// use core::pattern::{EmptyNeedleSearcher, SearchStep}; +/// +/// let haystack = "fóó"; +/// let mut searcher = EmptyNeedleSearcher::new(haystack); +/// let advance = |range: core::ops::Range| { +/// range.start + haystack[range].chars().next().unwrap().len_utf8() +/// }; +/// let steps = core::iter::from_fn(|| { +/// match searcher.next_fwd(advance) { +/// SearchStep::Done => None, +/// step => Some(step) +/// } +/// }).collect::>(); +/// assert_eq!(&[ +/// SearchStep::Match(0, 0), +/// SearchStep::Reject(0, 1), +/// SearchStep::Match(1, 1), +/// SearchStep::Reject(1, 3), +/// SearchStep::Match(3, 3), +/// SearchStep::Reject(3, 5), +/// SearchStep::Match(5, 5), +/// ], steps.as_slice()); +/// ``` +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[unstable(feature = "pattern_internals", issue = "none")] +pub struct EmptyNeedleSearcher { + start: T, + end: T, + is_match_fwd: bool, + is_match_bwd: bool, + // Needed in case of an empty haystack, see #85462 + is_finished: bool, +} + +impl EmptyNeedleSearcher { + /// Creates a new empty needle searcher for given haystack. + /// + /// The haystack is used to initialise the range of valid cursors positions. + pub fn new>(haystack: H) -> Self { + Self { + start: haystack.cursor_at_front(), + end: haystack.cursor_at_back(), + is_match_bwd: true, + is_match_fwd: true, + is_finished: false, + } + } + + /// Returns next search result. + /// + /// The callback function is used to advance the **start** of the range the + /// searcher is working on. It is passed the current range of cursor + /// positions that weren’t visited yet and it must return the new start + /// cursor position. It’s never called with an empty range. For some + /// haystacks the callback may be as simple as a closure returning the start + /// incremented by one; others might require looking for a new valid + /// boundary. + pub fn next_fwd, F>(&mut self, advance_fwd: F) -> R + where + F: FnOnce(crate::ops::Range) -> T, + { + if self.is_finished { + return R::DONE; + } + if take(&mut self.is_match_fwd) { + if let Some(ret) = R::matching(self.start, self.start) { + return ret; + } + } + if self.start < self.end { + let pos = self.start; + self.start = advance_fwd(self.start..self.end); + if let Some(ret) = R::rejecting(pos, self.start) { + self.is_match_fwd = true; + return ret; + } + return R::matching(self.start, self.start).unwrap(); + } + self.is_finished = true; + R::DONE + } + + /// Returns next search result. + /// + /// The callback function is used to advance the **end** of the range the + /// searcher is working on backwards. It is passed the current range of + /// cursor positions that weren’t visited yet and it must return the new end + /// cursor position. It’s never called with an empty range. For some + /// haystacks the callback may be as simple as a closure returning the end + /// decremented by one; others might require looking for a new valid + /// boundary. + pub fn next_bwd, F>(&mut self, advance_bwd: F) -> R + where + F: FnOnce(crate::ops::Range) -> T, + { + if self.is_finished { + return R::DONE; + } + if take(&mut self.is_match_bwd) { + if let Some(ret) = R::matching(self.end, self.end) { + return ret; + } + } + if self.start < self.end { + let pos = self.end; + self.end = advance_bwd(self.start..self.end); + if let Some(ret) = R::rejecting(self.end, pos) { + self.is_match_bwd = true; + return ret; + } + return R::matching(self.end, self.end).unwrap(); + } + self.is_finished = true; + R::DONE + } +} + ////////////////////////////////////////////////////////////////////////////// // Internal Split and SplitN implementations ////////////////////////////////////////////////////////////////////////////// diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs index 4183b82413238..bb6232d92eecd 100644 --- a/library/core/src/str/pattern.rs +++ b/library/core/src/str/pattern.rs @@ -54,7 +54,8 @@ use crate::cmp::Ordering; use crate::fmt; use crate::ops::Range; use crate::pattern::{ - DoubleEndedSearcher, Haystack, Pattern, ReverseSearcher, SearchStep, Searcher, + DoubleEndedSearcher, Haystack, MatchOnly, Pattern, ReverseSearcher, SearchResult, SearchStep, + Searcher, }; use crate::slice::memchr; @@ -765,43 +766,36 @@ pub struct StrSearcher<'a, 'b> { #[derive(Clone, Debug)] enum StrSearcherImpl { - Empty(EmptyNeedle), + Empty(core::pattern::EmptyNeedleSearcher), TwoWay(TwoWaySearcher), } -#[derive(Clone, Debug)] -struct EmptyNeedle { - position: usize, - end: usize, - is_match_fw: bool, - is_match_bw: bool, - // Needed in case of an empty haystack, see #85462 - is_finished: bool, -} - impl<'a, 'b> StrSearcher<'a, 'b> { fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> { - if needle.is_empty() { - StrSearcher { - haystack, - needle, - searcher: StrSearcherImpl::Empty(EmptyNeedle { - position: 0, - end: haystack.len(), - is_match_fw: true, - is_match_bw: true, - is_finished: false, - }), - } + let searcher = if needle.is_empty() { + StrSearcherImpl::Empty(core::pattern::EmptyNeedleSearcher::new(haystack)) } else { - StrSearcher { - haystack, - needle, - searcher: StrSearcherImpl::TwoWay(TwoWaySearcher::new( - needle.as_bytes(), - haystack.len(), - )), - } + StrSearcherImpl::TwoWay(TwoWaySearcher::new(needle.as_bytes(), haystack.len())) + }; + StrSearcher { haystack, needle, searcher } + } + + fn fwd_char(haystack: &str, pos: usize) -> usize { + pos + super::utf8_char_width(haystack.as_bytes()[pos]) + } + + fn bwd_char(haystack: &str, pos: usize) -> usize { + // Note: we are guaranteed to operate on valid UTF-8 thus we will never + // need to go further than four bytes back. + let bytes = haystack.as_bytes(); + if bytes[pos - 1].is_utf8_char_boundary() { + pos - 1 + } else if bytes[pos - 2].is_utf8_char_boundary() { + pos - 2 + } else if bytes[pos - 3].is_utf8_char_boundary() { + pos - 3 + } else { + pos - 4 } } } @@ -816,24 +810,7 @@ unsafe impl<'a, 'b> Searcher<&'a str> for StrSearcher<'a, 'b> { fn next(&mut self) -> SearchStep { match self.searcher { StrSearcherImpl::Empty(ref mut searcher) => { - if searcher.is_finished { - return SearchStep::Done; - } - // empty needle rejects every char and matches every empty string between them - let is_match = searcher.is_match_fw; - searcher.is_match_fw = !searcher.is_match_fw; - let pos = searcher.position; - match self.haystack[pos..].chars().next() { - _ if is_match => SearchStep::Match(pos, pos), - None => { - searcher.is_finished = true; - SearchStep::Done - } - Some(ch) => { - searcher.position += ch.len_utf8(); - SearchStep::Reject(pos, searcher.position) - } - } + searcher.next_fwd(|range| Self::fwd_char(self.haystack, range.start)) } StrSearcherImpl::TwoWay(ref mut searcher) => { // TwoWaySearcher produces valid *Match* indices that split at char boundaries @@ -845,11 +822,7 @@ unsafe impl<'a, 'b> Searcher<&'a str> for StrSearcher<'a, 'b> { return SearchStep::Done; } let is_long = searcher.memory == usize::MAX; - match searcher.next::( - self.haystack.as_bytes(), - self.needle.as_bytes(), - is_long, - ) { + match searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), is_long) { SearchStep::Reject(a, mut b) => { // skip to next char boundary while !self.haystack.is_char_boundary(b) { @@ -867,29 +840,23 @@ unsafe impl<'a, 'b> Searcher<&'a str> for StrSearcher<'a, 'b> { #[inline] fn next_match(&mut self) -> Option<(usize, usize)> { match self.searcher { - StrSearcherImpl::Empty(..) => loop { - match self.next() { - SearchStep::Match(a, b) => return Some((a, b)), - SearchStep::Done => return None, - SearchStep::Reject(..) => {} - } - }, + StrSearcherImpl::Empty(ref mut searcher) => { + searcher + .next_fwd::(|range| Self::fwd_char(self.haystack, range.start)) + .0 + } StrSearcherImpl::TwoWay(ref mut searcher) => { let is_long = searcher.memory == usize::MAX; // write out `true` and `false` cases to encourage the compiler // to specialize the two cases separately. if is_long { - searcher.next::( - self.haystack.as_bytes(), - self.needle.as_bytes(), - true, - ) + searcher + .next::(self.haystack.as_bytes(), self.needle.as_bytes(), true) + .0 } else { - searcher.next::( - self.haystack.as_bytes(), - self.needle.as_bytes(), - false, - ) + searcher + .next::(self.haystack.as_bytes(), self.needle.as_bytes(), false) + .0 } } } @@ -901,34 +868,15 @@ unsafe impl<'a, 'b> ReverseSearcher<&'a str> for StrSearcher<'a, 'b> { fn next_back(&mut self) -> SearchStep { match self.searcher { StrSearcherImpl::Empty(ref mut searcher) => { - if searcher.is_finished { - return SearchStep::Done; - } - let is_match = searcher.is_match_bw; - searcher.is_match_bw = !searcher.is_match_bw; - let end = searcher.end; - match self.haystack[..end].chars().next_back() { - _ if is_match => SearchStep::Match(end, end), - None => { - searcher.is_finished = true; - SearchStep::Done - } - Some(ch) => { - searcher.end -= ch.len_utf8(); - SearchStep::Reject(searcher.end, end) - } - } + searcher.next_bwd(|range| Self::bwd_char(self.haystack, range.end)) } StrSearcherImpl::TwoWay(ref mut searcher) => { if searcher.end == 0 { return SearchStep::Done; } let is_long = searcher.memory == usize::MAX; - match searcher.next_back::( - self.haystack.as_bytes(), - self.needle.as_bytes(), - is_long, - ) { + match searcher.next_back(self.haystack.as_bytes(), self.needle.as_bytes(), is_long) + { SearchStep::Reject(mut a, b) => { // skip to next char boundary while !self.haystack.is_char_boundary(a) { @@ -946,28 +894,30 @@ unsafe impl<'a, 'b> ReverseSearcher<&'a str> for StrSearcher<'a, 'b> { #[inline] fn next_match_back(&mut self) -> Option<(usize, usize)> { match self.searcher { - StrSearcherImpl::Empty(..) => loop { - match self.next_back() { - SearchStep::Match(a, b) => return Some((a, b)), - SearchStep::Done => return None, - SearchStep::Reject(..) => {} - } - }, + StrSearcherImpl::Empty(ref mut searcher) => { + searcher + .next_bwd::(|range| Self::bwd_char(self.haystack, range.end)) + .0 + } StrSearcherImpl::TwoWay(ref mut searcher) => { let is_long = searcher.memory == usize::MAX; // write out `true` and `false`, like `next_match` if is_long { - searcher.next_back::( - self.haystack.as_bytes(), - self.needle.as_bytes(), - true, - ) + searcher + .next_back::( + self.haystack.as_bytes(), + self.needle.as_bytes(), + true, + ) + .0 } else { - searcher.next_back::( - self.haystack.as_bytes(), - self.needle.as_bytes(), - false, - ) + searcher + .next_back::( + self.haystack.as_bytes(), + self.needle.as_bytes(), + false, + ) + .0 } } } @@ -1155,10 +1105,7 @@ impl TwoWaySearcher { // How far we can jump when we encounter a mismatch is all based on the fact // that (u, v) is a critical factorization for the needle. #[inline] - fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output - where - S: TwoWayStrategy, - { + fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> R { // `next()` uses `self.position` as its cursor let old_pos = self.position; let needle_last = needle.len() - 1; @@ -1170,12 +1117,14 @@ impl TwoWaySearcher { Some(&b) => b, None => { self.position = haystack.len(); - return S::rejecting(old_pos, self.position); + return R::rejecting(old_pos, self.position).unwrap_or(R::DONE); } }; - if S::use_early_reject() && old_pos != self.position { - return S::rejecting(old_pos, self.position); + if old_pos != self.position { + if let Some(ret) = R::rejecting(old_pos, self.position) { + return ret; + } } // Quickly skip by large portions unrelated to our substring @@ -1221,7 +1170,7 @@ impl TwoWaySearcher { self.memory = 0; // set to needle.len() - self.period for overlapping matches } - return S::matching(match_pos, match_pos + needle.len()); + return R::matching(match_pos, match_pos + needle.len()).unwrap(); } } @@ -1238,10 +1187,12 @@ impl TwoWaySearcher { // To search in reverse through the haystack, we search forward through // a reversed haystack with a reversed needle, matching first u' and then v'. #[inline] - fn next_back(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output - where - S: TwoWayStrategy, - { + fn next_back( + &mut self, + haystack: &[u8], + needle: &[u8], + long_period: bool, + ) -> R { // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()` // are independent. let old_end = self.end; @@ -1254,12 +1205,14 @@ impl TwoWaySearcher { Some(&b) => b, None => { self.end = 0; - return S::rejecting(0, old_end); + return R::rejecting(0, old_end).unwrap_or(R::DONE); } }; - if S::use_early_reject() && old_end != self.end { - return S::rejecting(self.end, old_end); + if old_end != self.end { + if let Some(ret) = R::rejecting(self.end, old_end) { + return ret; + } } // Quickly skip by large portions unrelated to our substring @@ -1307,7 +1260,7 @@ impl TwoWaySearcher { self.memory_back = needle.len(); } - return S::matching(match_pos, match_pos + needle.len()); + return R::matching(match_pos, match_pos + needle.len()).unwrap(); } } @@ -1410,55 +1363,6 @@ impl TwoWaySearcher { } } -// TwoWayStrategy allows the algorithm to either skip non-matches as quickly -// as possible, or to work in a mode where it emits Rejects relatively quickly. -trait TwoWayStrategy { - type Output; - fn use_early_reject() -> bool; - fn rejecting(a: usize, b: usize) -> Self::Output; - fn matching(a: usize, b: usize) -> Self::Output; -} - -/// Skip to match intervals as quickly as possible -enum MatchOnly {} - -impl TwoWayStrategy for MatchOnly { - type Output = Option<(usize, usize)>; - - #[inline] - fn use_early_reject() -> bool { - false - } - #[inline] - fn rejecting(_a: usize, _b: usize) -> Self::Output { - None - } - #[inline] - fn matching(a: usize, b: usize) -> Self::Output { - Some((a, b)) - } -} - -/// Emit Rejects regularly -enum RejectAndMatch {} - -impl TwoWayStrategy for RejectAndMatch { - type Output = SearchStep; - - #[inline] - fn use_early_reject() -> bool { - true - } - #[inline] - fn rejecting(a: usize, b: usize) -> Self::Output { - SearchStep::Reject(a, b) - } - #[inline] - fn matching(a: usize, b: usize) -> Self::Output { - SearchStep::Match(a, b) - } -} - /// SIMD search for short needles based on /// Wojciech Muła's "SIMD-friendly algorithms for substring searching"[0] /// From 4149fcd25e19935c28b1b647b2338495e0a059a1 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Fri, 17 Feb 2023 14:28:11 +0100 Subject: [PATCH 05/12] core: add try_next_code_point{,_reverse} internal functions --- library/core/src/str/mod.rs | 4 +- library/core/src/str/validations.rs | 241 ++++++++++++++++++++-------- 2 files changed, 175 insertions(+), 70 deletions(-) diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 3be15d932a0a7..a3c1a0e80e2ea 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -69,7 +69,9 @@ pub use iter::SplitAsciiWhitespace; pub use iter::SplitInclusive; #[unstable(feature = "str_internals", issue = "none")] -pub use validations::{next_code_point, utf8_char_width}; +pub use validations::{ + next_code_point, try_next_code_point, try_next_code_point_reverse, utf8_char_width, +}; use iter::MatchIndicesInternal; use iter::MatchesInternal; diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs index 2acef432f2063..b4a183711a1ee 100644 --- a/library/core/src/str/validations.rs +++ b/library/core/src/str/validations.rs @@ -120,6 +120,87 @@ const fn contains_nonascii(x: usize) -> bool { (x & NONASCII_MASK) != 0 } +/// Reads the first code point out of a byte slice validating whether it’s +/// valid. +/// +/// This is different than [`next_code_point`] in that it doesn’t assume +/// argument is well-formed UTF-8-like string. Together with the character its +/// encoded length is returned. +/// +/// If front of the bytes slice doesn’t contain valid UTF-8 bytes sequence (that +/// includes a WTF-8 encoded surrogate) returns `None`. +/// +/// ``` +/// #![feature(str_internals)] +/// use core::str::try_next_code_point; +/// +/// assert_eq!(Some(('f', 1)), try_next_code_point(b"foo".as_ref())); +/// assert_eq!(Some(('Ż', 2)), try_next_code_point("Żółw".as_bytes())); +/// assert_eq!(None, try_next_code_point(b"\xffoo".as_ref())); +/// ``` +#[unstable(feature = "str_internals", issue = "none")] +#[inline] +pub const fn try_next_code_point(bytes: &[u8]) -> Option<(char, usize)> { + let first = match bytes.first() { + Some(&byte) => byte, + None => return None, + }; + let (value, length) = if first < 0x80 { + (first as u32, 1) + } else if let Ok((cp, len)) = try_finish_byte_sequence(first, bytes, 0) { + (cp, len) + } else { + return None; + }; + // SAFETY: We’ve just verified value is correct Unicode scalar value. + // Either ASCII (first branch of the if-else-if-else) or non-ASCII Unicode + // character (second branch). + Some((unsafe { char::from_u32_unchecked(value) }, length)) +} + +/// Reads the last code point out of a byte slice validating whether it’s +/// valid. +/// +/// This is different than `next_code_point_reverse` in that it doesn’t assume +/// argument is well-formed UTF-8-like string. Together with the character its +/// encoded length is returned. +/// +/// If back of the bytes slice doesn’t contain valid UTF-8 bytes sequence (that +/// includes a WTF-8 encoded surrogate) returns `None`. +/// +/// ``` +/// #![feature(str_internals)] +/// use core::str::try_next_code_point_reverse; +/// +/// assert_eq!(Some(('o', 1)), try_next_code_point_reverse(b"foo".as_ref())); +/// assert_eq!(Some(('‽', 3)), try_next_code_point_reverse("Uh‽".as_bytes())); +/// assert_eq!(None, try_next_code_point_reverse(b"foo\xff".as_ref())); +/// ``` +#[unstable(feature = "str_internals", issue = "none")] +#[inline] +pub const fn try_next_code_point_reverse(bytes: &[u8]) -> Option<(char, usize)> { + let mut n = 1; + let limit = bytes.len(); + let limit = if limit < 4 { limit } else { 4 }; // not .min(4) because of const + while n <= limit && !bytes[bytes.len() - n].is_utf8_char_boundary() { + n += 1; + } + if n <= limit { + // It’s not clear to me why, but range indexing isn’t const here, + // i.e. `&bytes[bytes.len() - n..]` doesn’t compile. Because of that + // I’m resorting to unsafe block with from_raw_parts. + // SAFETY: n ≤ limit ≤ bytes.len() thus bytes.len() - n ≥ 0 and we + // have n remaining bytes. + let bytes = unsafe { crate::slice::from_raw_parts(bytes.as_ptr().add(bytes.len() - n), n) }; + if let Some((chr, len)) = try_next_code_point(bytes) { + if n == len { + return Some((chr, len)); + } + } + } + None +} + /// Walks through `v` checking that it's a valid UTF-8 sequence, /// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`. #[inline(always)] @@ -134,78 +215,13 @@ pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { let align = v.as_ptr().align_offset(usize_bytes); while index < len { - let old_offset = index; - macro_rules! err { - ($error_len: expr) => { - return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len }) - }; - } - - macro_rules! next { - () => {{ - index += 1; - // we needed data, but there was none: error! - if index >= len { - err!(None) - } - v[index] - }}; - } - + let valid_up_to = index; let first = v[index]; if first >= 128 { - let w = utf8_char_width(first); - // 2-byte encoding is for codepoints \u{0080} to \u{07ff} - // first C2 80 last DF BF - // 3-byte encoding is for codepoints \u{0800} to \u{ffff} - // first E0 A0 80 last EF BF BF - // excluding surrogates codepoints \u{d800} to \u{dfff} - // ED A0 80 to ED BF BF - // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff - // first F0 90 80 80 last F4 8F BF BF - // - // Use the UTF-8 syntax from the RFC - // - // https://tools.ietf.org/html/rfc3629 - // UTF8-1 = %x00-7F - // UTF8-2 = %xC2-DF UTF8-tail - // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / - // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) - // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / - // %xF4 %x80-8F 2( UTF8-tail ) - match w { - 2 => { - if next!() as i8 >= -64 { - err!(Some(1)) - } - } - 3 => { - match (first, next!()) { - (0xE0, 0xA0..=0xBF) - | (0xE1..=0xEC, 0x80..=0xBF) - | (0xED, 0x80..=0x9F) - | (0xEE..=0xEF, 0x80..=0xBF) => {} - _ => err!(Some(1)), - } - if next!() as i8 >= -64 { - err!(Some(2)) - } - } - 4 => { - match (first, next!()) { - (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {} - _ => err!(Some(1)), - } - if next!() as i8 >= -64 { - err!(Some(2)) - } - if next!() as i8 >= -64 { - err!(Some(3)) - } - } - _ => err!(Some(1)), + match try_finish_byte_sequence(first, v, index) { + Ok((_value, length)) => index += length, + Err(error_len) => return Err(Utf8Error { valid_up_to, error_len }), } - index += 1; } else { // Ascii case, try to skip forward quickly. // When the pointer is aligned, read 2 words of data per iteration @@ -241,6 +257,93 @@ pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { Ok(()) } +/// Try to finish an UTF-8 byte sequence. +/// +/// Assumes that `bytes[index] == first` and than `first >= 128`, i.e. that +/// `index` points at the beginning of a non-ASCII UTF-8 sequence in `bytes`. +/// +/// If the byte sequence at the index is correct, returns decoded code point and +/// length of the sequence. If it was invalid returns number of invalid bytes +/// or None if read was cut short. +#[inline(always)] +#[rustc_const_unstable(feature = "str_internals", issue = "none")] +const fn try_finish_byte_sequence( + first: u8, + bytes: &[u8], + index: usize, +) -> Result<(u32, usize), Option> { + macro_rules! get { + (raw $offset:expr) => { + if index + $offset < bytes.len() { + bytes[index + $offset] + } else { + return Err(None) + } + }; + (cont $offset:expr) => {{ + let byte = get!(raw $offset); + if !utf8_is_cont_byte(byte) { + return Err(Some($offset as u8)) + } + byte + }} + } + + // 2-byte encoding is for codepoints \u{0080} to \u{07ff} + // first C2 80 last DF BF + // 3-byte encoding is for codepoints \u{0800} to \u{ffff} + // first E0 A0 80 last EF BF BF + // excluding surrogates codepoints \u{d800} to \u{dfff} + // ED A0 80 to ED BF BF + // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff + // first F0 90 80 80 last F4 8F BF BF + // + // Use the UTF-8 syntax from the RFC + // + // https://tools.ietf.org/html/rfc3629 + // UTF8-1 = %x00-7F + // UTF8-2 = %xC2-DF UTF8-tail + // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / + // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / + // %xF4 %x80-8F 2( UTF8-tail ) + match utf8_char_width(first) { + 2 => { + let second = get!(cont 1); + let value = utf8_first_byte(first, 3); + let value = utf8_acc_cont_byte(value, second); + Ok((value, 2)) + } + 3 => { + let second = get!(raw 1); + match (first, second) { + (0xE0, 0xA0..=0xBF) + | (0xE1..=0xEC, 0x80..=0xBF) + | (0xED, 0x80..=0x9F) + | (0xEE..=0xEF, 0x80..=0xBF) => {} + _ => return Err(Some(1)), + } + let value = utf8_first_byte(first, 3); + let value = utf8_acc_cont_byte(value, second); + let value = utf8_acc_cont_byte(value, get!(cont 2)); + Ok((value, 3)) + } + 4 => { + let second = get!(raw 1); + match (first, second) { + (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {} + _ => return Err(Some(1)), + } + let value = utf8_first_byte(first, 4); + let value = utf8_acc_cont_byte(value, second); + let value = utf8_acc_cont_byte(value, get!(cont 2)); + let value = utf8_acc_cont_byte(value, get!(cont 3)); + Ok((value, 4)) + } + _ => Err(Some(1)), + } +} + // https://tools.ietf.org/html/rfc3629 const UTF8_CHAR_WIDTH: &[u8; 256] = &[ // 1 2 3 4 5 6 7 8 9 A B C D E F From cc9bf610bbf1c99d947910f9fdb966e7b7e24ed7 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Mon, 27 Feb 2023 17:34:55 +0100 Subject: [PATCH 06/12] core: refactor tests/pattern.rs tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Firstly, combine functions and results lists into a single list with ‘function => result’ pairs. This makes it easier to match function with its result. Secondly, eliminate InRange step so that it’s easier to notice series of matches or rejects. --- library/core/tests/pattern.rs | 462 ++++++++++++++++++---------------- 1 file changed, 246 insertions(+), 216 deletions(-) diff --git a/library/core/tests/pattern.rs b/library/core/tests/pattern.rs index 0e943bd80ec7f..866c1375d92ca 100644 --- a/library/core/tests/pattern.rs +++ b/library/core/tests/pattern.rs @@ -3,10 +3,10 @@ use std::pattern::*; // This macro makes it easier to write // tests that do a series of iterations macro_rules! search_asserts { - ($haystack:expr, $needle:expr, $testname:expr, [$($func:ident),*], $result:expr) => { + ($haystack:expr, $needle:expr, $testname:literal, $($func:ident => $result:expr),*) => { let mut searcher = $needle.into_searcher($haystack); - let arr = [$( Step::from(searcher.$func()) ),*]; - assert_eq!(&arr[..], &$result, $testname); + let arr = [$( searcher.$func().into_step(stringify!($func)) ),*]; + assert_eq!(&arr[..], &[$($result),*], $testname); } } @@ -17,26 +17,31 @@ enum Step { // be the same length for easy alignment Matches(usize, usize), Rejects(usize, usize), - InRange(usize, usize), Done, } -use self::Step::*; +use Step::*; -impl From for Step { - fn from(x: SearchStep) -> Self { - match x { - SearchStep::Match(a, b) => Matches(a, b), - SearchStep::Reject(a, b) => Rejects(a, b), +trait IntoStep { + fn into_step(self, method_name: &str) -> Step; +} + +impl IntoStep for SearchStep { + fn into_step(self, _name: &str) -> Step { + match self { + SearchStep::Match(s, e) => Matches(s, e), + SearchStep::Reject(s, e) => Rejects(s, e), SearchStep::Done => Done, } } } -impl From> for Step { - fn from(x: Option<(usize, usize)>) -> Self { - match x { - Some((a, b)) => InRange(a, b), +impl IntoStep for Option<(usize, usize)> { + fn into_step(self, method_name: &str) -> Step { + let is_reject = method_name.starts_with("next_reject"); + match self { + Some((s, e)) if is_reject => Rejects(s, e), + Some((s, e)) => Matches(s, e), None => Done, } } @@ -54,93 +59,74 @@ fn test_simple_iteration() { "abcdeabcd", 'a', "forward iteration for ASCII string", - // a b c d e a b c d EOF - [next, next, next, next, next, next, next, next, next, next], - [ - Matches(0, 1), - Rejects(1, 2), - Rejects(2, 3), - Rejects(3, 4), - Rejects(4, 5), - Matches(5, 6), - Rejects(6, 7), - Rejects(7, 8), - Rejects(8, 9), - Done - ] + next => Matches(0, 1), + next => Rejects(1, 2), + next => Rejects(2, 3), + next => Rejects(3, 4), + next => Rejects(4, 5), + next => Matches(5, 6), + next => Rejects(6, 7), + next => Rejects(7, 8), + next => Rejects(8, 9), + next => Done ); search_asserts!( "abcdeabcd", 'a', "reverse iteration for ASCII string", - // d c b a e d c b a EOF - [ - next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, - next_back, next_back - ], - [ - Rejects(8, 9), - Rejects(7, 8), - Rejects(6, 7), - Matches(5, 6), - Rejects(4, 5), - Rejects(3, 4), - Rejects(2, 3), - Rejects(1, 2), - Matches(0, 1), - Done - ] + next_back => Rejects(8, 9), + next_back => Rejects(7, 8), + next_back => Rejects(6, 7), + next_back => Matches(5, 6), + next_back => Rejects(4, 5), + next_back => Rejects(3, 4), + next_back => Rejects(2, 3), + next_back => Rejects(1, 2), + next_back => Matches(0, 1), + next_back => Done ); search_asserts!( "我爱我的猫", '我', "forward iteration for Chinese string", - // 我 愛 我 的 貓 EOF - [next, next, next, next, next, next], - [Matches(0, 3), Rejects(3, 6), Matches(6, 9), Rejects(9, 12), Rejects(12, 15), Done] + next => Matches(0, 3), + next => Rejects(3, 6), + next => Matches(6, 9), + next => Rejects(9, 12), + next => Rejects(12, 15), + next => Done ); search_asserts!( "我的猫说meow", 'm', "forward iteration for mixed string", - // 我 的 猫 说 m e o w EOF - [next, next, next, next, next, next, next, next, next], - [ - Rejects(0, 3), - Rejects(3, 6), - Rejects(6, 9), - Rejects(9, 12), - Matches(12, 13), - Rejects(13, 14), - Rejects(14, 15), - Rejects(15, 16), - Done - ] + next => Rejects(0, 3), + next => Rejects(3, 6), + next => Rejects(6, 9), + next => Rejects(9, 12), + next => Matches(12, 13), + next => Rejects(13, 14), + next => Rejects(14, 15), + next => Rejects(15, 16), + next => Done ); search_asserts!( "我的猫说meow", '猫', "reverse iteration for mixed string", - // w o e m 说 猫 的 我 EOF - [ - next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, - next_back - ], - [ - Rejects(15, 16), - Rejects(14, 15), - Rejects(13, 14), - Rejects(12, 13), - Rejects(9, 12), - Matches(6, 9), - Rejects(3, 6), - Rejects(0, 3), - Done - ] + next_back => Rejects(15, 16), + next_back => Rejects(14, 15), + next_back => Rejects(13, 14), + next_back => Rejects(12, 13), + next_back => Rejects(9, 12), + next_back => Matches(6, 9), + next_back => Rejects(3, 6), + next_back => Rejects(0, 3), + next_back => Done ); } @@ -150,46 +136,43 @@ fn test_simple_search() { "abcdeabcdeabcde", 'a', "next_match for ASCII string", - [next_match, next_match, next_match, next_match], - [InRange(0, 1), InRange(5, 6), InRange(10, 11), Done] + next_match => Matches(0, 1), + next_match => Matches(5, 6), + next_match => Matches(10, 11), + next_match => Done ); search_asserts!( "abcdeabcdeabcde", 'a', "next_match_back for ASCII string", - [next_match_back, next_match_back, next_match_back, next_match_back], - [InRange(10, 11), InRange(5, 6), InRange(0, 1), Done] + next_match_back => Matches(10, 11), + next_match_back => Matches(5, 6), + next_match_back => Matches(0, 1), + next_match_back => Done ); search_asserts!( "abcdeab", 'a', "next_reject for ASCII string", - [next_reject, next_reject, next_match, next_reject, next_reject], - [InRange(1, 2), InRange(2, 3), InRange(5, 6), InRange(6, 7), Done] + next_reject => Rejects(1, 2), + next_reject => Rejects(2, 3), + next_match => Matches(5, 6), + next_reject => Rejects(6, 7), + next_reject => Done ); search_asserts!( "abcdeabcdeabcde", 'a', "next_reject_back for ASCII string", - [ - next_reject_back, - next_reject_back, - next_match_back, - next_reject_back, - next_reject_back, - next_reject_back - ], - [ - InRange(14, 15), - InRange(13, 14), - InRange(10, 11), - InRange(9, 10), - InRange(8, 9), - InRange(7, 8) - ] + next_reject_back => Rejects(14, 15), + next_reject_back => Rejects(13, 14), + next_match_back => Matches(10, 11), + next_reject_back => Rejects(9, 10), + next_reject_back => Rejects(8, 9), + next_reject_back => Rejects(7, 8) ); } @@ -207,38 +190,31 @@ const STRESS: &str = "Áa🁀bÁꁁfg😁각กᘀ각aÁ각ꁁก😁a"; #[test] fn test_stress_indices() { // this isn't really a test, more of documentation on the indices of each character in the stresstest string - search_asserts!( STRESS, - 'x', + |_| true, "Indices of characters in stress test", - [ - next, next, next, next, next, next, next, next, next, next, next, next, next, next, - next, next, next, next, next, next, next - ], - [ - Rejects(0, 2), // Á - Rejects(2, 3), // a - Rejects(3, 7), // 🁀 - Rejects(7, 8), // b - Rejects(8, 10), // Á - Rejects(10, 13), // ꁁ - Rejects(13, 14), // f - Rejects(14, 15), // g - Rejects(15, 19), // 😀 - Rejects(19, 22), // 각 - Rejects(22, 25), // ก - Rejects(25, 28), // ᘀ - Rejects(28, 31), // 각 - Rejects(31, 32), // a - Rejects(32, 34), // Á - Rejects(34, 37), // 각 - Rejects(37, 40), // ꁁ - Rejects(40, 43), // ก - Rejects(43, 47), // 😀 - Rejects(47, 48), // a - Done - ] + next => Matches(0, 2), // Á + next => Matches(2, 3), // a + next => Matches(3, 7), // 🁀 + next => Matches(7, 8), // b + next => Matches(8, 10), // Á + next => Matches(10, 13), // ꁁ + next => Matches(13, 14), // f + next => Matches(14, 15), // g + next => Matches(15, 19), // 😀 + next => Matches(19, 22), // 각 + next => Matches(22, 25), // ก + next => Matches(25, 28), // ᘀ + next => Matches(28, 31), // 각 + next => Matches(31, 32), // a + next => Matches(32, 34), // Á + next => Matches(34, 37), // 각 + next => Matches(37, 40), // ꁁ + next => Matches(40, 43), // ก + next => Matches(43, 47), // 😀 + next => Matches(47, 48), // a + next => Done ); } @@ -248,96 +224,113 @@ fn test_forward_search_shared_bytes() { STRESS, 'Á', "Forward search for two-byte Latin character", - [next_match, next_match, next_match, next_match], - [InRange(0, 2), InRange(8, 10), InRange(32, 34), Done] + next_match => Matches(0, 2), + next_match => Matches(8, 10), + next_match => Matches(32, 34), + next_match => Done ); search_asserts!( STRESS, 'Á', "Forward search for two-byte Latin character; check if next() still works", - [next_match, next, next_match, next, next_match, next, next_match], - [ - InRange(0, 2), - Rejects(2, 3), - InRange(8, 10), - Rejects(10, 13), - InRange(32, 34), - Rejects(34, 37), - Done - ] + next_match => Matches(0, 2), + next => Rejects(2, 3), + next_match => Matches(8, 10), + next => Rejects(10, 13), + next_match => Matches(32, 34), + next => Rejects(34, 37), + next_match => Done ); search_asserts!( STRESS, '각', "Forward search for three-byte Hangul character", - [next_match, next, next_match, next_match, next_match], - [InRange(19, 22), Rejects(22, 25), InRange(28, 31), InRange(34, 37), Done] + next_match => Matches(19, 22), + next => Rejects(22, 25), + next_match => Matches(28, 31), + next_match => Matches(34, 37), + next_match => Done ); search_asserts!( STRESS, '각', "Forward search for three-byte Hangul character; check if next() still works", - [next_match, next, next_match, next, next_match, next, next_match], - [ - InRange(19, 22), - Rejects(22, 25), - InRange(28, 31), - Rejects(31, 32), - InRange(34, 37), - Rejects(37, 40), - Done - ] + next_match => Matches(19, 22), + next => Rejects(22, 25), + next_match => Matches(28, 31), + next => Rejects(31, 32), + next_match => Matches(34, 37), + next => Rejects(37, 40), + next_match => Done ); search_asserts!( STRESS, 'ก', "Forward search for three-byte Thai character", - [next_match, next, next_match, next, next_match], - [InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done] + next_match => Matches(22, 25), + next => Rejects(25, 28), + next_match => Matches(40, 43), + next => Rejects(43, 47), + next_match => Done ); search_asserts!( STRESS, 'ก', "Forward search for three-byte Thai character; check if next() still works", - [next_match, next, next_match, next, next_match], - [InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done] + next_match => Matches(22, 25), + next => Rejects(25, 28), + next_match => Matches(40, 43), + next => Rejects(43, 47), + next_match => Done ); search_asserts!( STRESS, '😁', "Forward search for four-byte emoji", - [next_match, next, next_match, next, next_match], - [InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done] + next_match => Matches(15, 19), + next => Rejects(19, 22), + next_match => Matches(43, 47), + next => Rejects(47, 48), + next_match => Done ); search_asserts!( STRESS, '😁', "Forward search for four-byte emoji; check if next() still works", - [next_match, next, next_match, next, next_match], - [InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done] + next_match => Matches(15, 19), + next => Rejects(19, 22), + next_match => Matches(43, 47), + next => Rejects(47, 48), + next_match => Done ); search_asserts!( STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes", - [next_match, next, next_match, next, next_match], - [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done] + next_match => Matches(10, 13), + next => Rejects(13, 14), + next_match => Matches(37, 40), + next => Rejects(40, 43), + next_match => Done ); search_asserts!( STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes; check if next() still works", - [next_match, next, next_match, next, next_match], - [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done] + next_match => Matches(10, 13), + next => Rejects(13, 14), + next_match => Matches(37, 40), + next => Rejects(40, 43), + next_match => Done ); } @@ -347,96 +340,112 @@ fn test_reverse_search_shared_bytes() { STRESS, 'Á', "Reverse search for two-byte Latin character", - [next_match_back, next_match_back, next_match_back, next_match_back], - [InRange(32, 34), InRange(8, 10), InRange(0, 2), Done] + next_match_back => Matches(32, 34), + next_match_back => Matches(8, 10), + next_match_back => Matches(0, 2), + next_match_back => Done ); search_asserts!( STRESS, 'Á', "Reverse search for two-byte Latin character; check if next_back() still works", - [next_match_back, next_back, next_match_back, next_back, next_match_back, next_back], - [InRange(32, 34), Rejects(31, 32), InRange(8, 10), Rejects(7, 8), InRange(0, 2), Done] + next_match_back => Matches(32, 34), + next_back => Rejects(31, 32), + next_match_back => Matches(8, 10), + next_back => Rejects(7, 8), + next_match_back => Matches(0, 2), + next_back => Done ); search_asserts!( STRESS, '각', "Reverse search for three-byte Hangul character", - [next_match_back, next_back, next_match_back, next_match_back, next_match_back], - [InRange(34, 37), Rejects(32, 34), InRange(28, 31), InRange(19, 22), Done] + next_match_back => Matches(34, 37), + next_back => Rejects(32, 34), + next_match_back => Matches(28, 31), + next_match_back => Matches(19, 22), + next_match_back => Done ); search_asserts!( STRESS, '각', "Reverse search for three-byte Hangul character; check if next_back() still works", - [ - next_match_back, - next_back, - next_match_back, - next_back, - next_match_back, - next_back, - next_match_back - ], - [ - InRange(34, 37), - Rejects(32, 34), - InRange(28, 31), - Rejects(25, 28), - InRange(19, 22), - Rejects(15, 19), - Done - ] + next_match_back => Matches(34, 37), + next_back => Rejects(32, 34), + next_match_back => Matches(28, 31), + next_back => Rejects(25, 28), + next_match_back => Matches(19, 22), + next_back => Rejects(15, 19), + next_match_back => Done ); search_asserts!( STRESS, 'ก', "Reverse search for three-byte Thai character", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done] + next_match_back => Matches(40, 43), + next_back => Rejects(37, 40), + next_match_back => Matches(22, 25), + next_back => Rejects(19, 22), + next_match_back => Done ); search_asserts!( STRESS, 'ก', "Reverse search for three-byte Thai character; check if next_back() still works", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done] + next_match_back => Matches(40, 43), + next_back => Rejects(37, 40), + next_match_back => Matches(22, 25), + next_back => Rejects(19, 22), + next_match_back => Done ); search_asserts!( STRESS, '😁', "Reverse search for four-byte emoji", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done] + next_match_back => Matches(43, 47), + next_back => Rejects(40, 43), + next_match_back => Matches(15, 19), + next_back => Rejects(14, 15), + next_match_back => Done ); search_asserts!( STRESS, '😁', "Reverse search for four-byte emoji; check if next_back() still works", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done] + next_match_back => Matches(43, 47), + next_back => Rejects(40, 43), + next_match_back => Matches(15, 19), + next_back => Rejects(14, 15), + next_match_back => Done ); search_asserts!( STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done] + next_match_back => Matches(37, 40), + next_back => Rejects(34, 37), + next_match_back => Matches(10, 13), + next_back => Rejects(8, 10), + next_match_back => Done ); search_asserts!( STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes; check if next_back() still works", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done] + next_match_back => Matches(37, 40), + next_back => Rejects(34, 37), + next_match_back => Matches(10, 13), + next_back => Rejects(8, 10), + next_match_back => Done ); } @@ -448,56 +457,77 @@ fn double_ended_regression_test() { "abcdeabcdeabcde", 'a', "alternating double ended search", - [next_match, next_match_back, next_match, next_match_back], - [InRange(0, 1), InRange(10, 11), InRange(5, 6), Done] + next_match => Matches(0, 1), + next_match_back => Matches(10, 11), + next_match => Matches(5, 6), + next_match_back => Done ); search_asserts!( "abcdeabcdeabcde", 'a', "triple double ended search for a", - [next_match, next_match_back, next_match_back, next_match_back], - [InRange(0, 1), InRange(10, 11), InRange(5, 6), Done] + next_match => Matches(0, 1), + next_match_back => Matches(10, 11), + next_match_back => Matches(5, 6), + next_match_back => Done ); search_asserts!( "abcdeabcdeabcde", 'd', "triple double ended search for d", - [next_match, next_match_back, next_match_back, next_match_back], - [InRange(3, 4), InRange(13, 14), InRange(8, 9), Done] + next_match => Matches(3, 4), + next_match_back => Matches(13, 14), + next_match_back => Matches(8, 9), + next_match_back => Done ); search_asserts!( STRESS, 'Á', "Double ended search for two-byte Latin character", - [next_match, next_match_back, next_match, next_match_back], - [InRange(0, 2), InRange(32, 34), InRange(8, 10), Done] + next_match => Matches(0, 2), + next_match_back => Matches(32, 34), + next_match => Matches(8, 10), + next_match_back => Done ); search_asserts!( STRESS, '각', "Reverse double ended search for three-byte Hangul character", - [next_match_back, next_back, next_match, next, next_match_back, next_match], - [InRange(34, 37), Rejects(32, 34), InRange(19, 22), Rejects(22, 25), InRange(28, 31), Done] + next_match_back => Matches(34, 37), + next_back => Rejects(32, 34), + next_match => Matches(19, 22), + next => Rejects(22, 25), + next_match_back => Matches(28, 31), + next_match => Done ); search_asserts!( STRESS, 'ก', "Double ended search for three-byte Thai character", - [next_match, next_back, next, next_match_back, next_match], - [InRange(22, 25), Rejects(47, 48), Rejects(25, 28), InRange(40, 43), Done] + next_match => Matches(22, 25), + next_back => Rejects(47, 48), + next => Rejects(25, 28), + next_match_back => Matches(40, 43), + next_match => Done ); search_asserts!( STRESS, '😁', "Double ended search for four-byte emoji", - [next_match_back, next, next_match, next_back, next_match], - [InRange(43, 47), Rejects(0, 2), InRange(15, 19), Rejects(40, 43), Done] + next_match_back => Matches(43, 47), + next => Rejects(0, 2), + next_match => Matches(15, 19), + next_back => Rejects(40, 43), + next_match => Done ); search_asserts!( STRESS, 'ꁁ', "Double ended search for three-byte Yi character with repeated bytes", - [next_match, next, next_match_back, next_back, next_match], - [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(34, 37), Done] + next_match => Matches(10, 13), + next => Rejects(13, 14), + next_match_back => Matches(37, 40), + next_back => Rejects(34, 37), + next_match => Done ); } From c9dce1ba0b93c92f725f4a870dd639e3c7d02adf Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Mon, 20 Feb 2023 00:02:51 +0100 Subject: [PATCH 07/12] core: add internal core::str_bytes module handling string-like slices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a new core::str_bytes module with types and functions which handle string-like bytes slices. String-like means that they code treats UTF-8 byte sequences as characters within such slices but doesn’t assume that the slices are well-formed. A `str` is trivially a bytes sequence that the module can handle but so is OsStr (which is WTF-8 on Windows and unstructured bytes on Unix). Move bunch of code (most notably implementation of the two-way string-matching algorithm) from core::str to core::str_bytes. Note that this likely introduces regression in some of the str function performance (since the new code cannot assume well-formed UTF-8). This is going to be rectified by following commit which will make it again possible for the code to assume bytes format. This is not done in this commit to keep it smaller. --- library/alloc/tests/str.rs | 6 +- library/core/src/lib.rs | 2 + library/core/src/pattern.rs | 16 + library/core/src/str/pattern.rs | 816 ++---------------- library/core/src/str_bytes.rs | 1390 +++++++++++++++++++++++++++++++ library/core/tests/pattern.rs | 126 ++- 6 files changed, 1520 insertions(+), 836 deletions(-) create mode 100644 library/core/src/str_bytes.rs diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index 57c7ad955e43a..8334c79e5cad5 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -1972,7 +1972,7 @@ mod pattern { str_searcher_multibyte_haystack, " ", "├──", - [Reject(0, 3), Reject(3, 6), Reject(6, 9),] + [Reject(0, 9),] ); make_test!( str_searcher_empty_needle_multibyte_haystack, @@ -2008,13 +2008,13 @@ mod pattern { char_searcher_multibyte_haystack, ' ', "├──", - [Reject(0, 3), Reject(3, 6), Reject(6, 9),] + [Reject(0, 9),] ); make_test!( char_searcher_short_haystack, '\u{1F4A9}', "* \t", - [Reject(0, 1), Reject(1, 2), Reject(2, 3),] + [Reject(0, 3),] ); // See #85462 diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs index 79670a0d60adb..eae36759e6807 100644 --- a/library/core/src/lib.rs +++ b/library/core/src/lib.rs @@ -365,6 +365,8 @@ pub mod hash; pub mod pattern; pub mod slice; pub mod str; +#[allow(missing_docs)] +pub mod str_bytes; pub mod time; pub mod unicode; diff --git a/library/core/src/pattern.rs b/library/core/src/pattern.rs index 835ca0459eea3..8fe632426cb66 100644 --- a/library/core/src/pattern.rs +++ b/library/core/src/pattern.rs @@ -211,6 +211,19 @@ pub trait SearchResult: Sized + sealed::Sealed { /// Value indicating searching has finished. const DONE: Self; + /// Whether search should return reject as soon as possible. + /// + /// For example, if a search can quickly determine that the very next + /// position cannot be where a next match starts, it should return a reject + /// with that position. This is an optimisation which allows the algorithm + /// to not waste time looking for the next match if caller is only + /// interested in the next position of a reject. + /// + /// If this is `true`, [`rejecting()`][Self::rejecting] is guaranteed to + /// return `Some` and if this is `false`, [`matching()`][Self::matching] is + /// guaranteed to return `Some`. + const USE_EARLY_REJECT: bool; + /// Returns value describing a match or `None` if this implementation /// doesn’t care about matches. fn matching(start: T, end: T) -> Option; @@ -232,6 +245,7 @@ pub struct RejectOnly(pub Option<(T, T)>); impl SearchResult for SearchStep { const DONE: Self = SearchStep::Done; + const USE_EARLY_REJECT: bool = false; #[inline(always)] fn matching(s: T, e: T) -> Option { @@ -246,6 +260,7 @@ impl SearchResult for SearchStep { impl SearchResult for MatchOnly { const DONE: Self = Self(None); + const USE_EARLY_REJECT: bool = false; #[inline(always)] fn matching(s: T, e: T) -> Option { @@ -260,6 +275,7 @@ impl SearchResult for MatchOnly { impl SearchResult for RejectOnly { const DONE: Self = Self(None); + const USE_EARLY_REJECT: bool = true; #[inline(always)] fn matching(_s: T, _e: T) -> Option { diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs index bb6232d92eecd..fa3b9b4a04244 100644 --- a/library/core/src/str/pattern.rs +++ b/library/core/src/str/pattern.rs @@ -49,15 +49,13 @@ issue = "27721" )] -use crate::cmp; use crate::cmp::Ordering; use crate::fmt; use crate::ops::Range; use crate::pattern::{ - DoubleEndedSearcher, Haystack, MatchOnly, Pattern, ReverseSearcher, SearchResult, SearchStep, - Searcher, + DoubleEndedSearcher, Haystack, Pattern, ReverseSearcher, SearchStep, Searcher, }; -use crate::slice::memchr; +use crate::str_bytes; ///////////////////////////////////////////////////////////////////////////// // Impl for Haystack @@ -93,181 +91,48 @@ impl<'a> Haystack for &'a str { /// Associated type for `>::Searcher`. #[derive(Clone, Debug)] -pub struct CharSearcher<'a> { - haystack: &'a str, - // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack` - // This invariant can be broken *within* next_match and next_match_back, however - // they must exit with fingers on valid code point boundaries. - /// `finger` is the current byte index of the forward search. - /// Imagine that it exists before the byte at its index, i.e. - /// `haystack[finger]` is the first byte of the slice we must inspect during - /// forward searching - finger: usize, - /// `finger_back` is the current byte index of the reverse search. - /// Imagine that it exists after the byte at its index, i.e. - /// haystack[finger_back - 1] is the last byte of the slice we must inspect during - /// forward searching (and thus the first byte to be inspected when calling next_back()). - finger_back: usize, - /// The character being searched for - needle: char, - - // safety invariant: `utf8_size` must be less than 5 - /// The number of bytes `needle` takes up when encoded in utf8. - utf8_size: usize, - /// A utf8 encoded copy of the `needle` - utf8_encoded: [u8; 4], +pub struct CharSearcher<'a>(str_bytes::CharSearcher<'a>); + +impl<'a> CharSearcher<'a> { + fn new(haystack: &'a str, chr: char) -> Self { + Self(str_bytes::CharSearcher::new(str_bytes::Bytes::from(haystack), chr)) + } } unsafe impl<'a> Searcher<&'a str> for CharSearcher<'a> { #[inline] fn haystack(&self) -> &'a str { - self.haystack + // SAFETY: self.0’s haystack was created from &str thus it is valid + // UTF-8. + unsafe { super::from_utf8_unchecked(self.0.haystack().as_bytes()) } } #[inline] fn next(&mut self) -> SearchStep { - let old_finger = self.finger; - // SAFETY: 1-4 guarantee safety of `get_unchecked` - // 1. `self.finger` and `self.finger_back` are kept on unicode boundaries - // (this is invariant) - // 2. `self.finger >= 0` since it starts at 0 and only increases - // 3. `self.finger < self.finger_back` because otherwise the char `iter` - // would return `SearchStep::Done` - // 4. `self.finger` comes before the end of the haystack because `self.finger_back` - // starts at the end and only decreases - let slice = unsafe { self.haystack.get_unchecked(old_finger..self.finger_back) }; - let mut iter = slice.chars(); - let old_len = iter.iter.len(); - if let Some(ch) = iter.next() { - // add byte offset of current character - // without re-encoding as utf-8 - self.finger += old_len - iter.iter.len(); - if ch == self.needle { - SearchStep::Match(old_finger, self.finger) - } else { - SearchStep::Reject(old_finger, self.finger) - } - } else { - SearchStep::Done - } + self.0.next() } #[inline] fn next_match(&mut self) -> Option<(usize, usize)> { - loop { - // get the haystack after the last character found - let bytes = self.haystack.as_bytes().get(self.finger..self.finger_back)?; - // the last byte of the utf8 encoded needle - // SAFETY: we have an invariant that `utf8_size < 5` - let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; - if let Some(index) = memchr::memchr(last_byte, bytes) { - // The new finger is the index of the byte we found, - // plus one, since we memchr'd for the last byte of the character. - // - // Note that this doesn't always give us a finger on a UTF8 boundary. - // If we *didn't* find our character - // we may have indexed to the non-last byte of a 3-byte or 4-byte character. - // We can't just skip to the next valid starting byte because a character like - // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find - // the second byte when searching for the third. - // - // However, this is totally okay. While we have the invariant that - // self.finger is on a UTF8 boundary, this invariant is not relied upon - // within this method (it is relied upon in CharSearcher::next()). - // - // We only exit this method when we reach the end of the string, or if we - // find something. When we find something the `finger` will be set - // to a UTF8 boundary. - self.finger += index + 1; - if self.finger >= self.utf8_size { - let found_char = self.finger - self.utf8_size; - if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) { - if slice == &self.utf8_encoded[0..self.utf8_size] { - return Some((found_char, self.finger)); - } - } - } - } else { - // found nothing, exit - self.finger = self.finger_back; - return None; - } - } + self.0.next_match() + } + #[inline] + fn next_reject(&mut self) -> Option<(usize, usize)> { + self.0.next_reject() } - - // let next_reject use the default implementation from the Searcher trait } unsafe impl<'a> ReverseSearcher<&'a str> for CharSearcher<'a> { #[inline] fn next_back(&mut self) -> SearchStep { - let old_finger = self.finger_back; - // SAFETY: see the comment for next() above - let slice = unsafe { self.haystack.get_unchecked(self.finger..old_finger) }; - let mut iter = slice.chars(); - let old_len = iter.iter.len(); - if let Some(ch) = iter.next_back() { - // subtract byte offset of current character - // without re-encoding as utf-8 - self.finger_back -= old_len - iter.iter.len(); - if ch == self.needle { - SearchStep::Match(self.finger_back, old_finger) - } else { - SearchStep::Reject(self.finger_back, old_finger) - } - } else { - SearchStep::Done - } + self.0.next_back() } #[inline] fn next_match_back(&mut self) -> Option<(usize, usize)> { - let haystack = self.haystack.as_bytes(); - loop { - // get the haystack up to but not including the last character searched - let bytes = haystack.get(self.finger..self.finger_back)?; - // the last byte of the utf8 encoded needle - // SAFETY: we have an invariant that `utf8_size < 5` - let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; - if let Some(index) = memchr::memrchr(last_byte, bytes) { - // we searched a slice that was offset by self.finger, - // add self.finger to recoup the original index - let index = self.finger + index; - // memrchr will return the index of the byte we wish to - // find. In case of an ASCII character, this is indeed - // were we wish our new finger to be ("after" the found - // char in the paradigm of reverse iteration). For - // multibyte chars we need to skip down by the number of more - // bytes they have than ASCII - let shift = self.utf8_size - 1; - if index >= shift { - let found_char = index - shift; - if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) { - if slice == &self.utf8_encoded[0..self.utf8_size] { - // move finger to before the character found (i.e., at its start index) - self.finger_back = found_char; - return Some((self.finger_back, self.finger_back + self.utf8_size)); - } - } - } - // We can't use finger_back = index - size + 1 here. If we found the last char - // of a different-sized character (or the middle byte of a different character) - // we need to bump the finger_back down to `index`. This similarly makes - // `finger_back` have the potential to no longer be on a boundary, - // but this is OK since we only exit this function on a boundary - // or when the haystack has been searched completely. - // - // Unlike next_match this does not - // have the problem of repeated bytes in utf-8 because - // we're searching for the last byte, and we can only have - // found the last byte when searching in reverse. - self.finger_back = index; - } else { - self.finger_back = self.finger; - // found nothing, exit - return None; - } - } + self.0.next_match_back() + } + #[inline] + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + self.0.next_reject_back() } - - // let next_reject_back use the default implementation from the Searcher trait } impl<'a> DoubleEndedSearcher<&'a str> for CharSearcher<'a> {} @@ -278,32 +143,19 @@ impl<'a> DoubleEndedSearcher<&'a str> for CharSearcher<'a> {} /// /// ``` /// assert_eq!("Hello world".find('o'), Some(4)); +/// assert_eq!("Hello world".find('x'), None); /// ``` impl<'a> Pattern<&'a str> for char { type Searcher = CharSearcher<'a>; #[inline] fn into_searcher(self, haystack: &'a str) -> Self::Searcher { - let mut utf8_encoded = [0; 4]; - let utf8_size = self.encode_utf8(&mut utf8_encoded).len(); - CharSearcher { - haystack, - finger: 0, - finger_back: haystack.len(), - needle: self, - utf8_size, - utf8_encoded, - } + CharSearcher::new(haystack, self) } #[inline] fn is_contained_in(self, haystack: &'a str) -> bool { - if (self as u32) < 128 { - haystack.as_bytes().contains(&(self as u8)) - } else { - let mut buffer = [0u8; 4]; - self.encode_utf8(&mut buffer).is_contained_in(haystack) - } + self.is_contained_in(str_bytes::Bytes::from(haystack)) } #[inline] @@ -313,23 +165,27 @@ impl<'a> Pattern<&'a str> for char { #[inline] fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> { - self.encode_utf8(&mut [0u8; 4]).strip_prefix_of(haystack) + self.strip_prefix_of(str_bytes::Bytes::from(haystack)).map(|bytes| { + // SAFETY: Bytes were created from &str and Bytes never splits + // inside of UTF-8 bytes sequences thus `bytes` is still valid + // UTF-8. + unsafe { super::from_utf8_unchecked(bytes.as_bytes()) } + }) } #[inline] - fn is_suffix_of(self, haystack: &'a str) -> bool - where - Self::Searcher: ReverseSearcher<&'a str>, - { - self.encode_utf8(&mut [0u8; 4]).is_suffix_of(haystack) + fn is_suffix_of(self, haystack: &'a str) -> bool { + self.is_suffix_of(str_bytes::Bytes::from(haystack)) } #[inline] - fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> - where - Self::Searcher: ReverseSearcher<&'a str>, - { - self.encode_utf8(&mut [0u8; 4]).strip_suffix_of(haystack) + fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> { + self.strip_suffix_of(str_bytes::Bytes::from(haystack)).map(|bytes| { + // SAFETY: Bytes were created from &str and Bytes never splits + // inside of UTF-8 bytes sequences thus `bytes` is still valid + // UTF-8. + unsafe { super::from_utf8_unchecked(bytes.as_bytes()) } + }) } } @@ -757,609 +613,51 @@ impl<'a, 'b> Pattern<&'a str> for &'b str { #[derive(Clone, Debug)] /// Associated type for `<&str as Pattern<&'a str>>::Searcher`. -pub struct StrSearcher<'a, 'b> { - haystack: &'a str, - needle: &'b str, - - searcher: StrSearcherImpl, -} - -#[derive(Clone, Debug)] -enum StrSearcherImpl { - Empty(core::pattern::EmptyNeedleSearcher), - TwoWay(TwoWaySearcher), -} +pub struct StrSearcher<'a, 'b>(crate::str_bytes::StrSearcher<'a, 'b>); impl<'a, 'b> StrSearcher<'a, 'b> { fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> { - let searcher = if needle.is_empty() { - StrSearcherImpl::Empty(core::pattern::EmptyNeedleSearcher::new(haystack)) - } else { - StrSearcherImpl::TwoWay(TwoWaySearcher::new(needle.as_bytes(), haystack.len())) - }; - StrSearcher { haystack, needle, searcher } - } - - fn fwd_char(haystack: &str, pos: usize) -> usize { - pos + super::utf8_char_width(haystack.as_bytes()[pos]) - } - - fn bwd_char(haystack: &str, pos: usize) -> usize { - // Note: we are guaranteed to operate on valid UTF-8 thus we will never - // need to go further than four bytes back. - let bytes = haystack.as_bytes(); - if bytes[pos - 1].is_utf8_char_boundary() { - pos - 1 - } else if bytes[pos - 2].is_utf8_char_boundary() { - pos - 2 - } else if bytes[pos - 3].is_utf8_char_boundary() { - pos - 3 - } else { - pos - 4 - } + let haystack = crate::str_bytes::Bytes::from(haystack); + Self(crate::str_bytes::StrSearcher::new(haystack, needle)) } } unsafe impl<'a, 'b> Searcher<&'a str> for StrSearcher<'a, 'b> { #[inline] fn haystack(&self) -> &'a str { - self.haystack + let bytes = self.0.haystack().as_bytes(); + // SAFETY: self.0.haystack() was created from a &str. + unsafe { crate::str::from_utf8_unchecked(bytes) } } #[inline] fn next(&mut self) -> SearchStep { - match self.searcher { - StrSearcherImpl::Empty(ref mut searcher) => { - searcher.next_fwd(|range| Self::fwd_char(self.haystack, range.start)) - } - StrSearcherImpl::TwoWay(ref mut searcher) => { - // TwoWaySearcher produces valid *Match* indices that split at char boundaries - // as long as it does correct matching and that haystack and needle are - // valid UTF-8 - // *Rejects* from the algorithm can fall on any indices, but we will walk them - // manually to the next character boundary, so that they are utf-8 safe. - if searcher.position == self.haystack.len() { - return SearchStep::Done; - } - let is_long = searcher.memory == usize::MAX; - match searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), is_long) { - SearchStep::Reject(a, mut b) => { - // skip to next char boundary - while !self.haystack.is_char_boundary(b) { - b += 1; - } - searcher.position = cmp::max(b, searcher.position); - SearchStep::Reject(a, b) - } - otherwise => otherwise, - } - } - } + self.0.next() } #[inline] fn next_match(&mut self) -> Option<(usize, usize)> { - match self.searcher { - StrSearcherImpl::Empty(ref mut searcher) => { - searcher - .next_fwd::(|range| Self::fwd_char(self.haystack, range.start)) - .0 - } - StrSearcherImpl::TwoWay(ref mut searcher) => { - let is_long = searcher.memory == usize::MAX; - // write out `true` and `false` cases to encourage the compiler - // to specialize the two cases separately. - if is_long { - searcher - .next::(self.haystack.as_bytes(), self.needle.as_bytes(), true) - .0 - } else { - searcher - .next::(self.haystack.as_bytes(), self.needle.as_bytes(), false) - .0 - } - } - } + self.0.next_match() + } + + fn next_reject(&mut self) -> Option<(usize, usize)> { + self.0.next_reject() } } unsafe impl<'a, 'b> ReverseSearcher<&'a str> for StrSearcher<'a, 'b> { #[inline] fn next_back(&mut self) -> SearchStep { - match self.searcher { - StrSearcherImpl::Empty(ref mut searcher) => { - searcher.next_bwd(|range| Self::bwd_char(self.haystack, range.end)) - } - StrSearcherImpl::TwoWay(ref mut searcher) => { - if searcher.end == 0 { - return SearchStep::Done; - } - let is_long = searcher.memory == usize::MAX; - match searcher.next_back(self.haystack.as_bytes(), self.needle.as_bytes(), is_long) - { - SearchStep::Reject(mut a, b) => { - // skip to next char boundary - while !self.haystack.is_char_boundary(a) { - a -= 1; - } - searcher.end = cmp::min(a, searcher.end); - SearchStep::Reject(a, b) - } - otherwise => otherwise, - } - } - } + self.0.next_back() } #[inline] fn next_match_back(&mut self) -> Option<(usize, usize)> { - match self.searcher { - StrSearcherImpl::Empty(ref mut searcher) => { - searcher - .next_bwd::(|range| Self::bwd_char(self.haystack, range.end)) - .0 - } - StrSearcherImpl::TwoWay(ref mut searcher) => { - let is_long = searcher.memory == usize::MAX; - // write out `true` and `false`, like `next_match` - if is_long { - searcher - .next_back::( - self.haystack.as_bytes(), - self.needle.as_bytes(), - true, - ) - .0 - } else { - searcher - .next_back::( - self.haystack.as_bytes(), - self.needle.as_bytes(), - false, - ) - .0 - } - } - } + self.0.next_match_back() } -} - -/// The internal state of the two-way substring search algorithm. -#[derive(Clone, Debug)] -struct TwoWaySearcher { - // constants - /// critical factorization index - crit_pos: usize, - /// critical factorization index for reversed needle - crit_pos_back: usize, - period: usize, - /// `byteset` is an extension (not part of the two way algorithm); - /// it's a 64-bit "fingerprint" where each set bit `j` corresponds - /// to a (byte & 63) == j present in the needle. - byteset: u64, - - // variables - position: usize, - end: usize, - /// index into needle before which we have already matched - memory: usize, - /// index into needle after which we have already matched - memory_back: usize, -} - -/* - This is the Two-Way search algorithm, which was introduced in the paper: - Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675. - - Here's some background information. - - A *word* is a string of symbols. The *length* of a word should be a familiar - notion, and here we denote it for any word x by |x|. - (We also allow for the possibility of the *empty word*, a word of length zero). - - If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a - *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p]. - For example, both 1 and 2 are periods for the string "aa". As another example, - the only period of the string "abcd" is 4. - - We denote by period(x) the *smallest* period of x (provided that x is non-empty). - This is always well-defined since every non-empty word x has at least one period, - |x|. We sometimes call this *the period* of x. - - If u, v and x are words such that x = uv, where uv is the concatenation of u and - v, then we say that (u, v) is a *factorization* of x. - - Let (u, v) be a factorization for a word x. Then if w is a non-empty word such - that both of the following hold - - - either w is a suffix of u or u is a suffix of w - - either w is a prefix of v or v is a prefix of w - - then w is said to be a *repetition* for the factorization (u, v). - - Just to unpack this, there are four possibilities here. Let w = "abc". Then we - might have: - - - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde") - - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab") - - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi") - - u is a suffix of w and v is a prefix of w. ex: ("bc", "a") - - Note that the word vu is a repetition for any factorization (u,v) of x = uv, - so every factorization has at least one repetition. - - If x is a string and (u, v) is a factorization for x, then a *local period* for - (u, v) is an integer r such that there is some word w such that |w| = r and w is - a repetition for (u, v). - - We denote by local_period(u, v) the smallest local period of (u, v). We sometimes - call this *the local period* of (u, v). Provided that x = uv is non-empty, this - is well-defined (because each non-empty word has at least one factorization, as - noted above). - - It can be proven that the following is an equivalent definition of a local period - for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for - all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are - defined. (i.e., i > 0 and i + r < |x|). - - Using the above reformulation, it is easy to prove that - - 1 <= local_period(u, v) <= period(uv) - A factorization (u, v) of x such that local_period(u,v) = period(x) is called a - *critical factorization*. - - The algorithm hinges on the following theorem, which is stated without proof: - - **Critical Factorization Theorem** Any word x has at least one critical - factorization (u, v) such that |u| < period(x). - - The purpose of maximal_suffix is to find such a critical factorization. - - If the period is short, compute another factorization x = u' v' to use - for reverse search, chosen instead so that |v'| < period(x). - -*/ -impl TwoWaySearcher { - fn new(needle: &[u8], end: usize) -> TwoWaySearcher { - let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false); - let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true); - - let (crit_pos, period) = if crit_pos_false > crit_pos_true { - (crit_pos_false, period_false) - } else { - (crit_pos_true, period_true) - }; - - // A particularly readable explanation of what's going on here can be found - // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically - // see the code for "Algorithm CP" on p. 323. - // - // What's going on is we have some critical factorization (u, v) of the - // needle, and we want to determine whether u is a suffix of - // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use - // "Algorithm CP2", which is optimized for when the period of the needle - // is large. - if needle[..crit_pos] == needle[period..period + crit_pos] { - // short period case -- the period is exact - // compute a separate critical factorization for the reversed needle - // x = u' v' where |v'| < period(x). - // - // This is sped up by the period being known already. - // Note that a case like x = "acba" may be factored exactly forwards - // (crit_pos = 1, period = 3) while being factored with approximate - // period in reverse (crit_pos = 2, period = 2). We use the given - // reverse factorization but keep the exact period. - let crit_pos_back = needle.len() - - cmp::max( - TwoWaySearcher::reverse_maximal_suffix(needle, period, false), - TwoWaySearcher::reverse_maximal_suffix(needle, period, true), - ); - - TwoWaySearcher { - crit_pos, - crit_pos_back, - period, - byteset: Self::byteset_create(&needle[..period]), - - position: 0, - end, - memory: 0, - memory_back: needle.len(), - } - } else { - // long period case -- we have an approximation to the actual period, - // and don't use memorization. - // - // Approximate the period by lower bound max(|u|, |v|) + 1. - // The critical factorization is efficient to use for both forward and - // reverse search. - - TwoWaySearcher { - crit_pos, - crit_pos_back: crit_pos, - period: cmp::max(crit_pos, needle.len() - crit_pos) + 1, - byteset: Self::byteset_create(needle), - - position: 0, - end, - memory: usize::MAX, // Dummy value to signify that the period is long - memory_back: usize::MAX, - } - } - } - - #[inline] - fn byteset_create(bytes: &[u8]) -> u64 { - bytes.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a) - } - - #[inline] - fn byteset_contains(&self, byte: u8) -> bool { - (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0 - } - - // One of the main ideas of Two-Way is that we factorize the needle into - // two halves, (u, v), and begin trying to find v in the haystack by scanning - // left to right. If v matches, we try to match u by scanning right to left. - // How far we can jump when we encounter a mismatch is all based on the fact - // that (u, v) is a critical factorization for the needle. - #[inline] - fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> R { - // `next()` uses `self.position` as its cursor - let old_pos = self.position; - let needle_last = needle.len() - 1; - 'search: loop { - // Check that we have room to search in - // position + needle_last can not overflow if we assume slices - // are bounded by isize's range. - let tail_byte = match haystack.get(self.position + needle_last) { - Some(&b) => b, - None => { - self.position = haystack.len(); - return R::rejecting(old_pos, self.position).unwrap_or(R::DONE); - } - }; - - if old_pos != self.position { - if let Some(ret) = R::rejecting(old_pos, self.position) { - return ret; - } - } - - // Quickly skip by large portions unrelated to our substring - if !self.byteset_contains(tail_byte) { - self.position += needle.len(); - if !long_period { - self.memory = 0; - } - continue 'search; - } - - // See if the right part of the needle matches - let start = - if long_period { self.crit_pos } else { cmp::max(self.crit_pos, self.memory) }; - for i in start..needle.len() { - if needle[i] != haystack[self.position + i] { - self.position += i - self.crit_pos + 1; - if !long_period { - self.memory = 0; - } - continue 'search; - } - } - - // See if the left part of the needle matches - let start = if long_period { 0 } else { self.memory }; - for i in (start..self.crit_pos).rev() { - if needle[i] != haystack[self.position + i] { - self.position += self.period; - if !long_period { - self.memory = needle.len() - self.period; - } - continue 'search; - } - } - - // We have found a match! - let match_pos = self.position; - - // Note: add self.period instead of needle.len() to have overlapping matches - self.position += needle.len(); - if !long_period { - self.memory = 0; // set to needle.len() - self.period for overlapping matches - } - - return R::matching(match_pos, match_pos + needle.len()).unwrap(); - } - } - - // Follows the ideas in `next()`. - // - // The definitions are symmetrical, with period(x) = period(reverse(x)) - // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v) - // is a critical factorization, so is (reverse(v), reverse(u)). - // - // For the reverse case we have computed a critical factorization x = u' v' - // (field `crit_pos_back`). We need |u| < period(x) for the forward case and - // thus |v'| < period(x) for the reverse. - // - // To search in reverse through the haystack, we search forward through - // a reversed haystack with a reversed needle, matching first u' and then v'. - #[inline] - fn next_back( - &mut self, - haystack: &[u8], - needle: &[u8], - long_period: bool, - ) -> R { - // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()` - // are independent. - let old_end = self.end; - 'search: loop { - // Check that we have room to search in - // end - needle.len() will wrap around when there is no more room, - // but due to slice length limits it can never wrap all the way back - // into the length of haystack. - let front_byte = match haystack.get(self.end.wrapping_sub(needle.len())) { - Some(&b) => b, - None => { - self.end = 0; - return R::rejecting(0, old_end).unwrap_or(R::DONE); - } - }; - - if old_end != self.end { - if let Some(ret) = R::rejecting(self.end, old_end) { - return ret; - } - } - - // Quickly skip by large portions unrelated to our substring - if !self.byteset_contains(front_byte) { - self.end -= needle.len(); - if !long_period { - self.memory_back = needle.len(); - } - continue 'search; - } - - // See if the left part of the needle matches - let crit = if long_period { - self.crit_pos_back - } else { - cmp::min(self.crit_pos_back, self.memory_back) - }; - for i in (0..crit).rev() { - if needle[i] != haystack[self.end - needle.len() + i] { - self.end -= self.crit_pos_back - i; - if !long_period { - self.memory_back = needle.len(); - } - continue 'search; - } - } - - // See if the right part of the needle matches - let needle_end = if long_period { needle.len() } else { self.memory_back }; - for i in self.crit_pos_back..needle_end { - if needle[i] != haystack[self.end - needle.len() + i] { - self.end -= self.period; - if !long_period { - self.memory_back = self.period; - } - continue 'search; - } - } - - // We have found a match! - let match_pos = self.end - needle.len(); - // Note: sub self.period instead of needle.len() to have overlapping matches - self.end -= needle.len(); - if !long_period { - self.memory_back = needle.len(); - } - - return R::matching(match_pos, match_pos + needle.len()).unwrap(); - } - } - - // Compute the maximal suffix of `arr`. - // - // The maximal suffix is a possible critical factorization (u, v) of `arr`. - // - // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the - // period of v. - // - // `order_greater` determines if lexical order is `<` or `>`. Both - // orders must be computed -- the ordering with the largest `i` gives - // a critical factorization. - // - // For long period cases, the resulting period is not exact (it is too short). - #[inline] - fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) { - let mut left = 0; // Corresponds to i in the paper - let mut right = 1; // Corresponds to j in the paper - let mut offset = 0; // Corresponds to k in the paper, but starting at 0 - // to match 0-based indexing. - let mut period = 1; // Corresponds to p in the paper - - while let Some(&a) = arr.get(right + offset) { - // `left` will be inbounds when `right` is. - let b = arr[left + offset]; - if (a < b && !order_greater) || (a > b && order_greater) { - // Suffix is smaller, period is entire prefix so far. - right += offset + 1; - offset = 0; - period = right - left; - } else if a == b { - // Advance through repetition of the current period. - if offset + 1 == period { - right += offset + 1; - offset = 0; - } else { - offset += 1; - } - } else { - // Suffix is larger, start over from current location. - left = right; - right += 1; - offset = 0; - period = 1; - } - } - (left, period) - } - - // Compute the maximal suffix of the reverse of `arr`. - // - // The maximal suffix is a possible critical factorization (u', v') of `arr`. - // - // Returns `i` where `i` is the starting index of v', from the back; - // returns immediately when a period of `known_period` is reached. - // - // `order_greater` determines if lexical order is `<` or `>`. Both - // orders must be computed -- the ordering with the largest `i` gives - // a critical factorization. - // - // For long period cases, the resulting period is not exact (it is too short). - fn reverse_maximal_suffix(arr: &[u8], known_period: usize, order_greater: bool) -> usize { - let mut left = 0; // Corresponds to i in the paper - let mut right = 1; // Corresponds to j in the paper - let mut offset = 0; // Corresponds to k in the paper, but starting at 0 - // to match 0-based indexing. - let mut period = 1; // Corresponds to p in the paper - let n = arr.len(); - - while right + offset < n { - let a = arr[n - (1 + right + offset)]; - let b = arr[n - (1 + left + offset)]; - if (a < b && !order_greater) || (a > b && order_greater) { - // Suffix is smaller, period is entire prefix so far. - right += offset + 1; - offset = 0; - period = right - left; - } else if a == b { - // Advance through repetition of the current period. - if offset + 1 == period { - right += offset + 1; - offset = 0; - } else { - offset += 1; - } - } else { - // Suffix is larger, start over from current location. - left = right; - right += 1; - offset = 0; - period = 1; - } - if period == known_period { - break; - } - } - debug_assert!(period <= known_period); - left + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + self.0.next_reject_back() } } diff --git a/library/core/src/str_bytes.rs b/library/core/src/str_bytes.rs new file mode 100644 index 0000000000000..047faf2005a12 --- /dev/null +++ b/library/core/src/str_bytes.rs @@ -0,0 +1,1390 @@ +//! Module provides pattern matching features for string-like bytes slice. +//! +//! A ‘string-like bytes slice’ means that types and functions here try to +//! interpret bytes slices as well-formed WTF-8 but don’t assume it is and treat +//! bytes in invalid portions of the slices as characters for the purpose of +//! deciding where character boundaries lie. This can be demonstrated by how +//! empty pattern is matched (since empty patterns match character boundaries): +//! +//! ``` +//! #![feature(pattern, pattern_internals, str_internals)] +//! use core::pattern::{Pattern, Searcher}; +//! use core::str_bytes::Bytes; +//! +//! let data = ["Żółw".as_bytes(), &b"\xff\xff\xff"[..], "🕴".as_bytes()].concat(); +//! let mut searcher = "".into_searcher(Bytes::from(data.as_slice())); +//! let next = move || searcher.next_match().map(|(x, _)| x); +//! let boundaries = core::iter::from_fn(next).collect::>(); +//! assert_eq!(&[0, 2, 4, 6, 7, 8, 9, 10, 14][..], &boundaries[..]); +//! ``` +#![unstable(feature = "str_internals", issue = "none")] + +use crate::cmp; +use crate::mem::take; +use crate::ops; +use crate::pattern; +use crate::pattern::{Haystack, MatchOnly, RejectOnly, SearchStep, Searcher}; +use crate::str::{try_next_code_point, try_next_code_point_reverse}; + +type OptRange = Option<(usize, usize)>; +type Range = ops::Range; + +//////////////////////////////////////////////////////////////////////////////// +// Bytes wrapper +//////////////////////////////////////////////////////////////////////////////// + +/// A reference to a string-like bytes slice. +/// +/// ‘String-like’ refers to the fact that parts of the data are valid WTF-8 and +/// when we split the slice we don’t want to split well-formed WTF-8 bytes +/// sequences. This is in a sense a generalisation of a `&str` which allows +/// portions of the buffer to be ill-formed while preserving correctness of +/// existing well-formed parts. +#[derive(Copy, Clone, Debug)] +pub struct Bytes<'a>(&'a [u8]); + +impl<'a> Bytes<'a> { + pub fn as_bytes(self) -> &'a [u8] { + self.0 + } + + pub fn len(self) -> usize { + self.0.len() + } + + pub fn is_empty(self) -> bool { + self.0.is_empty() + } + + /// Adjusts range’s start position forward so it points at a potential valid + /// WTF-8 byte sequence. + /// + /// `range` represents a possibly invalid range within the bytes; + /// furthermore, `range.start` must be non-zero. This method returns a new + /// start index which is a valid split position. If `range` is already + /// a valid, the method simply returns `range.start`. + /// + /// When dealing with ill-formed WTF-8 sequences, this is not guaranteed to + /// advance position byte at a time. If you need to be able to advance + /// position byte at a time use `advance_range_start` instead. + fn adjust_position_fwd(self, range: Range) -> usize { + range.start + + self.as_bytes()[range.clone()] + .iter() + .take_while(|chr| !chr.is_utf8_char_boundary()) + .count() + } + + /// Adjusts position backward so that it points at the closest potential + /// valid WTF-8 sequence. + /// + /// `range` represents a possibly invalid range within the bytes, + /// furthermore `range.end` must be less that bytes’ length. This method + /// returns a new exnd index which is a valid split position. If `range` is + /// already a valid, the method simply returns `range.end`. + /// + /// When dealing with ill-formed WTF-8 sequences, this is not guaranteed to + /// advance position byte at a time. If you need to be able to advance + /// position character at a time use `advance_range_end` instead. + fn adjust_position_bwd(self, range: Range) -> usize { + range.end + - self.as_bytes()[range.start..range.end + 1] + .iter() + .rev() + .take_while(|chr| !chr.is_utf8_char_boundary()) + .count() + } + + /// Given a valid range update it’s start so it falls on the next character + /// boundary. + /// + /// `range` must be non-empty. If it starts with a valid WTF-8 sequence, + /// this method returns position pass that sequence. Otherwise, it returns + /// `range.start + 1`. In other words, well-formed WTF-8 bytes sequence are + /// skipped in one go while ill-formed sequences are skipped byte-by-byte. + fn advance_range_start(self, range: Range) -> usize { + assert!(!range.is_empty()); + match try_next_code_point(&self.0[range.clone()]) { + Some((_, len)) => range.start + len, + None => range.end.min(range.start + 1), + } + } + + /// Given a valid range update it’s end so it falls on the previous + /// character boundary. + /// + /// `range` must be non-empty. If it ends with a valid WTF-8 sequence, this + /// method returns position of the start of that sequence. Otherwise, it + /// returns `range.end - 1`. In other words, well-formed WTF-8 bytes + /// sequence are skipped in one go while ill-formed sequences are skipped + /// byte-by-byte. + fn advance_range_end(self, range: Range) -> usize { + assert!(!range.is_empty()); + match try_next_code_point_reverse(&self.0[range.clone()]) { + Some((_, len)) => range.end - len, + None => range.end - 1, + } + } + + /// Returns valid UTF-8 character at the front of the slice. + /// + /// If slice doesn’t start with a valid UTF-8 sequence, returns `None`. + /// Otherwise returns decoded character and it’s UTF-8 encoding’s length. + /// WTF-8 sequences which encode surrogates are considered invalid. + fn get_first_code_point(self) -> Option<(char, usize)> { + try_next_code_point(&self.0) + } + + /// Returns valid UTF-8 character at the end of the slice. + /// + /// If slice doesn’t end with a valid UTF-8 sequence, returns `None`. + /// Otherwise returns decoded character and it’s UTF-8 encoding’s length. + /// WTF-8 sequences which encode surrogates are considered invalid. + fn get_last_code_point(&self) -> Option<(char, usize)> { + try_next_code_point_reverse(&self.0) + } + + /// Looks for the next UTF-8-encoded character in the slice. + /// + /// WTF-8 sequences which encode surrogates are considered invalid. + /// + /// Returns position of the match, decoded character and UTF-8 length of + /// that character. + fn find_code_point_fwd(self, range: Range) -> Option<(usize, char, usize)> { + let bytes = &self.as_bytes()[range.clone()]; + (0..bytes.len()) + .filter_map(|pos| { + let (chr, len) = try_next_code_point(&bytes[pos..])?; + Some((range.start + pos, chr, len)) + }) + .next() + } + + /// Looks backwards for the next UTF-8 encoded character in the slice. + /// + /// WTF-8 sequences which encode surrogates are considered invalid. + /// + /// Returns position of the match, decoded character and UTF-8 length of + /// that character. + fn find_code_point_bwd(&self, range: Range) -> Option<(usize, char, usize)> { + let bytes = &self.as_bytes()[range.clone()]; + (0..bytes.len()) + .rev() + .filter_map(|pos| { + let (chr, len) = try_next_code_point(&bytes[pos..])?; + Some((range.start + pos, chr, len)) + }) + .next() + } +} + +impl<'a> From<&'a [u8]> for Bytes<'a> { + #[inline] + fn from(val: &'a [u8]) -> Self { + Self(val) + } +} + +impl<'a> From<&'a str> for Bytes<'a> { + #[inline] + fn from(val: &'a str) -> Self { + Self(val.as_bytes()) + } +} + +trait SearchResult: crate::pattern::SearchResult { + /// Adjusts reject’s start position backwards to make sure it doesn’t fall + /// withing well-formed WTF-8 sequence. + /// + /// Doesn’t move the start position past `begin`. If position was adjusted, + /// updates `*out` as well. + fn adjust_reject_start_bwd(self, bytes: Bytes<'_>, begin: usize, out: &mut usize) -> Self; + + /// Adjusts reject’s end position forwards to make sure it doesn’t fall + /// withing well-formed WTF-8 sequence. + /// + /// Doesn’t move the end position past `len`. If position was adjusted, + /// updates `*out` as well. + fn adjust_reject_end_fwd(self, bytes: Bytes<'_>, len: usize, out: &mut usize) -> Self; +} + +impl SearchResult for SearchStep { + fn adjust_reject_start_bwd(mut self, bytes: Bytes<'_>, begin: usize, out: &mut usize) -> Self { + if let SearchStep::Reject(ref mut start, _) = self { + *start = bytes.adjust_position_bwd(begin..*start); + *out = *start; + } + self + } + fn adjust_reject_end_fwd(mut self, bytes: Bytes<'_>, len: usize, out: &mut usize) -> Self { + if let SearchStep::Reject(_, ref mut end) = self { + *end = bytes.adjust_position_fwd(*end..len); + *out = *end; + } + self + } +} + +impl SearchResult for MatchOnly { + fn adjust_reject_start_bwd(self, _bytes: Bytes<'_>, _begin: usize, _out: &mut usize) -> Self { + self + } + fn adjust_reject_end_fwd(self, _bytes: Bytes<'_>, _end: usize, _out: &mut usize) -> Self { + self + } +} + +impl SearchResult for RejectOnly { + fn adjust_reject_start_bwd(mut self, bytes: Bytes<'_>, begin: usize, out: &mut usize) -> Self { + if let RejectOnly(Some((ref mut start, _))) = self { + *start = bytes.adjust_position_bwd(begin..*start); + *out = *start; + } + self + } + fn adjust_reject_end_fwd(mut self, bytes: Bytes<'_>, len: usize, out: &mut usize) -> Self { + if let RejectOnly(Some((_, ref mut end))) = self { + *end = bytes.adjust_position_fwd(*end..len); + *out = *end; + } + self + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Impl for Haystack +//////////////////////////////////////////////////////////////////////////////// + +impl Haystack for Bytes<'_> { + type Cursor = usize; + + fn cursor_at_front(self) -> Self::Cursor { + 0 + } + fn cursor_at_back(self) -> Self::Cursor { + self.0.len() + } + fn is_empty(self) -> bool { + self.0.is_empty() + } + unsafe fn get_unchecked(self, range: Range) -> Self { + Self(if cfg!(debug_assertions) { + self.0.get(range).unwrap() + } else { + // SAFETY: Caller promises cursor is a valid split position. + unsafe { self.0.get_unchecked(range) } + }) + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Impl Pattern for char +//////////////////////////////////////////////////////////////////////////////// + +impl<'hs> pattern::Pattern> for char { + type Searcher = CharSearcher<'hs>; + + fn into_searcher(self, haystack: Bytes<'hs>) -> Self::Searcher { + Self::Searcher::new(haystack, self) + } + + fn is_contained_in(self, haystack: Bytes<'hs>) -> bool { + let mut buf = [0; 4]; + encode_utf8(self, &mut buf).is_contained_in(haystack) + } + + fn is_prefix_of(self, haystack: Bytes<'hs>) -> bool { + let mut buf = [0; 4]; + encode_utf8(self, &mut buf).is_prefix_of(haystack) + } + fn strip_prefix_of(self, haystack: Bytes<'hs>) -> Option> { + let mut buf = [0; 4]; + encode_utf8(self, &mut buf).strip_prefix_of(haystack) + } + + fn is_suffix_of(self, haystack: Bytes<'hs>) -> bool { + let mut buf = [0; 4]; + encode_utf8(self, &mut buf).is_suffix_of(haystack) + } + fn strip_suffix_of(self, haystack: Bytes<'hs>) -> Option> { + let mut buf = [0; 4]; + encode_utf8(self, &mut buf).strip_suffix_of(haystack) + } +} + +/// Like `chr.encode_utf8(&mut buf)` but casts result to `&str`. +/// +/// This is useful because we have Pattern impl for &str but not for &mut str. +fn encode_utf8(chr: char, buf: &mut [u8; 4]) -> &str { + chr.encode_utf8(buf) +} + +#[derive(Clone, Debug)] +pub struct CharSearcher<'hs> { + haystack: Bytes<'hs>, + state: CharSearcherState, +} + +#[derive(Clone, Debug)] +struct CharSearcherState { + /// Not yet processed range of the haystack. + range: crate::ops::Range, + /// Needle the searcher is looking for within the haystack. + needle: CharBuffer, + /// If `true` and `range` is non-empty, `haystack[range]` starts with the + /// needle. + is_match_fwd: bool, + /// If `true` and `range` is non-empty, `haystack[range]` ends with the + /// needle. + is_match_bwd: bool, +} + +impl<'hs> CharSearcher<'hs> { + #[inline] + pub fn new(haystack: Bytes<'hs>, chr: char) -> Self { + Self { haystack, state: CharSearcherState::new(haystack.len(), chr) } + } +} + +unsafe impl<'hs> pattern::Searcher> for CharSearcher<'hs> { + fn haystack(&self) -> Bytes<'hs> { + self.haystack + } + + fn next(&mut self) -> SearchStep { + self.state.next_fwd(self.haystack) + } + fn next_match(&mut self) -> OptRange { + self.state.next_fwd::(self.haystack).0 + } + fn next_reject(&mut self) -> OptRange { + self.state.next_fwd::(self.haystack).0 + } +} + +unsafe impl<'hs> pattern::ReverseSearcher> for CharSearcher<'hs> { + fn next_back(&mut self) -> SearchStep { + self.state.next_bwd(self.haystack) + } + fn next_match_back(&mut self) -> OptRange { + self.state.next_bwd::(self.haystack).0 + } + fn next_reject_back(&mut self) -> OptRange { + self.state.next_bwd::(self.haystack).0 + } +} + +impl<'hs> pattern::DoubleEndedSearcher> for CharSearcher<'hs> {} + +impl CharSearcherState { + fn new(haystack_len: usize, chr: char) -> Self { + Self { + range: 0..haystack_len, + needle: CharBuffer::new(chr), + is_match_fwd: false, + is_match_bwd: false, + } + } + + fn find_match_fwd(&mut self, haystack: Bytes<'_>) -> OptRange { + let start = if take(&mut self.is_match_fwd) { + (!self.range.is_empty()).then_some(self.range.start) + } else { + // SAFETY: self.range is valid range of haystack. + let bytes = unsafe { haystack.get_unchecked(self.range.clone()) }; + // SAFETY: self.needle encodes a single character. + unsafe { naive::find_match_fwd(bytes.as_bytes(), self.needle.as_str()) } + .map(|pos| pos + self.range.start) + }?; + Some((start, start + self.needle.len())) + } + + fn next_reject_fwd(&mut self, haystack: Bytes<'_>) -> OptRange { + if take(&mut self.is_match_fwd) { + if self.range.is_empty() { + return None; + } + self.range.start += self.needle.len() + } + // SAFETY: self.range is valid range of haystack. + let bytes = unsafe { haystack.get_unchecked(self.range.clone()) }; + if let Some(pos) = naive::find_reject_fwd(bytes.as_bytes(), self.needle.as_str()) { + let pos = pos + self.range.start; + let end = haystack.advance_range_start(pos..self.range.end); + self.range.start = end; + Some((pos, end)) + } else { + self.range.start = self.range.end; + None + } + } + + fn next_fwd(&mut self, haystack: Bytes<'_>) -> R { + if R::USE_EARLY_REJECT { + match self.next_reject_fwd(haystack) { + Some((start, end)) => R::rejecting(start, end).unwrap(), + None => R::DONE, + } + } else if let Some((start, end)) = self.find_match_fwd(haystack) { + if self.range.start < start { + if let Some(res) = R::rejecting(self.range.start, start) { + self.range.start = start; + self.is_match_fwd = true; + return res; + } + } + self.range.start = end; + R::matching(start, end).unwrap() + } else if self.range.is_empty() { + R::DONE + } else { + let start = self.range.start; + self.range.start = self.range.end; + R::rejecting(start, self.range.end).unwrap_or(R::DONE) + } + } + + fn find_match_bwd(&mut self, haystack: Bytes<'_>) -> OptRange { + let start = if take(&mut self.is_match_bwd) { + (!self.range.is_empty()).then(|| self.range.end - self.needle.len()) + } else { + // SAFETY: self.range is valid range of haystack. + let bytes = unsafe { haystack.get_unchecked(self.range.clone()) }; + // SAFETY: self.needle encodes a single character. + unsafe { naive::find_match_bwd(bytes.as_bytes(), self.needle.as_str()) } + .map(|pos| pos + self.range.start) + }?; + Some((start, start + self.needle.len())) + } + + fn next_reject_bwd(&mut self, haystack: Bytes<'_>) -> OptRange { + if take(&mut self.is_match_bwd) { + if self.range.is_empty() { + return None; + } + self.range.end -= self.needle.len(); + } + // SAFETY: self.range is valid range of haystack. + let bytes = unsafe { haystack.get_unchecked(self.range.clone()) }; + if let Some(end) = naive::find_reject_bwd(bytes.as_bytes(), self.needle.as_str()) { + let end = end + self.range.start; + let start = haystack.advance_range_end(self.range.start..end); + self.range.end = start; + Some((start, end)) + } else { + self.range.end = self.range.start; + None + } + } + + fn next_bwd(&mut self, haystack: Bytes<'_>) -> R { + if R::USE_EARLY_REJECT { + match self.next_reject_bwd(haystack) { + Some((start, end)) => R::rejecting(start, end).unwrap(), + None => R::DONE, + } + } else if let Some((start, end)) = self.find_match_bwd(haystack) { + if end < self.range.end { + if let Some(res) = R::rejecting(end, self.range.end) { + self.range.end = end; + self.is_match_bwd = true; + return res; + } + } + self.range.end = start; + R::matching(start, end).unwrap() + } else if self.range.is_empty() { + R::DONE + } else { + let end = self.range.end; + self.range.end = self.range.start; + R::rejecting(self.range.start, end).unwrap_or(R::DONE) + } + } +} + +#[derive(Clone, Debug)] +struct CharBuffer([u8; 4], crate::num::NonZeroU8); + +impl CharBuffer { + fn new(chr: char) -> Self { + let mut buf = [0; 4]; + let len = chr.encode_utf8(&mut buf).len(); + // SAFETY: `len` is length of a single character UTF-8 sequence. + let len = unsafe { crate::num::NonZeroU8::new_unchecked(len as u8) }; + Self(buf, len) + } + + fn len(&self) -> usize { + usize::from(self.1.get()) + } + + fn as_str(&self) -> &str { + // SAFETY: `self.0` is UTF-8 encoding of a single character and `self.1` + // is its length. See `new` constructor. + unsafe { crate::str::from_utf8_unchecked(self.0.get_unchecked(..self.len())) } + } +} + +mod naive { + use crate::slice::memchr; + + /// Looks forwards for the next position of needle within haystack. + /// + /// Safety: `needle` must consist of a single character. + pub(super) unsafe fn find_match_fwd(haystack: &[u8], needle: &str) -> Option { + debug_assert!(!needle.is_empty()); + // SAFETY: Caller promises needle is non-empty. + let (&last_byte, head) = unsafe { needle.as_bytes().split_last().unwrap_unchecked() }; + let mut start = 0; + while haystack.len() - start > head.len() { + // SAFETY: + // 1. `start` is initialised to `self.start` and only ever increased + // thus `self.start ≤ start`. + // 2. We've checked `start + head.len() < haystack.len()`. + let bytes = unsafe { haystack.get_unchecked(start + head.len()..) }; + if let Some(index) = memchr::memchr(last_byte, bytes) { + // `start + index + head.len()` is the index of the last byte + // thus `start + index` is the index of the first byte. + let pos = start + index; + // SAFETY: Since we’ve started our search with head.len() + // offset, we know we have at least head.len() bytes in buffer. + if unsafe { haystack.get_unchecked(pos..pos + head.len()) } == head { + return Some(pos); + } + start += index + 1; + } else { + break; + } + } + None + } + + /// Looks backwards for the next position of needle within haystack. + /// + /// Safety: `needle` must consist of a single character. + pub(super) unsafe fn find_match_bwd(haystack: &[u8], needle: &str) -> Option { + // SAFETY: Caller promises needle is non-empty. + let (&first_byte, tail) = unsafe { needle.as_bytes().split_first().unwrap_unchecked() }; + let mut end = haystack.len(); + while end > tail.len() { + // SAFETY: + // 1. `end` is initialised to `haystack.len()` and only ever + // decreased thus `end ≤ haystack.len()`. + // 2. We've checked `end > tail.len()`. + let bytes = unsafe { haystack.get_unchecked(..end - tail.len()) }; + if let Some(pos) = memchr::memrchr(first_byte, bytes) { + // SAFETY: Since we’ve stopped our search with tail.len() + // offset, we know we have at least tail.len() bytes in buffer + // after position of the byte we’ve found. + if unsafe { haystack.get_unchecked(pos + 1..pos + 1 + tail.len()) } == tail { + return Some(pos); + } + end = pos; + } else { + break; + } + } + None + } + + /// Looks forwards for the next position where needle stops matching. + /// + /// Returns start of the next reject or `None` if there is no reject. + pub(super) fn find_reject_fwd(haystack: &[u8], needle: &str) -> Option { + let count = + haystack.chunks(needle.len()).take_while(|&slice| slice == needle.as_bytes()).count(); + let start = count * needle.len(); + (start < haystack.len()).then_some(start) + } + + /// Looks backwards for the next position where needle stops matching. + /// + /// Returns end of the next reject or `None` if there is no reject. + pub(super) fn find_reject_bwd(haystack: &[u8], needle: &str) -> Option { + debug_assert!(!needle.is_empty()); + let count = + haystack.rchunks(needle.len()).take_while(|&slice| slice == needle.as_bytes()).count(); + let end = haystack.len() - count * needle.len(); + (end > 0).then_some(end) + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Impl Pattern for FnMut(char) and FnMut(Result) +//////////////////////////////////////////////////////////////////////////////// + +impl<'hs, F: FnMut(char) -> bool> pattern::Pattern> for F { + type Searcher = PredicateSearcher<'hs, F>; + + fn into_searcher(self, haystack: Bytes<'hs>) -> Self::Searcher { + Self::Searcher::new(haystack, self) + } + + fn is_prefix_of(mut self, haystack: Bytes<'hs>) -> bool { + haystack.get_first_code_point().map_or(false, |(chr, _)| self(chr)) + } + fn strip_prefix_of(mut self, haystack: Bytes<'hs>) -> Option> { + let (chr, len) = haystack.get_first_code_point()?; + // SAFETY: We’ve just checked slice starts with len-byte long + // well-formed sequence. + self(chr).then(|| unsafe { haystack.get_unchecked(len..haystack.len()) }) + } + + fn is_suffix_of(mut self, haystack: Bytes<'hs>) -> bool { + haystack.get_last_code_point().map_or(false, |(chr, _)| self(chr)) + } + fn strip_suffix_of(mut self, haystack: Bytes<'hs>) -> Option> { + let (chr, len) = haystack.get_last_code_point()?; + let len = haystack.len() - len; + // SAFETY: We’ve just checked slice ends with len-byte long well-formed + // sequence. + self(chr).then(|| unsafe { haystack.get_unchecked(0..len) }) + } +} + +#[derive(Clone, Debug)] +pub struct PredicateSearcher<'hs, F> { + haystack: Bytes<'hs>, + pred: F, + start: usize, + end: usize, + fwd_match_len: u8, + bwd_match_len: u8, +} + +impl<'hs, F> PredicateSearcher<'hs, F> { + fn new(haystack: Bytes<'hs>, pred: F) -> Self { + Self { haystack, pred, start: 0, end: haystack.len(), fwd_match_len: 0, bwd_match_len: 0 } + } +} + +impl<'hs, F: FnMut(char) -> bool> PredicateSearcher<'hs, F> { + fn find_match_fwd(&mut self) -> Option<(usize, usize)> { + let mut start = self.start; + while start < self.end { + let (idx, chr, len) = self.haystack.find_code_point_fwd(start..self.end)?; + if (self.pred)(chr) { + return Some((idx, len)); + } + start = idx + len; + } + None + } + + fn find_match_bwd(&mut self) -> Option<(usize, usize)> { + let mut end = self.end; + while self.start < end { + let (idx, chr, len) = self.haystack.find_code_point_bwd(self.start..end)?; + if (self.pred)(chr) { + return Some((idx, len)); + } + end = idx; + } + None + } + + fn next_fwd(&mut self) -> R { + while self.start < self.end { + if self.fwd_match_len == 0 { + let (pos, len) = self.find_match_fwd().unwrap_or((self.end, 0)); + self.fwd_match_len = len as u8; + if pos != self.start { + let start = self.start; + self.start = pos; + if let Some(ret) = R::rejecting(start, pos) { + return ret; + } else if pos >= self.end { + break; + } + } + } + + let pos = self.start; + self.start += usize::from(take(&mut self.fwd_match_len)); + if let Some(ret) = R::matching(pos, self.start) { + return ret; + } + } + R::DONE + } + + fn next_bwd(&mut self) -> R { + while self.start < self.end { + if self.bwd_match_len == 0 { + let (pos, len) = self.find_match_bwd().unwrap_or((self.start, 0)); + self.bwd_match_len = len as u8; + let pos = pos + len; + let end = self.end; + if pos != self.end { + self.end = pos; + if let Some(ret) = R::rejecting(pos, end) { + return ret; + } else if self.start >= self.end { + break; + } + } + } + + let end = self.end; + self.end -= usize::from(take(&mut self.bwd_match_len)); + if let Some(ret) = R::matching(self.end, end) { + return ret; + } + } + R::DONE + } +} + +unsafe impl<'hs, F: FnMut(char) -> bool> Searcher> for PredicateSearcher<'hs, F> { + fn haystack(&self) -> Bytes<'hs> { + self.haystack + } + fn next(&mut self) -> SearchStep { + self.next_fwd() + } + fn next_match(&mut self) -> OptRange { + self.next_fwd::().0 + } + fn next_reject(&mut self) -> OptRange { + self.next_fwd::().0 + } +} + +unsafe impl<'hs, F: FnMut(char) -> bool> pattern::ReverseSearcher> + for PredicateSearcher<'hs, F> +{ + fn next_back(&mut self) -> SearchStep { + self.next_bwd() + } + fn next_match_back(&mut self) -> OptRange { + self.next_bwd::().0 + } + fn next_reject_back(&mut self) -> OptRange { + self.next_bwd::().0 + } +} + +impl<'hs, F: FnMut(char) -> bool> pattern::DoubleEndedSearcher> + for PredicateSearcher<'hs, F> +{ +} + +//////////////////////////////////////////////////////////////////////////////// +// Impl Pattern for &str +//////////////////////////////////////////////////////////////////////////////// + +impl<'hs, 'p> pattern::Pattern> for &'p str { + type Searcher = StrSearcher<'hs, 'p>; + + fn into_searcher(self, haystack: Bytes<'hs>) -> Self::Searcher { + Self::Searcher::new(haystack, self) + } + + fn is_prefix_of(self, haystack: Bytes<'hs>) -> bool { + haystack.as_bytes().starts_with(self.as_bytes()) + } + fn strip_prefix_of(self, haystack: Bytes<'hs>) -> Option> { + haystack.as_bytes().strip_prefix(self.as_bytes()).map(Bytes) + } + + fn is_suffix_of(self, haystack: Bytes<'hs>) -> bool { + haystack.as_bytes().ends_with(self.as_bytes()) + } + fn strip_suffix_of(self, haystack: Bytes<'hs>) -> Option> { + haystack.as_bytes().strip_suffix(self.as_bytes()).map(Bytes) + } +} + +#[derive(Clone, Debug)] +pub struct StrSearcher<'hs, 'p> { + haystack: Bytes<'hs>, + state: StrSearcherInner<'p>, +} + +impl<'hs, 'p> StrSearcher<'hs, 'p> { + pub fn new(haystack: Bytes<'hs>, needle: &'p str) -> Self { + let state = StrSearcherInner::new(haystack, needle); + Self { haystack, state } + } +} + +unsafe impl<'hs, 'p> Searcher> for StrSearcher<'hs, 'p> { + fn haystack(&self) -> Bytes<'hs> { + self.haystack + } + fn next(&mut self) -> SearchStep { + self.state.next_fwd(self.haystack) + } + fn next_match(&mut self) -> OptRange { + self.state.next_fwd::(self.haystack).0 + } + fn next_reject(&mut self) -> OptRange { + self.state.next_fwd::(self.haystack).0 + } +} + +unsafe impl<'hs, 'p> pattern::ReverseSearcher> for StrSearcher<'hs, 'p> { + fn next_back(&mut self) -> SearchStep { + self.state.next_bwd(self.haystack) + } + fn next_match_back(&mut self) -> OptRange { + self.state.next_bwd::(self.haystack).0 + } + fn next_reject_back(&mut self) -> OptRange { + self.state.next_bwd::(self.haystack).0 + } +} + +#[derive(Clone, Debug)] +enum StrSearcherInner<'p> { + Empty(EmptySearcherState), + Char(CharSearcherState), + Str(StrSearcherState<'p>), +} + +impl<'p> StrSearcherInner<'p> { + fn new(haystack: Bytes<'_>, needle: &'p str) -> Self { + let mut chars = needle.chars(); + let chr = match chars.next() { + Some(chr) => chr, + None => return Self::Empty(EmptySearcherState::new(haystack)), + }; + if chars.next().is_none() { + Self::Char(CharSearcherState::new(haystack.len(), chr)) + } else { + Self::Str(StrSearcherState::new(haystack, needle)) + } + } + + fn next_fwd(&mut self, haystack: Bytes<'_>) -> R { + match self { + Self::Empty(state) => state.next_fwd::(haystack), + Self::Char(state) => state.next_fwd::(haystack), + Self::Str(state) => state.next_fwd::(haystack), + } + } + + fn next_bwd(&mut self, haystack: Bytes<'_>) -> R { + match self { + Self::Empty(state) => state.next_bwd::(haystack), + Self::Char(state) => state.next_bwd::(haystack), + Self::Str(state) => state.next_bwd::(haystack), + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Empty needle searching +//////////////////////////////////////////////////////////////////////////////// + +/// Empty needle rejects every character and matches every character boundary. +/// +/// A character is either a well-formed WTF-8 bytes sequence or a single byte +/// whichever is longer. +#[derive(Clone, Debug)] +struct EmptySearcherState(pattern::EmptyNeedleSearcher); + +impl EmptySearcherState { + fn new(haystack: Bytes<'_>) -> Self { + Self(pattern::EmptyNeedleSearcher::new(haystack)) + } + + fn next_fwd(&mut self, bytes: Bytes<'_>) -> R { + self.0.next_fwd(|range| bytes.advance_range_start(range)) + } + + fn next_bwd(&mut self, bytes: Bytes<'_>) -> R { + self.0.next_bwd(|range| bytes.advance_range_end(range)) + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Full substring search +//////////////////////////////////////////////////////////////////////////////// + +/// A substring search. +#[derive(Clone, Debug)] +struct StrSearcherState<'p> { + needle: &'p str, + searcher: TwoWaySearcher, +} + +impl<'p> StrSearcherState<'p> { + fn new(haystack: Bytes<'_>, needle: &'p str) -> Self { + let searcher = TwoWaySearcher::new(haystack.len(), needle.as_bytes()); + Self { needle, searcher } + } + + fn next_fwd(&mut self, bytes: Bytes<'_>) -> R { + if self.searcher.position >= bytes.len() { + return R::DONE; + } + if self.searcher.memory == usize::MAX { + self.searcher.next_fwd::(bytes.0, self.needle.as_bytes(), true) + } else { + self.searcher.next_fwd::(bytes.0, self.needle.as_bytes(), false) + } + .adjust_reject_end_fwd(bytes, bytes.len(), &mut self.searcher.position) + } + + fn next_bwd(&mut self, bytes: Bytes<'_>) -> R { + if self.searcher.end == 0 { + return R::DONE; + } + if self.searcher.memory == usize::MAX { + self.searcher.next_bwd::(bytes.0, self.needle.as_bytes(), true) + } else { + self.searcher.next_bwd::(bytes.0, self.needle.as_bytes(), false) + } + .adjust_reject_start_bwd(bytes, 0, &mut self.searcher.end) + } +} + +/// The internal state of the two-way substring search algorithm. +#[derive(Clone, Debug)] +struct TwoWaySearcher { + // constants + /// critical factorization index + crit_pos: usize, + /// critical factorization index for reversed needle + crit_pos_back: usize, + period: usize, + /// `byteset` is an extension (not part of the two way algorithm); + /// it's a 64-bit "fingerprint" where each set bit `j` corresponds + /// to a (byte & 63) == j present in the needle. + byteset: u64, + + // variables + position: usize, + end: usize, + /// index into needle before which we have already matched + memory: usize, + /// index into needle after which we have already matched + memory_back: usize, +} + +/* + This is the Two-Way search algorithm, which was introduced in the paper: + Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675. + + Here's some background information. + + A *word* is a string of symbols. The *length* of a word should be a familiar + notion, and here we denote it for any word x by |x|. + (We also allow for the possibility of the *empty word*, a word of length zero). + + If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a + *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p]. + For example, both 1 and 2 are periods for the string "aa". As another example, + the only period of the string "abcd" is 4. + + We denote by period(x) the *smallest* period of x (provided that x is non-empty). + This is always well-defined since every non-empty word x has at least one period, + |x|. We sometimes call this *the period* of x. + + If u, v and x are words such that x = uv, where uv is the concatenation of u and + v, then we say that (u, v) is a *factorization* of x. + + Let (u, v) be a factorization for a word x. Then if w is a non-empty word such + that both of the following hold + + - either w is a suffix of u or u is a suffix of w + - either w is a prefix of v or v is a prefix of w + + then w is said to be a *repetition* for the factorization (u, v). + + Just to unpack this, there are four possibilities here. Let w = "abc". Then we + might have: + + - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde") + - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab") + - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi") + - u is a suffix of w and v is a prefix of w. ex: ("bc", "a") + + Note that the word vu is a repetition for any factorization (u,v) of x = uv, + so every factorization has at least one repetition. + + If x is a string and (u, v) is a factorization for x, then a *local period* for + (u, v) is an integer r such that there is some word w such that |w| = r and w is + a repetition for (u, v). + + We denote by local_period(u, v) the smallest local period of (u, v). We sometimes + call this *the local period* of (u, v). Provided that x = uv is non-empty, this + is well-defined (because each non-empty word has at least one factorization, as + noted above). + + It can be proven that the following is an equivalent definition of a local period + for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for + all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are + defined. (i.e., i > 0 and i + r < |x|). + + Using the above reformulation, it is easy to prove that + + 1 <= local_period(u, v) <= period(uv) + + A factorization (u, v) of x such that local_period(u,v) = period(x) is called a + *critical factorization*. + + The algorithm hinges on the following theorem, which is stated without proof: + + **Critical Factorization Theorem** Any word x has at least one critical + factorization (u, v) such that |u| < period(x). + + The purpose of maximal_suffix is to find such a critical factorization. + + If the period is short, compute another factorization x = u' v' to use + for reverse search, chosen instead so that |v'| < period(x). + +*/ +impl TwoWaySearcher { + fn new(haystack_len: usize, needle: &[u8]) -> TwoWaySearcher { + let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false); + let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true); + + let (crit_pos, period) = if crit_pos_false > crit_pos_true { + (crit_pos_false, period_false) + } else { + (crit_pos_true, period_true) + }; + + // A particularly readable explanation of what's going on here can be found + // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically + // see the code for "Algorithm CP" on p. 323. + // + // What's going on is we have some critical factorization (u, v) of the + // needle, and we want to determine whether u is a suffix of + // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use + // "Algorithm CP2", which is optimized for when the period of the needle + // is large. + if needle[..crit_pos] == needle[period..period + crit_pos] { + // short period case -- the period is exact + // compute a separate critical factorization for the reversed needle + // x = u' v' where |v'| < period(x). + // + // This is sped up by the period being known already. + // Note that a case like x = "acba" may be factored exactly forwards + // (crit_pos = 1, period = 3) while being factored with approximate + // period in reverse (crit_pos = 2, period = 2). We use the given + // reverse factorization but keep the exact period. + let crit_pos_back = needle.len() + - cmp::max( + TwoWaySearcher::reverse_maximal_suffix(needle, period, false), + TwoWaySearcher::reverse_maximal_suffix(needle, period, true), + ); + + TwoWaySearcher { + crit_pos, + crit_pos_back, + period, + byteset: Self::byteset_create(&needle[..period]), + + position: 0, + end: haystack_len, + memory: 0, + memory_back: needle.len(), + } + } else { + // long period case -- we have an approximation to the actual period, + // and don't use memorization. + // + // Approximate the period by lower bound max(|u|, |v|) + 1. + // The critical factorization is efficient to use for both forward and + // reverse search. + + TwoWaySearcher { + crit_pos, + crit_pos_back: crit_pos, + period: cmp::max(crit_pos, needle.len() - crit_pos) + 1, + byteset: Self::byteset_create(needle), + + position: 0, + end: haystack_len, + memory: usize::MAX, // Dummy value to signify that the period is long + memory_back: usize::MAX, + } + } + } + + #[inline] + fn byteset_create(bytes: &[u8]) -> u64 { + bytes.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a) + } + + #[inline] + fn byteset_contains(&self, byte: u8) -> bool { + (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0 + } + + // One of the main ideas of Two-Way is that we factorize the needle into + // two halves, (u, v), and begin trying to find v in the haystack by scanning + // left to right. If v matches, we try to match u by scanning right to left. + // How far we can jump when we encounter a mismatch is all based on the fact + // that (u, v) is a critical factorization for the needle. + #[inline] + fn next_fwd( + &mut self, + haystack: &[u8], + needle: &[u8], + long_period: bool, + ) -> R { + // `next()` uses `self.position` as its cursor + let old_pos = self.position; + let needle_last = needle.len() - 1; + 'search: loop { + // Check that we have room to search in + // position + needle_last can not overflow if we assume slices + // are bounded by isize's range. + let tail_byte = match haystack.get(self.position + needle_last) { + Some(&b) => b, + None => { + self.position = haystack.len(); + return R::rejecting(old_pos, self.position).unwrap_or(R::DONE); + } + }; + + if old_pos != self.position { + if let Some(ret) = R::rejecting(old_pos, self.position) { + return ret; + } + } + + // Quickly skip by large portions unrelated to our substring + if !self.byteset_contains(tail_byte) { + self.position += needle.len(); + if !long_period { + self.memory = 0; + } + continue 'search; + } + + // See if the right part of the needle matches + let start = + if long_period { self.crit_pos } else { cmp::max(self.crit_pos, self.memory) }; + for i in start..needle.len() { + if needle[i] != haystack[self.position + i] { + self.position += i - self.crit_pos + 1; + if !long_period { + self.memory = 0; + } + continue 'search; + } + } + + // See if the left part of the needle matches + let start = if long_period { 0 } else { self.memory }; + for i in (start..self.crit_pos).rev() { + if needle[i] != haystack[self.position + i] { + self.position += self.period; + if !long_period { + self.memory = needle.len() - self.period; + } + continue 'search; + } + } + + // We have found a match! + let match_pos = self.position; + + // Note: add self.period instead of needle.len() to have overlapping matches + self.position += needle.len(); + if !long_period { + self.memory = 0; // set to needle.len() - self.period for overlapping matches + } + + if let Some(ret) = R::matching(match_pos, match_pos + needle.len()) { + return ret; + } + } + } + + // Follows the ideas in `next()`. + // + // The definitions are symmetrical, with period(x) = period(reverse(x)) + // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v) + // is a critical factorization, so is (reverse(v), reverse(u)). + // + // For the reverse case we have computed a critical factorization x = u' v' + // (field `crit_pos_back`). We need |u| < period(x) for the forward case and + // thus |v'| < period(x) for the reverse. + // + // To search in reverse through the haystack, we search forward through + // a reversed haystack with a reversed needle, matching first u' and then v'. + #[inline] + fn next_bwd( + &mut self, + haystack: &[u8], + needle: &[u8], + long_period: bool, + ) -> R { + // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()` + // are independent. + let old_end = self.end; + 'search: loop { + // Check that we have room to search in + // end - needle.len() will wrap around when there is no more room, + // but due to slice length limits it can never wrap all the way back + // into the length of haystack. + let front_byte = match haystack.get(self.end.wrapping_sub(needle.len())) { + Some(&b) => b, + None => { + self.end = 0; + return R::rejecting(0, old_end).unwrap_or(R::DONE); + } + }; + + if old_end != self.end { + if let Some(ret) = R::rejecting(self.end, old_end) { + return ret; + } + } + + // Quickly skip by large portions unrelated to our substring + if !self.byteset_contains(front_byte) { + self.end -= needle.len(); + if !long_period { + self.memory_back = needle.len(); + } + continue 'search; + } + + // See if the left part of the needle matches + let crit = if long_period { + self.crit_pos_back + } else { + cmp::min(self.crit_pos_back, self.memory_back) + }; + for i in (0..crit).rev() { + if needle[i] != haystack[self.end - needle.len() + i] { + self.end -= self.crit_pos_back - i; + if !long_period { + self.memory_back = needle.len(); + } + continue 'search; + } + } + + // See if the right part of the needle matches + let needle_end = if long_period { needle.len() } else { self.memory_back }; + for i in self.crit_pos_back..needle_end { + if needle[i] != haystack[self.end - needle.len() + i] { + self.end -= self.period; + if !long_period { + self.memory_back = self.period; + } + continue 'search; + } + } + + // We have found a match! + let match_pos = self.end - needle.len(); + // Note: sub self.period instead of needle.len() to have overlapping matches + self.end -= needle.len(); + if !long_period { + self.memory_back = needle.len(); + } + + if let Some(ret) = R::matching(match_pos, match_pos + needle.len()) { + return ret; + } + } + } + + // Compute the maximal suffix of `arr`. + // + // The maximal suffix is a possible critical factorization (u, v) of `arr`. + // + // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the + // period of v. + // + // `order_greater` determines if lexical order is `<` or `>`. Both + // orders must be computed -- the ordering with the largest `i` gives + // a critical factorization. + // + // For long period cases, the resulting period is not exact (it is too short). + #[inline] + fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) { + let mut left = 0; // Corresponds to i in the paper + let mut right = 1; // Corresponds to j in the paper + let mut offset = 0; // Corresponds to k in the paper, but starting at 0 + // to match 0-based indexing. + let mut period = 1; // Corresponds to p in the paper + + while let Some(&a) = arr.get(right + offset) { + // `left` will be inbounds when `right` is. + let b = arr[left + offset]; + if (a < b && !order_greater) || (a > b && order_greater) { + // Suffix is smaller, period is entire prefix so far. + right += offset + 1; + offset = 0; + period = right - left; + } else if a == b { + // Advance through repetition of the current period. + if offset + 1 == period { + right += offset + 1; + offset = 0; + } else { + offset += 1; + } + } else { + // Suffix is larger, start over from current location. + left = right; + right += 1; + offset = 0; + period = 1; + } + } + (left, period) + } + + // Compute the maximal suffix of the reverse of `arr`. + // + // The maximal suffix is a possible critical factorization (u', v') of `arr`. + // + // Returns `i` where `i` is the starting index of v', from the back; + // returns immediately when a period of `known_period` is reached. + // + // `order_greater` determines if lexical order is `<` or `>`. Both + // orders must be computed -- the ordering with the largest `i` gives + // a critical factorization. + // + // For long period cases, the resulting period is not exact (it is too short). + fn reverse_maximal_suffix(arr: &[u8], known_period: usize, order_greater: bool) -> usize { + let mut left = 0; // Corresponds to i in the paper + let mut right = 1; // Corresponds to j in the paper + let mut offset = 0; // Corresponds to k in the paper, but starting at 0 + // to match 0-based indexing. + let mut period = 1; // Corresponds to p in the paper + let n = arr.len(); + + while right + offset < n { + let a = arr[n - (1 + right + offset)]; + let b = arr[n - (1 + left + offset)]; + if (a < b && !order_greater) || (a > b && order_greater) { + // Suffix is smaller, period is entire prefix so far. + right += offset + 1; + offset = 0; + period = right - left; + } else if a == b { + // Advance through repetition of the current period. + if offset + 1 == period { + right += offset + 1; + offset = 0; + } else { + offset += 1; + } + } else { + // Suffix is larger, start over from current location. + left = right; + right += 1; + offset = 0; + period = 1; + } + if period == known_period { + break; + } + } + debug_assert!(period <= known_period); + left + } +} diff --git a/library/core/tests/pattern.rs b/library/core/tests/pattern.rs index 866c1375d92ca..e92c30bbdd735 100644 --- a/library/core/tests/pattern.rs +++ b/library/core/tests/pattern.rs @@ -60,14 +60,9 @@ fn test_simple_iteration() { 'a', "forward iteration for ASCII string", next => Matches(0, 1), - next => Rejects(1, 2), - next => Rejects(2, 3), - next => Rejects(3, 4), - next => Rejects(4, 5), + next => Rejects(1, 5), next => Matches(5, 6), - next => Rejects(6, 7), - next => Rejects(7, 8), - next => Rejects(8, 9), + next => Rejects(6, 9), next => Done ); @@ -75,14 +70,9 @@ fn test_simple_iteration() { "abcdeabcd", 'a', "reverse iteration for ASCII string", - next_back => Rejects(8, 9), - next_back => Rejects(7, 8), - next_back => Rejects(6, 7), + next_back => Rejects(6, 9), next_back => Matches(5, 6), - next_back => Rejects(4, 5), - next_back => Rejects(3, 4), - next_back => Rejects(2, 3), - next_back => Rejects(1, 2), + next_back => Rejects(1, 5), next_back => Matches(0, 1), next_back => Done ); @@ -94,8 +84,7 @@ fn test_simple_iteration() { next => Matches(0, 3), next => Rejects(3, 6), next => Matches(6, 9), - next => Rejects(9, 12), - next => Rejects(12, 15), + next => Rejects(9, 15), next => Done ); @@ -103,14 +92,9 @@ fn test_simple_iteration() { "我的猫说meow", 'm', "forward iteration for mixed string", - next => Rejects(0, 3), - next => Rejects(3, 6), - next => Rejects(6, 9), - next => Rejects(9, 12), + next => Rejects(0, 12), next => Matches(12, 13), - next => Rejects(13, 14), - next => Rejects(14, 15), - next => Rejects(15, 16), + next => Rejects(13, 16), next => Done ); @@ -118,14 +102,9 @@ fn test_simple_iteration() { "我的猫说meow", '猫', "reverse iteration for mixed string", - next_back => Rejects(15, 16), - next_back => Rejects(14, 15), - next_back => Rejects(13, 14), - next_back => Rejects(12, 13), - next_back => Rejects(9, 12), + next_back => Rejects(9, 16), next_back => Matches(6, 9), - next_back => Rejects(3, 6), - next_back => Rejects(0, 3), + next_back => Rejects(0, 6), next_back => Done ); } @@ -235,11 +214,11 @@ fn test_forward_search_shared_bytes() { 'Á', "Forward search for two-byte Latin character; check if next() still works", next_match => Matches(0, 2), - next => Rejects(2, 3), + next => Rejects(2, 8), next_match => Matches(8, 10), - next => Rejects(10, 13), + next => Rejects(10, 32), next_match => Matches(32, 34), - next => Rejects(34, 37), + next => Rejects(34, 48), next_match => Done ); @@ -248,7 +227,7 @@ fn test_forward_search_shared_bytes() { '각', "Forward search for three-byte Hangul character", next_match => Matches(19, 22), - next => Rejects(22, 25), + next => Rejects(22, 28), next_match => Matches(28, 31), next_match => Matches(34, 37), next_match => Done @@ -259,11 +238,11 @@ fn test_forward_search_shared_bytes() { '각', "Forward search for three-byte Hangul character; check if next() still works", next_match => Matches(19, 22), - next => Rejects(22, 25), + next => Rejects(22, 28), next_match => Matches(28, 31), - next => Rejects(31, 32), + next => Rejects(31, 34), next_match => Matches(34, 37), - next => Rejects(37, 40), + next => Rejects(37, 48), next_match => Done ); @@ -272,9 +251,9 @@ fn test_forward_search_shared_bytes() { 'ก', "Forward search for three-byte Thai character", next_match => Matches(22, 25), - next => Rejects(25, 28), + next => Rejects(25, 40), next_match => Matches(40, 43), - next => Rejects(43, 47), + next => Rejects(43, 48), next_match => Done ); @@ -283,9 +262,9 @@ fn test_forward_search_shared_bytes() { 'ก', "Forward search for three-byte Thai character; check if next() still works", next_match => Matches(22, 25), - next => Rejects(25, 28), + next => Rejects(25, 40), next_match => Matches(40, 43), - next => Rejects(43, 47), + next => Rejects(43, 48), next_match => Done ); @@ -294,7 +273,7 @@ fn test_forward_search_shared_bytes() { '😁', "Forward search for four-byte emoji", next_match => Matches(15, 19), - next => Rejects(19, 22), + next => Rejects(19, 43), next_match => Matches(43, 47), next => Rejects(47, 48), next_match => Done @@ -305,7 +284,7 @@ fn test_forward_search_shared_bytes() { '😁', "Forward search for four-byte emoji; check if next() still works", next_match => Matches(15, 19), - next => Rejects(19, 22), + next => Rejects(19, 43), next_match => Matches(43, 47), next => Rejects(47, 48), next_match => Done @@ -316,9 +295,9 @@ fn test_forward_search_shared_bytes() { 'ꁁ', "Forward search for three-byte Yi character with repeated bytes", next_match => Matches(10, 13), - next => Rejects(13, 14), + next => Rejects(13, 37), next_match => Matches(37, 40), - next => Rejects(40, 43), + next => Rejects(40, 48), next_match => Done ); @@ -327,9 +306,9 @@ fn test_forward_search_shared_bytes() { 'ꁁ', "Forward search for three-byte Yi character with repeated bytes; check if next() still works", next_match => Matches(10, 13), - next => Rejects(13, 14), + next => Rejects(13, 37), next_match => Matches(37, 40), - next => Rejects(40, 43), + next => Rejects(40, 48), next_match => Done ); } @@ -351,9 +330,9 @@ fn test_reverse_search_shared_bytes() { 'Á', "Reverse search for two-byte Latin character; check if next_back() still works", next_match_back => Matches(32, 34), - next_back => Rejects(31, 32), + next_back => Rejects(10, 32), next_match_back => Matches(8, 10), - next_back => Rejects(7, 8), + next_back => Rejects(2, 8), next_match_back => Matches(0, 2), next_back => Done ); @@ -363,7 +342,7 @@ fn test_reverse_search_shared_bytes() { '각', "Reverse search for three-byte Hangul character", next_match_back => Matches(34, 37), - next_back => Rejects(32, 34), + next_back => Rejects(31, 34), next_match_back => Matches(28, 31), next_match_back => Matches(19, 22), next_match_back => Done @@ -374,11 +353,11 @@ fn test_reverse_search_shared_bytes() { '각', "Reverse search for three-byte Hangul character; check if next_back() still works", next_match_back => Matches(34, 37), - next_back => Rejects(32, 34), + next_back => Rejects(31, 34), next_match_back => Matches(28, 31), - next_back => Rejects(25, 28), + next_back => Rejects(22, 28), next_match_back => Matches(19, 22), - next_back => Rejects(15, 19), + next_back => Rejects(0, 19), next_match_back => Done ); @@ -387,9 +366,9 @@ fn test_reverse_search_shared_bytes() { 'ก', "Reverse search for three-byte Thai character", next_match_back => Matches(40, 43), - next_back => Rejects(37, 40), + next_back => Rejects(25, 40), next_match_back => Matches(22, 25), - next_back => Rejects(19, 22), + next_back => Rejects(0, 22), next_match_back => Done ); @@ -398,9 +377,9 @@ fn test_reverse_search_shared_bytes() { 'ก', "Reverse search for three-byte Thai character; check if next_back() still works", next_match_back => Matches(40, 43), - next_back => Rejects(37, 40), + next_back => Rejects(25, 40), next_match_back => Matches(22, 25), - next_back => Rejects(19, 22), + next_back => Rejects(0, 22), next_match_back => Done ); @@ -409,9 +388,9 @@ fn test_reverse_search_shared_bytes() { '😁', "Reverse search for four-byte emoji", next_match_back => Matches(43, 47), - next_back => Rejects(40, 43), + next_back => Rejects(19, 43), next_match_back => Matches(15, 19), - next_back => Rejects(14, 15), + next_back => Rejects(0, 15), next_match_back => Done ); @@ -420,9 +399,9 @@ fn test_reverse_search_shared_bytes() { '😁', "Reverse search for four-byte emoji; check if next_back() still works", next_match_back => Matches(43, 47), - next_back => Rejects(40, 43), + next_back => Rejects(19, 43), next_match_back => Matches(15, 19), - next_back => Rejects(14, 15), + next_back => Rejects(0, 15), next_match_back => Done ); @@ -431,9 +410,9 @@ fn test_reverse_search_shared_bytes() { 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes", next_match_back => Matches(37, 40), - next_back => Rejects(34, 37), + next_back => Rejects(13, 37), next_match_back => Matches(10, 13), - next_back => Rejects(8, 10), + next_back => Rejects(0, 10), next_match_back => Done ); @@ -442,9 +421,9 @@ fn test_reverse_search_shared_bytes() { 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes; check if next_back() still works", next_match_back => Matches(37, 40), - next_back => Rejects(34, 37), + next_back => Rejects(13, 37), next_match_back => Matches(10, 13), - next_back => Rejects(8, 10), + next_back => Rejects(0, 10), next_match_back => Done ); } @@ -494,9 +473,9 @@ fn double_ended_regression_test() { '각', "Reverse double ended search for three-byte Hangul character", next_match_back => Matches(34, 37), - next_back => Rejects(32, 34), + next_back => Rejects(31, 34), next_match => Matches(19, 22), - next => Rejects(22, 25), + next => Rejects(22, 28), next_match_back => Matches(28, 31), next_match => Done ); @@ -505,8 +484,8 @@ fn double_ended_regression_test() { 'ก', "Double ended search for three-byte Thai character", next_match => Matches(22, 25), - next_back => Rejects(47, 48), - next => Rejects(25, 28), + next_back => Rejects(43, 48), + next => Rejects(25, 40), next_match_back => Matches(40, 43), next_match => Done ); @@ -515,9 +494,9 @@ fn double_ended_regression_test() { '😁', "Double ended search for four-byte emoji", next_match_back => Matches(43, 47), - next => Rejects(0, 2), + next => Rejects(0, 15), next_match => Matches(15, 19), - next_back => Rejects(40, 43), + next_back => Rejects(19, 43), next_match => Done ); search_asserts!( @@ -525,9 +504,8 @@ fn double_ended_regression_test() { 'ꁁ', "Double ended search for three-byte Yi character with repeated bytes", next_match => Matches(10, 13), - next => Rejects(13, 14), + next => Rejects(13, 37), next_match_back => Matches(37, 40), - next_back => Rejects(34, 37), - next_match => Done + next_back => Done ); } From fdb0aa6c876282a42b6c45dd3d8df9c0d1545f33 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Mon, 20 Feb 2023 13:57:36 +0100 Subject: [PATCH 08/12] core: add concept of Flavour to core::str_bytes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since core::str_bytes module cannot assume byte slices it deals with are well-formed UTF-8 (or even WTF-8), the code must be defensive and accept invalid sequences. This eliminates optimisations which would be otherwise possible. Introduce a `Flavour` trait which tags `Bytes` type with information about the byte sequence. For example, if a `Bytes` object is created from `&str` it’s tagged with `Utf8` flavour which gives the code freedom to assume data is well-formed UTF-8. This brings back all the optimisations removed in previous commit. --- library/core/src/str/mod.rs | 1 + library/core/src/str/pattern.rs | 26 +- library/core/src/str/validations.rs | 2 +- library/core/src/str_bytes.rs | 623 +++++++++++++++++++++------- 4 files changed, 484 insertions(+), 168 deletions(-) diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index a3c1a0e80e2ea..594e07a266141 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -68,6 +68,7 @@ pub use iter::SplitAsciiWhitespace; #[stable(feature = "split_inclusive", since = "1.51.0")] pub use iter::SplitInclusive; +pub(crate) use validations::next_code_point_reverse; #[unstable(feature = "str_internals", issue = "none")] pub use validations::{ next_code_point, try_next_code_point, try_next_code_point_reverse, utf8_char_width, diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs index fa3b9b4a04244..ca53e1ae8268b 100644 --- a/library/core/src/str/pattern.rs +++ b/library/core/src/str/pattern.rs @@ -91,7 +91,7 @@ impl<'a> Haystack for &'a str { /// Associated type for `>::Searcher`. #[derive(Clone, Debug)] -pub struct CharSearcher<'a>(str_bytes::CharSearcher<'a>); +pub struct CharSearcher<'a>(str_bytes::CharSearcher<'a, str_bytes::Utf8>); impl<'a> CharSearcher<'a> { fn new(haystack: &'a str, chr: char) -> Self { @@ -102,9 +102,7 @@ impl<'a> CharSearcher<'a> { unsafe impl<'a> Searcher<&'a str> for CharSearcher<'a> { #[inline] fn haystack(&self) -> &'a str { - // SAFETY: self.0’s haystack was created from &str thus it is valid - // UTF-8. - unsafe { super::from_utf8_unchecked(self.0.haystack().as_bytes()) } + self.0.haystack().into() } #[inline] fn next(&mut self) -> SearchStep { @@ -165,12 +163,7 @@ impl<'a> Pattern<&'a str> for char { #[inline] fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> { - self.strip_prefix_of(str_bytes::Bytes::from(haystack)).map(|bytes| { - // SAFETY: Bytes were created from &str and Bytes never splits - // inside of UTF-8 bytes sequences thus `bytes` is still valid - // UTF-8. - unsafe { super::from_utf8_unchecked(bytes.as_bytes()) } - }) + self.strip_prefix_of(str_bytes::Bytes::from(haystack)).map(<&str>::from) } #[inline] @@ -180,12 +173,7 @@ impl<'a> Pattern<&'a str> for char { #[inline] fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> { - self.strip_suffix_of(str_bytes::Bytes::from(haystack)).map(|bytes| { - // SAFETY: Bytes were created from &str and Bytes never splits - // inside of UTF-8 bytes sequences thus `bytes` is still valid - // UTF-8. - unsafe { super::from_utf8_unchecked(bytes.as_bytes()) } - }) + self.strip_suffix_of(str_bytes::Bytes::from(haystack)).map(<&str>::from) } } @@ -613,7 +601,7 @@ impl<'a, 'b> Pattern<&'a str> for &'b str { #[derive(Clone, Debug)] /// Associated type for `<&str as Pattern<&'a str>>::Searcher`. -pub struct StrSearcher<'a, 'b>(crate::str_bytes::StrSearcher<'a, 'b>); +pub struct StrSearcher<'a, 'b>(crate::str_bytes::StrSearcher<'a, 'b, crate::str_bytes::Utf8>); impl<'a, 'b> StrSearcher<'a, 'b> { fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> { @@ -625,9 +613,7 @@ impl<'a, 'b> StrSearcher<'a, 'b> { unsafe impl<'a, 'b> Searcher<&'a str> for StrSearcher<'a, 'b> { #[inline] fn haystack(&self) -> &'a str { - let bytes = self.0.haystack().as_bytes(); - // SAFETY: self.0.haystack() was created from a &str. - unsafe { crate::str::from_utf8_unchecked(bytes) } + self.0.haystack().into() } #[inline] diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs index b4a183711a1ee..c66a06837f39f 100644 --- a/library/core/src/str/validations.rs +++ b/library/core/src/str/validations.rs @@ -76,7 +76,7 @@ pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> /// /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string #[inline] -pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option +pub(crate) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option where I: DoubleEndedIterator, { diff --git a/library/core/src/str_bytes.rs b/library/core/src/str_bytes.rs index 047faf2005a12..bc781e345b5b1 100644 --- a/library/core/src/str_bytes.rs +++ b/library/core/src/str_bytes.rs @@ -20,11 +20,15 @@ #![unstable(feature = "str_internals", issue = "none")] use crate::cmp; +use crate::marker::PhantomData; use crate::mem::take; use crate::ops; use crate::pattern; -use crate::pattern::{Haystack, MatchOnly, RejectOnly, SearchStep, Searcher}; -use crate::str::{try_next_code_point, try_next_code_point_reverse}; +use crate::pattern::{Haystack, MatchOnly, Pattern, RejectOnly, SearchStep}; +use crate::str::{ + next_code_point, next_code_point_reverse, try_next_code_point, try_next_code_point_reverse, + utf8_char_width, +}; type OptRange = Option<(usize, usize)>; type Range = ops::Range; @@ -40,10 +44,29 @@ type Range = ops::Range; /// sequences. This is in a sense a generalisation of a `&str` which allows /// portions of the buffer to be ill-formed while preserving correctness of /// existing well-formed parts. +/// +/// The `F` generic argument tags the slice with a [flavour][Flavour] which +/// specifies structure of the data. #[derive(Copy, Clone, Debug)] -pub struct Bytes<'a>(&'a [u8]); +pub struct Bytes<'a, F>(&'a [u8], PhantomData); + +impl<'a, F: Flavour> Bytes<'a, F> { + /// Creates a new `Bytes` wrapper around bytes slice. + /// + /// # Safety + /// + /// Caller must guarantee that the bytes adhere to the requirements for the + /// flavour `F`. E.g. for [`Wtf8`] flavour, the bytes must be well-formed + /// WTF-8 encoded string. + /// + /// It may be more convenient to use `Bytes::From` implementations which are + /// provided for `&str`, `&OsStr` and `&[u8]`. + pub unsafe fn new(bytes: &'a [u8]) -> Bytes<'a, F> { + Self(bytes, PhantomData) + } +} -impl<'a> Bytes<'a> { +impl<'a, F: Flavour> Bytes<'a, F> { pub fn as_bytes(self) -> &'a [u8] { self.0 } @@ -68,11 +91,7 @@ impl<'a> Bytes<'a> { /// advance position byte at a time. If you need to be able to advance /// position byte at a time use `advance_range_start` instead. fn adjust_position_fwd(self, range: Range) -> usize { - range.start - + self.as_bytes()[range.clone()] - .iter() - .take_while(|chr| !chr.is_utf8_char_boundary()) - .count() + F::adjust_position_fwd(self.as_bytes(), range) } /// Adjusts position backward so that it points at the closest potential @@ -87,12 +106,7 @@ impl<'a> Bytes<'a> { /// advance position byte at a time. If you need to be able to advance /// position character at a time use `advance_range_end` instead. fn adjust_position_bwd(self, range: Range) -> usize { - range.end - - self.as_bytes()[range.start..range.end + 1] - .iter() - .rev() - .take_while(|chr| !chr.is_utf8_char_boundary()) - .count() + F::adjust_position_bwd(self.as_bytes(), range) } /// Given a valid range update it’s start so it falls on the next character @@ -103,11 +117,7 @@ impl<'a> Bytes<'a> { /// `range.start + 1`. In other words, well-formed WTF-8 bytes sequence are /// skipped in one go while ill-formed sequences are skipped byte-by-byte. fn advance_range_start(self, range: Range) -> usize { - assert!(!range.is_empty()); - match try_next_code_point(&self.0[range.clone()]) { - Some((_, len)) => range.start + len, - None => range.end.min(range.start + 1), - } + range.start + F::advance_range_start(&self.as_bytes()[range]) } /// Given a valid range update it’s end so it falls on the previous @@ -119,11 +129,7 @@ impl<'a> Bytes<'a> { /// sequence are skipped in one go while ill-formed sequences are skipped /// byte-by-byte. fn advance_range_end(self, range: Range) -> usize { - assert!(!range.is_empty()); - match try_next_code_point_reverse(&self.0[range.clone()]) { - Some((_, len)) => range.end - len, - None => range.end - 1, - } + range.start + F::advance_range_end(&self.as_bytes()[range]) } /// Returns valid UTF-8 character at the front of the slice. @@ -132,7 +138,7 @@ impl<'a> Bytes<'a> { /// Otherwise returns decoded character and it’s UTF-8 encoding’s length. /// WTF-8 sequences which encode surrogates are considered invalid. fn get_first_code_point(self) -> Option<(char, usize)> { - try_next_code_point(&self.0) + F::get_first_code_point(self.as_bytes()) } /// Returns valid UTF-8 character at the end of the slice. @@ -140,8 +146,8 @@ impl<'a> Bytes<'a> { /// If slice doesn’t end with a valid UTF-8 sequence, returns `None`. /// Otherwise returns decoded character and it’s UTF-8 encoding’s length. /// WTF-8 sequences which encode surrogates are considered invalid. - fn get_last_code_point(&self) -> Option<(char, usize)> { - try_next_code_point_reverse(&self.0) + fn get_last_code_point(self) -> Option<(char, usize)> { + F::get_last_code_point(self.as_bytes()) } /// Looks for the next UTF-8-encoded character in the slice. @@ -151,13 +157,8 @@ impl<'a> Bytes<'a> { /// Returns position of the match, decoded character and UTF-8 length of /// that character. fn find_code_point_fwd(self, range: Range) -> Option<(usize, char, usize)> { - let bytes = &self.as_bytes()[range.clone()]; - (0..bytes.len()) - .filter_map(|pos| { - let (chr, len) = try_next_code_point(&bytes[pos..])?; - Some((range.start + pos, chr, len)) - }) - .next() + F::find_code_point_fwd(&self.as_bytes()[range.clone()]) + .map(|(pos, chr, len)| (range.start + pos, chr, len)) } /// Looks backwards for the next UTF-8 encoded character in the slice. @@ -167,28 +168,301 @@ impl<'a> Bytes<'a> { /// Returns position of the match, decoded character and UTF-8 length of /// that character. fn find_code_point_bwd(&self, range: Range) -> Option<(usize, char, usize)> { - let bytes = &self.as_bytes()[range.clone()]; - (0..bytes.len()) - .rev() - .filter_map(|pos| { - let (chr, len) = try_next_code_point(&bytes[pos..])?; - Some((range.start + pos, chr, len)) - }) - .next() + F::find_code_point_bwd(&self.as_bytes()[range.clone()]) + .map(|(pos, chr, len)| (range.start + pos, chr, len)) } } -impl<'a> From<&'a [u8]> for Bytes<'a> { +impl<'a> From<&'a [u8]> for Bytes<'a, Unstructured> { #[inline] fn from(val: &'a [u8]) -> Self { - Self(val) + Self(val, PhantomData) } } -impl<'a> From<&'a str> for Bytes<'a> { +impl<'a> From<&'a str> for Bytes<'a, Utf8> { #[inline] fn from(val: &'a str) -> Self { - Self(val.as_bytes()) + // SAFETY: `str`’s bytes ares guaranteed to be UTF-8 so `Utf8` flavour + // is correct. + unsafe { Bytes::new(val.as_bytes()) } + } +} + +impl<'a> From> for &'a str { + #[inline] + fn from(bytes: Bytes<'a, Utf8>) -> &'a str { + if cfg!(debug_assertions) { + crate::str::from_utf8(bytes.as_bytes()).unwrap() + } else { + // SAFETY: Bytes has been created from &str and we’ve been + // maintaining UTF-8 format. + unsafe { crate::str::from_utf8_unchecked(bytes.as_bytes()) } + } + } +} + +#[derive(Clone, Copy, Debug)] +pub enum Unstructured {} +#[derive(Clone, Copy, Debug)] +pub enum Wtf8 {} +#[derive(Clone, Copy, Debug)] +pub enum Utf8 {} + +/// A marker trait indicating ‘flavour’ of data referred by [`Bytes`] type. +/// +/// The trait abstracts away operations related to identifying and decoding +/// ‘characters’ from a bytes slice. A valid WTF-8 byte sequence is always +/// treated as indivisible ‘character’ but depending on the flavour code can +/// make different assumption about contents of the bytes slice: +/// - [`Unstructured`] flavoured bytes slice may contain ill-formed bytes +/// sequences and in those each byte is treated as separate ‘character’, +/// - [`Wtf8`] flavoured bytes slice is a well-formed WTF-8-encoded string (that +/// is some of the byte sequences may encode surrogate code points) and +/// - [`Utf8`] flavoured bytes slice is a well-formed UTF-8-encoded string (that +/// is all byte sequences encode valid Unicode code points). +pub trait Flavour: private::Flavour {} + +impl Flavour for Unstructured {} +impl Flavour for Wtf8 {} +impl Flavour for Utf8 {} + +mod private { + use super::*; + + /// Private methods of the [`super::Flavour`] trait. + pub trait Flavour: Copy + core::fmt::Debug { + const IS_WTF8: bool; + fn adjust_position_fwd(bytes: &[u8], range: Range) -> usize; + fn adjust_position_bwd(bytes: &[u8], range: Range) -> usize; + fn advance_range_start(bytes: &[u8]) -> usize; + fn advance_range_end(bytes: &[u8]) -> usize; + fn get_first_code_point(bytes: &[u8]) -> Option<(char, usize)>; + fn get_last_code_point(bytes: &[u8]) -> Option<(char, usize)>; + fn find_code_point_fwd(bytes: &[u8]) -> Option<(usize, char, usize)>; + fn find_code_point_bwd(bytes: &[u8]) -> Option<(usize, char, usize)>; + } + + impl Flavour for super::Unstructured { + const IS_WTF8: bool = false; + + fn adjust_position_fwd(bytes: &[u8], range: Range) -> usize { + range.start + + bytes[range.clone()].iter().take_while(|chr| !chr.is_utf8_char_boundary()).count() + } + + fn adjust_position_bwd(bytes: &[u8], range: Range) -> usize { + range.end + - bytes[range.start..range.end + 1] + .iter() + .rev() + .take_while(|chr| !chr.is_utf8_char_boundary()) + .count() + } + + fn advance_range_start(bytes: &[u8]) -> usize { + assert!(!bytes.is_empty()); + try_next_code_point(bytes).map_or(1, |(_, len)| len) + } + + fn advance_range_end(bytes: &[u8]) -> usize { + assert!(!bytes.is_empty()); + bytes.len() - try_next_code_point_reverse(bytes).map_or(1, |(_, len)| len) + } + + fn get_first_code_point(bytes: &[u8]) -> Option<(char, usize)> { + try_next_code_point(bytes) + } + + fn get_last_code_point(bytes: &[u8]) -> Option<(char, usize)> { + try_next_code_point_reverse(bytes) + } + + fn find_code_point_fwd(bytes: &[u8]) -> Option<(usize, char, usize)> { + (0..bytes.len()) + .filter_map(|pos| { + let (chr, len) = try_next_code_point(&bytes[pos..])?; + Some((pos, chr, len)) + }) + .next() + } + + fn find_code_point_bwd(bytes: &[u8]) -> Option<(usize, char, usize)> { + (0..bytes.len()) + .rev() + .filter_map(|pos| { + let (chr, len) = try_next_code_point(&bytes[pos..])?; + Some((pos, chr, len)) + }) + .next() + } + } + + impl Flavour for Wtf8 { + const IS_WTF8: bool = true; + + fn adjust_position_fwd(bytes: &[u8], range: Range) -> usize { + let mut pos = range.start; + // Input is WTF-8 so we will never need to move more than three + // positions. This happens when we’re at pointing at the first + // continuation byte of a four-byte sequence. Unroll the loop. + for _ in 0..3 { + // We’re not checking pos against _end because we know that _end + // == bytes.len() or falls on a character boundary. We can + // therefore compare against bytes.len() and eliminate that + // comparison. + if bytes.get(pos).map_or(true, |b: &u8| b.is_utf8_char_boundary()) { + break; + } + pos += 1; + } + pos + } + + fn adjust_position_bwd(bytes: &[u8], range: Range) -> usize { + let mut pos = range.end; + // Input is WTF-8 so we will never need to move more than three + // positions. This happens when we’re at pointing at the first + // continuation byte of a four-byte sequence. Unroll the loop. + for _ in 0..3 { + // SAFETY: `bytes` is well-formed WTF-8 sequence and at function + // start `pos` is index within `bytes`. Therefore, `bytes[pos]` + // is valid and a) if it’s a character boundary we exit the + // function or b) otherwise we know that `pos > 0` (because + // otherwise `bytes` wouldn’t be well-formed WTF-8). + if unsafe { bytes.get_unchecked(pos) }.is_utf8_char_boundary() { + break; + } + pos -= 1; + } + pos + } + + fn advance_range_start(bytes: &[u8]) -> usize { + // Input is valid WTF-8 so we can just deduce length of next + // sequence to skip from the frist byte. + utf8_char_width(*bytes.get(0).unwrap()) + } + + fn advance_range_end(bytes: &[u8]) -> usize { + let end = bytes.len().checked_sub(1).unwrap(); + Self::adjust_position_bwd(bytes, 0..end) + } + + fn get_first_code_point(bytes: &[u8]) -> Option<(char, usize)> { + // SAFETY: We’re Wtf8 flavour. Client promises that bytes are + // well-formed WTF-8. + let cp = unsafe { next_code_point(&mut bytes.iter())? }; + // WTF-8 might produce surrogate code points so we still need to + // verify that we got a valid character. + char::from_u32(cp).map(|chr| (chr, len_utf8(cp))) + } + + fn get_last_code_point(bytes: &[u8]) -> Option<(char, usize)> { + // SAFETY: We’re Wtf8 flavour. Client promises that bytes are + // well-formed WTF-8. + let cp = unsafe { next_code_point_reverse(&mut bytes.iter().rev())? }; + // WTF-8 might produce surrogate code points so we still need to + // verify that we got a valid character. + char::from_u32(cp).map(|chr| (chr, len_utf8(cp))) + } + + fn find_code_point_fwd(bytes: &[u8]) -> Option<(usize, char, usize)> { + let mut iter = bytes.iter(); + let mut pos = 0; + loop { + // SAFETY: We’re Wtf8 flavour. Client promises that bytes are + // well-formed WTF-8. + let cp = unsafe { next_code_point(&mut iter)? }; + let len = len_utf8(cp); + if let Some(chr) = char::from_u32(cp) { + return Some((pos, chr, len)); + } + pos += len; + } + } + + fn find_code_point_bwd(bytes: &[u8]) -> Option<(usize, char, usize)> { + let mut iter = bytes.iter().rev(); + let mut pos = bytes.len(); + loop { + // SAFETY: We’re Wtf8 flavour. Client promises that bytes are + // well-formed WTF-8. + let cp = unsafe { next_code_point_reverse(&mut iter)? }; + let len = len_utf8(cp); + pos -= len; + if let Some(chr) = char::from_u32(cp) { + return Some((pos, chr, len)); + } + } + } + } + + impl Flavour for Utf8 { + const IS_WTF8: bool = true; + + fn adjust_position_fwd(bytes: &[u8], range: Range) -> usize { + Wtf8::adjust_position_fwd(bytes, range) + } + + fn adjust_position_bwd(bytes: &[u8], range: Range) -> usize { + Wtf8::adjust_position_bwd(bytes, range) + } + + fn advance_range_start(bytes: &[u8]) -> usize { + Wtf8::advance_range_start(bytes) + } + + fn advance_range_end(bytes: &[u8]) -> usize { + Wtf8::advance_range_end(bytes) + } + + fn get_first_code_point(bytes: &[u8]) -> Option<(char, usize)> { + let (_, chr, len) = Self::find_code_point_fwd(bytes)?; + Some((chr, len)) + } + + fn get_last_code_point(bytes: &[u8]) -> Option<(char, usize)> { + let (_, chr, len) = Self::find_code_point_bwd(bytes)?; + Some((chr, len)) + } + + fn find_code_point_fwd(bytes: &[u8]) -> Option<(usize, char, usize)> { + // SAFETY: We’re Utf8 flavour. Client promises that bytes are + // well-formed UTF-8. We can not only assume well-formed byte + // sequence but also that produced code points are valid. + let chr = unsafe { char::from_u32_unchecked(next_code_point(&mut bytes.iter())?) }; + let len = chr.len_utf8(); + Some((0, chr, len)) + } + + fn find_code_point_bwd(bytes: &[u8]) -> Option<(usize, char, usize)> { + // SAFETY: We’re Utf8 flavour. Client promises that bytes are + // well-formed UTF-8. We can not only assume well-formed byte + // sequence but also that produced code points are valid. + let chr = unsafe { + let code = next_code_point_reverse(&mut bytes.iter().rev())?; + char::from_u32_unchecked(code) + }; + let len = chr.len_utf8(); + Some((bytes.len() - len, chr, len)) + } + } + + // Copied from src/chars/methods.rs. We need it because it’s not public + // there and char::len_utf8 requires us to have a char and we need this to + // work on surrogate code points as well. + #[inline] + const fn len_utf8(code: u32) -> usize { + if code < 0x80 { + 1 + } else if code < 0x800 { + 2 + } else if code < 0x10000 { + 3 + } else { + 4 + } } } @@ -198,25 +472,45 @@ trait SearchResult: crate::pattern::SearchResult { /// /// Doesn’t move the start position past `begin`. If position was adjusted, /// updates `*out` as well. - fn adjust_reject_start_bwd(self, bytes: Bytes<'_>, begin: usize, out: &mut usize) -> Self; + fn adjust_reject_start_bwd( + self, + bytes: Bytes<'_, F>, + begin: usize, + out: &mut usize, + ) -> Self; /// Adjusts reject’s end position forwards to make sure it doesn’t fall /// withing well-formed WTF-8 sequence. /// /// Doesn’t move the end position past `len`. If position was adjusted, /// updates `*out` as well. - fn adjust_reject_end_fwd(self, bytes: Bytes<'_>, len: usize, out: &mut usize) -> Self; + fn adjust_reject_end_fwd( + self, + bytes: Bytes<'_, F>, + len: usize, + out: &mut usize, + ) -> Self; } impl SearchResult for SearchStep { - fn adjust_reject_start_bwd(mut self, bytes: Bytes<'_>, begin: usize, out: &mut usize) -> Self { + fn adjust_reject_start_bwd( + mut self, + bytes: Bytes<'_, F>, + begin: usize, + out: &mut usize, + ) -> Self { if let SearchStep::Reject(ref mut start, _) = self { *start = bytes.adjust_position_bwd(begin..*start); *out = *start; } self } - fn adjust_reject_end_fwd(mut self, bytes: Bytes<'_>, len: usize, out: &mut usize) -> Self { + fn adjust_reject_end_fwd( + mut self, + bytes: Bytes<'_, F>, + len: usize, + out: &mut usize, + ) -> Self { if let SearchStep::Reject(_, ref mut end) = self { *end = bytes.adjust_position_fwd(*end..len); *out = *end; @@ -226,23 +520,43 @@ impl SearchResult for SearchStep { } impl SearchResult for MatchOnly { - fn adjust_reject_start_bwd(self, _bytes: Bytes<'_>, _begin: usize, _out: &mut usize) -> Self { + fn adjust_reject_start_bwd( + self, + _bytes: Bytes<'_, F>, + _begin: usize, + _out: &mut usize, + ) -> Self { self } - fn adjust_reject_end_fwd(self, _bytes: Bytes<'_>, _end: usize, _out: &mut usize) -> Self { + fn adjust_reject_end_fwd( + self, + _bytes: Bytes<'_, F>, + _end: usize, + _out: &mut usize, + ) -> Self { self } } impl SearchResult for RejectOnly { - fn adjust_reject_start_bwd(mut self, bytes: Bytes<'_>, begin: usize, out: &mut usize) -> Self { + fn adjust_reject_start_bwd( + mut self, + bytes: Bytes<'_, F>, + begin: usize, + out: &mut usize, + ) -> Self { if let RejectOnly(Some((ref mut start, _))) = self { *start = bytes.adjust_position_bwd(begin..*start); *out = *start; } self } - fn adjust_reject_end_fwd(mut self, bytes: Bytes<'_>, len: usize, out: &mut usize) -> Self { + fn adjust_reject_end_fwd( + mut self, + bytes: Bytes<'_, F>, + len: usize, + out: &mut usize, + ) -> Self { if let RejectOnly(Some((_, ref mut end))) = self { *end = bytes.adjust_position_fwd(*end..len); *out = *end; @@ -255,7 +569,7 @@ impl SearchResult for RejectOnly { // Impl for Haystack //////////////////////////////////////////////////////////////////////////////// -impl Haystack for Bytes<'_> { +impl<'hs, F: Flavour> Haystack for Bytes<'hs, F> { type Cursor = usize; fn cursor_at_front(self) -> Self::Cursor { @@ -267,13 +581,17 @@ impl Haystack for Bytes<'_> { fn is_empty(self) -> bool { self.0.is_empty() } + unsafe fn get_unchecked(self, range: Range) -> Self { - Self(if cfg!(debug_assertions) { - self.0.get(range).unwrap() - } else { - // SAFETY: Caller promises cursor is a valid split position. - unsafe { self.0.get_unchecked(range) } - }) + Self( + if cfg!(debug_assertions) { + self.0.get(range).unwrap() + } else { + // SAFETY: Caller promises cursor is a valid split position. + unsafe { self.0.get_unchecked(range) } + }, + PhantomData, + ) } } @@ -281,32 +599,32 @@ impl Haystack for Bytes<'_> { // Impl Pattern for char //////////////////////////////////////////////////////////////////////////////// -impl<'hs> pattern::Pattern> for char { - type Searcher = CharSearcher<'hs>; +impl<'hs, F: Flavour> Pattern> for char { + type Searcher = CharSearcher<'hs, F>; - fn into_searcher(self, haystack: Bytes<'hs>) -> Self::Searcher { + fn into_searcher(self, haystack: Bytes<'hs, F>) -> Self::Searcher { Self::Searcher::new(haystack, self) } - fn is_contained_in(self, haystack: Bytes<'hs>) -> bool { + fn is_contained_in(self, haystack: Bytes<'hs, F>) -> bool { let mut buf = [0; 4]; encode_utf8(self, &mut buf).is_contained_in(haystack) } - fn is_prefix_of(self, haystack: Bytes<'hs>) -> bool { + fn is_prefix_of(self, haystack: Bytes<'hs, F>) -> bool { let mut buf = [0; 4]; encode_utf8(self, &mut buf).is_prefix_of(haystack) } - fn strip_prefix_of(self, haystack: Bytes<'hs>) -> Option> { + fn strip_prefix_of(self, haystack: Bytes<'hs, F>) -> Option> { let mut buf = [0; 4]; encode_utf8(self, &mut buf).strip_prefix_of(haystack) } - fn is_suffix_of(self, haystack: Bytes<'hs>) -> bool { + fn is_suffix_of(self, haystack: Bytes<'hs, F>) -> bool { let mut buf = [0; 4]; encode_utf8(self, &mut buf).is_suffix_of(haystack) } - fn strip_suffix_of(self, haystack: Bytes<'hs>) -> Option> { + fn strip_suffix_of(self, haystack: Bytes<'hs, F>) -> Option> { let mut buf = [0; 4]; encode_utf8(self, &mut buf).strip_suffix_of(haystack) } @@ -320,8 +638,8 @@ fn encode_utf8(chr: char, buf: &mut [u8; 4]) -> &str { } #[derive(Clone, Debug)] -pub struct CharSearcher<'hs> { - haystack: Bytes<'hs>, +pub struct CharSearcher<'hs, F> { + haystack: Bytes<'hs, F>, state: CharSearcherState, } @@ -339,15 +657,15 @@ struct CharSearcherState { is_match_bwd: bool, } -impl<'hs> CharSearcher<'hs> { +impl<'hs, F: Flavour> CharSearcher<'hs, F> { #[inline] - pub fn new(haystack: Bytes<'hs>, chr: char) -> Self { + pub fn new(haystack: Bytes<'hs, F>, chr: char) -> Self { Self { haystack, state: CharSearcherState::new(haystack.len(), chr) } } } -unsafe impl<'hs> pattern::Searcher> for CharSearcher<'hs> { - fn haystack(&self) -> Bytes<'hs> { +unsafe impl<'hs, F: Flavour> pattern::Searcher> for CharSearcher<'hs, F> { + fn haystack(&self) -> Bytes<'hs, F> { self.haystack } @@ -355,26 +673,26 @@ unsafe impl<'hs> pattern::Searcher> for CharSearcher<'hs> { self.state.next_fwd(self.haystack) } fn next_match(&mut self) -> OptRange { - self.state.next_fwd::(self.haystack).0 + self.state.next_fwd::(self.haystack).0 } fn next_reject(&mut self) -> OptRange { - self.state.next_fwd::(self.haystack).0 + self.state.next_fwd::(self.haystack).0 } } -unsafe impl<'hs> pattern::ReverseSearcher> for CharSearcher<'hs> { +unsafe impl<'hs, F: Flavour> pattern::ReverseSearcher> for CharSearcher<'hs, F> { fn next_back(&mut self) -> SearchStep { self.state.next_bwd(self.haystack) } fn next_match_back(&mut self) -> OptRange { - self.state.next_bwd::(self.haystack).0 + self.state.next_bwd::(self.haystack).0 } fn next_reject_back(&mut self) -> OptRange { - self.state.next_bwd::(self.haystack).0 + self.state.next_bwd::(self.haystack).0 } } -impl<'hs> pattern::DoubleEndedSearcher> for CharSearcher<'hs> {} +impl<'hs, F: Flavour> pattern::DoubleEndedSearcher> for CharSearcher<'hs, F> {} impl CharSearcherState { fn new(haystack_len: usize, chr: char) -> Self { @@ -386,7 +704,7 @@ impl CharSearcherState { } } - fn find_match_fwd(&mut self, haystack: Bytes<'_>) -> OptRange { + fn find_match_fwd(&mut self, haystack: Bytes<'_, F>) -> OptRange { let start = if take(&mut self.is_match_fwd) { (!self.range.is_empty()).then_some(self.range.start) } else { @@ -399,7 +717,7 @@ impl CharSearcherState { Some((start, start + self.needle.len())) } - fn next_reject_fwd(&mut self, haystack: Bytes<'_>) -> OptRange { + fn next_reject_fwd(&mut self, haystack: Bytes<'_, F>) -> OptRange { if take(&mut self.is_match_fwd) { if self.range.is_empty() { return None; @@ -419,7 +737,7 @@ impl CharSearcherState { } } - fn next_fwd(&mut self, haystack: Bytes<'_>) -> R { + fn next_fwd(&mut self, haystack: Bytes<'_, F>) -> R { if R::USE_EARLY_REJECT { match self.next_reject_fwd(haystack) { Some((start, end)) => R::rejecting(start, end).unwrap(), @@ -444,7 +762,7 @@ impl CharSearcherState { } } - fn find_match_bwd(&mut self, haystack: Bytes<'_>) -> OptRange { + fn find_match_bwd(&mut self, haystack: Bytes<'_, F>) -> OptRange { let start = if take(&mut self.is_match_bwd) { (!self.range.is_empty()).then(|| self.range.end - self.needle.len()) } else { @@ -457,7 +775,7 @@ impl CharSearcherState { Some((start, start + self.needle.len())) } - fn next_reject_bwd(&mut self, haystack: Bytes<'_>) -> OptRange { + fn next_reject_bwd(&mut self, haystack: Bytes<'_, F>) -> OptRange { if take(&mut self.is_match_bwd) { if self.range.is_empty() { return None; @@ -477,7 +795,7 @@ impl CharSearcherState { } } - fn next_bwd(&mut self, haystack: Bytes<'_>) -> R { + fn next_bwd(&mut self, haystack: Bytes<'_, F>) -> R { if R::USE_EARLY_REJECT { match self.next_reject_bwd(haystack) { Some((start, end)) => R::rejecting(start, end).unwrap(), @@ -614,27 +932,27 @@ mod naive { // Impl Pattern for FnMut(char) and FnMut(Result) //////////////////////////////////////////////////////////////////////////////// -impl<'hs, F: FnMut(char) -> bool> pattern::Pattern> for F { - type Searcher = PredicateSearcher<'hs, F>; +impl<'hs, F: Flavour, P: FnMut(char) -> bool> Pattern> for P { + type Searcher = PredicateSearcher<'hs, F, P>; - fn into_searcher(self, haystack: Bytes<'hs>) -> Self::Searcher { + fn into_searcher(self, haystack: Bytes<'hs, F>) -> Self::Searcher { Self::Searcher::new(haystack, self) } - fn is_prefix_of(mut self, haystack: Bytes<'hs>) -> bool { + fn is_prefix_of(mut self, haystack: Bytes<'hs, F>) -> bool { haystack.get_first_code_point().map_or(false, |(chr, _)| self(chr)) } - fn strip_prefix_of(mut self, haystack: Bytes<'hs>) -> Option> { + fn strip_prefix_of(mut self, haystack: Bytes<'hs, F>) -> Option> { let (chr, len) = haystack.get_first_code_point()?; // SAFETY: We’ve just checked slice starts with len-byte long // well-formed sequence. self(chr).then(|| unsafe { haystack.get_unchecked(len..haystack.len()) }) } - fn is_suffix_of(mut self, haystack: Bytes<'hs>) -> bool { + fn is_suffix_of(mut self, haystack: Bytes<'hs, F>) -> bool { haystack.get_last_code_point().map_or(false, |(chr, _)| self(chr)) } - fn strip_suffix_of(mut self, haystack: Bytes<'hs>) -> Option> { + fn strip_suffix_of(mut self, haystack: Bytes<'hs, F>) -> Option> { let (chr, len) = haystack.get_last_code_point()?; let len = haystack.len() - len; // SAFETY: We’ve just checked slice ends with len-byte long well-formed @@ -644,22 +962,23 @@ impl<'hs, F: FnMut(char) -> bool> pattern::Pattern> for F { } #[derive(Clone, Debug)] -pub struct PredicateSearcher<'hs, F> { - haystack: Bytes<'hs>, - pred: F, +pub struct PredicateSearcher<'hs, F, P> { + haystack: Bytes<'hs, F>, + pred: P, start: usize, end: usize, fwd_match_len: u8, bwd_match_len: u8, } -impl<'hs, F> PredicateSearcher<'hs, F> { - fn new(haystack: Bytes<'hs>, pred: F) -> Self { +impl<'hs, F: Flavour, P> PredicateSearcher<'hs, F, P> { + #[inline] + pub fn new(haystack: Bytes<'hs, F>, pred: P) -> Self { Self { haystack, pred, start: 0, end: haystack.len(), fwd_match_len: 0, bwd_match_len: 0 } } } -impl<'hs, F: FnMut(char) -> bool> PredicateSearcher<'hs, F> { +impl<'hs, F: Flavour, P: FnMut(char) -> bool> PredicateSearcher<'hs, F, P> { fn find_match_fwd(&mut self) -> Option<(usize, usize)> { let mut start = self.start; while start < self.end { @@ -736,8 +1055,12 @@ impl<'hs, F: FnMut(char) -> bool> PredicateSearcher<'hs, F> { } } -unsafe impl<'hs, F: FnMut(char) -> bool> Searcher> for PredicateSearcher<'hs, F> { - fn haystack(&self) -> Bytes<'hs> { +unsafe impl<'hs, F, P> pattern::Searcher> for PredicateSearcher<'hs, F, P> +where + F: Flavour, + P: FnMut(char) -> bool, +{ + fn haystack(&self) -> Bytes<'hs, F> { self.haystack } fn next(&mut self) -> SearchStep { @@ -751,8 +1074,10 @@ unsafe impl<'hs, F: FnMut(char) -> bool> Searcher> for PredicateSearc } } -unsafe impl<'hs, F: FnMut(char) -> bool> pattern::ReverseSearcher> - for PredicateSearcher<'hs, F> +unsafe impl<'hs, F, P> pattern::ReverseSearcher> for PredicateSearcher<'hs, F, P> +where + F: Flavour, + P: FnMut(char) -> bool, { fn next_back(&mut self) -> SearchStep { self.next_bwd() @@ -765,8 +1090,10 @@ unsafe impl<'hs, F: FnMut(char) -> bool> pattern::ReverseSearcher> } } -impl<'hs, F: FnMut(char) -> bool> pattern::DoubleEndedSearcher> - for PredicateSearcher<'hs, F> +impl<'hs, F, P> pattern::DoubleEndedSearcher> for PredicateSearcher<'hs, F, P> +where + F: Flavour, + P: FnMut(char) -> bool, { } @@ -774,65 +1101,67 @@ impl<'hs, F: FnMut(char) -> bool> pattern::DoubleEndedSearcher> // Impl Pattern for &str //////////////////////////////////////////////////////////////////////////////// -impl<'hs, 'p> pattern::Pattern> for &'p str { - type Searcher = StrSearcher<'hs, 'p>; +impl<'hs, 'p, F: Flavour> Pattern> for &'p str { + type Searcher = StrSearcher<'hs, 'p, F>; - fn into_searcher(self, haystack: Bytes<'hs>) -> Self::Searcher { + fn into_searcher(self, haystack: Bytes<'hs, F>) -> Self::Searcher { Self::Searcher::new(haystack, self) } - fn is_prefix_of(self, haystack: Bytes<'hs>) -> bool { + fn is_prefix_of(self, haystack: Bytes<'hs, F>) -> bool { haystack.as_bytes().starts_with(self.as_bytes()) } - fn strip_prefix_of(self, haystack: Bytes<'hs>) -> Option> { - haystack.as_bytes().strip_prefix(self.as_bytes()).map(Bytes) + fn strip_prefix_of(self, haystack: Bytes<'hs, F>) -> Option> { + haystack.as_bytes().strip_prefix(self.as_bytes()).map(|bytes| Bytes(bytes, PhantomData)) } - fn is_suffix_of(self, haystack: Bytes<'hs>) -> bool { + fn is_suffix_of(self, haystack: Bytes<'hs, F>) -> bool { haystack.as_bytes().ends_with(self.as_bytes()) } - fn strip_suffix_of(self, haystack: Bytes<'hs>) -> Option> { - haystack.as_bytes().strip_suffix(self.as_bytes()).map(Bytes) + fn strip_suffix_of(self, haystack: Bytes<'hs, F>) -> Option> { + haystack.as_bytes().strip_suffix(self.as_bytes()).map(|bytes| Bytes(bytes, PhantomData)) } } #[derive(Clone, Debug)] -pub struct StrSearcher<'hs, 'p> { - haystack: Bytes<'hs>, - state: StrSearcherInner<'p>, +pub struct StrSearcher<'hs, 'p, F> { + haystack: Bytes<'hs, F>, + inner: StrSearcherInner<'p>, } -impl<'hs, 'p> StrSearcher<'hs, 'p> { - pub fn new(haystack: Bytes<'hs>, needle: &'p str) -> Self { - let state = StrSearcherInner::new(haystack, needle); - Self { haystack, state } +impl<'hs, 'p, F: Flavour> StrSearcher<'hs, 'p, F> { + pub fn new(haystack: Bytes<'hs, F>, needle: &'p str) -> Self { + let inner = StrSearcherInner::new(haystack, needle); + Self { haystack, inner } } } -unsafe impl<'hs, 'p> Searcher> for StrSearcher<'hs, 'p> { - fn haystack(&self) -> Bytes<'hs> { +unsafe impl<'hs, 'p, F: Flavour> pattern::Searcher> for StrSearcher<'hs, 'p, F> { + fn haystack(&self) -> Bytes<'hs, F> { self.haystack } fn next(&mut self) -> SearchStep { - self.state.next_fwd(self.haystack) + self.inner.next_fwd(self.haystack) } fn next_match(&mut self) -> OptRange { - self.state.next_fwd::(self.haystack).0 + self.inner.next_fwd::(self.haystack).0 } fn next_reject(&mut self) -> OptRange { - self.state.next_fwd::(self.haystack).0 + self.inner.next_fwd::(self.haystack).0 } } -unsafe impl<'hs, 'p> pattern::ReverseSearcher> for StrSearcher<'hs, 'p> { +unsafe impl<'hs, 'p, F: Flavour> pattern::ReverseSearcher> + for StrSearcher<'hs, 'p, F> +{ fn next_back(&mut self) -> SearchStep { - self.state.next_bwd(self.haystack) + self.inner.next_bwd(self.haystack) } fn next_match_back(&mut self) -> OptRange { - self.state.next_bwd::(self.haystack).0 + self.inner.next_bwd::(self.haystack).0 } fn next_reject_back(&mut self) -> OptRange { - self.state.next_bwd::(self.haystack).0 + self.inner.next_bwd::(self.haystack).0 } } @@ -844,7 +1173,7 @@ enum StrSearcherInner<'p> { } impl<'p> StrSearcherInner<'p> { - fn new(haystack: Bytes<'_>, needle: &'p str) -> Self { + fn new(haystack: Bytes<'_, F>, needle: &'p str) -> Self { let mut chars = needle.chars(); let chr = match chars.next() { Some(chr) => chr, @@ -857,19 +1186,19 @@ impl<'p> StrSearcherInner<'p> { } } - fn next_fwd(&mut self, haystack: Bytes<'_>) -> R { + fn next_fwd(&mut self, haystack: Bytes<'_, F>) -> R { match self { - Self::Empty(state) => state.next_fwd::(haystack), - Self::Char(state) => state.next_fwd::(haystack), - Self::Str(state) => state.next_fwd::(haystack), + Self::Empty(state) => state.next_fwd::(haystack), + Self::Char(state) => state.next_fwd::(haystack), + Self::Str(state) => state.next_fwd::(haystack), } } - fn next_bwd(&mut self, haystack: Bytes<'_>) -> R { + fn next_bwd(&mut self, haystack: Bytes<'_, F>) -> R { match self { - Self::Empty(state) => state.next_bwd::(haystack), - Self::Char(state) => state.next_bwd::(haystack), - Self::Str(state) => state.next_bwd::(haystack), + Self::Empty(state) => state.next_bwd::(haystack), + Self::Char(state) => state.next_bwd::(haystack), + Self::Str(state) => state.next_bwd::(haystack), } } } @@ -886,15 +1215,15 @@ impl<'p> StrSearcherInner<'p> { struct EmptySearcherState(pattern::EmptyNeedleSearcher); impl EmptySearcherState { - fn new(haystack: Bytes<'_>) -> Self { + fn new(haystack: Bytes<'_, F>) -> Self { Self(pattern::EmptyNeedleSearcher::new(haystack)) } - fn next_fwd(&mut self, bytes: Bytes<'_>) -> R { + fn next_fwd(&mut self, bytes: Bytes<'_, F>) -> R { self.0.next_fwd(|range| bytes.advance_range_start(range)) } - fn next_bwd(&mut self, bytes: Bytes<'_>) -> R { + fn next_bwd(&mut self, bytes: Bytes<'_, F>) -> R { self.0.next_bwd(|range| bytes.advance_range_end(range)) } } @@ -911,12 +1240,12 @@ struct StrSearcherState<'p> { } impl<'p> StrSearcherState<'p> { - fn new(haystack: Bytes<'_>, needle: &'p str) -> Self { + fn new(haystack: Bytes<'_, F>, needle: &'p str) -> Self { let searcher = TwoWaySearcher::new(haystack.len(), needle.as_bytes()); Self { needle, searcher } } - fn next_fwd(&mut self, bytes: Bytes<'_>) -> R { + fn next_fwd(&mut self, bytes: Bytes<'_, F>) -> R { if self.searcher.position >= bytes.len() { return R::DONE; } @@ -928,7 +1257,7 @@ impl<'p> StrSearcherState<'p> { .adjust_reject_end_fwd(bytes, bytes.len(), &mut self.searcher.position) } - fn next_bwd(&mut self, bytes: Bytes<'_>) -> R { + fn next_bwd(&mut self, bytes: Bytes<'_, F>) -> R { if self.searcher.end == 0 { return R::DONE; } From 0d81d85942f00e76ff8800c2e77052176d5fb034 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Wed, 1 Mar 2023 02:12:42 +0100 Subject: [PATCH 09/12] sys: reduce visibility of some internal OsStr-related types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I’m honestly not entirely sure why this is needed, but if I try to edit the file in subsequent commit without this change I’m getting ‘missing stability attribute’ errors: error: struct has missing stability attribute --> library/std/src/sys/unix/os_str.rs:22:1 | 22 | / pub struct Buf { 23 | | pub inner: Vec, 24 | | } | |_^ --- library/std/src/sys/unix/os_str.rs | 4 ++-- library/std/src/sys/windows/os_str.rs | 4 ++-- library/std/src/sys_common/wtf8.rs | 14 +++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/library/std/src/sys/unix/os_str.rs b/library/std/src/sys/unix/os_str.rs index 488217f39413f..1710d42c7aa1e 100644 --- a/library/std/src/sys/unix/os_str.rs +++ b/library/std/src/sys/unix/os_str.rs @@ -19,12 +19,12 @@ mod tests; #[derive(Hash)] #[repr(transparent)] -pub struct Buf { +pub(crate) struct Buf { pub inner: Vec, } #[repr(transparent)] -pub struct Slice { +pub(crate) struct Slice { pub inner: [u8], } diff --git a/library/std/src/sys/windows/os_str.rs b/library/std/src/sys/windows/os_str.rs index 2f2b0e56e0889..84a7e56020ed3 100644 --- a/library/std/src/sys/windows/os_str.rs +++ b/library/std/src/sys/windows/os_str.rs @@ -10,7 +10,7 @@ use crate::sys_common::wtf8::{Wtf8, Wtf8Buf}; use crate::sys_common::{AsInner, FromInner, IntoInner}; #[derive(Clone, Hash)] -pub struct Buf { +pub(crate) struct Buf { pub inner: Wtf8Buf, } @@ -46,7 +46,7 @@ impl fmt::Display for Buf { } #[repr(transparent)] -pub struct Slice { +pub(crate) struct Slice { pub inner: Wtf8, } diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs index ff96c35fb0ba6..2c878292c7fe6 100644 --- a/library/std/src/sys_common/wtf8.rs +++ b/library/std/src/sys_common/wtf8.rs @@ -42,7 +42,7 @@ const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}"; /// which represents a Unicode scalar value: /// a code point that is not a surrogate (U+D800 to U+DFFF). #[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)] -pub struct CodePoint { +pub(crate) struct CodePoint { value: u32, } @@ -133,7 +133,7 @@ impl CodePoint { /// Similar to `String`, but can additionally contain surrogate code points /// if they’re not in a surrogate pair. #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] -pub struct Wtf8Buf { +pub(crate) struct Wtf8Buf { bytes: Vec, /// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily @@ -496,7 +496,7 @@ impl Extend for Wtf8Buf { /// Similar to `&str`, but can additionally contain surrogate code points /// if they’re not in a surrogate pair. #[derive(Eq, Ord, PartialEq, PartialOrd)] -pub struct Wtf8 { +pub(crate) struct Wtf8 { bytes: [u8], } @@ -869,7 +869,7 @@ fn decode_surrogate_pair(lead: u16, trail: u16) -> char { /// Copied from core::str::StrPrelude::is_char_boundary #[inline] -pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { +pub(crate) fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { if index == slice.len() { return true; } @@ -881,14 +881,14 @@ pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { /// Copied from core::str::raw::slice_unchecked #[inline] -pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { +pub(crate) unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { // memory layout of a &[u8] and &Wtf8 are the same Wtf8::from_bytes_unchecked(slice::from_raw_parts(s.bytes.as_ptr().add(begin), end - begin)) } /// Copied from core::str::raw::slice_error_fail #[inline(never)] -pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { +pub(crate) fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { assert!(begin <= end); panic!("index {begin} and/or {end} in `{s:?}` do not lie on character boundary"); } @@ -897,7 +897,7 @@ pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { /// /// Created with the method `.code_points()`. #[derive(Clone)] -pub struct Wtf8CodePoints<'a> { +pub(crate) struct Wtf8CodePoints<'a> { bytes: slice::Iter<'a, u8>, } From 06e6d69e2e1e086e667407d949bda3aafb1f052d Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Mon, 20 Feb 2023 07:32:02 +0100 Subject: [PATCH 10/12] std: add pattern matching to OsStr Implement Haystack for &OsStr and Pattern<&OsStr> for &str, char and Predicate. Furthermore, add prefix/suffix matching/stripping and splitting methods to OsStr type to make use of those patterns. Using OsStr as a pattern is *not* implemented. Neither is indexing into OsStr. All matching and indexing has to be done via provided functions. --- library/std/src/ffi/os_str.rs | 407 +++- library/std/src/lib.rs | 3 + library/std/src/sys/unix/os_str.rs | 22 + library/std/src/sys/windows/os_str.rs | 26 + library/std/src/sys_common/wtf8.rs | 26 + library/std/tests/os_str.rs | 2561 +++++++++++++++++++++++++ 6 files changed, 3044 insertions(+), 1 deletion(-) create mode 100644 library/std/tests/os_str.rs diff --git a/library/std/src/ffi/os_str.rs b/library/std/src/ffi/os_str.rs index 5c0541d3caf33..c5b60bdcb0963 100644 --- a/library/std/src/ffi/os_str.rs +++ b/library/std/src/ffi/os_str.rs @@ -7,11 +7,12 @@ use crate::collections::TryReserveError; use crate::fmt; use crate::hash::{Hash, Hasher}; use crate::ops; +use crate::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, SearchStep, Searcher}; use crate::rc::Rc; use crate::str::FromStr; use crate::sync::Arc; -use crate::sys::os_str::{Buf, Slice}; +use crate::sys::os_str::{Buf, BytesFlavour, Slice}; use crate::sys_common::{AsInner, FromInner, IntoInner}; /// A type that can represent owned, mutable platform-native strings, but is @@ -978,6 +979,167 @@ impl OsStr { pub fn eq_ignore_ascii_case>(&self, other: S) -> bool { self.inner.eq_ignore_ascii_case(&other.as_ref().inner) } + + /// Returns `true` if the given pattern matches a prefix of this `OsStr` + /// slice. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// assert!(OsStr::new("foo").starts_with('f')); + /// assert!(!OsStr::new("foo").starts_with('F')); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn starts_with<'a, P: Pattern<&'a OsStr>>(&'a self, pat: P) -> bool { + pat.is_prefix_of(self) + } + + /// Returns `true` if the given pattern matches a suffix of this `OsStr` + /// slice. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// assert!(OsStr::new("foo").ends_with('o')); + /// assert!(!OsStr::new("foo").ends_with('O')); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn ends_with<'a, P>(&'a self, pat: P) -> bool + where + P: Pattern<&'a OsStr, Searcher: ReverseSearcher<&'a OsStr>>, + { + pat.is_suffix_of(self) + } + + /// Returns a `OsStr` slice with the prefix removed. + /// + /// If the string starts with the pattern `prefix`, returns substring after + /// the prefix, wrapped in `Some`. If the string doesn’t start with + /// `prefix`, returns `None`. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// assert_eq!(Some(OsStr::new("foo")), OsStr::new("--foo").strip_prefix("--")); + /// assert_eq!(None, OsStr::new("-f").strip_prefix("--")); + /// ``` + #[must_use = "this returns the remaining substring as a new slice, \ + without modifying the original"] + #[unstable(feature = "pattern", issue = "27721")] + pub fn strip_prefix<'a, P>(&'a self, prefix: P) -> Option<&'a OsStr> + where + P: Pattern<&'a OsStr>, + { + prefix.strip_prefix_of(self) + } + + /// Returns a `OsStr` slice with the suffix removed. + /// + /// If the string ends with the pattern `suffix`, returns substring before + /// the suffix, wrapped in `Some`. If the string doesn’t end with `suffix`, + /// returns `None`. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// assert_eq!(Some(OsStr::new("fo")), OsStr::new("foo").strip_suffix('o')); + /// assert_eq!(None, OsStr::new("foo").strip_suffix('O')); + /// ``` + #[must_use = "this returns the remaining substring as a new slice, \ + without modifying the original"] + #[unstable(feature = "pattern", issue = "27721")] + pub fn strip_suffix<'a, P>(&'a self, suffix: P) -> Option<&'a OsStr> + where + P: Pattern<&'a OsStr>, +

>::Searcher: ReverseSearcher<&'a OsStr>, + { + suffix.strip_suffix_of(self) + } + + /// Splits the string on the first occurrence of the specified delimiter and + /// returns prefix before delimiter and suffix after delimiter. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// assert_eq!(Some((OsStr::new("foo"), OsStr::new("bar=baz"))), + /// OsStr::new("foo=bar=baz").split_once('=')); + /// assert_eq!(None, OsStr::new("foobar").split_once(',')); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn split_once<'a, P: Pattern<&'a OsStr>>(&'a self, delimiter: P) -> Option<(&Self, &Self)> { + let (start, end) = delimiter.into_searcher(self).next_match()?; + eprintln!("{:?} {} {}", self, start, end); + Some(unsafe { + let head = self.bytes().get_unchecked(..start); + let tail = self.bytes().get_unchecked(end..); + let head = OsStr::from_inner(core::mem::transmute(head)); + let tail = OsStr::from_inner(core::mem::transmute(tail)); + (head, tail) + }) + } + + /// Splits the string on the last occurrence of the specified delimiter and + /// returns prefix before delimiter and suffix after delimiter. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// assert_eq!(Some((OsStr::new("foo=bar"), OsStr::new("baz"))), + /// OsStr::new("foo=bar=baz").rsplit_once('=')); + /// assert_eq!(None, OsStr::new("foobar").rsplit_once(',')); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn rsplit_once<'a, P: Pattern<&'a OsStr>>(&'a self, delimiter: P) -> Option<(&Self, &Self)> + where + P: Pattern<&'a OsStr, Searcher: ReverseSearcher<&'a OsStr>>, + { + let (start, end) = delimiter.into_searcher(self).next_match_back()?; + Some(unsafe { + let head = self.bytes().get_unchecked(..start); + let tail = self.bytes().get_unchecked(end..); + let head = OsStr::from_inner(core::mem::transmute(head)); + let tail = OsStr::from_inner(core::mem::transmute(tail)); + (head, tail) + }) + } + + /// An iterator over substrings of this string slice, separated by + /// characters matched by a pattern. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// let value = OsStr::new("foo,bar,baz"); + /// let got = value.split(',').collect::>(); + /// let want = [OsStr::new("foo"), OsStr::new("bar"), OsStr::new("baz")]; + /// assert_eq!(&want[..], &got[..]); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn split<'hs, P: Pattern<&'hs OsStr>>(&'hs self, delimiter: P) -> Split<'hs, P::Searcher> { + Split(core::pattern::Split::new(delimiter.into_searcher(self)).with_allow_trailing_empty()) + } } #[stable(feature = "box_from_os_str", since = "1.17.0")] @@ -1445,3 +1607,246 @@ impl<'a> FromIterator> for OsString { } } } + +#[unstable(feature = "str_internals", issue = "none")] +impl<'a> From<&'a OsStr> for core::str_bytes::Bytes<'a, BytesFlavour> { + fn from(val: &'a OsStr) -> Self { + val.inner.into() + } +} + +#[unstable(feature = "str_internals", issue = "none")] +impl<'a> From> for &'a OsStr { + fn from(val: core::str_bytes::Bytes<'a, BytesFlavour>) -> Self { + OsStr::from_inner(<&Slice>::from(val)) + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs> core::pattern::Haystack for &'hs OsStr { + type Cursor = usize; + + #[inline(always)] + fn cursor_at_front(self) -> usize { + 0 + } + #[inline(always)] + fn cursor_at_back(self) -> usize { + self.inner.inner.len() + } + + #[inline(always)] + fn is_empty(self) -> bool { + self.inner.inner.is_empty() + } + + #[inline(always)] + unsafe fn get_unchecked(self, range: core::ops::Range) -> Self { + // SAFETY: Caller promises that `range` is valid. + OsStr::from_inner(unsafe { self.inner.get_unchecked(range) }) + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs> core::pattern::Pattern<&'hs OsStr> for char { + type Searcher = CharSearcher<'hs>; + + fn into_searcher(self, haystack: &'hs OsStr) -> Self::Searcher { + Self::Searcher::new(haystack, self) + } + + fn is_contained_in(self, haystack: &'hs OsStr) -> bool { + self.is_contained_in(core::str_bytes::Bytes::from(haystack)) + } + + fn is_prefix_of(self, haystack: &'hs OsStr) -> bool { + self.is_prefix_of(core::str_bytes::Bytes::from(haystack)) + } + + fn is_suffix_of(self, haystack: &'hs OsStr) -> bool { + self.is_suffix_of(core::str_bytes::Bytes::from(haystack)) + } + + /// Removes the pattern from the front of haystack, if it matches. + fn strip_prefix_of(self, haystack: &'hs OsStr) -> Option<&'hs OsStr> { + self.strip_prefix_of(core::str_bytes::Bytes::from(haystack)).map(|bytes| bytes.into()) + } + + /// Removes the pattern from the back of haystack, if it matches. + fn strip_suffix_of(self, haystack: &'hs OsStr) -> Option<&'hs OsStr> + where + Self::Searcher: ReverseSearcher<&'hs OsStr>, + { + self.strip_suffix_of(core::str_bytes::Bytes::from(haystack)).map(|bytes| bytes.into()) + } +} + +#[derive(Clone, Debug)] +#[unstable(feature = "pattern", issue = "27721")] +pub struct CharSearcher<'hs>(core::str_bytes::CharSearcher<'hs, BytesFlavour>); + +impl<'hs> CharSearcher<'hs> { + fn new(haystack: &'hs OsStr, needle: char) -> CharSearcher<'hs> { + Self(core::str_bytes::CharSearcher::new(haystack.into(), needle)) + } +} + +#[unstable(feature = "pattern", issue = "27721")] +unsafe impl<'hs> Searcher<&'hs OsStr> for CharSearcher<'hs> { + #[inline(always)] + fn haystack(&self) -> &'hs OsStr { + self.0.haystack().into() + } + + #[inline(always)] + fn next(&mut self) -> SearchStep { + self.0.next() + } + #[inline(always)] + fn next_match(&mut self) -> Option<(usize, usize)> { + self.0.next_match() + } + #[inline(always)] + fn next_reject(&mut self) -> Option<(usize, usize)> { + self.0.next_reject() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +unsafe impl<'hs> ReverseSearcher<&'hs OsStr> for CharSearcher<'hs> { + #[inline(always)] + fn next_back(&mut self) -> SearchStep { + self.0.next_back() + } + #[inline(always)] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + self.0.next_match_back() + } + #[inline(always)] + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + self.0.next_reject_back() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs> DoubleEndedSearcher<&'hs OsStr> for CharSearcher<'hs> {} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs, 'p> core::pattern::Pattern<&'hs OsStr> for &'p str { + type Searcher = StrSearcher<'hs, 'p>; + + fn into_searcher(self, haystack: &'hs OsStr) -> Self::Searcher { + Self::Searcher::new(haystack, self) + } + + fn is_contained_in(self, haystack: &'hs OsStr) -> bool { + self.is_contained_in(core::str_bytes::Bytes::from(haystack)) + } + + fn is_prefix_of(self, haystack: &'hs OsStr) -> bool { + self.is_prefix_of(core::str_bytes::Bytes::from(haystack)) + } + + fn is_suffix_of(self, haystack: &'hs OsStr) -> bool { + self.is_suffix_of(core::str_bytes::Bytes::from(haystack)) + } + + /// Removes the pattern from the front of haystack, if it matches. + fn strip_prefix_of(self, haystack: &'hs OsStr) -> Option<&'hs OsStr> { + self.strip_prefix_of(core::str_bytes::Bytes::from(haystack)) + .map(|bytes| OsStr::from_inner(unsafe { core::mem::transmute(bytes.as_bytes()) })) + } + + /// Removes the pattern from the back of haystack, if it matches. + fn strip_suffix_of(self, haystack: &'hs OsStr) -> Option<&'hs OsStr> + where + Self::Searcher: ReverseSearcher<&'hs OsStr>, + { + self.strip_suffix_of(core::str_bytes::Bytes::from(haystack)) + .map(|bytes| OsStr::from_inner(unsafe { core::mem::transmute(bytes.as_bytes()) })) + } +} + +#[derive(Clone, Debug)] +#[unstable(feature = "pattern", issue = "27721")] +pub struct StrSearcher<'hs, 'p>(core::str_bytes::StrSearcher<'hs, 'p, BytesFlavour>); + +impl<'hs, 'p> StrSearcher<'hs, 'p> { + fn new(haystack: &'hs OsStr, needle: &'p str) -> StrSearcher<'hs, 'p> { + let haystack = core::str_bytes::Bytes::from(haystack); + Self(core::str_bytes::StrSearcher::new(haystack, needle)) + } +} + +#[unstable(feature = "pattern", issue = "27721")] +unsafe impl<'hs, 'p> Searcher<&'hs OsStr> for StrSearcher<'hs, 'p> { + #[inline(always)] + fn haystack(&self) -> &'hs OsStr { + self.0.haystack().into() + } + + #[inline(always)] + fn next(&mut self) -> SearchStep { + self.0.next() + } + #[inline(always)] + fn next_match(&mut self) -> Option<(usize, usize)> { + self.0.next_match() + } + #[inline(always)] + fn next_reject(&mut self) -> Option<(usize, usize)> { + self.0.next_reject() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +unsafe impl<'hs, 'p> ReverseSearcher<&'hs OsStr> for StrSearcher<'hs, 'p> { + #[inline(always)] + fn next_back(&mut self) -> SearchStep { + self.0.next_back() + } + #[inline(always)] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + self.0.next_match_back() + } + #[inline(always)] + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + self.0.next_reject_back() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +pub struct Split<'hs, S: Searcher<&'hs OsStr>>(core::pattern::Split<&'hs OsStr, S>); + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs, S: Searcher<&'hs OsStr> + Clone> Clone for Split<'hs, S> { + fn clone(&self) -> Self { + Self(self.0.clone()) + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs, S: Searcher<&'hs OsStr> + fmt::Debug> fmt::Debug for Split<'hs, S> { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(fmt) + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs, S: Searcher<&'hs OsStr>> Iterator for Split<'hs, S> { + type Item = &'hs OsStr; + + fn next(&mut self) -> Option { + self.0.next_fwd::() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs, S: DoubleEndedSearcher<&'hs OsStr>> DoubleEndedIterator for Split<'hs, S> { + fn next_back(&mut self) -> Option { + self.0.next_bwd::() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs, S: Searcher<&'hs OsStr>> core::iter::FusedIterator for Split<'hs, S> {} diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index 5bbdc1e0d984c..db318a50193b4 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -240,6 +240,7 @@ #![feature(allocator_internals)] #![feature(allow_internal_unsafe)] #![feature(allow_internal_unstable)] +#![feature(associated_type_bounds)] #![feature(c_unwind)] #![feature(cfg_target_thread_local)] #![feature(concat_idents)] @@ -298,6 +299,8 @@ #![feature(panic_can_unwind)] #![feature(panic_info_message)] #![feature(panic_internals)] +#![feature(pattern)] +#![feature(pattern_internals)] #![feature(pointer_byte_offsets)] #![feature(pointer_is_aligned)] #![feature(portable_simd)] diff --git a/library/std/src/sys/unix/os_str.rs b/library/std/src/sys/unix/os_str.rs index 1710d42c7aa1e..69184ce445a19 100644 --- a/library/std/src/sys/unix/os_str.rs +++ b/library/std/src/sys/unix/os_str.rs @@ -28,6 +28,22 @@ pub(crate) struct Slice { pub inner: [u8], } +pub(crate) type BytesFlavour = core::str_bytes::Unstructured; + +#[unstable(feature = "pattern", issue = "27721")] +impl<'a> From<&'a Slice> for core::str_bytes::Bytes<'a, BytesFlavour> { + fn from(slice: &'a Slice) -> Self { + (&slice.inner).into() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'a> From> for &'a Slice { + fn from(bytes: core::str_bytes::Bytes<'a, BytesFlavour>) -> Self { + Slice::from_u8_slice(bytes.as_bytes()) + } +} + impl fmt::Debug for Slice { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Debug::fmt(&Utf8Chunks::new(&self.inner).debug(), f) @@ -270,4 +286,10 @@ impl Slice { pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { self.inner.eq_ignore_ascii_case(&other.inner) } + + #[inline] + pub(crate) unsafe fn get_unchecked(&self, range: core::ops::Range) -> &Self { + // SAFETY: Caller promises `range` is valid. + Self::from_u8_slice(unsafe { self.inner.get_unchecked(range) }) + } } diff --git a/library/std/src/sys/windows/os_str.rs b/library/std/src/sys/windows/os_str.rs index 84a7e56020ed3..55c41195b9ef5 100644 --- a/library/std/src/sys/windows/os_str.rs +++ b/library/std/src/sys/windows/os_str.rs @@ -50,6 +50,24 @@ pub(crate) struct Slice { pub inner: Wtf8, } +pub(crate) type BytesFlavour = core::str_bytes::Wtf8; + +#[unstable(feature = "pattern", issue = "27721")] +impl<'a> From<&'a Slice> for core::str_bytes::Bytes<'a, BytesFlavour> { + fn from(slice: &'a Slice) -> Self { + (&slice.inner).into() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'a> From> for &'a Slice { + fn from(bytes: core::str_bytes::Bytes<'a, BytesFlavour>) -> &'a Slice { + let inner = <&Wtf8>::from(bytes); + // SAFETY: `Slice` is transparent wrapper around `Wtf8`. + unsafe { mem::transmute(inner) } + } +} + impl fmt::Debug for Slice { fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Debug::fmt(&self.inner, formatter) @@ -222,4 +240,12 @@ impl Slice { pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { self.inner.eq_ignore_ascii_case(&other.inner) } + + #[inline] + pub(crate) unsafe fn get_unchecked(&self, range: core::ops::Range) -> &Self { + // SAFETY: Caller promises `range` is valid. + let inner = unsafe { self.inner.get_unchecked(range) }; + // SAFETY: We’re just a transparent wrapper around `Wtf8`. + unsafe { mem::transmute(inner) } + } } diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs index 2c878292c7fe6..4649fa221286d 100644 --- a/library/std/src/sys_common/wtf8.rs +++ b/library/std/src/sys_common/wtf8.rs @@ -496,6 +496,7 @@ impl Extend for Wtf8Buf { /// Similar to `&str`, but can additionally contain surrogate code points /// if they’re not in a surrogate pair. #[derive(Eq, Ord, PartialEq, PartialOrd)] +#[repr(transparent)] pub(crate) struct Wtf8 { bytes: [u8], } @@ -781,6 +782,14 @@ impl Wtf8 { pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { self.bytes.eq_ignore_ascii_case(&other.bytes) } + + #[inline] + pub(crate) unsafe fn get_unchecked(&self, range: core::ops::Range) -> &Self { + // SAFETY: Caller promises `range` is valid. + let bytes = unsafe { self.bytes.get_unchecked(range) }; + // SAFETY: We’re just a transparent wrapper around [u8]. + unsafe { mem::transmute(bytes) } + } } /// Returns a slice of the given string for the byte range \[`begin`..`end`). @@ -984,3 +993,20 @@ impl Hash for Wtf8 { 0xfeu8.hash(state) } } + +#[unstable(feature = "pattern", issue = "27721")] +impl<'a> From<&'a Wtf8> for core::str_bytes::Bytes<'a, core::str_bytes::Wtf8> { + fn from(wtf8: &'a Wtf8) -> Self { + // SAFETY: As name implies, `Wtf8`’s bytes ares guaranteed to be WTF-8 + // so `Wtf8` flavour is correct. + unsafe { core::str_bytes::Bytes::new(&wtf8.bytes) } + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'a> From> for &'a Wtf8 { + fn from(bytes: core::str_bytes::Bytes<'a, core::str_bytes::Wtf8>) -> Self { + // SAFETY: Bytes<'_, Wtf8> are guaranteed to be well-formed WTF-8. + unsafe { Wtf8::from_bytes_unchecked(bytes.as_bytes()) } + } +} diff --git a/library/std/tests/os_str.rs b/library/std/tests/os_str.rs new file mode 100644 index 0000000000000..fea1c0e2275c8 --- /dev/null +++ b/library/std/tests/os_str.rs @@ -0,0 +1,2561 @@ +#![feature(associated_type_bounds, pattern)] + +use core::pattern::{Pattern, Searcher, ReverseSearcher}; +use std::borrow::Cow; +use std::ffi::{OsStr, OsString}; + +//////////////////////////////////////////////////////////////////////////////// +// Helper functions for creating OsStr and OsString + +/// Cast `str` into `OsStr`. This is a trivial convenience function. +fn os(value: &str) -> &OsStr { + OsStr::new(value) +} + +/// Constructs an OsString with potentially invalid UTF-8. +/// +/// If `valid` is `false`, some characters are replaced by invalid sequences +/// (see `map_invalid`) resulting in returned OsString not being a valid String. +fn make_os_string(value: &str, valid: bool) -> OsString { + if valid { + OsString::from(value) + } else { + make_invalid_os_string(value) + } +} + +fn map_invalid(chr: char) -> Result { + match chr { + 'ą' => Err(0xB1), + 'ä' => Err(0xE4), + 'ă' => Err(0xE3), + 'ó' => Err(0xF3), + chr => Ok(chr), + } +} + +#[cfg(unix)] +fn make_invalid_os_string(value: &str) -> OsString { + use std::os::unix::ffi::OsStringExt; + + let mut vec = Vec::with_capacity(value.len()); + let mut buf = [0; 4]; + for chr in value.chars() { + match map_invalid(chr) { + Ok(chr) => vec.extend_from_slice(chr.encode_utf8(&mut buf).as_bytes()), + Err(byte) => vec.push(byte) + } + } + OsString::from_vec(vec) +} + +#[cfg(windows)] +fn make_invalid_os_string(value: &str) -> OsString { + use std::os::windows::ffi::OsStringExt; + + let mut vec = Vec::with_capacity(value.len()); + let mut buf = [0; 2]; + for chr in value.chars() { + match map_invalid(chr) { + Ok(chr) => vec.extend_from_slice(chr.encode_utf16(&mut buf)), + Err(byte) => vec.push(0xD800 | byte as u16), + } + } + OsStringExt::from_wide(&vec) +} + +//////////////////////////////////////////////////////////////////////////////// +// Test of features demonstrating command-line argument parsing + +fn do_test_long_flag(valid: bool) { + let os = |value| { make_os_string(value, valid) }; + + // strip_prefix("--") and strip_prefix('-') can be used to check if it’s + // a flag argument or not. + let arg = os("--flăg=fóó,bąr,bäz"); + assert_eq!(Some(&*os("-flăg=fóó,bąr,bäz")), arg.strip_prefix('-')); + assert_eq!(Some(&*os("-flăg=fóó,bąr,bäz")), arg.strip_prefix("-")); + assert_eq!(Some(&*os("flăg=fóó,bąr,bäz")), arg.strip_prefix("--")); + + // split_once('=') separates long flag name from its value. If + // split_once returns None, there’s no value with the flag. + let arg = os("flăg=fóó,bąr,bäz"); + assert_eq!( + Some((&*os("flăg"), &*os("fóó,bąr,bäz"))), + arg.split_once('=') + ); + assert_eq!(None, os("flăg").split_once('=')); + + // split(',') separates values in flag whose values are comma separated. + let arg = os("fóó,bąr,bäz"); + let values = arg.split(',').collect::>(); + assert_eq!(&[os("fóó"), os("bąr"), os("bäz")][..], values.as_slice()) +} + +#[test] +fn test_long_flag() { + do_test_long_flag(true) +} + +#[test] +fn test_long_flag_non_utf8() { + do_test_long_flag(false) +} + +fn do_test_short_flag(valid: bool) { + let os = |value| { make_os_string(value, valid) }; + + // strip_prefix("--") and strip_prefix('-') can be used to check if it’s + // a flag argument or not. + let arg = os("-shórt"); + assert_eq!(Some(&*os("shórt")), arg.strip_prefix('-')); + assert_eq!(Some(&*os("shórt")), arg.strip_prefix("-")); + assert_eq!(None, arg.strip_prefix("--")); +} + +#[test] +fn test_short_flag() { + do_test_short_flag(true) +} + +#[test] +fn test_short_flag_non_utf8() { + do_test_short_flag(false) +} + +//////////////////////////////////////////////////////////////////////////////// +// Test adapted from library/alloc/tests/str.rs + +// We currently don’t offer full set of pattern-matching methods on OsStr which +// are available on str. At least some of them can be implemented using Pattern +// API so do that for the sake of testing. + +fn find<'a>(haystack: &'a str, pat: impl Pattern<&'a OsStr>) -> Option { + pat.into_searcher(os(haystack)).next_match().map(|(i, _)| i) +} + +fn rfind<'a, P>(haystack: &'a str, pat: P) -> Option +where + P: Pattern<&'a OsStr, Searcher: ReverseSearcher<&'a OsStr>>, +{ + pat.into_searcher(os(haystack)).next_match_back().map(|(i, _)| i) +} + +pub fn contains<'a>(haystack: &'a str, pat: impl Pattern<&'a OsStr>) -> bool { + pat.is_contained_in(os(haystack)) +} + + +#[test] +fn test_le() { + assert!(os("") <= ""); + assert!(os("") <= "foo"); + assert!(os("foo") <= "foo"); + assert_ne!(os("foo"), "bar"); +} + +#[test] +fn test_find() { + assert_eq!(find("hello", 'l'), Some(2)); + assert!(find("hello", 'x').is_none()); + assert_eq!(find("ประเทศไทย中华Việt Nam", '华'), Some(30)); +} + +#[test] +fn test_rfind() { + assert_eq!(rfind("hello", 'l'), Some(3)); + assert!(rfind("hello", 'x').is_none()); + assert_eq!(rfind("ประเทศไทย中华Việt Nam", '华'), Some(30)); +} + +/* +#[test] +fn test_collect() { + let empty = ""; + let s: String = empty.chars().collect(); + assert_eq!(empty, s); + let data = "ประเทศไทย中"; + let s: String = data.chars().collect(); + assert_eq!(data, s); +} +*/ + +#[test] +fn test_find_str() { + // byte positions + assert_eq!(find("", ""), Some(0)); + assert!(find("banana", "apple pie").is_none()); + + assert_eq!(find("abcabc", "ab"), Some(0)); + assert_eq!(find("cabc", "ab"), Some(1)); + assert!(find("ca", "ab").is_none()); + + let string = "ประเทศไทย中华Việt Nam"; + let mut data = String::from(string); + data.push_str(string); + assert!(find(&data, "ไท华").is_none()); + assert_eq!(find(&data[0..43], ""), Some(0)); + assert_eq!(find(&data[6..43], ""), Some(6 - 6)); + + assert_eq!(find(&data[0..43], "ประ"), Some(0)); + assert_eq!(find(&data[0..43], "ทศไ"), Some(12)); + assert_eq!(find(&data[0..43], "ย中"), Some(24)); + assert_eq!(find(&data[0..43], "iệt"), Some(34)); + assert_eq!(find(&data[0..43], "Nam"), Some(40)); + + assert_eq!(find(&data[43..86], "ประ"), Some(43 - 43)); + assert_eq!(find(&data[43..86], "ทศไ"), Some(55 - 43)); + assert_eq!(find(&data[43..86], "ย中"), Some(67 - 43)); + assert_eq!(find(&data[43..86], "iệt"), Some(77 - 43)); + assert_eq!(find(&data[43..86], "Nam"), Some(83 - 43)); + + // find every substring -- assert that it finds it, or an earlier occurrence. + let string = "Việt Namacbaabcaabaaba"; + for (i, ci) in string.char_indices() { + let ip = i + ci.len_utf8(); + for j in string[ip..].char_indices().map(|(i, _)| i).chain(Some(string.len() - ip)) { + let pat = &string[i..ip + j]; + assert!(match find(&string, pat) { + None => false, + Some(x) => x <= i, + }); + assert!(match rfind(&string, pat) { + None => false, + Some(x) => x >= i, + }); + } + } +} + +/* +fn s(x: &str) -> String { + x.to_string() +} + +macro_rules! test_concat { + ($expected: expr, $string: expr) => {{ + let s: String = $string.concat(); + assert_eq!($expected, s); + }}; +} + +#[test] +fn test_concat_for_different_types() { + test_concat!("ab", vec![s("a"), s("b")]); + test_concat!("ab", vec!["a", "b"]); +} + +#[test] +fn test_concat_for_different_lengths() { + let empty: &[&str] = &[]; + test_concat!("", empty); + test_concat!("a", ["a"]); + test_concat!("ab", ["a", "b"]); + test_concat!("abc", ["", "a", "bc"]); +} + */ + +/* +macro_rules! test_join { + ($expected: expr, $string: expr, $delim: expr) => {{ + let s = $string.join($delim); + assert_eq!($expected, s); + }}; +} + +#[test] +fn test_join_for_different_types() { + test_join!("a-b", ["a", "b"], "-"); + let hyphen = "-".to_string(); + test_join!("a-b", [s("a"), s("b")], &*hyphen); + test_join!("a-b", vec!["a", "b"], &*hyphen); + test_join!("a-b", &*vec!["a", "b"], "-"); + test_join!("a-b", vec![s("a"), s("b")], "-"); +} + +#[test] +fn test_join_for_different_lengths() { + let empty: &[&str] = &[]; + test_join!("", empty, "-"); + test_join!("a", ["a"], "-"); + test_join!("a-b", ["a", "b"], "-"); + test_join!("-a-bc", ["", "a", "bc"], "-"); +} + +// join has fast paths for small separators up to 4 bytes +// this tests the slow paths. +#[test] +fn test_join_for_different_lengths_with_long_separator() { + assert_eq!("~~~~~".len(), 15); + + let empty: &[&str] = &[]; + test_join!("", empty, "~~~~~"); + test_join!("a", ["a"], "~~~~~"); + test_join!("a~~~~~b", ["a", "b"], "~~~~~"); + test_join!("~~~~~a~~~~~bc", ["", "a", "bc"], "~~~~~"); +} + +#[test] +fn test_join_issue_80335() { + use core::{borrow::Borrow, cell::Cell}; + + struct WeirdBorrow { + state: Cell, + } + + impl Default for WeirdBorrow { + fn default() -> Self { + WeirdBorrow { state: Cell::new(false) } + } + } + + impl Borrow for WeirdBorrow { + fn borrow(&self) -> &str { + let state = self.state.get(); + if state { + "0" + } else { + self.state.set(true); + "123456" + } + } + } + + let arr: [WeirdBorrow; 3] = Default::default(); + test_join!("0-0-0", arr, "-"); +} + +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn test_unsafe_slice() { + assert_eq!("ab", unsafe { "abc".get_unchecked(0..2) }); + assert_eq!("bc", unsafe { "abc".get_unchecked(1..3) }); + assert_eq!("", unsafe { "abc".get_unchecked(1..1) }); + fn a_million_letter_a() -> String { + let mut i = 0; + let mut rs = String::new(); + while i < 100000 { + rs.push_str("aaaaaaaaaa"); + i += 1; + } + rs + } + fn half_a_million_letter_a() -> String { + let mut i = 0; + let mut rs = String::new(); + while i < 100000 { + rs.push_str("aaaaa"); + i += 1; + } + rs + } + let letters = a_million_letter_a(); + assert_eq!(half_a_million_letter_a(), unsafe { letters.get_unchecked(0..500000) }); +} +*/ + +#[test] +fn test_starts_with() { + assert!(os("").starts_with("")); + assert!(os("abc").starts_with("")); + assert!(os("abc").starts_with("a")); + assert!(!os("a").starts_with("abc")); + assert!(!os("").starts_with("abc")); + assert!(!os("ödd").starts_with("-")); + assert!(os("ödd").starts_with("öd")); +} + +#[test] +fn test_ends_with() { + assert!(os("").ends_with("")); + assert!(os("abc").ends_with("")); + assert!(os("abc").ends_with("c")); + assert!(!os("a").ends_with("abc")); + assert!(!os("").ends_with("abc")); + assert!(!os("ddö").ends_with("-")); + assert!(os("ddö").ends_with("dö")); +} + +#[test] +fn test_is_empty() { + assert!(os("").is_empty()); + assert!(!os("a").is_empty()); +} + +/* +#[test] +fn test_replacen() { + assert_eq!("".replacen('a', "b", 5), ""); + assert_eq!("acaaa".replacen("a", "b", 3), "bcbba"); + assert_eq!("aaaa".replacen("a", "b", 0), "aaaa"); + + let test = "test"; + assert_eq!(" test test ".replacen(test, "toast", 3), " toast toast "); + assert_eq!(" test test ".replacen(test, "toast", 0), " test test "); + assert_eq!(" test test ".replacen(test, "", 5), " "); + + assert_eq!("qwer123zxc789".replacen(char::is_numeric, "", 3), "qwerzxc789"); +} + +#[test] +fn test_replace() { + let a = "a"; + assert_eq!("".replace(a, "b"), ""); + assert_eq!("a".replace(a, "b"), "b"); + assert_eq!("ab".replace(a, "b"), "bb"); + let test = "test"; + assert_eq!(" test test ".replace(test, "toast"), " toast toast "); + assert_eq!(" test test ".replace(test, ""), " "); +} + +#[test] +fn test_replace_2a() { + let data = "ประเทศไทย中华"; + let repl = "دولة الكويت"; + + let a = "ประเ"; + let a2 = "دولة الكويتทศไทย中华"; + assert_eq!(data.replace(a, repl), a2); +} + +#[test] +fn test_replace_2b() { + let data = "ประเทศไทย中华"; + let repl = "دولة الكويت"; + + let b = "ะเ"; + let b2 = "ปรدولة الكويتทศไทย中华"; + assert_eq!(data.replace(b, repl), b2); +} + +#[test] +fn test_replace_2c() { + let data = "ประเทศไทย中华"; + let repl = "دولة الكويت"; + + let c = "中华"; + let c2 = "ประเทศไทยدولة الكويت"; + assert_eq!(data.replace(c, repl), c2); +} + +#[test] +fn test_replace_2d() { + let data = "ประเทศไทย中华"; + let repl = "دولة الكويت"; + + let d = "ไท华"; + assert_eq!(data.replace(d, repl), data); +} + +#[test] +fn test_replace_pattern() { + let data = "abcdαβγδabcdαβγδ"; + assert_eq!(data.replace("dαβ", "😺😺😺"), "abc😺😺😺γδabc😺😺😺γδ"); + assert_eq!(data.replace('γ', "😺😺😺"), "abcdαβ😺😺😺δabcdαβ😺😺😺δ"); + assert_eq!(data.replace(&['a', 'γ'] as &[_], "😺😺😺"), "😺😺😺bcdαβ😺😺😺δ😺😺😺bcdαβ😺😺😺δ"); + assert_eq!(data.replace(|c| c == 'γ', "😺😺😺"), "abcdαβ😺😺😺δabcdαβ😺😺😺δ"); +} + +// The current implementation of SliceIndex fails to handle methods +// orthogonally from range types; therefore, it is worth testing +// all of the indexing operations on each input. +mod slice_index { + // Test a slicing operation **that should succeed,** + // testing it on all of the indexing methods. + // + // This is not suitable for testing failure on invalid inputs. + macro_rules! assert_range_eq { + ($s:expr, $range:expr, $expected:expr) => { + let mut s: String = $s.to_owned(); + let mut expected: String = $expected.to_owned(); + { + let s: &str = &s; + let expected: &str = &expected; + + assert_eq!(&s[$range], expected, "(in assertion for: index)"); + assert_eq!(s.get($range), Some(expected), "(in assertion for: get)"); + unsafe { + assert_eq!( + s.get_unchecked($range), + expected, + "(in assertion for: get_unchecked)", + ); + } + } + { + let s: &mut str = &mut s; + let expected: &mut str = &mut expected; + + assert_eq!(&mut s[$range], expected, "(in assertion for: index_mut)",); + assert_eq!( + s.get_mut($range), + Some(&mut expected[..]), + "(in assertion for: get_mut)", + ); + unsafe { + assert_eq!( + s.get_unchecked_mut($range), + expected, + "(in assertion for: get_unchecked_mut)", + ); + } + } + }; + } + + // Make sure the macro can actually detect bugs, + // because if it can't, then what are we even doing here? + // + // (Be aware this only demonstrates the ability to detect bugs + // in the FIRST method that panics, as the macro is not designed + // to be used in `should_panic`) + #[test] + #[should_panic(expected = "out of bounds")] + fn assert_range_eq_can_fail_by_panic() { + assert_range_eq!("abc", 0..5, "abc"); + } + + // (Be aware this only demonstrates the ability to detect bugs + // in the FIRST method it calls, as the macro is not designed + // to be used in `should_panic`) + #[test] + #[should_panic(expected = "==")] + fn assert_range_eq_can_fail_by_inequality() { + assert_range_eq!("abc", 0..2, "abc"); + } + + // Generates test cases for bad index operations. + // + // This generates `should_panic` test cases for Index/IndexMut + // and `None` test cases for get/get_mut. + macro_rules! panic_cases { + ($( + in mod $case_name:ident { + data: $data:expr; + + // optional: + // + // a similar input for which DATA[input] succeeds, and the corresponding + // output str. This helps validate "critical points" where an input range + // straddles the boundary between valid and invalid. + // (such as the input `len..len`, which is just barely valid) + $( + good: data[$good:expr] == $output:expr; + )* + + bad: data[$bad:expr]; + message: $expect_msg:expr; // must be a literal + } + )*) => {$( + mod $case_name { + #[test] + fn pass() { + let mut v: String = $data.into(); + + $( assert_range_eq!(v, $good, $output); )* + + { + let v: &str = &v; + assert_eq!(v.get($bad), None, "(in None assertion for get)"); + } + + { + let v: &mut str = &mut v; + assert_eq!(v.get_mut($bad), None, "(in None assertion for get_mut)"); + } + } + + #[test] + #[should_panic(expected = $expect_msg)] + fn index_fail() { + let v: String = $data.into(); + let v: &str = &v; + let _v = &v[$bad]; + } + + #[test] + #[should_panic(expected = $expect_msg)] + fn index_mut_fail() { + let mut v: String = $data.into(); + let v: &mut str = &mut v; + let _v = &mut v[$bad]; + } + } + )*}; + } + + #[test] + fn simple_ascii() { + assert_range_eq!("abc", .., "abc"); + + assert_range_eq!("abc", 0..2, "ab"); + assert_range_eq!("abc", 0..=1, "ab"); + assert_range_eq!("abc", ..2, "ab"); + assert_range_eq!("abc", ..=1, "ab"); + + assert_range_eq!("abc", 1..3, "bc"); + assert_range_eq!("abc", 1..=2, "bc"); + assert_range_eq!("abc", 1..1, ""); + assert_range_eq!("abc", 1..=0, ""); + } + + #[test] + fn simple_unicode() { + // 日本 + assert_range_eq!("\u{65e5}\u{672c}", .., "\u{65e5}\u{672c}"); + + assert_range_eq!("\u{65e5}\u{672c}", 0..3, "\u{65e5}"); + assert_range_eq!("\u{65e5}\u{672c}", 0..=2, "\u{65e5}"); + assert_range_eq!("\u{65e5}\u{672c}", ..3, "\u{65e5}"); + assert_range_eq!("\u{65e5}\u{672c}", ..=2, "\u{65e5}"); + + assert_range_eq!("\u{65e5}\u{672c}", 3..6, "\u{672c}"); + assert_range_eq!("\u{65e5}\u{672c}", 3..=5, "\u{672c}"); + assert_range_eq!("\u{65e5}\u{672c}", 3.., "\u{672c}"); + + let data = "ประเทศไทย中华"; + assert_range_eq!(data, 0..3, "ป"); + assert_range_eq!(data, 3..6, "ร"); + assert_range_eq!(data, 3..3, ""); + assert_range_eq!(data, 30..33, "华"); + + /*0: 中 + 3: 华 + 6: V + 7: i + 8: ệ + 11: t + 12: + 13: N + 14: a + 15: m */ + let ss = "中华Việt Nam"; + assert_range_eq!(ss, 3..6, "华"); + assert_range_eq!(ss, 6..16, "Việt Nam"); + assert_range_eq!(ss, 6..=15, "Việt Nam"); + assert_range_eq!(ss, 6.., "Việt Nam"); + + assert_range_eq!(ss, 0..3, "中"); + assert_range_eq!(ss, 3..7, "华V"); + assert_range_eq!(ss, 3..=6, "华V"); + assert_range_eq!(ss, 3..3, ""); + assert_range_eq!(ss, 3..=2, ""); + } + + #[test] + #[cfg_attr(target_os = "emscripten", ignore)] // hits an OOM + #[cfg_attr(miri, ignore)] // Miri is too slow + fn simple_big() { + fn a_million_letter_x() -> String { + let mut i = 0; + let mut rs = String::new(); + while i < 100000 { + rs.push_str("华华华华华华华华华华"); + i += 1; + } + rs + } + fn half_a_million_letter_x() -> String { + let mut i = 0; + let mut rs = String::new(); + while i < 100000 { + rs.push_str("华华华华华"); + i += 1; + } + rs + } + let letters = a_million_letter_x(); + assert_range_eq!(letters, 0..3 * 500000, half_a_million_letter_x()); + } + + #[test] + #[should_panic] + fn test_slice_fail() { + let _ = &"中华Việt Nam"[0..2]; + } + + panic_cases! { + in mod rangefrom_len { + data: "abcdef"; + good: data[6..] == ""; + bad: data[7..]; + message: "out of bounds"; + } + + in mod rangeto_len { + data: "abcdef"; + good: data[..6] == "abcdef"; + bad: data[..7]; + message: "out of bounds"; + } + + in mod rangetoinclusive_len { + data: "abcdef"; + good: data[..=5] == "abcdef"; + bad: data[..=6]; + message: "out of bounds"; + } + + in mod rangeinclusive_len { + data: "abcdef"; + good: data[0..=5] == "abcdef"; + bad: data[0..=6]; + message: "out of bounds"; + } + + in mod range_len_len { + data: "abcdef"; + good: data[6..6] == ""; + bad: data[7..7]; + message: "out of bounds"; + } + + in mod rangeinclusive_len_len { + data: "abcdef"; + good: data[6..=5] == ""; + bad: data[7..=6]; + message: "out of bounds"; + } + } + + panic_cases! { + in mod rangeinclusive_exhausted { + data: "abcdef"; + + good: data[0..=5] == "abcdef"; + good: data[{ + let mut iter = 0..=5; + iter.by_ref().count(); // exhaust it + iter + }] == ""; + + // 0..=6 is out of bounds before exhaustion, so it + // stands to reason that it still would be after. + bad: data[{ + let mut iter = 0..=6; + iter.by_ref().count(); // exhaust it + iter + }]; + message: "out of bounds"; + } + } + + panic_cases! { + in mod range_neg_width { + data: "abcdef"; + good: data[4..4] == ""; + bad: data[4..3]; + message: "begin <= end (4 <= 3)"; + } + + in mod rangeinclusive_neg_width { + data: "abcdef"; + good: data[4..=3] == ""; + bad: data[4..=2]; + message: "begin <= end (4 <= 3)"; + } + } + + mod overflow { + panic_cases! { + in mod rangeinclusive { + data: "hello"; + // note: using 0 specifically ensures that the result of overflowing is 0..0, + // so that `get` doesn't simply return None for the wrong reason. + bad: data[0..=usize::MAX]; + message: "maximum usize"; + } + + in mod rangetoinclusive { + data: "hello"; + bad: data[..=usize::MAX]; + message: "maximum usize"; + } + } + } + + mod boundary { + const DATA: &str = "abcαβγ"; + + const BAD_START: usize = 4; + const GOOD_START: usize = 3; + const BAD_END: usize = 6; + const GOOD_END: usize = 7; + const BAD_END_INCL: usize = BAD_END - 1; + const GOOD_END_INCL: usize = GOOD_END - 1; + + // it is especially important to test all of the different range types here + // because some of the logic may be duplicated as part of micro-optimizations + // to dodge unicode boundary checks on half-ranges. + panic_cases! { + in mod range_1 { + data: super::DATA; + bad: data[super::BAD_START..super::GOOD_END]; + message: + "byte index 4 is not a char boundary; it is inside 'α' (bytes 3..5) of"; + } + + in mod range_2 { + data: super::DATA; + bad: data[super::GOOD_START..super::BAD_END]; + message: + "byte index 6 is not a char boundary; it is inside 'β' (bytes 5..7) of"; + } + + in mod rangefrom { + data: super::DATA; + bad: data[super::BAD_START..]; + message: + "byte index 4 is not a char boundary; it is inside 'α' (bytes 3..5) of"; + } + + in mod rangeto { + data: super::DATA; + bad: data[..super::BAD_END]; + message: + "byte index 6 is not a char boundary; it is inside 'β' (bytes 5..7) of"; + } + + in mod rangeinclusive_1 { + data: super::DATA; + bad: data[super::BAD_START..=super::GOOD_END_INCL]; + message: + "byte index 4 is not a char boundary; it is inside 'α' (bytes 3..5) of"; + } + + in mod rangeinclusive_2 { + data: super::DATA; + bad: data[super::GOOD_START..=super::BAD_END_INCL]; + message: + "byte index 6 is not a char boundary; it is inside 'β' (bytes 5..7) of"; + } + + in mod rangetoinclusive { + data: super::DATA; + bad: data[..=super::BAD_END_INCL]; + message: + "byte index 6 is not a char boundary; it is inside 'β' (bytes 5..7) of"; + } + } + } + + const LOREM_PARAGRAPH: &str = "\ + Lorem ipsum dolor sit amet, consectetur adipiscing elit. Suspendisse quis lorem \ + sit amet dolor ultricies condimentum. Praesent iaculis purus elit, ac malesuada \ + quam malesuada in. Duis sed orci eros. Suspendisse sit amet magna mollis, mollis \ + nunc luctus, imperdiet mi. Integer fringilla non sem ut lacinia. Fusce varius \ + tortor a risus porttitor hendrerit. Morbi mauris dui, ultricies nec tempus vel, \ + gravida nec quam."; + + // check the panic includes the prefix of the sliced string + #[test] + #[should_panic(expected = "byte index 1024 is out of bounds of `Lorem ipsum dolor sit amet")] + fn test_slice_fail_truncated_1() { + let _ = &LOREM_PARAGRAPH[..1024]; + } + // check the truncation in the panic message + #[test] + #[should_panic(expected = "luctus, im`[...]")] + fn test_slice_fail_truncated_2() { + let _ = &LOREM_PARAGRAPH[..1024]; + } +} + +#[test] +fn test_str_slice_rangetoinclusive_ok() { + let s = "abcαβγ"; + assert_eq!(&s[..=2], "abc"); + assert_eq!(&s[..=4], "abcα"); +} + +#[test] +#[should_panic] +fn test_str_slice_rangetoinclusive_notok() { + let s = "abcαβγ"; + let _ = &s[..=3]; +} + +#[test] +fn test_str_slicemut_rangetoinclusive_ok() { + let mut s = "abcαβγ".to_owned(); + let s: &mut str = &mut s; + assert_eq!(&mut s[..=2], "abc"); + assert_eq!(&mut s[..=4], "abcα"); +} + +#[test] +#[should_panic] +fn test_str_slicemut_rangetoinclusive_notok() { + let mut s = "abcαβγ".to_owned(); + let s: &mut str = &mut s; + let _ = &mut s[..=3]; +} + +#[test] +fn test_is_char_boundary() { + let s = "ศไทย中华Việt Nam β-release 🐱123"; + assert!(s.is_char_boundary(0)); + assert!(s.is_char_boundary(s.len())); + assert!(!s.is_char_boundary(s.len() + 1)); + for (i, ch) in s.char_indices() { + // ensure character locations are boundaries and continuation bytes are not + assert!(s.is_char_boundary(i), "{} is a char boundary in {:?}", i, s); + for j in 1..ch.len_utf8() { + assert!( + !s.is_char_boundary(i + j), + "{} should not be a char boundary in {:?}", + i + j, + s + ); + } + } +} + +#[test] +fn test_trim_start_matches() { + let v: &[char] = &[]; + assert_eq!(" *** foo *** ".trim_start_matches(v), " *** foo *** "); + let chars: &[char] = &['*', ' ']; + assert_eq!(" *** foo *** ".trim_start_matches(chars), "foo *** "); + assert_eq!(" *** *** ".trim_start_matches(chars), ""); + assert_eq!("foo *** ".trim_start_matches(chars), "foo *** "); + + assert_eq!("11foo1bar11".trim_start_matches('1'), "foo1bar11"); + let chars: &[char] = &['1', '2']; + assert_eq!("12foo1bar12".trim_start_matches(chars), "foo1bar12"); + assert_eq!("123foo1bar123".trim_start_matches(|c: char| c.is_numeric()), "foo1bar123"); +} + +#[test] +fn test_trim_end_matches() { + let v: &[char] = &[]; + assert_eq!(" *** foo *** ".trim_end_matches(v), " *** foo *** "); + let chars: &[char] = &['*', ' ']; + assert_eq!(" *** foo *** ".trim_end_matches(chars), " *** foo"); + assert_eq!(" *** *** ".trim_end_matches(chars), ""); + assert_eq!(" *** foo".trim_end_matches(chars), " *** foo"); + + assert_eq!("11foo1bar11".trim_end_matches('1'), "11foo1bar"); + let chars: &[char] = &['1', '2']; + assert_eq!("12foo1bar12".trim_end_matches(chars), "12foo1bar"); + assert_eq!("123foo1bar123".trim_end_matches(|c: char| c.is_numeric()), "123foo1bar"); +} + +#[test] +fn test_trim_matches() { + let v: &[char] = &[]; + assert_eq!(" *** foo *** ".trim_matches(v), " *** foo *** "); + let chars: &[char] = &['*', ' ']; + assert_eq!(" *** foo *** ".trim_matches(chars), "foo"); + assert_eq!(" *** *** ".trim_matches(chars), ""); + assert_eq!("foo".trim_matches(chars), "foo"); + + assert_eq!("11foo1bar11".trim_matches('1'), "foo1bar"); + let chars: &[char] = &['1', '2']; + assert_eq!("12foo1bar12".trim_matches(chars), "foo1bar"); + assert_eq!("123foo1bar123".trim_matches(|c: char| c.is_numeric()), "foo1bar"); +} + +#[test] +fn test_trim_start() { + assert_eq!("".trim_start(), ""); + assert_eq!("a".trim_start(), "a"); + assert_eq!(" ".trim_start(), ""); + assert_eq!(" blah".trim_start(), "blah"); + assert_eq!(" \u{3000} wut".trim_start(), "wut"); + assert_eq!("hey ".trim_start(), "hey "); +} + +#[test] +fn test_trim_end() { + assert_eq!("".trim_end(), ""); + assert_eq!("a".trim_end(), "a"); + assert_eq!(" ".trim_end(), ""); + assert_eq!("blah ".trim_end(), "blah"); + assert_eq!("wut \u{3000} ".trim_end(), "wut"); + assert_eq!(" hey".trim_end(), " hey"); +} + +#[test] +fn test_trim() { + assert_eq!("".trim(), ""); + assert_eq!("a".trim(), "a"); + assert_eq!(" ".trim(), ""); + assert_eq!(" blah ".trim(), "blah"); + assert_eq!("\nwut \u{3000} ".trim(), "wut"); + assert_eq!(" hey dude ".trim(), "hey dude"); +} + +#[test] +fn test_is_whitespace() { + assert!("".chars().all(|c| c.is_whitespace())); + assert!(" ".chars().all(|c| c.is_whitespace())); + assert!("\u{2009}".chars().all(|c| c.is_whitespace())); // Thin space + assert!(" \n\t ".chars().all(|c| c.is_whitespace())); + assert!(!" _ ".chars().all(|c| c.is_whitespace())); +} + +#[test] +fn test_is_utf8() { + // deny overlong encodings + assert!(from_utf8(&[0xc0, 0x80]).is_err()); + assert!(from_utf8(&[0xc0, 0xae]).is_err()); + assert!(from_utf8(&[0xe0, 0x80, 0x80]).is_err()); + assert!(from_utf8(&[0xe0, 0x80, 0xaf]).is_err()); + assert!(from_utf8(&[0xe0, 0x81, 0x81]).is_err()); + assert!(from_utf8(&[0xf0, 0x82, 0x82, 0xac]).is_err()); + assert!(from_utf8(&[0xf4, 0x90, 0x80, 0x80]).is_err()); + + // deny surrogates + assert!(from_utf8(&[0xED, 0xA0, 0x80]).is_err()); + assert!(from_utf8(&[0xED, 0xBF, 0xBF]).is_err()); + + assert!(from_utf8(&[0xC2, 0x80]).is_ok()); + assert!(from_utf8(&[0xDF, 0xBF]).is_ok()); + assert!(from_utf8(&[0xE0, 0xA0, 0x80]).is_ok()); + assert!(from_utf8(&[0xED, 0x9F, 0xBF]).is_ok()); + assert!(from_utf8(&[0xEE, 0x80, 0x80]).is_ok()); + assert!(from_utf8(&[0xEF, 0xBF, 0xBF]).is_ok()); + assert!(from_utf8(&[0xF0, 0x90, 0x80, 0x80]).is_ok()); + assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok()); +} + +#[test] +fn test_const_is_utf8() { + const _: () = { + // deny overlong encodings + assert!(from_utf8(&[0xc0, 0x80]).is_err()); + assert!(from_utf8(&[0xc0, 0xae]).is_err()); + assert!(from_utf8(&[0xe0, 0x80, 0x80]).is_err()); + assert!(from_utf8(&[0xe0, 0x80, 0xaf]).is_err()); + assert!(from_utf8(&[0xe0, 0x81, 0x81]).is_err()); + assert!(from_utf8(&[0xf0, 0x82, 0x82, 0xac]).is_err()); + assert!(from_utf8(&[0xf4, 0x90, 0x80, 0x80]).is_err()); + + // deny surrogates + assert!(from_utf8(&[0xED, 0xA0, 0x80]).is_err()); + assert!(from_utf8(&[0xED, 0xBF, 0xBF]).is_err()); + + assert!(from_utf8(&[0xC2, 0x80]).is_ok()); + assert!(from_utf8(&[0xDF, 0xBF]).is_ok()); + assert!(from_utf8(&[0xE0, 0xA0, 0x80]).is_ok()); + assert!(from_utf8(&[0xED, 0x9F, 0xBF]).is_ok()); + assert!(from_utf8(&[0xEE, 0x80, 0x80]).is_ok()); + assert!(from_utf8(&[0xEF, 0xBF, 0xBF]).is_ok()); + assert!(from_utf8(&[0xF0, 0x90, 0x80, 0x80]).is_ok()); + assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok()); + }; +} + +#[test] +fn from_utf8_mostly_ascii() { + // deny invalid bytes embedded in long stretches of ascii + for i in 32..64 { + let mut data = [0; 128]; + data[i] = 0xC0; + assert!(from_utf8(&data).is_err()); + data[i] = 0xC2; + assert!(from_utf8(&data).is_err()); + } +} + +#[test] +fn const_from_utf8_mostly_ascii() { + const _: () = { + // deny invalid bytes embedded in long stretches of ascii + let mut i = 32; + while i < 64 { + let mut data = [0; 128]; + data[i] = 0xC0; + assert!(from_utf8(&data).is_err()); + data[i] = 0xC2; + assert!(from_utf8(&data).is_err()); + + i = i + 1; + } + }; +} + +#[test] +fn from_utf8_error() { + macro_rules! test { + ($input: expr, $expected_valid_up_to:pat, $expected_error_len:pat) => { + let error = from_utf8($input).unwrap_err(); + assert_matches!(error.valid_up_to(), $expected_valid_up_to); + assert_matches!(error.error_len(), $expected_error_len); + + const _: () = { + match from_utf8($input) { + Err(error) => { + let valid_up_to = error.valid_up_to(); + let error_len = error.error_len(); + + assert!(matches!(valid_up_to, $expected_valid_up_to)); + assert!(matches!(error_len, $expected_error_len)); + } + Ok(_) => unreachable!(), + } + }; + }; + } + test!(b"A\xC3\xA9 \xFF ", 4, Some(1)); + test!(b"A\xC3\xA9 \x80 ", 4, Some(1)); + test!(b"A\xC3\xA9 \xC1 ", 4, Some(1)); + test!(b"A\xC3\xA9 \xC1", 4, Some(1)); + test!(b"A\xC3\xA9 \xC2", 4, None); + test!(b"A\xC3\xA9 \xC2 ", 4, Some(1)); + test!(b"A\xC3\xA9 \xC2\xC0", 4, Some(1)); + test!(b"A\xC3\xA9 \xE0", 4, None); + test!(b"A\xC3\xA9 \xE0\x9F", 4, Some(1)); + test!(b"A\xC3\xA9 \xE0\xA0", 4, None); + test!(b"A\xC3\xA9 \xE0\xA0\xC0", 4, Some(2)); + test!(b"A\xC3\xA9 \xE0\xA0 ", 4, Some(2)); + test!(b"A\xC3\xA9 \xED\xA0\x80 ", 4, Some(1)); + test!(b"A\xC3\xA9 \xF1", 4, None); + test!(b"A\xC3\xA9 \xF1\x80", 4, None); + test!(b"A\xC3\xA9 \xF1\x80\x80", 4, None); + test!(b"A\xC3\xA9 \xF1 ", 4, Some(1)); + test!(b"A\xC3\xA9 \xF1\x80 ", 4, Some(2)); + test!(b"A\xC3\xA9 \xF1\x80\x80 ", 4, Some(3)); +} + +#[test] +fn test_as_bytes() { + // no null + let v = [ + 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228, 184, 173, 229, 141, 142, + 86, 105, 225, 187, 135, 116, 32, 78, 97, 109, + ]; + let b: &[u8] = &[]; + assert_eq!("".as_bytes(), b); + assert_eq!("abc".as_bytes(), b"abc"); + assert_eq!("ศไทย中华Việt Nam".as_bytes(), v); +} + +#[test] +#[should_panic] +fn test_as_bytes_fail() { + // Don't double free. (I'm not sure if this exercises the + // original problem code path anymore.) + let s = String::from(""); + let _bytes = s.as_bytes(); + panic!(); +} + +#[test] +fn test_as_ptr() { + let buf = "hello".as_ptr(); + unsafe { + assert_eq!(*buf.add(0), b'h'); + assert_eq!(*buf.add(1), b'e'); + assert_eq!(*buf.add(2), b'l'); + assert_eq!(*buf.add(3), b'l'); + assert_eq!(*buf.add(4), b'o'); + } +} + +#[test] +fn vec_str_conversions() { + let s1: String = String::from("All mimsy were the borogoves"); + + let v: Vec = s1.as_bytes().to_vec(); + let s2: String = String::from(from_utf8(&v).unwrap()); + let mut i = 0; + let n1 = s1.len(); + let n2 = v.len(); + assert_eq!(n1, n2); + while i < n1 { + let a: u8 = s1.as_bytes()[i]; + let b: u8 = s2.as_bytes()[i]; + assert_eq!(a, b); + i += 1; + } +} + */ + +#[test] +fn test_contains() { + assert!(contains("abcde", "bcd")); + assert!(contains("abcde", "abcd")); + assert!(contains("abcde", "bcde")); + assert!(contains("abcde", "")); + assert!(contains("", "")); + assert!(!contains("abcde", "def")); + assert!(!contains("", "a")); + + let data = "ประเทศไทย中华Việt Nam"; + assert!(contains(data, "ประเ")); + assert!(contains(data, "ะเ")); + assert!(contains(data, "中华")); + assert!(!contains(data, "ไท华")); +} + +#[test] +fn test_contains_char() { + assert!(contains("abc", 'b')); + assert!(contains("a", 'a')); + assert!(!contains("abc", 'd')); + assert!(!contains("", 'a')); +} + +/* +#[test] +fn test_split_at() { + let s = "ศไทย中华Việt Nam"; + for (index, _) in s.char_indices() { + let (a, b) = s.split_at(index); + assert_eq!(&s[..a.len()], a); + assert_eq!(&s[a.len()..], b); + } + let (a, b) = s.split_at(s.len()); + assert_eq!(a, s); + assert_eq!(b, ""); +} + +#[test] +fn test_split_at_mut() { + let mut s = "Hello World".to_string(); + { + let (a, b) = s.split_at_mut(5); + a.make_ascii_uppercase(); + b.make_ascii_lowercase(); + } + assert_eq!(s, "HELLO world"); +} + +#[test] +#[should_panic] +fn test_split_at_boundscheck() { + let s = "ศไทย中华Việt Nam"; + let _ = s.split_at(1); +} + +#[test] +fn test_escape_unicode() { + assert_eq!("abc".escape_unicode().to_string(), "\\u{61}\\u{62}\\u{63}"); + assert_eq!("a c".escape_unicode().to_string(), "\\u{61}\\u{20}\\u{63}"); + assert_eq!("\r\n\t".escape_unicode().to_string(), "\\u{d}\\u{a}\\u{9}"); + assert_eq!("'\"\\".escape_unicode().to_string(), "\\u{27}\\u{22}\\u{5c}"); + assert_eq!("\x00\x01\u{fe}\u{ff}".escape_unicode().to_string(), "\\u{0}\\u{1}\\u{fe}\\u{ff}"); + assert_eq!("\u{100}\u{ffff}".escape_unicode().to_string(), "\\u{100}\\u{ffff}"); + assert_eq!("\u{10000}\u{10ffff}".escape_unicode().to_string(), "\\u{10000}\\u{10ffff}"); + assert_eq!("ab\u{fb00}".escape_unicode().to_string(), "\\u{61}\\u{62}\\u{fb00}"); + assert_eq!("\u{1d4ea}\r".escape_unicode().to_string(), "\\u{1d4ea}\\u{d}"); +} + +#[test] +fn test_escape_debug() { + // Note that there are subtleties with the number of backslashes + // on the left- and right-hand sides. In particular, Unicode code points + // are usually escaped with two backslashes on the right-hand side, as + // they are escaped. However, when the character is unescaped (e.g., for + // printable characters), only a single backslash appears (as the character + // itself appears in the debug string). + assert_eq!("abc".escape_debug().to_string(), "abc"); + assert_eq!("a c".escape_debug().to_string(), "a c"); + assert_eq!("éèê".escape_debug().to_string(), "éèê"); + assert_eq!("\0\r\n\t".escape_debug().to_string(), "\\0\\r\\n\\t"); + assert_eq!("'\"\\".escape_debug().to_string(), "\\'\\\"\\\\"); + assert_eq!("\u{7f}\u{ff}".escape_debug().to_string(), "\\u{7f}\u{ff}"); + assert_eq!("\u{100}\u{ffff}".escape_debug().to_string(), "\u{100}\\u{ffff}"); + assert_eq!("\u{10000}\u{10ffff}".escape_debug().to_string(), "\u{10000}\\u{10ffff}"); + assert_eq!("ab\u{200b}".escape_debug().to_string(), "ab\\u{200b}"); + assert_eq!("\u{10d4ea}\r".escape_debug().to_string(), "\\u{10d4ea}\\r"); + assert_eq!( + "\u{301}a\u{301}bé\u{e000}".escape_debug().to_string(), + "\\u{301}a\u{301}bé\\u{e000}" + ); +} + +#[test] +fn test_escape_default() { + assert_eq!("abc".escape_default().to_string(), "abc"); + assert_eq!("a c".escape_default().to_string(), "a c"); + assert_eq!("éèê".escape_default().to_string(), "\\u{e9}\\u{e8}\\u{ea}"); + assert_eq!("\r\n\t".escape_default().to_string(), "\\r\\n\\t"); + assert_eq!("'\"\\".escape_default().to_string(), "\\'\\\"\\\\"); + assert_eq!("\u{7f}\u{ff}".escape_default().to_string(), "\\u{7f}\\u{ff}"); + assert_eq!("\u{100}\u{ffff}".escape_default().to_string(), "\\u{100}\\u{ffff}"); + assert_eq!("\u{10000}\u{10ffff}".escape_default().to_string(), "\\u{10000}\\u{10ffff}"); + assert_eq!("ab\u{200b}".escape_default().to_string(), "ab\\u{200b}"); + assert_eq!("\u{10d4ea}\r".escape_default().to_string(), "\\u{10d4ea}\\r"); +} + */ + +#[test] +fn test_total_ord() { + fn test(lhs: &str, rhs: &str) { + assert_eq!(lhs.cmp(rhs), os(lhs).cmp(os(rhs)), "{lhs} <=> {rhs}"); + } + + test("1234", "123"); + test("123", "1234"); + test("1234", "1234"); + test("12345555", "123456"); + test("22", "1234"); +} + +/* +#[test] +fn test_iterator() { + let s = "ศไทย中华Việt Nam"; + let v = ['ศ', 'ไ', 'ท', 'ย', '中', '华', 'V', 'i', 'ệ', 't', ' ', 'N', 'a', 'm']; + + let mut pos = 0; + let it = s.chars(); + + for c in it { + assert_eq!(c, v[pos]); + pos += 1; + } + assert_eq!(pos, v.len()); + assert_eq!(s.chars().count(), v.len()); +} + +#[test] +fn test_rev_iterator() { + let s = "ศไทย中华Việt Nam"; + let v = ['m', 'a', 'N', ' ', 't', 'ệ', 'i', 'V', '华', '中', 'ย', 'ท', 'ไ', 'ศ']; + + let mut pos = 0; + let it = s.chars().rev(); + + for c in it { + assert_eq!(c, v[pos]); + pos += 1; + } + assert_eq!(pos, v.len()); +} + +#[test] +fn test_to_lowercase_rev_iterator() { + let s = "AÖßÜ💩ΣΤΙΓΜΑΣDžfiİ"; + let v = ['\u{307}', 'i', 'fi', 'dž', 'σ', 'α', 'μ', 'γ', 'ι', 'τ', 'σ', '💩', 'ü', 'ß', 'ö', 'a']; + + let mut pos = 0; + let it = s.chars().flat_map(|c| c.to_lowercase()).rev(); + + for c in it { + assert_eq!(c, v[pos]); + pos += 1; + } + assert_eq!(pos, v.len()); +} + +#[test] +fn test_to_uppercase_rev_iterator() { + let s = "aößü💩στιγμαςDžfiᾀ"; + let v = + ['Ι', 'Ἀ', 'I', 'F', 'DŽ', 'Σ', 'Α', 'Μ', 'Γ', 'Ι', 'Τ', 'Σ', '💩', 'Ü', 'S', 'S', 'Ö', 'A']; + + let mut pos = 0; + let it = s.chars().flat_map(|c| c.to_uppercase()).rev(); + + for c in it { + assert_eq!(c, v[pos]); + pos += 1; + } + assert_eq!(pos, v.len()); +} + +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn test_chars_decoding() { + let mut bytes = [0; 4]; + for c in (0..0x110000).filter_map(std::char::from_u32) { + let s = c.encode_utf8(&mut bytes); + if Some(c) != s.chars().next() { + panic!("character {:x}={} does not decode correctly", c as u32, c); + } + } +} + +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn test_chars_rev_decoding() { + let mut bytes = [0; 4]; + for c in (0..0x110000).filter_map(std::char::from_u32) { + let s = c.encode_utf8(&mut bytes); + if Some(c) != s.chars().rev().next() { + panic!("character {:x}={} does not decode correctly", c as u32, c); + } + } +} + +#[test] +fn test_iterator_clone() { + let s = "ศไทย中华Việt Nam"; + let mut it = s.chars(); + it.next(); + assert!(it.clone().zip(it).all(|(x, y)| x == y)); +} + +#[test] +fn test_iterator_last() { + let s = "ศไทย中华Việt Nam"; + let mut it = s.chars(); + it.next(); + assert_eq!(it.last(), Some('m')); +} + +#[test] +fn test_chars_debug() { + let s = "ศไทย中华Việt Nam"; + let c = s.chars(); + assert_eq!( + format!("{c:?}"), + r#"Chars(['ศ', 'ไ', 'ท', 'ย', '中', '华', 'V', 'i', 'ệ', 't', ' ', 'N', 'a', 'm'])"# + ); +} + +#[test] +fn test_bytesator() { + let s = "ศไทย中华Việt Nam"; + let v = [ + 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228, 184, 173, 229, 141, 142, + 86, 105, 225, 187, 135, 116, 32, 78, 97, 109, + ]; + let mut pos = 0; + + for b in s.bytes() { + assert_eq!(b, v[pos]); + pos += 1; + } +} + +#[test] +fn test_bytes_revator() { + let s = "ศไทย中华Việt Nam"; + let v = [ + 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228, 184, 173, 229, 141, 142, + 86, 105, 225, 187, 135, 116, 32, 78, 97, 109, + ]; + let mut pos = v.len(); + + for b in s.bytes().rev() { + pos -= 1; + assert_eq!(b, v[pos]); + } +} + +#[test] +fn test_bytesator_nth() { + let s = "ศไทย中华Việt Nam"; + let v = [ + 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228, 184, 173, 229, 141, 142, + 86, 105, 225, 187, 135, 116, 32, 78, 97, 109, + ]; + + let mut b = s.bytes(); + assert_eq!(b.nth(2).unwrap(), v[2]); + assert_eq!(b.nth(10).unwrap(), v[10]); + assert_eq!(b.nth(200), None); +} + +#[test] +fn test_bytesator_count() { + let s = "ศไทย中华Việt Nam"; + + let b = s.bytes(); + assert_eq!(b.count(), 28) +} + +#[test] +fn test_bytesator_last() { + let s = "ศไทย中华Việt Nam"; + + let b = s.bytes(); + assert_eq!(b.last().unwrap(), 109) +} + +#[test] +fn test_char_indicesator() { + let s = "ศไทย中华Việt Nam"; + let p = [0, 3, 6, 9, 12, 15, 18, 19, 20, 23, 24, 25, 26, 27]; + let v = ['ศ', 'ไ', 'ท', 'ย', '中', '华', 'V', 'i', 'ệ', 't', ' ', 'N', 'a', 'm']; + + let mut pos = 0; + let it = s.char_indices(); + + for c in it { + assert_eq!(c, (p[pos], v[pos])); + pos += 1; + } + assert_eq!(pos, v.len()); + assert_eq!(pos, p.len()); +} + +#[test] +fn test_char_indices_revator() { + let s = "ศไทย中华Việt Nam"; + let p = [27, 26, 25, 24, 23, 20, 19, 18, 15, 12, 9, 6, 3, 0]; + let v = ['m', 'a', 'N', ' ', 't', 'ệ', 'i', 'V', '华', '中', 'ย', 'ท', 'ไ', 'ศ']; + + let mut pos = 0; + let it = s.char_indices().rev(); + + for c in it { + assert_eq!(c, (p[pos], v[pos])); + pos += 1; + } + assert_eq!(pos, v.len()); + assert_eq!(pos, p.len()); +} + +#[test] +fn test_char_indices_last() { + let s = "ศไทย中华Việt Nam"; + let mut it = s.char_indices(); + it.next(); + assert_eq!(it.last(), Some((27, 'm'))); +} + +#[test] +fn test_splitn_char_iterator() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let split: Vec<&str> = data.splitn(4, ' ').collect(); + assert_eq!(split, ["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]); + + let split: Vec<&str> = data.splitn(4, |c: char| c == ' ').collect(); + assert_eq!(split, ["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]); + + // Unicode + let split: Vec<&str> = data.splitn(4, 'ä').collect(); + assert_eq!(split, ["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]); + + let split: Vec<&str> = data.splitn(4, |c: char| c == 'ä').collect(); + assert_eq!(split, ["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]); +} +*/ + +#[test] +fn test_split_char_iterator_no_trailing() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let split: Vec<&str> = data.split('\n').collect(); + assert_eq!(split, ["", "Märy häd ä little lämb", "Little lämb", ""]); + + /* + let split: Vec<&str> = data.split_terminator('\n').collect(); + assert_eq!(split, ["", "Märy häd ä little lämb", "Little lämb"]); + */ +} + +/* +#[test] +fn test_split_char_iterator_inclusive() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let split: Vec<&str> = data.split_inclusive('\n').collect(); + assert_eq!(split, ["\n", "Märy häd ä little lämb\n", "Little lämb\n"]); + + let uppercase_separated = "SheePSharKTurtlECaT"; + let mut first_char = true; + let split: Vec<&str> = uppercase_separated + .split_inclusive(|c: char| { + let split = !first_char && c.is_uppercase(); + first_char = split; + split + }) + .collect(); + assert_eq!(split, ["SheeP", "SharK", "TurtlE", "CaT"]); +} + +#[test] +fn test_split_char_iterator_inclusive_rev() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let split: Vec<&str> = data.split_inclusive('\n').rev().collect(); + assert_eq!(split, ["Little lämb\n", "Märy häd ä little lämb\n", "\n"]); + + // Note that the predicate is stateful and thus dependent + // on the iteration order. + // (A different predicate is needed for reverse iterator vs normal iterator.) + // Not sure if anything can be done though. + let uppercase_separated = "SheePSharKTurtlECaT"; + let mut term_char = true; + let split: Vec<&str> = uppercase_separated + .split_inclusive(|c: char| { + let split = term_char && c.is_uppercase(); + term_char = c.is_uppercase(); + split + }) + .rev() + .collect(); + assert_eq!(split, ["CaT", "TurtlE", "SharK", "SheeP"]); +} + +#[test] +fn test_rsplit() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let split: Vec<&str> = data.rsplit(' ').collect(); + assert_eq!(split, ["lämb\n", "lämb\nLittle", "little", "ä", "häd", "\nMäry"]); + + let split: Vec<&str> = data.rsplit("lämb").collect(); + assert_eq!(split, ["\n", "\nLittle ", "\nMäry häd ä little "]); + + let split: Vec<&str> = data.rsplit(|c: char| c == 'ä').collect(); + assert_eq!(split, ["mb\n", "mb\nLittle l", " little l", "d ", "ry h", "\nM"]); +} + +#[test] +fn test_rsplitn() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let split: Vec<&str> = data.rsplitn(2, ' ').collect(); + assert_eq!(split, ["lämb\n", "\nMäry häd ä little lämb\nLittle"]); + + let split: Vec<&str> = data.rsplitn(2, "lämb").collect(); + assert_eq!(split, ["\n", "\nMäry häd ä little lämb\nLittle "]); + + let split: Vec<&str> = data.rsplitn(2, |c: char| c == 'ä').collect(); + assert_eq!(split, ["mb\n", "\nMäry häd ä little lämb\nLittle l"]); +} + +#[test] +fn test_split_once() { + assert_eq!("".split_once("->"), None); + assert_eq!("-".split_once("->"), None); + assert_eq!("->".split_once("->"), Some(("", ""))); + assert_eq!("a->".split_once("->"), Some(("a", ""))); + assert_eq!("->b".split_once("->"), Some(("", "b"))); + assert_eq!("a->b".split_once("->"), Some(("a", "b"))); + assert_eq!("a->b->c".split_once("->"), Some(("a", "b->c"))); + assert_eq!("---".split_once("--"), Some(("", "-"))); +} + +#[test] +fn test_rsplit_once() { + assert_eq!("".rsplit_once("->"), None); + assert_eq!("-".rsplit_once("->"), None); + assert_eq!("->".rsplit_once("->"), Some(("", ""))); + assert_eq!("a->".rsplit_once("->"), Some(("a", ""))); + assert_eq!("->b".rsplit_once("->"), Some(("", "b"))); + assert_eq!("a->b".rsplit_once("->"), Some(("a", "b"))); + assert_eq!("a->b->c".rsplit_once("->"), Some(("a->b", "c"))); + assert_eq!("---".rsplit_once("--"), Some(("-", ""))); +} + +#[test] +fn test_split_whitespace() { + let data = "\n \tMäry häd\tä little lämb\nLittle lämb\n"; + let words: Vec<&str> = data.split_whitespace().collect(); + assert_eq!(words, ["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"]) +} + +#[test] +fn test_lines() { + let data = "\nMäry häd ä little lämb\n\r\nLittle lämb\n"; + let lines: Vec<&str> = data.lines().collect(); + assert_eq!(lines, ["", "Märy häd ä little lämb", "", "Little lämb"]); + + let data = "\r\nMäry häd ä little lämb\n\nLittle lämb"; // no trailing \n + let lines: Vec<&str> = data.lines().collect(); + assert_eq!(lines, ["", "Märy häd ä little lämb", "", "Little lämb"]); +} + */ + +#[test] +fn test_splitator() { + fn t(s: &str, sep: &str, u: &[&str]) { + let want: Vec<&OsStr> = u.into_iter().map(|&v| os(v)).collect(); + let got: Vec<&OsStr> = os(s).split(sep).collect(); + assert_eq!(want, got); + } + t("--1233345--", "12345", &["--1233345--"]); + t("abc::hello::there", "::", &["abc", "hello", "there"]); + t("::hello::there", "::", &["", "hello", "there"]); + t("hello::there::", "::", &["hello", "there", ""]); + t("::hello::there::", "::", &["", "hello", "there", ""]); + t("ประเทศไทย中华Việt Nam", "中华", &["ประเทศไทย", "Việt Nam"]); + t("zzXXXzzYYYzz", "zz", &["", "XXX", "YYY", ""]); + t("zzXXXzYYYz", "XXX", &["zz", "zYYYz"]); + t(".XXX.YYY.", ".", &["", "XXX", "YYY", ""]); + t("", ".", &[""]); + t("zz", "zz", &["", ""]); + t("ok", "z", &["ok"]); + t("zzz", "zz", &["", "z"]); + t("zzzzz", "zz", &["", "", "z"]); +} + +#[test] +fn test_str_default() { + use std::default::Default; + + fn t>() { + let s: S = Default::default(); + assert_eq!(s.as_ref(), os("")); + } + + t::<&str>(); + t::(); + t::<&mut str>(); +} + +#[test] +fn test_str_container() { + fn sum_len(v: &[&str]) -> usize { + v.iter().map(|x| os(x).len()).sum() + } + + assert_eq!(5, sum_len(&["012", "", "34"])); + assert_eq!(5, sum_len(&["01", "2", "34", ""])); + assert_eq!(5, sum_len(&["01234"])); +} + +/* +#[test] +fn test_str_from_utf8() { + let xs = b"hello"; + assert_eq!(from_utf8(xs), Ok("hello")); + + let xs = "ศไทย中华Việt Nam".as_bytes(); + assert_eq!(from_utf8(xs), Ok("ศไทย中华Việt Nam")); + + let xs = b"hello\xFF"; + assert!(from_utf8(xs).is_err()); +} + */ + +#[test] +fn test_pattern_deref_forward() { + let data = "aabcdaa"; + assert!(data.contains("bcd")); + assert!(data.contains(&"bcd")); + assert!(data.contains(&"bcd".to_string())); +} + +#[test] +fn test_empty_match_indices() { + let data = "aä中!"; + let mut searcher = "".into_searcher(os(data)); + let got: Vec = core::iter::from_fn(|| searcher.next_match()) + .map(|(start, _)| start) + .collect(); + assert_eq!(got, [0, 1, 3, 6, 7]); +} + +fn check_contains_all_substrings(haystack: &str) { + let mut modified_needle = String::new(); + + for i in 0..haystack.len() { + // check different haystack lengths since we special-case short haystacks. + let haystack = &haystack[0..i]; + assert!(contains(haystack, "")); + for j in 0..haystack.len() { + for k in j + 1..=haystack.len() { + let needle = &haystack[j..k]; + assert!(contains(haystack, needle)); + modified_needle.clear(); + modified_needle.push_str(needle); + modified_needle.replace_range(0..1, "\0"); + assert!(!contains(haystack, &*modified_needle)); + + modified_needle.clear(); + modified_needle.push_str(needle); + modified_needle.replace_range(needle.len() - 1..needle.len(), "\0"); + assert!(!contains(haystack, &*modified_needle)); + } + } + } +} + +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn strslice_issue_16589() { + assert!(contains("bananas", "nana")); + + // prior to the fix for #16589, x.contains("abcdabcd") returned false + // test all substrings for good measure + check_contains_all_substrings("012345678901234567890123456789bcdabcdabcd"); +} + +#[test] +fn strslice_issue_16878() { + assert!(!contains("1234567ah012345678901ah", "hah")); + assert!(!contains("00abc01234567890123456789abc", "bcabc")); +} + +#[test] +fn strslice_issue_104726() { + // Edge-case in the simd_contains impl. + // The first and last byte are the same so it backtracks by one byte + // which aligns with the end of the string. Previously incorrect offset calculations + // lead to out-of-bounds slicing. + #[rustfmt::skip] + let needle = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaba"; + let haystack = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab"; + assert!(!contains(haystack, needle)); +} + +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn test_strslice_contains() { + let x = "There are moments, Jeeves, when one asks oneself, 'Do trousers matter?'"; + check_contains_all_substrings(x); +} + +/* +#[test] +fn test_rsplitn_char_iterator() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let mut split: Vec<&str> = data.rsplitn(4, ' ').collect(); + split.reverse(); + assert_eq!(split, ["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]); + + let mut split: Vec<&str> = data.rsplitn(4, |c: char| c == ' ').collect(); + split.reverse(); + assert_eq!(split, ["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]); + + // Unicode + let mut split: Vec<&str> = data.rsplitn(4, 'ä').collect(); + split.reverse(); + assert_eq!(split, ["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]); + + let mut split: Vec<&str> = data.rsplitn(4, |c: char| c == 'ä').collect(); + split.reverse(); + assert_eq!(split, ["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]); +} +*/ + +#[test] +fn test_split_char_iterator() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let split: Vec<&OsStr> = os(data).split(' ').collect(); + assert_eq!(split, ["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]); + + let mut rsplit: Vec<&OsStr> = os(data).split(' ').rev().collect(); + rsplit.reverse(); + assert_eq!(rsplit, ["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]); + + // Unicode + let split: Vec<&OsStr> = os(data).split('ä').collect(); + assert_eq!(split, ["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]); + + let mut rsplit: Vec<&OsStr> = os(data).split('ä').rev().collect(); + rsplit.reverse(); + assert_eq!(rsplit, ["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]); +} + +#[test] +fn test_rev_split_char_iterator_no_trailing() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let mut split: Vec<&OsStr> = os(data).split('\n').rev().collect(); + split.reverse(); + assert_eq!(split, ["", "Märy häd ä little lämb", "Little lämb", ""]); +/* + let mut split: Vec<&OsStr> = os(data).split_terminator('\n').rev().collect(); + split.reverse(); + assert_eq!(split, ["", "Märy häd ä little lämb", "Little lämb"]); +*/ +} + +/* +#[test] +fn test_utf16_code_units() { + assert_eq!("é\u{1F4A9}".encode_utf16().collect::>(), [0xE9, 0xD83D, 0xDCA9]) +} + */ + +#[test] +fn starts_with_in_unicode() { + assert!(!os("├── Cargo.toml").starts_with("# ")); +} + +#[test] +fn starts_short_long() { + assert!(!os("").starts_with("##")); + assert!(!os("##").starts_with("####")); + assert!(os("####").starts_with("##")); + assert!(!os("##ä").starts_with("####")); + assert!(os("####ä").starts_with("##")); + assert!(!os("##").starts_with("####ä")); + assert!(os("##ä##").starts_with("##ä")); + + assert!(os("").starts_with("")); + assert!(os("ä").starts_with("")); + assert!(os("#ä").starts_with("")); + assert!(os("##ä").starts_with("")); + assert!(os("ä###").starts_with("")); + assert!(os("#ä##").starts_with("")); + assert!(os("##ä#").starts_with("")); +} + +#[test] +fn contains_weird_cases() { + assert!(contains("* \t", ' ')); + assert!(!contains("* \t", '?')); + assert!(!contains("* \t", '\u{1F4A9}')); +} + +/* +#[test] +fn trim_ws() { + assert_eq!(" \t a \t ".trim_start_matches(|c: char| c.is_whitespace()), "a \t "); + assert_eq!(" \t a \t ".trim_end_matches(|c: char| c.is_whitespace()), " \t a"); + assert_eq!(" \t a \t ".trim_start_matches(|c: char| c.is_whitespace()), "a \t "); + assert_eq!(" \t a \t ".trim_end_matches(|c: char| c.is_whitespace()), " \t a"); + assert_eq!(" \t a \t ".trim_matches(|c: char| c.is_whitespace()), "a"); + assert_eq!(" \t \t ".trim_start_matches(|c: char| c.is_whitespace()), ""); + assert_eq!(" \t \t ".trim_end_matches(|c: char| c.is_whitespace()), ""); + assert_eq!(" \t \t ".trim_start_matches(|c: char| c.is_whitespace()), ""); + assert_eq!(" \t \t ".trim_end_matches(|c: char| c.is_whitespace()), ""); + assert_eq!(" \t \t ".trim_matches(|c: char| c.is_whitespace()), ""); +} + +#[test] +fn to_lowercase() { + assert_eq!("".to_lowercase(), ""); + assert_eq!("AÉDžaé ".to_lowercase(), "aédžaé "); + + // https://github.com/rust-lang/rust/issues/26035 + assert_eq!("ΑΣ".to_lowercase(), "ας"); + assert_eq!("Α'Σ".to_lowercase(), "α'ς"); + assert_eq!("Α''Σ".to_lowercase(), "α''ς"); + + assert_eq!("ΑΣ Α".to_lowercase(), "ας α"); + assert_eq!("Α'Σ Α".to_lowercase(), "α'ς α"); + assert_eq!("Α''Σ Α".to_lowercase(), "α''ς α"); + + assert_eq!("ΑΣ' Α".to_lowercase(), "ας' α"); + assert_eq!("ΑΣ'' Α".to_lowercase(), "ας'' α"); + + assert_eq!("Α'Σ' Α".to_lowercase(), "α'ς' α"); + assert_eq!("Α''Σ'' Α".to_lowercase(), "α''ς'' α"); + + assert_eq!("Α Σ".to_lowercase(), "α σ"); + assert_eq!("Α 'Σ".to_lowercase(), "α 'σ"); + assert_eq!("Α ''Σ".to_lowercase(), "α ''σ"); + + assert_eq!("Σ".to_lowercase(), "σ"); + assert_eq!("'Σ".to_lowercase(), "'σ"); + assert_eq!("''Σ".to_lowercase(), "''σ"); + + assert_eq!("ΑΣΑ".to_lowercase(), "ασα"); + assert_eq!("ΑΣ'Α".to_lowercase(), "ασ'α"); + assert_eq!("ΑΣ''Α".to_lowercase(), "ασ''α"); + + // a really long string that has it's lowercase form + // even longer. this tests that implementations don't assume + // an incorrect upper bound on allocations + let upper = str::repeat("İ", 512); + let lower = str::repeat("i̇", 512); + assert_eq!(upper.to_lowercase(), lower); + + // a really long ascii-only string. + // This test that the ascii hot-path + // functions correctly + let upper = str::repeat("A", 511); + let lower = str::repeat("a", 511); + assert_eq!(upper.to_lowercase(), lower); +} + +#[test] +fn to_uppercase() { + assert_eq!("".to_uppercase(), ""); + assert_eq!("aéDžßfiᾀ".to_uppercase(), "AÉDŽSSFIἈΙ"); +} +*/ + +#[test] +fn test_into_string() { + // The only way to acquire a Box in the first place is through + // a OsString, so just test that we can round-trip between Box and + // OsString. + let string = OsString::from("Some text goes here"); + assert_eq!(string.clone().into_boxed_os_str().into_os_string(), string); +} + +#[test] +fn test_box_slice_clone() { + let data = OsString::from("hello HELLO hello HELLO yes YES 5 中ä华!!!"); + let data2 = data.clone().into_boxed_os_str().clone().into_os_string(); + + assert_eq!(data, data2); +} + +#[test] +fn test_cow_from() { + let borrowed = os("borrowed"); + let owned = OsString::from("owned"); + match (Cow::from(owned.clone()), Cow::from(borrowed)) { + (Cow::Owned(o), Cow::Borrowed(b)) => assert!(o == owned && b == borrowed), + _ => panic!("invalid `Cow::from`"), + } +} + +/* +#[test] +fn test_repeat() { + assert_eq!("".repeat(3), ""); + assert_eq!("abc".repeat(0), ""); + assert_eq!("α".repeat(3), "ααα"); +} +*/ + +mod pattern { + use core::pattern::SearchStep::{self, Done, Match, Reject}; + use core::pattern::{Pattern, ReverseSearcher, Searcher}; + use super::*; + + macro_rules! make_test { + ($name:ident, $p:expr, $h:expr, [$($e:expr,)*]) => { + #[allow(unused_imports)] + mod $name { + use core::pattern::SearchStep::{Match, Reject}; + use super::{cmp_search_to_vec}; + #[test] + fn fwd() { + cmp_search_to_vec(false, $p, $h, vec![$($e),*]); + } + #[test] + fn bwd() { + cmp_search_to_vec(true, $p, $h, vec![$($e),*]); + } + } + } + } + + fn cmp_search_to_vec<'a>( + rev: bool, + pat: impl Pattern<&'a OsStr, Searcher: ReverseSearcher<&'a OsStr>>, + haystack: &'a str, + right: Vec, + ) { + let mut searcher = pat.into_searcher(os(haystack)); + let mut v = vec![]; + loop { + match if !rev { searcher.next() } else { searcher.next_back() } { + Match(a, b) => v.push(Match(a, b)), + Reject(a, b) => v.push(Reject(a, b)), + Done => break, + } + } + if rev { + v.reverse(); + } + + let mut first_index = 0; + let mut err = None; + + for (i, e) in right.iter().enumerate() { + match *e { + Match(a, b) | Reject(a, b) if a <= b && a == first_index => { + first_index = b; + } + _ => { + err = Some(i); + break; + } + } + } + + if let Some(err) = err { + panic!("Input skipped range at {err}"); + } + + if first_index != haystack.len() { + panic!("Did not cover whole input"); + } + + assert_eq!(v, right); + } + + make_test!( + str_searcher_ascii_haystack, + "bb", + "abbcbbd", + [Reject(0, 1), Match(1, 3), Reject(3, 4), Match(4, 6), Reject(6, 7),] + ); + make_test!( + str_searcher_ascii_haystack_seq, + "bb", + "abbcbbbbd", + [Reject(0, 1), Match(1, 3), Reject(3, 4), Match(4, 6), Match(6, 8), Reject(8, 9),] + ); + make_test!( + str_searcher_empty_needle_ascii_haystack, + "", + "abbcbbd", + [ + Match(0, 0), + Reject(0, 1), + Match(1, 1), + Reject(1, 2), + Match(2, 2), + Reject(2, 3), + Match(3, 3), + Reject(3, 4), + Match(4, 4), + Reject(4, 5), + Match(5, 5), + Reject(5, 6), + Match(6, 6), + Reject(6, 7), + Match(7, 7), + ] + ); + make_test!( + str_searcher_multibyte_haystack, + " ", + "├──", + [Reject(0, 9),] + ); + make_test!( + str_searcher_empty_needle_multibyte_haystack, + "", + "├──", + [ + Match(0, 0), + Reject(0, 3), + Match(3, 3), + Reject(3, 6), + Match(6, 6), + Reject(6, 9), + Match(9, 9), + ] + ); + make_test!(str_searcher_empty_needle_empty_haystack, "", "", [Match(0, 0),]); + make_test!(str_searcher_nonempty_needle_empty_haystack, "├", "", []); + make_test!( + char_searcher_ascii_haystack, + 'b', + "abbcbbd", + [ + Reject(0, 1), + Match(1, 2), + Match(2, 3), + Reject(3, 4), + Match(4, 5), + Match(5, 6), + Reject(6, 7), + ] + ); + make_test!( + char_searcher_multibyte_haystack, + ' ', + "├──", + [Reject(0, 9),] + ); + make_test!( + char_searcher_short_haystack, + '\u{1F4A9}', + "* \t", + [Reject(0, 3),] + ); + + // See #85462 + #[test] + fn str_searcher_empty_needle_after_done() { + // Empty needle and haystack + { + let mut searcher = "".into_searcher(os("")); + + assert_eq!(searcher.next(), SearchStep::Match(0, 0)); + assert_eq!(searcher.next(), SearchStep::Done); + assert_eq!(searcher.next(), SearchStep::Done); + assert_eq!(searcher.next(), SearchStep::Done); + + let mut searcher = "".into_searcher(os("")); + + assert_eq!(searcher.next_back(), SearchStep::Match(0, 0)); + assert_eq!(searcher.next_back(), SearchStep::Done); + assert_eq!(searcher.next_back(), SearchStep::Done); + assert_eq!(searcher.next_back(), SearchStep::Done); + } + // Empty needle and non-empty haystack + { + let mut searcher = "".into_searcher(os("a")); + + assert_eq!(searcher.next(), SearchStep::Match(0, 0)); + assert_eq!(searcher.next(), SearchStep::Reject(0, 1)); + assert_eq!(searcher.next(), SearchStep::Match(1, 1)); + assert_eq!(searcher.next(), SearchStep::Done); + assert_eq!(searcher.next(), SearchStep::Done); + assert_eq!(searcher.next(), SearchStep::Done); + + let mut searcher = "".into_searcher(os("a")); + + assert_eq!(searcher.next_back(), SearchStep::Match(1, 1)); + assert_eq!(searcher.next_back(), SearchStep::Reject(0, 1)); + assert_eq!(searcher.next_back(), SearchStep::Match(0, 0)); + assert_eq!(searcher.next_back(), SearchStep::Done); + assert_eq!(searcher.next_back(), SearchStep::Done); + assert_eq!(searcher.next_back(), SearchStep::Done); + } + } +} + +macro_rules! generate_iterator_test { + { + $name:ident { + $( + ($($arg:expr),*) -> [$($t:tt)*]; + )* + } + with $fwd:expr, $bwd:expr; + } => { + #[test] + fn $name() { + $( + { + let res = vec![$($t)*]; + + let fwd_vec: Vec<_> = ($fwd)($($arg),*).collect(); + assert_eq!(fwd_vec, res); + + let mut bwd_vec: Vec<_> = ($bwd)($($arg),*).collect(); + bwd_vec.reverse(); + assert_eq!(bwd_vec, res); + } + )* + } + }; + { + $name:ident { + $( + ($($arg:expr),*) -> [$($t:tt)*]; + )* + } + with $fwd:expr; + } => { + #[test] + fn $name() { + $( + { + let want: Vec<_> = [$($t)*].into_iter().map(os).collect(); + + let fwd_vec: Vec<_> = ($fwd)($($arg),*).collect(); + assert_eq!(fwd_vec, want); + } + )* + } + } +} + +generate_iterator_test! { + double_ended_split { + (os("foo.bar.baz"), '.') -> ["foo", "bar", "baz"]; + (os("foo::bar::baz"), "::") -> ["foo", "bar", "baz"]; + } + with OsStr::split /*, str::rsplit */; +} + +/* +generate_iterator_test! { + double_ended_split_terminator { + ("foo;bar;baz;", ';') -> ["foo", "bar", "baz"]; + } + with str::split_terminator, str::rsplit_terminator; +} + +generate_iterator_test! { + double_ended_matches { + ("a1b2c3", char::is_numeric) -> ["1", "2", "3"]; + } + with str::matches, str::rmatches; +} + +generate_iterator_test! { + double_ended_match_indices { + ("a1b2c3", char::is_numeric) -> [(1, "1"), (3, "2"), (5, "3")]; + } + with str::match_indices, str::rmatch_indices; +} + +generate_iterator_test! { + not_double_ended_splitn { + ("foo::bar::baz", 2, "::") -> ["foo", "bar::baz"]; + } + with str::splitn; +} + +generate_iterator_test! { + not_double_ended_rsplitn { + ("foo::bar::baz", 2, "::") -> ["baz", "foo::bar"]; + } + with str::rsplitn; +} +*/ + +/* +#[test] +fn different_str_pattern_forwarding_lifetimes() { + use core::pattern::Pattern; + + fn foo<'a, P>(p: P) + where + for<'b> &'b P: Pattern<&'a OsStr>, + { + for _ in 0..3 { + os("asdf").find(&p); + } + } + + foo::<&str>("x"); +} +*/ + +/* +#[test] +fn test_str_multiline() { + let a: String = "this \ +is a test" + .to_string(); + let b: String = "this \ + is \ + another \ + test" + .to_string(); + assert_eq!(a, "this is a test".to_string()); + assert_eq!(b, "this is another test".to_string()); +} + +#[test] +fn test_str_escapes() { + let x = "\\\\\ + "; + assert_eq!(x, r"\\"); // extraneous whitespace stripped +} + +#[test] +fn const_str_ptr() { + const A: [u8; 2] = ['h' as u8, 'i' as u8]; + const B: &'static [u8; 2] = &A; + const C: *const u8 = B as *const u8; + + // Miri does not deduplicate consts (https://github.com/rust-lang/miri/issues/131) + #[cfg(not(miri))] + { + let foo = &A as *const u8; + assert_eq!(foo, C); + } + + unsafe { + assert_eq!(from_utf8_unchecked(&A), "hi"); + assert_eq!(*C, A[0]); + assert_eq!(*(&B[0] as *const u8), A[0]); + } +} + +#[test] +fn utf8() { + let yen: char = '¥'; // 0xa5 + let c_cedilla: char = 'ç'; // 0xe7 + let thorn: char = 'þ'; // 0xfe + let y_diaeresis: char = 'ÿ'; // 0xff + let pi: char = 'Π'; // 0x3a0 + + assert_eq!(yen as isize, 0xa5); + assert_eq!(c_cedilla as isize, 0xe7); + assert_eq!(thorn as isize, 0xfe); + assert_eq!(y_diaeresis as isize, 0xff); + assert_eq!(pi as isize, 0x3a0); + + assert_eq!(pi as isize, '\u{3a0}' as isize); + assert_eq!('\x0a' as isize, '\n' as isize); + + let bhutan: String = "འབྲུག་ཡུལ།".to_string(); + let japan: String = "日本".to_string(); + let uzbekistan: String = "Ўзбекистон".to_string(); + let austria: String = "Österreich".to_string(); + + let bhutan_e: String = + "\u{f60}\u{f56}\u{fb2}\u{f74}\u{f42}\u{f0b}\u{f61}\u{f74}\u{f63}\u{f0d}".to_string(); + let japan_e: String = "\u{65e5}\u{672c}".to_string(); + let uzbekistan_e: String = + "\u{40e}\u{437}\u{431}\u{435}\u{43a}\u{438}\u{441}\u{442}\u{43e}\u{43d}".to_string(); + let austria_e: String = "\u{d6}sterreich".to_string(); + + let oo: char = 'Ö'; + assert_eq!(oo as isize, 0xd6); + + fn check_str_eq(a: String, b: String) { + let mut i: isize = 0; + for ab in a.bytes() { + println!("{i}"); + println!("{ab}"); + let bb: u8 = b.as_bytes()[i as usize]; + println!("{bb}"); + assert_eq!(ab, bb); + i += 1; + } + } + + check_str_eq(bhutan, bhutan_e); + check_str_eq(japan, japan_e); + check_str_eq(uzbekistan, uzbekistan_e); + check_str_eq(austria, austria_e); +} + +#[test] +fn utf8_chars() { + // Chars of 1, 2, 3, and 4 bytes + let chs: Vec = vec!['e', 'é', '€', '\u{10000}']; + let s: String = chs.iter().cloned().collect(); + let schs: Vec = s.chars().collect(); + + assert_eq!(s.len(), 10); + assert_eq!(s.chars().count(), 4); + assert_eq!(schs.len(), 4); + assert_eq!(schs.iter().cloned().collect::(), s); + + assert!((from_utf8(s.as_bytes()).is_ok())); + // invalid prefix + assert!((!from_utf8(&[0x80]).is_ok())); + // invalid 2 byte prefix + assert!((!from_utf8(&[0xc0]).is_ok())); + assert!((!from_utf8(&[0xc0, 0x10]).is_ok())); + // invalid 3 byte prefix + assert!((!from_utf8(&[0xe0]).is_ok())); + assert!((!from_utf8(&[0xe0, 0x10]).is_ok())); + assert!((!from_utf8(&[0xe0, 0xff, 0x10]).is_ok())); + // invalid 4 byte prefix + assert!((!from_utf8(&[0xf0]).is_ok())); + assert!((!from_utf8(&[0xf0, 0x10]).is_ok())); + assert!((!from_utf8(&[0xf0, 0xff, 0x10]).is_ok())); + assert!((!from_utf8(&[0xf0, 0xff, 0xff, 0x10]).is_ok())); +} + +#[test] +fn utf8_char_counts() { + let strs = [("e", 1), ("é", 1), ("€", 1), ("\u{10000}", 1), ("eé€\u{10000}", 4)]; + let spread = if cfg!(miri) { 4 } else { 8 }; + let mut reps = [8, 64, 256, 512] + .iter() + .copied() + .flat_map(|n| n - spread..=n + spread) + .collect::>(); + if cfg!(not(miri)) { + reps.extend([1024, 1 << 16].iter().copied().flat_map(|n| n - spread..=n + spread)); + } + let counts = if cfg!(miri) { 0..1 } else { 0..8 }; + let padding = counts.map(|len| " ".repeat(len)).collect::>(); + + for repeat in reps { + for (tmpl_str, tmpl_char_count) in strs { + for pad_start in &padding { + for pad_end in &padding { + // Create a string with padding... + let with_padding = + format!("{}{}{}", pad_start, tmpl_str.repeat(repeat), pad_end); + // ...and then skip past that padding. This should ensure + // that we test several different alignments for both head + // and tail. + let si = pad_start.len(); + let ei = with_padding.len() - pad_end.len(); + let target = &with_padding[si..ei]; + + assert!(!target.starts_with(" ") && !target.ends_with(" ")); + let expected_count = tmpl_char_count * repeat; + assert_eq!( + expected_count, + target.chars().count(), + "wrong count for `{:?}.repeat({})` (padding: `{:?}`)", + tmpl_str, + repeat, + (pad_start.len(), pad_end.len()), + ); + } + } + } + } +} + +#[test] +fn floor_char_boundary() { + fn check_many(s: &str, arg: impl IntoIterator, ret: usize) { + for idx in arg { + assert_eq!( + s.floor_char_boundary(idx), + ret, + "{:?}.floor_char_boundary({:?}) != {:?}", + s, + idx, + ret + ); + } + } + + // edge case + check_many("", [0, 1, isize::MAX as usize, usize::MAX], 0); + + // basic check + check_many("x", [0], 0); + check_many("x", [1, isize::MAX as usize, usize::MAX], 1); + + // 1-byte chars + check_many("jp", [0], 0); + check_many("jp", [1], 1); + check_many("jp", 2..4, 2); + + // 2-byte chars + check_many("ĵƥ", 0..2, 0); + check_many("ĵƥ", 2..4, 2); + check_many("ĵƥ", 4..6, 4); + + // 3-byte chars + check_many("日本", 0..3, 0); + check_many("日本", 3..6, 3); + check_many("日本", 6..8, 6); + + // 4-byte chars + check_many("🇯🇵", 0..4, 0); + check_many("🇯🇵", 4..8, 4); + check_many("🇯🇵", 8..10, 8); +} + +#[test] +fn ceil_char_boundary() { + fn check_many(s: &str, arg: impl IntoIterator, ret: usize) { + for idx in arg { + assert_eq!( + s.ceil_char_boundary(idx), + ret, + "{:?}.ceil_char_boundary({:?}) != {:?}", + s, + idx, + ret + ); + } + } + + // edge case + check_many("", [0], 0); + + // basic check + check_many("x", [0], 0); + check_many("x", [1], 1); + + // 1-byte chars + check_many("jp", [0], 0); + check_many("jp", [1], 1); + check_many("jp", [2], 2); + + // 2-byte chars + check_many("ĵƥ", 0..=0, 0); + check_many("ĵƥ", 1..=2, 2); + check_many("ĵƥ", 3..=4, 4); + + // 3-byte chars + check_many("日本", 0..=0, 0); + check_many("日本", 1..=3, 3); + check_many("日本", 4..=6, 6); + + // 4-byte chars + check_many("🇯🇵", 0..=0, 0); + check_many("🇯🇵", 1..=4, 4); + check_many("🇯🇵", 5..=8, 8); +} + +#[test] +#[should_panic] +fn ceil_char_boundary_above_len_panic() { + let _ = "x".ceil_char_boundary(2); +} +*/ From 237606c0c3cf3a1d49e0e9cf0006a7739cca1b21 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Wed, 22 Feb 2023 06:40:45 +0100 Subject: [PATCH 11/12] core: add core::pattern::Predicate wrapper type To work around orphan rules, introduce a wrapper type for predicate functions to be used as pattern. Specefically, if we want to add predicat pattern implementation for OsStr type, doing it with a naked `FnMut` results in compile-time errors: error[E0210]: type parameter `F` must be covered by another type when it appears before the first local type (`OsStr`) impl<'hs, F: FnMut(char) -> bool> core::pattern::Pattern<&'hs OsStr> for F { ^ type parameter `F` must be covered by another type when it appears before the first local type (`OsStr`) --- library/core/src/pattern.rs | 42 +++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/library/core/src/pattern.rs b/library/core/src/pattern.rs index 8fe632426cb66..0a50ebb1b725a 100644 --- a/library/core/src/pattern.rs +++ b/library/core/src/pattern.rs @@ -457,6 +457,48 @@ pub unsafe trait ReverseSearcher: Searcher { /// from which side it is searched. pub trait DoubleEndedSearcher: ReverseSearcher {} +/// A wrapper around single-argument function returning a boolean, +/// i.e. a predicate function. +/// +/// `Predicate` objects are created with [`predicate`] function. +#[derive(Clone, Debug)] +pub struct Predicate(F); + +/// Constructs a wrapper for a single-argument function returning a boolean, +/// i.e. a predicate function. +/// +/// This is intended to be used as a pattern when working with haystacks which +/// (for whatever reason) cannot support naked function traits as patterns. +pub fn predicate bool>(pred: F) -> Predicate { + Predicate(pred) +} + +impl Predicate { + /// Executes the predicate returning its result. + pub fn test(&mut self, element: T) -> bool + where + F: FnMut(T) -> bool, + { + self.0(element) + } + + /// Returns reference to the wrapped predicate function. + pub fn as_fn(&mut self) -> &mut F + where + F: FnMut(T) -> bool, + { + &mut self.0 + } + + /// Consumes this object and returns wrapped predicate function. + pub fn into_fn(self) -> F + where + F: FnMut(T) -> bool, + { + self.0 + } +} + ////////////////////////////////////////////////////////////////////////////// // Internal EmptyNeedleSearcher helper ////////////////////////////////////////////////////////////////////////////// From e7fe1a23ac81d9306ddbdb622fa61613281ffa8b Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Wed, 15 Mar 2023 14:59:15 +0100 Subject: [PATCH 12/12] std: add predicate pattern support to OsStr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Due to technical limitations adding support for predicate as patterns on OsStr slices must be done via core::pattern::Predicate wrapper type. This isn’t ideal but for the time being it’s the best option I’ve came up with. The core of the issue (as I understand it) is that FnMut is a foreign type in std crate where OsStr is defined. Using predicate as a pattern on OsStr is the final piece which now allows parsing command line arguments. --- library/std/src/ffi/os_str.rs | 98 +++++++++++++++++++++++++++++++++++ library/std/tests/os_str.rs | 39 +++++++++++++- 2 files changed, 136 insertions(+), 1 deletion(-) diff --git a/library/std/src/ffi/os_str.rs b/library/std/src/ffi/os_str.rs index c5b60bdcb0963..74b95997af8c9 100644 --- a/library/std/src/ffi/os_str.rs +++ b/library/std/src/ffi/os_str.rs @@ -1731,6 +1731,104 @@ unsafe impl<'hs> ReverseSearcher<&'hs OsStr> for CharSearcher<'hs> { #[unstable(feature = "pattern", issue = "27721")] impl<'hs> DoubleEndedSearcher<&'hs OsStr> for CharSearcher<'hs> {} +#[unstable(feature = "pattern", issue = "27721")] +// FIXME: Using Predicate because of: +// error[E0210]: type parameter `F` must be covered by another type when it +// appears before the first local type (`OsStr`) +// --> library/std/src/ffi/os_str.rs:1697:11 +// | +// 1697 | impl<'hs, F: FnMut(char) -> bool> core::pattern::Pattern<&'hs OsStr> for F { +// | ^ type parameter `F` must be covered by another type +// when it appears before the first local type (`OsStr`) +impl<'hs, F: FnMut(char) -> bool> core::pattern::Pattern<&'hs OsStr> + for core::pattern::Predicate +{ + type Searcher = PredicateSearcher<'hs, F>; + + fn into_searcher(self, haystack: &'hs OsStr) -> Self::Searcher { + Self::Searcher::new(haystack, self.into_fn()) + } + + fn is_contained_in(self, haystack: &'hs OsStr) -> bool { + self.into_fn().is_contained_in(core::str_bytes::Bytes::from(haystack)) + } + + fn is_prefix_of(self, haystack: &'hs OsStr) -> bool { + self.into_fn().is_prefix_of(core::str_bytes::Bytes::from(haystack)) + } + + fn is_suffix_of(self, haystack: &'hs OsStr) -> bool { + self.into_fn().is_suffix_of(core::str_bytes::Bytes::from(haystack)) + } + + /// Removes the pattern from the front of haystack, if it matches. + fn strip_prefix_of(self, haystack: &'hs OsStr) -> Option<&'hs OsStr> { + self.into_fn() + .strip_prefix_of(core::str_bytes::Bytes::from(haystack)) + .map(|bytes| bytes.into()) + } + + /// Removes the pattern from the back of haystack, if it matches. + fn strip_suffix_of(self, haystack: &'hs OsStr) -> Option<&'hs OsStr> + where + Self::Searcher: ReverseSearcher<&'hs OsStr>, + { + self.into_fn() + .strip_suffix_of(core::str_bytes::Bytes::from(haystack)) + .map(|bytes| bytes.into()) + } +} + +#[derive(Clone, Debug)] +#[unstable(feature = "pattern", issue = "27721")] +pub struct PredicateSearcher<'hs, P>(core::str_bytes::PredicateSearcher<'hs, BytesFlavour, P>); + +impl<'hs, P> PredicateSearcher<'hs, P> { + fn new(haystack: &'hs OsStr, pred: P) -> PredicateSearcher<'hs, P> { + Self(core::str_bytes::PredicateSearcher::new(haystack.into(), pred)) + } +} + +#[unstable(feature = "pattern", issue = "27721")] +unsafe impl<'hs, P: FnMut(char) -> bool> Searcher<&'hs OsStr> for PredicateSearcher<'hs, P> { + #[inline(always)] + fn haystack(&self) -> &'hs OsStr { + self.0.haystack().into() + } + + #[inline(always)] + fn next(&mut self) -> SearchStep { + self.0.next() + } + #[inline(always)] + fn next_match(&mut self) -> Option<(usize, usize)> { + self.0.next_match() + } + #[inline(always)] + fn next_reject(&mut self) -> Option<(usize, usize)> { + self.0.next_reject() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +unsafe impl<'hs, P: FnMut(char) -> bool> ReverseSearcher<&'hs OsStr> for PredicateSearcher<'hs, P> { + #[inline(always)] + fn next_back(&mut self) -> SearchStep { + self.0.next_back() + } + #[inline(always)] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + self.0.next_match_back() + } + #[inline(always)] + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + self.0.next_reject_back() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs, P: FnMut(char) -> bool> DoubleEndedSearcher<&'hs OsStr> for PredicateSearcher<'hs, P> {} + #[unstable(feature = "pattern", issue = "27721")] impl<'hs, 'p> core::pattern::Pattern<&'hs OsStr> for &'p str { type Searcher = StrSearcher<'hs, 'p>; diff --git a/library/std/tests/os_str.rs b/library/std/tests/os_str.rs index fea1c0e2275c8..24dd4e1434567 100644 --- a/library/std/tests/os_str.rs +++ b/library/std/tests/os_str.rs @@ -1,6 +1,6 @@ #![feature(associated_type_bounds, pattern)] -use core::pattern::{Pattern, Searcher, ReverseSearcher}; +use core::pattern::{Pattern, Searcher, ReverseSearcher, predicate}; use std::borrow::Cow; use std::ffi::{OsStr, OsString}; @@ -111,6 +111,23 @@ fn do_test_short_flag(valid: bool) { assert_eq!(Some(&*os("shórt")), arg.strip_prefix('-')); assert_eq!(Some(&*os("shórt")), arg.strip_prefix("-")); assert_eq!(None, arg.strip_prefix("--")); + + // A bit awkward but closure can be used to test short options character + // by character. + let mut switch = '\0'; + let mut check_switch = |chr| { + switch = chr; + chr == 's' || chr == 'h' + }; + assert_eq!( + Some(&*os("hórt")), + os("shórt").strip_prefix(predicate(&mut check_switch)) + ); + assert_eq!( + Some(&*os("órt")), + os("hórt").strip_prefix(predicate(&mut check_switch)) + ); + assert_eq!(None, os("órt").strip_prefix(predicate(&mut check_switch))); } #[test] @@ -157,15 +174,21 @@ fn test_le() { #[test] fn test_find() { assert_eq!(find("hello", 'l'), Some(2)); + assert_eq!(find("hello", predicate(|c: char| c == 'o')), Some(4)); assert!(find("hello", 'x').is_none()); + assert!(find("hello", predicate(|c: char| c == 'x')).is_none()); assert_eq!(find("ประเทศไทย中华Việt Nam", '华'), Some(30)); + assert_eq!(find("ประเทศไทย中华Việt Nam", predicate(|c: char| c == '华')), Some(30)); } #[test] fn test_rfind() { assert_eq!(rfind("hello", 'l'), Some(3)); + assert_eq!(rfind("hello", predicate(|c: char| c == 'o')), Some(4)); assert!(rfind("hello", 'x').is_none()); + assert!(rfind("hello", predicate(|c: char| c == 'x')).is_none()); assert_eq!(rfind("ประเทศไทย中华Việt Nam", '华'), Some(30)); + assert_eq!(rfind("ประเทศไทย中华Việt Nam", predicate(|c: char| c == '华')), Some(30)); } /* @@ -1832,6 +1855,13 @@ fn test_split_char_iterator() { rsplit.reverse(); assert_eq!(rsplit, ["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]); + let split: Vec<&OsStr> = os(data).split(predicate(|c: char| c == ' ')).collect(); + assert_eq!(split, ["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]); + + let mut rsplit: Vec<&OsStr> = os(data).split(predicate(|c: char| c == ' ')).rev().collect(); + rsplit.reverse(); + assert_eq!(rsplit, ["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]); + // Unicode let split: Vec<&OsStr> = os(data).split('ä').collect(); assert_eq!(split, ["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]); @@ -1839,6 +1869,13 @@ fn test_split_char_iterator() { let mut rsplit: Vec<&OsStr> = os(data).split('ä').rev().collect(); rsplit.reverse(); assert_eq!(rsplit, ["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]); + + let split: Vec<&OsStr> = os(data).split(predicate(|c: char| c == 'ä')).collect(); + assert_eq!(split, ["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]); + + let mut rsplit: Vec<&OsStr> = os(data).split(predicate(|c: char| c == 'ä')).rev().collect(); + rsplit.reverse(); + assert_eq!(rsplit, ["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]); } #[test]