Skip to content
5 changes: 2 additions & 3 deletions library/alloc/src/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -418,9 +418,8 @@ impl str {
}

fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
use core::unicode::{Case_Ignorable, Cased};
match iter.skip_while(|&c| Case_Ignorable(c)).next() {
Some(c) => Cased(c),
match iter.skip_while(|&c| c.is_case_ignorable()).next() {
Some(c) => c.is_cased(),
None => false,
}
}
Expand Down
50 changes: 47 additions & 3 deletions library/core/src/char/methods.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::slice;
use crate::str::from_utf8_unchecked_mut;
use crate::ub_checks::assert_unsafe_precondition;
use crate::unicode::printable::is_printable;
use crate::unicode::{self, conversions};
use crate::unicode::{self, Case_Ignorable, conversions};

impl char {
/// The lowest valid code point a `char` can have, `'\0'`.
Expand Down Expand Up @@ -950,7 +950,11 @@ impl char {
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn is_control(self) -> bool {
unicode::Cc(self)
// According to
// https://www.unicode.org/policies/stability_policy.html#Property_Value,
// the set of codepoints in `Cc` will never change. So we can hard-code
// the patterns to match against instead of using a table.
matches!(self, '\0'..='\x1f' | '\x7f'..='\u{9f}')
}

/// Returns `true` if this `char` has the `Grapheme_Extend` property.
Expand All @@ -965,7 +969,47 @@ impl char {
#[must_use]
#[inline]
pub(crate) fn is_grapheme_extended(self) -> bool {
unicode::Grapheme_Extend(self)
!self.is_ascii() && unicode::Grapheme_Extend(self)
}

/// Returns `true` if this `char` has the `Cased` derived property.
///
/// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
///
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
/// [ucd]: https://www.unicode.org/reports/tr44/
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
#[must_use]
#[inline]
#[doc(hidden)]
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
pub fn is_cased(self) -> bool {
if self.is_ascii() {
self.is_ascii_alphabetic()
} else {
unicode::Lowercase(self) || unicode::Uppercase(self) || unicode::Lt(self)
}
}

/// Returns `true` if this `char` has the `Case_Ignorable` property.
///
/// `Case_Ignorable` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
///
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
/// [ucd]: https://www.unicode.org/reports/tr44/
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
#[must_use]
#[inline]
#[doc(hidden)]
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
pub fn is_case_ignorable(self) -> bool {
if self.is_ascii() {
matches!(self, '\'' | '.' | ':' | '^' | '`')
} else {
Case_Ignorable(self)
}
}

/// Returns `true` if this `char` has one of the general categories for numbers.
Expand Down
5 changes: 2 additions & 3 deletions library/core/src/unicode/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@

// for use in alloc, not re-exported in std.
#[rustfmt::skip]
pub use unicode_data::case_ignorable::lookup as Case_Ignorable;
pub use unicode_data::cased::lookup as Cased;
pub use unicode_data::conversions;

#[rustfmt::skip]
pub(crate) use unicode_data::alphabetic::lookup as Alphabetic;
pub(crate) use unicode_data::cc::lookup as Cc;
pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable;
pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend;
pub(crate) use unicode_data::lowercase::lookup as Lowercase;
pub(crate) use unicode_data::lt::lookup as Lt;
pub(crate) use unicode_data::n::lookup as N;
pub(crate) use unicode_data::uppercase::lookup as Uppercase;
pub(crate) use unicode_data::white_space::lookup as White_Space;
Expand Down
575 changes: 278 additions & 297 deletions library/core/src/unicode/unicode_data.rs

Large diffs are not rendered by default.

77 changes: 0 additions & 77 deletions src/tools/unicode-table-generator/src/cascading_map.rs

This file was deleted.

43 changes: 30 additions & 13 deletions src/tools/unicode-table-generator/src/case_mapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,26 @@ use crate::{UnicodeData, fmt_list};

const INDEX_MASK: u32 = 1 << 22;

pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String {
pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) {
let mut file = String::new();

write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap();
file.push_str("\n\n");
file.push_str(HEADER.trim_start());
file.push('\n');
file.push_str(&generate_tables("LOWER", &data.to_lower));
let (lower_tables, lower_size) = generate_tables("LOWER", &data.to_lower);
file.push_str(&lower_tables);
file.push_str("\n\n");
file.push_str(&generate_tables("UPPER", &data.to_upper));
file
let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper);
file.push_str(&upper_tables);
(file, [lower_size, upper_size])
}

fn generate_tables(case: &str, data: &BTreeMap<u32, (u32, u32, u32)>) -> String {
fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize) {
let mut mappings = Vec::with_capacity(data.len());
let mut multis = Vec::new();

for (&key, &(a, b, c)) in data.iter() {
for (&key, &[a, b, c]) in data.iter() {
let key = char::from_u32(key).unwrap();

if key.is_ascii() {
Expand All @@ -46,16 +48,31 @@ fn generate_tables(case: &str, data: &BTreeMap<u32, (u32, u32, u32)>) -> String
}

let mut tables = String::new();

write!(tables, "static {}CASE_TABLE: &[(char, u32)] = &[{}];", case, fmt_list(mappings))
.unwrap();
let mut size = 0;

size += size_of_val(mappings.as_slice());
write!(
tables,
"static {}CASE_TABLE: &[(char, u32); {}] = &[{}];",
case,
mappings.len(),
fmt_list(mappings),
)
.unwrap();

tables.push_str("\n\n");

write!(tables, "static {}CASE_TABLE_MULTI: &[[char; 3]] = &[{}];", case, fmt_list(multis))
.unwrap();

tables
size += size_of_val(multis.as_slice());
write!(
tables,
"static {}CASE_TABLE_MULTI: &[[char; 3]; {}] = &[{}];",
case,
multis.len(),
fmt_list(multis),
)
.unwrap();

(tables, size)
}

struct CharEscape(char);
Expand Down
Loading
Loading