diff --git a/Cargo.lock b/Cargo.lock index 1abf4d958..b94756048 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -613,12 +613,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "convert_case" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" - [[package]] name = "convert_case" version = "0.7.1" @@ -844,19 +838,6 @@ dependencies = [ "syn 2.0.90", ] -[[package]] -name = "derive_more" -version = "0.99.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3da29a38df43d6f156149c9b43ded5e018ddff2a855cf2cfd62e8cd7d079c69f" -dependencies = [ - "convert_case 0.4.0", - "proc-macro2", - "quote", - "rustc_version", - "syn 2.0.90", -] - [[package]] name = "digest" version = "0.10.7" @@ -1489,6 +1470,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.14.0" @@ -1791,6 +1781,12 @@ dependencies = [ "libc", ] +[[package]] +name = "nohash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0f889fb66f7acdf83442c35775764b51fed3c606ab9cee51500dbde2cf528ca" + [[package]] name = "nom" version = "7.1.3" @@ -2522,6 +2518,20 @@ dependencies = [ "regex-syntax 0.8.5", ] +[[package]] +name = "regex-filtered" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c11639076bf147be211b90e47790db89f4c22b6c8a9ca6e960833869da67166" +dependencies = [ + "aho-corasick", + "indexmap", + "itertools 0.13.0", + "nohash", + "regex", + "regex-syntax 0.8.5", +] + [[package]] name = "regex-syntax" version = "0.6.29" @@ -3255,17 +3265,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" [[package]] -name = "uaparser" -version = "0.6.4" +name = "ua-parser" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4c9e1c3f893758f154004195fc2d2c52fbda462df725220ceaef830ac29affa" +checksum = "d7176a413a0b7e94926d11a2054c6db5ac7fa42bf4ebe7e9571152e3f024ddfd" dependencies = [ - "derive_more", - "lazy_static", "regex", + "regex-filtered", "serde", - "serde_derive", - "serde_yaml", ] [[package]] @@ -3408,7 +3415,7 @@ dependencies = [ "clap", "codespan-reporting", "community-id", - "convert_case 0.7.1", + "convert_case", "crc", "criterion", "crypto_secretbox", @@ -3466,6 +3473,7 @@ dependencies = [ "seahash", "serde", "serde_json", + "serde_yaml", "sha-1", "sha2", "sha3", @@ -3480,7 +3488,7 @@ dependencies = [ "toml", "tracing", "tracing-test", - "uaparser", + "ua-parser", "unicode-segmentation", "url", "utf8-width", diff --git a/Cargo.toml b/Cargo.toml index 971ee688a..36e5f84a3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -110,7 +110,7 @@ stdlib = [ "dep:strip-ansi-escapes", "dep:syslog_loose", "dep:tokio", - "dep:uaparser", + "dep:ua-parser", "dep:url", "dep:utf8-width", "dep:uuid", @@ -192,7 +192,7 @@ syslog_loose = { version = "0.21", optional = true } termcolor = { version = "1", optional = true } thiserror = { version = "2", optional = true } tracing = { version = "0.1", default-features = false } -uaparser = { version = "0.6", default-features = false, optional = true } +ua-parser = { version = "0.2", optional = true } utf8-width = { version = "0.1", optional = true } url = { version = "2", optional = true } snafu = { version = "0.8", optional = true } @@ -250,6 +250,8 @@ proptest-derive = { version = "0.5" } [build-dependencies] lalrpop = { version = "0.22", default-features = false } +serde_yaml = "0.9.34" +ua-parser = { version = "0.2" } [[bench]] name = "kind" diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index f6ddc01b7..a5e519082 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -53,7 +53,6 @@ colorchoice,https://github.com/rust-cli/anstyle,MIT OR Apache-2.0,The colorchoic combine,https://github.com/Marwes/combine,MIT,Markus Westerlind community-id,https://github.com/traceflight/rs-community-id,MIT OR Apache-2.0,Julian Wang concurrent-queue,https://github.com/smol-rs/concurrent-queue,Apache-2.0 OR MIT,"Stjepan Glavina , Taiki Endo , John Nunley " -convert_case,https://github.com/rutrum/convert-case,MIT,David Purdum convert_case,https://github.com/rutrum/convert-case,MIT,rutrum core-foundation,https://github.com/servo/core-foundation-rs,MIT OR Apache-2.0,The Servo Project Developers cpufeatures,https://github.com/RustCrypto/utils,MIT OR Apache-2.0,RustCrypto Developers @@ -71,7 +70,6 @@ ctr,https://github.com/RustCrypto/block-modes,MIT OR Apache-2.0,RustCrypto Devel data-encoding,https://github.com/ia0/data-encoding,MIT,Julien Cretin dbl,https://github.com/RustCrypto/utils,MIT OR Apache-2.0,RustCrypto Developers deranged,https://github.com/jhpratt/deranged,MIT OR Apache-2.0,Jacob Pratt -derive_more,https://github.com/JelteF/derive_more,MIT,Jelte Fennema digest,https://github.com/RustCrypto/traits,MIT OR Apache-2.0,RustCrypto Developers dirs-next,https://github.com/xdg-rs/dirs,MIT OR Apache-2.0,The @xdg-rs members dirs-sys-next,https://github.com/xdg-rs/dirs/tree/master/dirs-sys,MIT OR Apache-2.0,The @xdg-rs members @@ -149,6 +147,7 @@ mio,https://github.com/tokio-rs/mio,MIT,"Carl Lerche , Thomas moka,https://github.com/moka-rs/moka,MIT OR Apache-2.0,The moka Authors ndk-context,https://github.com/rust-windowing/android-ndk-rs,MIT OR Apache-2.0,The Rust Windowing contributors nix,https://github.com/nix-rust/nix,MIT,The nix-rust Project Developers +nohash,https://github.com/tetcoin/nohash,Apache-2.0 OR MIT,Parity Technologies nom,https://github.com/Geal/nom,MIT,contact@geoffroycouprie.com nu-ansi-term,https://github.com/nushell/nu-ansi-term,MIT,"ogham@bsago.me, Ryan Scheel (Havvy) , Josh Triplett , The Nushell Project Developers" num-bigint,https://github.com/rust-num/num-bigint,MIT OR Apache-2.0,The Rust Project Developers @@ -201,6 +200,7 @@ redox_users,https://gitlab.redox-os.org/redox-os/users,MIT,"Jose Narvaez " regex-automata,https://github.com/BurntSushi/regex-automata,Unlicense OR MIT,Andrew Gallant regex-automata,https://github.com/rust-lang/regex/tree/master/regex-automata,MIT OR Apache-2.0,"The Rust Project Developers, Andrew Gallant " +regex-filtered,https://github.com/ua-parser/uap-rust,BSD-3-Clause,The regex-filtered Authors regex-syntax,https://github.com/rust-lang/regex,MIT OR Apache-2.0,The Rust Project Developers regex-syntax,https://github.com/rust-lang/regex/tree/master/regex-syntax,MIT OR Apache-2.0,"The Rust Project Developers, Andrew Gallant " roxmltree,https://github.com/RazrFalcon/roxmltree,MIT OR Apache-2.0,Yevhenii Reizner @@ -215,7 +215,6 @@ scopeguard,https://github.com/bluss/scopeguard,MIT OR Apache-2.0,bluss seahash,https://gitlab.redox-os.org/redox-os/seahash,MIT,"ticki , Tom Almeida " serde,https://github.com/serde-rs/serde,MIT OR Apache-2.0,"Erick Tryzelaar , David Tolnay " serde_json,https://github.com/serde-rs/json,MIT OR Apache-2.0,"Erick Tryzelaar , David Tolnay " -serde_yaml,https://github.com/dtolnay/serde-yaml,MIT OR Apache-2.0,David Tolnay sha-1,https://github.com/RustCrypto/hashes,MIT OR Apache-2.0,RustCrypto Developers sha1,https://github.com/RustCrypto/hashes,MIT OR Apache-2.0,RustCrypto Developers sha2,https://github.com/RustCrypto/hashes,MIT OR Apache-2.0,RustCrypto Developers @@ -251,13 +250,12 @@ tracing-core,https://github.com/tokio-rs/tracing,MIT,Tokio Contributors tracing-subscriber,https://github.com/tokio-rs/tracing,MIT,"Eliza Weisman , David Barsky , Tokio Contributors " typenum,https://github.com/paholg/typenum,MIT OR Apache-2.0,"Paho Lurie-Gregg , Andre Bogus " -uaparser,https://github.com/davidarmstronglewis/uap-rs,MIT,Ocean Lewis +ua-parser,https://github.com/ua-parser/uap-rust,Apache-2.0,The ua-parser Authors ucd-trie,https://github.com/BurntSushi/ucd-generate,MIT OR Apache-2.0,Andrew Gallant unicode-ident,https://github.com/dtolnay/unicode-ident,(MIT OR Apache-2.0) AND Unicode-3.0,David Tolnay unicode-segmentation,https://github.com/unicode-rs/unicode-segmentation,MIT OR Apache-2.0,"kwantam , Manish Goregaokar " unicode-width,https://github.com/unicode-rs/unicode-width,MIT OR Apache-2.0,"kwantam , Manish Goregaokar " universal-hash,https://github.com/RustCrypto/traits,MIT OR Apache-2.0,RustCrypto Developers -unsafe-libyaml,https://github.com/dtolnay/unsafe-libyaml,MIT,David Tolnay url,https://github.com/servo/rust-url,MIT OR Apache-2.0,The rust-url developers utf16_iter,https://github.com/hsivonen/utf16_iter,Apache-2.0 OR MIT,Henri Sivonen utf8-width,https://github.com/magiclen/utf8-width,MIT,Magic Len diff --git a/build.rs b/build.rs index d77f0b291..1c9b43241 100644 --- a/build.rs +++ b/build.rs @@ -1,16 +1,21 @@ extern crate lalrpop; use std::{ + borrow::Cow, env, fmt::Write as fmt_write, fs::{self, File}, io::{BufRead, BufReader}, path::Path, }; +use ua_parser::device::Flag; fn main() { read_grok_patterns(); + #[cfg(feature = "stdlib")] + convert_user_agent_regexes(); + println!("cargo:rerun-if-changed=src/parser/parser.lalrpop"); lalrpop::Configuration::new() .always_use_colors() @@ -51,3 +56,74 @@ fn read_grok_patterns() { let dest_path = Path::new(&out_dir).join("patterns.rs"); fs::write(dest_path, output).expect("'patterns.rs' wasn't generated"); } + +#[cfg(feature = "stdlib")] +fn convert_user_agent_regexes() { + let regexes = fs::read("data/user_agent_regexes.yaml").expect("Could not read regexes"); + let regexes: ua_parser::Regexes = + serde_yaml::from_slice(®exes).expect("Regex file is not valid yaml"); + + fn write_item(output: &mut Vec, name: &'static str, value: Option>) { + if let Some(value) = value { + output.extend(format!(" {}: Some(r#\"{}\"#.into()),\n", name, value).bytes()); + } else { + output.extend(format!(" {}: None,\n", name).bytes()); + } + } + + let mut output = Vec::new(); + + output.extend(b"ua_parser::Regexes {\n"); + + output.extend(b"os_parsers: vec![\n"); + for os in regexes.os_parsers { + output.extend(b"#[allow(clippy::needless_raw_string_hashes)]\n"); + output.extend(b"ua_parser::os::Parser {\n"); + output.extend(format!(" regex: r#\"{}\"#.into(),\n", os.regex).bytes()); + write_item(&mut output, "os_replacement", os.os_replacement); + write_item(&mut output, "os_v1_replacement", os.os_v1_replacement); + write_item(&mut output, "os_v2_replacement", os.os_v2_replacement); + write_item(&mut output, "os_v3_replacement", os.os_v3_replacement); + write_item(&mut output, "os_v4_replacement", os.os_v4_replacement); + output.extend(b"},\n"); + } + output.extend(b"],\n"); + + output.extend(b"user_agent_parsers: vec![\n"); + for ua in regexes.user_agent_parsers { + output.extend(b"#[allow(clippy::needless_raw_string_hashes)]\n"); + output.extend(b"ua_parser::user_agent::Parser {\n"); + output.extend(format!(" regex: r#\"{}\"#.into(),\n", ua.regex).bytes()); + write_item(&mut output, "family_replacement", ua.family_replacement); + write_item(&mut output, "v1_replacement", ua.v1_replacement); + write_item(&mut output, "v2_replacement", ua.v2_replacement); + write_item(&mut output, "v3_replacement", ua.v3_replacement); + write_item(&mut output, "v4_replacement", ua.v4_replacement); + output.extend(b"},\n"); + } + output.extend(b"],\n"); + + output.extend(b"device_parsers: vec![\n"); + for device in regexes.device_parsers { + output.extend(b"#[allow(clippy::needless_raw_string_hashes)]\n"); + output.extend(b"ua_parser::device::Parser {\n"); + output.extend(format!(" regex: r#\"{}\"#.into(),\n", device.regex).bytes()); + match device.regex_flag { + Some(Flag::IgnoreCase) => { + output.extend(b" regex_flag: Some(ua_parser::device::Flag::IgnoreCase),\n"); + } + None => { + output.extend(b" regex_flag: None,\n"); + } + } + write_item(&mut output, "device_replacement", device.device_replacement); + write_item(&mut output, "brand_replacement", device.brand_replacement); + write_item(&mut output, "model_replacement", device.model_replacement); + output.extend(b"},\n"); + } + output.extend(b"],\n}\n"); + + let out_dir = env::var("OUT_DIR").expect("OUT_DIR isn't defined"); + let dest_path = Path::new(&out_dir).join("user_agent_regexes.rs"); + fs::write(dest_path, output).expect("'user_agent_regexes.rs' wasn't generated"); +} diff --git a/changelog.d/1317.enhancement.md b/changelog.d/1317.enhancement.md new file mode 100644 index 000000000..d552519e5 --- /dev/null +++ b/changelog.d/1317.enhancement.md @@ -0,0 +1,4 @@ +The `parse_user_agent` method now uses the [ua-parser](https://crates.io/crates/ua-parser) library +which is much faster than the previous library. The method's output remains unchanged. + +authors: JakubOnderka diff --git a/src/stdlib/parse_user_agent.rs b/src/stdlib/parse_user_agent.rs index 828e60f79..b7e6f7fc0 100644 --- a/src/stdlib/parse_user_agent.rs +++ b/src/stdlib/parse_user_agent.rs @@ -6,12 +6,11 @@ use std::{ str::FromStr, sync::{Arc, LazyLock}, }; -use uaparser::UserAgentParser as UAParser; use woothee::parser::Parser as WootheeParser; -static UA_PARSER: LazyLock = LazyLock::new(|| { - let regexes = include_bytes!("./../../data/user_agent_regexes.yaml"); - UAParser::from_bytes(regexes).expect("Regex file is not valid.") +static UA_EXTRACTOR: LazyLock = LazyLock::new(|| { + let regexes = include!(concat!(env!("OUT_DIR"), "/user_agent_regexes.rs")); + ua_parser::Extractor::try_from(regexes).expect("Regex file is not valid.") }); #[derive(Clone, Copy, Debug)] @@ -107,7 +106,7 @@ impl Function for ParseUserAgent { } Mode::Reliable => { let fast = WootheeParser::new(); - let slow = &UA_PARSER; + let slow = &UA_EXTRACTOR; Arc::new(move |s: &str| { let ua = fast.parse_user_agent(s); @@ -122,7 +121,7 @@ impl Function for ParseUserAgent { } Mode::Enriched => { let fast = WootheeParser::new(); - let slow = &UA_PARSER; + let slow = &UA_EXTRACTOR; Arc::new(move |s: &str| { slow.parse_user_agent(s) @@ -514,41 +513,48 @@ impl Parser for WootheeParser { } } -impl Parser for UAParser { +impl Parser for ua_parser::Extractor<'_> { fn parse_user_agent(&self, user_agent: &str) -> UserAgent { - #[inline] - fn unknown_to_none(s: Option>) -> Option { - let cow = s?; - match cow.as_ref() { - "" | "Other" => None, - _ => Some(cow.into_owned()), - } - } - - let ua = ::parse(self, user_agent); - - UserAgent { - browser: Browser { - family: unknown_to_none(Some(ua.user_agent.family)), - major: unknown_to_none(ua.user_agent.major), - minor: unknown_to_none(ua.user_agent.minor), - patch: unknown_to_none(ua.user_agent.patch), + let browser = self + .ua + .extract(user_agent) + .map(|ua| Browser { + family: Some(ua.family.into_owned()), + major: ua.major.map(Into::into), + minor: ua.minor.map(Into::into), + patch: ua.patch.map(Into::into), ..Default::default() - }, - os: Os { - family: unknown_to_none(Some(ua.os.family)), - major: unknown_to_none(ua.os.major), - minor: unknown_to_none(ua.os.minor), - patch: unknown_to_none(ua.os.patch), - patch_minor: unknown_to_none(ua.os.patch_minor), + }) + .unwrap_or_default(); + + let os = self + .os + .extract(user_agent) + .map(|os| Os { + family: Some(os.os.into_owned()), + major: os.major.map(Cow::into_owned), + minor: os.minor.map(Cow::into_owned), + patch: os.patch.map(Cow::into_owned), + patch_minor: os.patch_minor.map(Cow::into_owned), ..Default::default() - }, - device: Device { - family: unknown_to_none(Some(ua.device.family)), - brand: unknown_to_none(ua.device.brand), - model: unknown_to_none(ua.device.model), + }) + .unwrap_or_default(); + + let device = self + .dev + .extract(user_agent) + .map(|dev| Device { + family: Some(dev.device.into_owned()), + brand: dev.brand.map(Cow::into_owned), + model: dev.model.map(Cow::into_owned), ..Default::default() - }, + }) + .unwrap_or_default(); + + UserAgent { + browser, + os, + device, } } }