Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

performance(stdlib): Switch to much faster ua-parser #1317

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
performance(stdlib): Switch to much faster ua-parser
JakubOnderka committed Mar 27, 2025
commit 0277680dd0db36b9134331a1485dfaeaf0c3d6e7
64 changes: 36 additions & 28 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 4 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -110,7 +110,7 @@ stdlib = [
"dep:strip-ansi-escapes",
"dep:syslog_loose",
"dep:tokio",
"dep:uaparser",
"dep:ua-parser",
"dep:url",
"dep:utf8-width",
"dep:uuid",
@@ -192,7 +192,7 @@ syslog_loose = { version = "0.21", optional = true }
termcolor = { version = "1", optional = true }
thiserror = { version = "2", optional = true }
tracing = { version = "0.1", default-features = false }
uaparser = { version = "0.6", default-features = false, optional = true }
ua-parser = { version = "0.2", optional = true }
utf8-width = { version = "0.1", optional = true }
url = { version = "2", optional = true }
snafu = { version = "0.8", optional = true }
@@ -250,6 +250,8 @@ proptest-derive = { version = "0.5" }

[build-dependencies]
lalrpop = { version = "0.22", default-features = false }
serde_yaml = "0.9.34"
ua-parser = { version = "0.2" }

[[bench]]
name = "kind"
8 changes: 3 additions & 5 deletions LICENSE-3rdparty.csv
Original file line number Diff line number Diff line change
@@ -53,7 +53,6 @@ colorchoice,https://github.com/rust-cli/anstyle,MIT OR Apache-2.0,The colorchoic
combine,https://github.com/Marwes/combine,MIT,Markus Westerlind <[email protected]>
community-id,https://github.com/traceflight/rs-community-id,MIT OR Apache-2.0,Julian Wang <[email protected]>
concurrent-queue,https://github.com/smol-rs/concurrent-queue,Apache-2.0 OR MIT,"Stjepan Glavina <[email protected]>, Taiki Endo <[email protected]>, John Nunley <[email protected]>"
convert_case,https://github.com/rutrum/convert-case,MIT,David Purdum <[email protected]>
convert_case,https://github.com/rutrum/convert-case,MIT,rutrum <[email protected]>
core-foundation,https://github.com/servo/core-foundation-rs,MIT OR Apache-2.0,The Servo Project Developers
cpufeatures,https://github.com/RustCrypto/utils,MIT OR Apache-2.0,RustCrypto Developers
@@ -71,7 +70,6 @@ ctr,https://github.com/RustCrypto/block-modes,MIT OR Apache-2.0,RustCrypto Devel
data-encoding,https://github.com/ia0/data-encoding,MIT,Julien Cretin <[email protected]>
dbl,https://github.com/RustCrypto/utils,MIT OR Apache-2.0,RustCrypto Developers
deranged,https://github.com/jhpratt/deranged,MIT OR Apache-2.0,Jacob Pratt <[email protected]>
derive_more,https://github.com/JelteF/derive_more,MIT,Jelte Fennema <[email protected]>
digest,https://github.com/RustCrypto/traits,MIT OR Apache-2.0,RustCrypto Developers
dirs-next,https://github.com/xdg-rs/dirs,MIT OR Apache-2.0,The @xdg-rs members
dirs-sys-next,https://github.com/xdg-rs/dirs/tree/master/dirs-sys,MIT OR Apache-2.0,The @xdg-rs members
@@ -149,6 +147,7 @@ mio,https://github.com/tokio-rs/mio,MIT,"Carl Lerche <[email protected]>, Thomas
moka,https://github.com/moka-rs/moka,MIT OR Apache-2.0,The moka Authors
ndk-context,https://github.com/rust-windowing/android-ndk-rs,MIT OR Apache-2.0,The Rust Windowing contributors
nix,https://github.com/nix-rust/nix,MIT,The nix-rust Project Developers
nohash,https://github.com/tetcoin/nohash,Apache-2.0 OR MIT,Parity Technologies <[email protected]>
nom,https://github.com/Geal/nom,MIT,[email protected]
nu-ansi-term,https://github.com/nushell/nu-ansi-term,MIT,"[email protected], Ryan Scheel (Havvy) <[email protected]>, Josh Triplett <[email protected]>, The Nushell Project Developers"
num-bigint,https://github.com/rust-num/num-bigint,MIT OR Apache-2.0,The Rust Project Developers
@@ -201,6 +200,7 @@ redox_users,https://gitlab.redox-os.org/redox-os/users,MIT,"Jose Narvaez <goyox8
regex,https://github.com/rust-lang/regex,MIT OR Apache-2.0,"The Rust Project Developers, Andrew Gallant <[email protected]>"
regex-automata,https://github.com/BurntSushi/regex-automata,Unlicense OR MIT,Andrew Gallant <[email protected]>
regex-automata,https://github.com/rust-lang/regex/tree/master/regex-automata,MIT OR Apache-2.0,"The Rust Project Developers, Andrew Gallant <[email protected]>"
regex-filtered,https://github.com/ua-parser/uap-rust,BSD-3-Clause,The regex-filtered Authors
regex-syntax,https://github.com/rust-lang/regex,MIT OR Apache-2.0,The Rust Project Developers
regex-syntax,https://github.com/rust-lang/regex/tree/master/regex-syntax,MIT OR Apache-2.0,"The Rust Project Developers, Andrew Gallant <[email protected]>"
roxmltree,https://github.com/RazrFalcon/roxmltree,MIT OR Apache-2.0,Yevhenii Reizner <[email protected]>
@@ -215,7 +215,6 @@ scopeguard,https://github.com/bluss/scopeguard,MIT OR Apache-2.0,bluss
seahash,https://gitlab.redox-os.org/redox-os/seahash,MIT,"ticki <[email protected]>, Tom Almeida <[email protected]>"
serde,https://github.com/serde-rs/serde,MIT OR Apache-2.0,"Erick Tryzelaar <[email protected]>, David Tolnay <[email protected]>"
serde_json,https://github.com/serde-rs/json,MIT OR Apache-2.0,"Erick Tryzelaar <[email protected]>, David Tolnay <[email protected]>"
serde_yaml,https://github.com/dtolnay/serde-yaml,MIT OR Apache-2.0,David Tolnay <[email protected]>
sha-1,https://github.com/RustCrypto/hashes,MIT OR Apache-2.0,RustCrypto Developers
sha1,https://github.com/RustCrypto/hashes,MIT OR Apache-2.0,RustCrypto Developers
sha2,https://github.com/RustCrypto/hashes,MIT OR Apache-2.0,RustCrypto Developers
@@ -251,13 +250,12 @@ tracing-core,https://github.com/tokio-rs/tracing,MIT,Tokio Contributors <team@to
tracing-log,https://github.com/tokio-rs/tracing,MIT,Tokio Contributors <[email protected]>
tracing-subscriber,https://github.com/tokio-rs/tracing,MIT,"Eliza Weisman <[email protected]>, David Barsky <[email protected]>, Tokio Contributors <[email protected]>"
typenum,https://github.com/paholg/typenum,MIT OR Apache-2.0,"Paho Lurie-Gregg <[email protected]>, Andre Bogus <[email protected]>"
uaparser,https://github.com/davidarmstronglewis/uap-rs,MIT,Ocean Lewis
ua-parser,https://github.com/ua-parser/uap-rust,Apache-2.0,The ua-parser Authors
ucd-trie,https://github.com/BurntSushi/ucd-generate,MIT OR Apache-2.0,Andrew Gallant <[email protected]>
unicode-ident,https://github.com/dtolnay/unicode-ident,(MIT OR Apache-2.0) AND Unicode-3.0,David Tolnay <[email protected]>
unicode-segmentation,https://github.com/unicode-rs/unicode-segmentation,MIT OR Apache-2.0,"kwantam <[email protected]>, Manish Goregaokar <[email protected]>"
unicode-width,https://github.com/unicode-rs/unicode-width,MIT OR Apache-2.0,"kwantam <[email protected]>, Manish Goregaokar <[email protected]>"
universal-hash,https://github.com/RustCrypto/traits,MIT OR Apache-2.0,RustCrypto Developers
unsafe-libyaml,https://github.com/dtolnay/unsafe-libyaml,MIT,David Tolnay <[email protected]>
url,https://github.com/servo/rust-url,MIT OR Apache-2.0,The rust-url developers
utf16_iter,https://github.com/hsivonen/utf16_iter,Apache-2.0 OR MIT,Henri Sivonen <[email protected]>
utf8-width,https://github.com/magiclen/utf8-width,MIT,Magic Len <[email protected]>
76 changes: 76 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
extern crate lalrpop;

use std::{
borrow::Cow,
env,
fmt::Write as fmt_write,
fs::{self, File},
io::{BufRead, BufReader},
path::Path,
};
use ua_parser::device::Flag;

fn main() {
read_grok_patterns();

#[cfg(feature = "stdlib")]
convert_user_agent_regexes();

println!("cargo:rerun-if-changed=src/parser/parser.lalrpop");
lalrpop::Configuration::new()
.always_use_colors()
@@ -51,3 +56,74 @@ fn read_grok_patterns() {
let dest_path = Path::new(&out_dir).join("patterns.rs");
fs::write(dest_path, output).expect("'patterns.rs' wasn't generated");
}

#[cfg(feature = "stdlib")]
fn convert_user_agent_regexes() {
let regexes = fs::read("data/user_agent_regexes.yaml").expect("Could not read regexes");
let regexes: ua_parser::Regexes =
serde_yaml::from_slice(&regexes).expect("Regex file is not valid yaml");

fn write_item(output: &mut Vec<u8>, name: &'static str, value: Option<Cow<str>>) {
if let Some(value) = value {
output.extend(format!(" {}: Some(r#\"{}\"#.into()),\n", name, value).bytes());
} else {
output.extend(format!(" {}: None,\n", name).bytes());
}
}

let mut output = Vec::new();

output.extend(b"ua_parser::Regexes {\n");

output.extend(b"os_parsers: vec![\n");
for os in regexes.os_parsers {
output.extend(b"#[allow(clippy::needless_raw_string_hashes)]\n");
output.extend(b"ua_parser::os::Parser {\n");
output.extend(format!(" regex: r#\"{}\"#.into(),\n", os.regex).bytes());
write_item(&mut output, "os_replacement", os.os_replacement);
write_item(&mut output, "os_v1_replacement", os.os_v1_replacement);
write_item(&mut output, "os_v2_replacement", os.os_v2_replacement);
write_item(&mut output, "os_v3_replacement", os.os_v3_replacement);
write_item(&mut output, "os_v4_replacement", os.os_v4_replacement);
output.extend(b"},\n");
}
output.extend(b"],\n");

output.extend(b"user_agent_parsers: vec![\n");
for ua in regexes.user_agent_parsers {
output.extend(b"#[allow(clippy::needless_raw_string_hashes)]\n");
output.extend(b"ua_parser::user_agent::Parser {\n");
output.extend(format!(" regex: r#\"{}\"#.into(),\n", ua.regex).bytes());
write_item(&mut output, "family_replacement", ua.family_replacement);
write_item(&mut output, "v1_replacement", ua.v1_replacement);
write_item(&mut output, "v2_replacement", ua.v2_replacement);
write_item(&mut output, "v3_replacement", ua.v3_replacement);
write_item(&mut output, "v4_replacement", ua.v4_replacement);
output.extend(b"},\n");
}
output.extend(b"],\n");

output.extend(b"device_parsers: vec![\n");
for device in regexes.device_parsers {
output.extend(b"#[allow(clippy::needless_raw_string_hashes)]\n");
output.extend(b"ua_parser::device::Parser {\n");
output.extend(format!(" regex: r#\"{}\"#.into(),\n", device.regex).bytes());
match device.regex_flag {
Some(Flag::IgnoreCase) => {
output.extend(b" regex_flag: Some(ua_parser::device::Flag::IgnoreCase),\n");
}
None => {
output.extend(b" regex_flag: None,\n");
}
}
write_item(&mut output, "device_replacement", device.device_replacement);
write_item(&mut output, "brand_replacement", device.brand_replacement);
write_item(&mut output, "model_replacement", device.model_replacement);
output.extend(b"},\n");
}
output.extend(b"],\n}\n");

let out_dir = env::var("OUT_DIR").expect("OUT_DIR isn't defined");
let dest_path = Path::new(&out_dir).join("user_agent_regexes.rs");
fs::write(dest_path, output).expect("'user_agent_regexes.rs' wasn't generated");
}
4 changes: 4 additions & 0 deletions changelog.d/1317.enhancement.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Improved the `parse_user_agent` method in enriched and reliable mode by switching to a faster library.
The method's output remains unchanged as the new library utilizes the same data.

authors: JakubOnderka
80 changes: 43 additions & 37 deletions src/stdlib/parse_user_agent.rs
Original file line number Diff line number Diff line change
@@ -6,12 +6,11 @@ use std::{
str::FromStr,
sync::{Arc, LazyLock},
};
use uaparser::UserAgentParser as UAParser;
use woothee::parser::Parser as WootheeParser;

static UA_PARSER: LazyLock<UAParser> = LazyLock::new(|| {
let regexes = include_bytes!("./../../data/user_agent_regexes.yaml");
UAParser::from_bytes(regexes).expect("Regex file is not valid.")
static UA_EXTRACTOR: LazyLock<ua_parser::Extractor> = LazyLock::new(|| {
let regexes = include!(concat!(env!("OUT_DIR"), "/user_agent_regexes.rs"));
ua_parser::Extractor::try_from(regexes).expect("Regex file is not valid.")
});

#[derive(Clone, Copy, Debug)]
@@ -107,7 +106,7 @@ impl Function for ParseUserAgent {
}
Mode::Reliable => {
let fast = WootheeParser::new();
let slow = &UA_PARSER;
let slow = &UA_EXTRACTOR;

Arc::new(move |s: &str| {
let ua = fast.parse_user_agent(s);
@@ -122,7 +121,7 @@ impl Function for ParseUserAgent {
}
Mode::Enriched => {
let fast = WootheeParser::new();
let slow = &UA_PARSER;
let slow = &UA_EXTRACTOR;

Arc::new(move |s: &str| {
slow.parse_user_agent(s)
@@ -514,41 +513,48 @@ impl Parser for WootheeParser {
}
}

impl Parser for UAParser {
impl Parser for ua_parser::Extractor<'_> {
fn parse_user_agent(&self, user_agent: &str) -> UserAgent {
#[inline]
fn unknown_to_none(s: Option<Cow<'_, str>>) -> Option<String> {
let cow = s?;
match cow.as_ref() {
"" | "Other" => None,
_ => Some(cow.into_owned()),
}
}

let ua = <UAParser as uaparser::Parser>::parse(self, user_agent);

UserAgent {
browser: Browser {
family: unknown_to_none(Some(ua.user_agent.family)),
major: unknown_to_none(ua.user_agent.major),
minor: unknown_to_none(ua.user_agent.minor),
patch: unknown_to_none(ua.user_agent.patch),
let browser = self
.ua
.extract(user_agent)
.map(|ua| Browser {
family: Some(ua.family.into_owned()),
major: ua.major.map(Into::into),
minor: ua.minor.map(Into::into),
patch: ua.patch.map(Into::into),
..Default::default()
},
os: Os {
family: unknown_to_none(Some(ua.os.family)),
major: unknown_to_none(ua.os.major),
minor: unknown_to_none(ua.os.minor),
patch: unknown_to_none(ua.os.patch),
patch_minor: unknown_to_none(ua.os.patch_minor),
})
.unwrap_or_default();

let os = self
.os
.extract(user_agent)
.map(|os| Os {
family: Some(os.os.into_owned()),
major: os.major.map(Cow::into_owned),
minor: os.minor.map(Cow::into_owned),
patch: os.patch.map(Cow::into_owned),
patch_minor: os.patch_minor.map(Cow::into_owned),
..Default::default()
},
device: Device {
family: unknown_to_none(Some(ua.device.family)),
brand: unknown_to_none(ua.device.brand),
model: unknown_to_none(ua.device.model),
})
.unwrap_or_default();

let device = self
.dev
.extract(user_agent)
.map(|dev| Device {
family: Some(dev.device.into_owned()),
brand: dev.brand.map(Cow::into_owned),
model: dev.model.map(Cow::into_owned),
..Default::default()
},
})
.unwrap_or_default();

UserAgent {
browser,
os,
device,
}
}
}