diff --git a/.gitignore b/.gitignore index 45839f3..05f2e54 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ /target /smith_waterman_macro/target -/benches/match_list/data.txt +/benches/data diff --git a/Cargo.lock b/Cargo.lock index 6656824..8640350 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -34,9 +34,9 @@ checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" [[package]] name = "anyhow" -version = "1.0.101" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e0fee31ef5ed1ba1316088939cea399010ed7731dba877ed44aeb407a75ea" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" [[package]] name = "autocfg" @@ -46,15 +46,15 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "bitflags" -version = "2.10.0" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" [[package]] name = "bumpalo" -version = "3.19.1" +version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" [[package]] name = "cast" @@ -118,18 +118,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.58" +version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63be97961acde393029492ce0be7a1af7e323e6bae9511ebfac33751be5e6806" +checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.58" +version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f13174bda5dfd69d7e947827e5af4b0f2f94a4a3ee92912fba07a66150f21e2" +checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" dependencies = [ "anstyle", "clap_lex", @@ -253,11 +253,39 @@ dependencies = [ "serde", ] +[[package]] +name = "frizbee-node" +version = "0.8.2" +dependencies = [ + "frizbee", + "neon", + "serde", +] + +[[package]] +name = "frizbee-python" +version = "0.8.2" +dependencies = [ + "frizbee", + "pyo3", +] + [[package]] name = "getrandom" -version = "0.4.1" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "139ef39800118c7683f2fd3c98c1b23c09ae076556b435f8e9064ae108aaeeec" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ "cfg-if", "libc", @@ -343,9 +371,9 @@ checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "js-sys" -version = "0.3.85" +version = "0.3.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" +checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" dependencies = [ "once_cell", "wasm-bindgen", @@ -363,12 +391,42 @@ version = "0.2.182" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + [[package]] name = "libm" version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" +[[package]] +name = "linkme" +version = "0.3.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e3283ed2d0e50c06dd8602e0ab319bb048b6325d0bba739db64ed8205179898" +dependencies = [ + "linkme-impl", +] + +[[package]] +name = "linkme-impl" +version = "0.3.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5cec0ec4228b4853bb129c84dbf093a27e6c7a20526da046defc334a1b017f7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "lock_api" version = "0.4.14" @@ -390,6 +448,36 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "neon" +version = "1.2.0-alpha.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b369318a3f63d9fbc7e663bb04be2fbf13ab4c8bbc8b185ea9dc7988529295e8" +dependencies = [ + "either", + "getrandom 0.2.17", + "libloading", + "linkme", + "neon-macros", + "once_cell", + "semver", + "send_wrapper", + "serde", + "serde_json", + "smallvec", +] + +[[package]] +name = "neon-macros" +version = "1.2.0-alpha.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89e910c030108900f00779b4326f02c022b28e765382d32487b6d091a6cad48" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "nucleo" version = "0.5.0" @@ -494,6 +582,12 @@ dependencies = [ "plotters-backend", ] +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + [[package]] name = "prettyplease" version = "0.2.37" @@ -513,20 +607,88 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "pyo3" +version = "0.28.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf85e27e86080aafd5a22eae58a162e133a589551542b3e5cee4beb27e54f8e1" +dependencies = [ + "libc", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", +] + +[[package]] +name = "pyo3-build-config" +version = "0.28.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bf94ee265674bf76c09fa430b0e99c26e319c945d96ca0d5a8215f31bf81cf7" +dependencies = [ + "python3-dll-a", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.28.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "491aa5fc66d8059dd44a75f4580a2962c1862a1c2945359db36f6c2818b748dc" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.28.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5d671734e9d7a43449f8480f8b38115df67bef8d21f76837fa75ee7aaa5e52e" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.28.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22faaa1ce6c430a1f71658760497291065e6450d7b5dc2bcf254d49f66ee700a" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "python3-dll-a" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d381ef313ae70b4da5f95f8a4de773c6aa5cd28f73adec4b4a31df70b66780d8" +dependencies = [ + "cc", +] + [[package]] name = "quote" -version = "1.0.44" +version = "1.0.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" dependencies = [ "proc-macro2", ] [[package]] name = "r-efi" -version = "5.3.0" +version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" [[package]] name = "rand" @@ -535,7 +697,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" dependencies = [ "chacha20", - "getrandom", + "getrandom 0.4.2", "rand_core", ] @@ -618,9 +780,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.9" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" [[package]] name = "rustversion" @@ -649,6 +811,12 @@ version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +[[package]] +name = "send_wrapper" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd0b0ec5f1c1ca621c432a25813d8d60c88abe6d3e08a3eb9cf37d97a0fe3d73" + [[package]] name = "serde" version = "1.0.228" @@ -706,15 +874,21 @@ checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" [[package]] name = "syn" -version = "2.0.115" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e614ed320ac28113fa64972c4262d5dbc89deacdfd00c34a3e4cea073243c12" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] +[[package]] +name = "target-lexicon" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca" + [[package]] name = "tinytemplate" version = "1.2.1" @@ -727,9 +901,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.23" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "537dd038a89878be9b64dd4bd1b260315c1bb94f4d784956b81e27a088d9a09e" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "unicode-segmentation" @@ -753,6 +927,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + [[package]] name = "wasip2" version = "1.0.2+wasi-0.2.9" @@ -773,9 +953,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.108" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" +checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" dependencies = [ "cfg-if", "once_cell", @@ -786,9 +966,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.108" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" +checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -796,9 +976,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.108" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" +checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" dependencies = [ "bumpalo", "proc-macro2", @@ -809,9 +989,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.108" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" +checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" dependencies = [ "unicode-ident", ] @@ -852,9 +1032,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.85" +version = "0.3.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" +checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9" dependencies = [ "js-sys", "wasm-bindgen", @@ -996,18 +1176,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.39" +version = "0.8.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a" +checksum = "a789c6e490b576db9f7e6b6d661bcc9799f7c0ac8352f56ea20193b2681532e5" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.39" +version = "0.8.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" +checksum = "f65c489a7071a749c849713807783f70672b28094011623e200cb86dcb835953" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index b62a011..8cb17a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,31 +1,13 @@ -[package] -name = "frizbee" -description = "Fast typo-resistant fuzzy matching via SIMD smith waterman, similar algorithm to FZF/FZY" -license = "MIT" -version = "0.8.2" -edition = "2024" -repository = "https://github.com/saghen/frizbee" +[workspace] +resolver = "2" +members = [ + "frizbee", + "frizbee-node", + "frizbee-python", +] + +[workspace.dependencies] +frizbee = { path = "./frizbee" } [profile.release] lto = true - -[dev-dependencies] -criterion = "0.8" -nucleo = "0.5.0" -rand = "0.10" -rand_distr = "0.6" - -[[bench]] -name = "lib" -harness = false - -[lib] -bench = false - -[dependencies] -itertools = "0.14" # k-way merge -raw-cpuid = "11.6" # runtime feature detection -serde = { version = "1.0", features = ["derive"], optional = true } - -[features] -serde = ["dep:serde"] diff --git a/README.md b/README.md index a641a72..8179256 100644 --- a/README.md +++ b/README.md @@ -53,13 +53,13 @@ bitmask: 0b00001000 // movemask(mask) bitmask > 0 // needle found in haystack, check next needle char ``` -See the full implementation in [`src/prefilter/x86_64/avx2.rs`](src/prefilter/x86_64/avx2.rs). When 256-bit SIMD is not available (no AVX2 or ARM), we simply check the uppercase and lowercase separately. +See the full implementation in [`src/prefilter/x86_64/avx2.rs`](frizbee/src/prefilter/x86_64/avx2.rs). When 256-bit SIMD is not available (no AVX2 or ARM), we simply check the uppercase and lowercase separately. ### Smith Waterman The [Smith Waterman algorithm](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm) performs local sequence alignment ([explanation](https://kaell.se/bibook/pairwise/waterman.html)), originally designed to find similar sequences between two DNA strings. The algorithm's time and space complexity of O(nm) led to plenty of research on parallelization. Each cell in the matrix has a data dependency on the cell to the left, up, and left-up diagonal. For biology, DNA sequences are typically quite large (m > 1000), so most of the parallelization approaches focused on large matrices ([see this paper for common parallelization techniques](https://pmc.ncbi.nlm.nih.gov/articles/PMC8419822)). -As a fuzzy matcher, the matrices in Frizbee are typically much smaller than those in DNA alignment (m < 128). Frizbee uses an approach similar to [sequential layout](https://pmc.ncbi.nlm.nih.gov/articles/PMC8419822/#Sec11), except the horizontal (vertical in the paper, but flipped in frizbee) data dependency [is applied immediately](src/smith_waterman/simd/gaps.rs). This approach supports [affine gaps](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm#Affine). +As a fuzzy matcher, the matrices in Frizbee are typically much smaller than those in DNA alignment (m < 128). Frizbee uses an approach similar to [sequential layout](https://pmc.ncbi.nlm.nih.gov/articles/PMC8419822/#Sec11), except the horizontal (vertical in the paper, but flipped in frizbee) data dependency [is applied immediately](frizbee/src/smith_waterman/simd/gaps.rs). This approach supports [affine gaps](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm#Affine). ``` needle: "foo" @@ -120,3 +120,7 @@ The parallel implementation uses work-stealing to distribute the work across thr - `CAPITALIZATION_BONUS`: Bonus for matching a capital letter after a lowercase letter (e.g. "b" on "fooBar" will receive a bonus on "B") - `MATCHING_CASE_BONUS`: Bonus for matching the case of the needle (e.g. "WorLd" on "WoRld" will receive a bonus on "W", "o", "d") - `EXACT_MATCH_BONUS`: Bonus for matching the exact needle (e.g. "foo" on "foo" will receive the bonus) + +## License + +MIT diff --git a/frizbee-node/.gitignore b/frizbee-node/.gitignore new file mode 100644 index 0000000..1c8437e --- /dev/null +++ b/frizbee-node/.gitignore @@ -0,0 +1,2 @@ +index.node +node_modules diff --git a/frizbee-node/Cargo.toml b/frizbee-node/Cargo.toml new file mode 100644 index 0000000..e395090 --- /dev/null +++ b/frizbee-node/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "frizbee-node" +version = "0.8.2" +description = "Fast typo-resistant fuzzy matching via SIMD smith waterman, similar algorithm to FZF/FZY" +authors = ["Liam Dyer "] +license = "MIT" +edition = "2024" +exclude = ["index.node"] + +[lib] +crate-type = ["cdylib"] + +[dependencies] +frizbee = { workspace = true, features = ["serde"] } +neon = { version = "1.2.0-alpha.0", features = ["napi-8", "serde"] } +serde = { version = "1.0.228", features = ["derive"] } diff --git a/frizbee-node/index.cjs b/frizbee-node/index.cjs new file mode 100644 index 0000000..4c1814f --- /dev/null +++ b/frizbee-node/index.cjs @@ -0,0 +1,5 @@ +const { Matcher, matchList, matchListIndices } = require("./index.node"); + +module.exports.Matcher = Matcher; +module.exports.matchList = matchList; +module.exports.matchListIndices = matchListIndices; diff --git a/frizbee-node/index.d.ts b/frizbee-node/index.d.ts new file mode 100644 index 0000000..b80e85a --- /dev/null +++ b/frizbee-node/index.d.ts @@ -0,0 +1,52 @@ +declare module "frizbee" { + function matchList( + needle: string, + haystacks: string[], + config?: Config, + ): Match[]; + + function matchListIndices( + needle: string, + haystacks: string[], + config?: Config, + ): MatchIndices[]; + + export class Matcher { + constructor(needle: string, config?: Config); + setNeedle(needle: string): void; + setConfig(config: Config): void; + matchList(haystacks: string[]): Match[]; + matchListIndices(haystacks: string[]): MatchIndices[]; + } + + export interface Config { + maxTypos?: number; + sort?: boolean; + scoring?: Scoring; + } + + export interface Scoring { + matchScore?: number; + mismatchPenalty?: number; + gapOpenPenalty?: number; + gapExtendPenalty?: number; + prefixBonus?: number; + capitalizationBonus?: number; + matchingCaseBonus?: number; + exactMatchBonus?: number; + delimiterBonus?: number; + } + + export interface Match { + index: number; + score: number; + exact: boolean; + } + + export interface MatchIndices { + score: number; + index: number; + exact: boolean; + indices: number[]; + } +} diff --git a/frizbee-node/package-lock.json b/frizbee-node/package-lock.json new file mode 100644 index 0000000..ff38530 --- /dev/null +++ b/frizbee-node/package-lock.json @@ -0,0 +1,26 @@ +{ + "name": "frizbee", + "version": "0.8.2", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "frizbee", + "version": "0.8.2", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "cargo-cp-artifact": "^0.1" + } + }, + "node_modules/cargo-cp-artifact": { + "version": "0.1.9", + "resolved": "https://registry.npmjs.org/cargo-cp-artifact/-/cargo-cp-artifact-0.1.9.tgz", + "integrity": "sha512-6F+UYzTaGB+awsTXg0uSJA1/b/B3DDJzpKVRu0UmyI7DmNeaAl2RFHuTGIN6fEgpadRxoXGb7gbC1xo4C3IdyA==", + "license": "MIT", + "bin": { + "cargo-cp-artifact": "bin/cargo-cp-artifact.js" + } + } + } +} diff --git a/frizbee-node/package.json b/frizbee-node/package.json new file mode 100644 index 0000000..bc8ea29 --- /dev/null +++ b/frizbee-node/package.json @@ -0,0 +1,33 @@ +{ + "name": "frizbee", + "version": "0.8.2", + "description": "Fast typo-resistant fuzzy matching via SIMD smith waterman, similar algorithm to FZF/FZY", + "author": "Liam Dyer ", + "license": "MIT", + "main": "index.cjs", + "types": "index.d.ts", + "scripts": { + "build": "cargo-cp-artifact -a cdylib frizbee_node index.node -- cargo build --release --message-format=json-render-diagnostics", + "install": "npm run build" + }, + "dependencies": { + "cargo-cp-artifact": "^0.1" + }, + "keywords": [ + "fuzzy", + "matching", + "fzf", + "fzy", + "rust", + "simd", + "smith-waterman" + ], + "homepage": "https://github.com/saghen/frizbee#readme", + "bugs": { + "url": "https://github.com/saghen/frizbee/issues" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/saghen/frizbee.git" + } +} diff --git a/frizbee-node/src/config.rs b/frizbee-node/src/config.rs new file mode 100644 index 0000000..04ff7d6 --- /dev/null +++ b/frizbee-node/src/config.rs @@ -0,0 +1,66 @@ +#[derive(Clone, serde::Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct PartialConfig { + pub max_typos: Option, + pub sort: Option, + pub scoring: Option, +} + +impl From for frizbee::Config { + fn from(partial: PartialConfig) -> Self { + let default_config = frizbee::Config::default(); + frizbee::Config { + max_typos: Some(partial.max_typos.unwrap_or(0)), + sort: partial.sort.unwrap_or(default_config.sort), + scoring: partial + .scoring + .map(Into::into) + .unwrap_or(default_config.scoring), + } + } +} + +#[derive(Clone, serde::Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct PartialScoring { + pub match_score: Option, + pub mismatch_penalty: Option, + pub gap_open_penalty: Option, + pub gap_extend_penalty: Option, + pub prefix_bonus: Option, + pub capitalization_bonus: Option, + pub matching_case_bonus: Option, + pub exact_match_bonus: Option, + pub delimiter_bonus: Option, +} + +impl From for frizbee::Scoring { + fn from(partial: PartialScoring) -> Self { + let default_scoring = frizbee::Scoring::default(); + frizbee::Scoring { + match_score: partial.match_score.unwrap_or(default_scoring.match_score), + mismatch_penalty: partial + .mismatch_penalty + .unwrap_or(default_scoring.mismatch_penalty), + gap_open_penalty: partial + .gap_open_penalty + .unwrap_or(default_scoring.gap_open_penalty), + gap_extend_penalty: partial + .gap_extend_penalty + .unwrap_or(default_scoring.gap_extend_penalty), + prefix_bonus: partial.prefix_bonus.unwrap_or(default_scoring.prefix_bonus), + capitalization_bonus: partial + .capitalization_bonus + .unwrap_or(default_scoring.capitalization_bonus), + matching_case_bonus: partial + .matching_case_bonus + .unwrap_or(default_scoring.matching_case_bonus), + exact_match_bonus: partial + .exact_match_bonus + .unwrap_or(default_scoring.exact_match_bonus), + delimiter_bonus: partial + .delimiter_bonus + .unwrap_or(default_scoring.delimiter_bonus), + } + } +} diff --git a/frizbee-node/src/lib.rs b/frizbee-node/src/lib.rs new file mode 100644 index 0000000..73a07a8 --- /dev/null +++ b/frizbee-node/src/lib.rs @@ -0,0 +1,65 @@ +use neon::types::extract::Json; + +mod config; +use config::PartialConfig as Config; + +#[derive(Clone)] +pub struct Matcher { + inner: frizbee::Matcher, +} + +#[neon::export] +fn match_list( + needle: String, + haystacks: Json>, + config: Option>, +) -> Json> { + Json(frizbee::match_list( + &needle, + &haystacks.0, + &config.map(|c| c.0.into()).unwrap_or_default(), + )) +} + +#[neon::export] +fn match_list_indices( + needle: String, + haystacks: Json>, + config: Option>, +) -> Json> { + Json(frizbee::match_list_indices( + &needle, + &haystacks.0, + &config.map(|c| c.0.into()).unwrap_or_default(), + )) +} + +#[neon::export(class)] +impl Matcher { + pub fn new(needle: String, config: Option>) -> Self { + Self { + inner: frizbee::Matcher::new( + needle.as_str(), + &config.map(|c| c.0.into()).unwrap_or_default(), + ), + } + } + + fn set_needle(&mut self, needle: String) { + self.inner.set_needle(needle.as_str()); + } + + fn set_config(&mut self, config: Json) { + self.inner.set_config(&config.0.into()); + } + + #[neon(json)] + fn match_list(&mut self, haystacks: Vec) -> Vec { + self.inner.match_list(&haystacks) + } + + #[neon(json)] + fn match_list_indices(&mut self, haystacks: Vec) -> Vec { + self.inner.match_list_indices(&haystacks) + } +} diff --git a/frizbee-python/.gitignore b/frizbee-python/.gitignore new file mode 100644 index 0000000..8e980de --- /dev/null +++ b/frizbee-python/.gitignore @@ -0,0 +1,2 @@ +.venv +frizbee/frizbee.cpython* diff --git a/frizbee-python/Cargo.toml b/frizbee-python/Cargo.toml new file mode 100644 index 0000000..a55a4c0 --- /dev/null +++ b/frizbee-python/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "frizbee-python" +version = "0.8.2" +edition = "2024" + +description = "Fast typo-resistant fuzzy matching via SIMD smith waterman, similar algorithm to FZF/FZY" +repository = "https://github.com/saghen/frizbee" +license = "MIT" + +[lib] +crate-type = ["cdylib"] +name = "frizbee_python" + +[dependencies] +frizbee.workspace = true +pyo3 = { version = "0.28", features = ["extension-module", "generate-import-lib"] } diff --git a/frizbee-python/README.md b/frizbee-python/README.md new file mode 100644 index 0000000..8787f89 --- /dev/null +++ b/frizbee-python/README.md @@ -0,0 +1,120 @@ +# frizbee-rs + +Python bindings for [frizbee](https://github.com/saghen/frizbee), a fast SIMD fuzzy string matcher written in Rust. + +Frizbee uses Smith-Waterman with affine gaps for typo-resistant fuzzy matching, similar to FZF/FZY but faster. In benchmarks it outperforms [nucleo](https://github.com/helix-editor/nucleo) by ~1.7x and [fzf](https://github.com/junegunn/fzf) by ~2.1x. + +## Installation + +```bash +pip install frizbee +``` + +## Quick Start + +```python +import frizbee_rs + +results = frizbee.match_list("fBr", ["fooBar", "foo_bar", "prelude", "println!"]) +for m in results: + print(f"index={m.index}, score={m.score}, exact={m.exact}") +# index=0, score=53, exact=False +# index=1, score=48, exact=False +``` + +## API + +### Functions + +#### `match_list(needle, haystacks, config=None) -> list[Match]` + +Fuzzy match a needle against a list of haystacks. Returns matches sorted by score (descending) by default. + +```python +results = frizbee.match_list("foo", ["foobar", "baz", "foo"]) +# [Match(score=68, index=2, exact=True), Match(score=60, index=0, exact=False)] +``` + +#### `match_list_indices(needle, haystacks, config=None) -> list[MatchIndices]` + +Like `match_list` but also returns the indices of matched characters in each haystack. + +```python +results = frizbee.match_list_indices("fb", ["foobar"]) +# [MatchIndices(score=29, index=0, exact=False, indices=[3, 0])] +``` + +#### `match_list_parallel(needle, haystacks, config=None, threads=None) -> list[Match]` + +Multithreaded version of `match_list`. Defaults to all available cores. + +```python +results = frizbee.match_list_parallel("query", big_list, threads=8) +``` + +### Classes + +#### `Matcher(needle, config=None)` + +Stateful matcher for reusing a needle against multiple haystack lists. + +```python +m = frizbee.Matcher("foo") +results1 = m.match_list(["foobar", "baz"]) +results2 = m.match_list(["food", "drink"]) + +m.set_needle("bar") # change needle +m.set_config(frizbee.Config(max_typos=1)) # change config +``` + +#### `Config(max_typos=0, sort=True, scoring=None)` + +| Parameter | Default | Description | +|-------------|---------|-----------------------------------------------------------------| +| `max_typos` | `0` | Max missing chars before filtering out. `None` = unlimited. | +| `sort` | `True` | Sort results by score (descending). | +| `scoring` | default | `Scoring` instance for fine-grained control. | + +```python +# Allow typos +cfg = frizbee_rs.Config(max_typos=2) + +# Disable sorting +cfg = frizbee_rs.Config(sort=False) +``` + +#### `Scoring(...)` + +All parameters are optional and default to frizbee's built-in values. + +| Parameter | Default | Description | +|------------------------|---------|------------------------------------------------| +| `match_score` | 12 | Score for a matching character. | +| `mismatch_penalty` | 6 | Penalty for a substitution. | +| `gap_open_penalty` | 5 | Penalty for opening a gap. | +| `gap_extend_penalty` | 1 | Penalty for extending a gap. | +| `prefix_bonus` | 12 | Bonus for matching the first character. | +| `capitalization_bonus` | 4 | Bonus for matching camelCase boundaries. | +| `matching_case_bonus` | 4 | Bonus for matching the case of the needle. | +| `exact_match_bonus` | 8 | Bonus for an exact match. | +| `delimiter_bonus` | 4 | Bonus for matching after a delimiter (`_` etc).| + +#### `Match` + +| Attribute | Type | Description | +|-----------|--------|---------------------------------------------| +| `score` | `int` | Match score (higher is better). | +| `index` | `int` | Index in the original haystack list. | +| `exact` | `bool` | Whether the needle matched exactly. | + +#### `MatchIndices` + +Same as `Match` plus: + +| Attribute | Type | Description | +|-----------|-------------|--------------------------------------------------| +| `indices` | `list[int]` | Positions of matched characters in the haystack. | + +## License + +MIT diff --git a/frizbee-python/frizbee/__init__.py b/frizbee-python/frizbee/__init__.py new file mode 100644 index 0000000..37bf3de --- /dev/null +++ b/frizbee-python/frizbee/__init__.py @@ -0,0 +1,21 @@ +from .frizbee import ( + Config, + Match, + Matcher, + MatchIndices, + Scoring, + match_list, + match_list_indices, + match_list_parallel, +) + +__all__ = [ + "Config", + "Match", + "MatchIndices", + "Matcher", + "Scoring", + "match_list", + "match_list_indices", + "match_list_parallel", +] diff --git a/frizbee-python/frizbee/__init__.pyi b/frizbee-python/frizbee/__init__.pyi new file mode 100644 index 0000000..cbff44e --- /dev/null +++ b/frizbee-python/frizbee/__init__.pyi @@ -0,0 +1,155 @@ +class Scoring: + """Scoring parameters for the Smith-Waterman fuzzy matching algorithm.""" + + match_score: int + """Score for a matching character between needle and haystack.""" + mismatch_penalty: int + """Penalty for a mismatch (substitution).""" + gap_open_penalty: int + """Penalty for opening a gap (deletion/insertion).""" + gap_extend_penalty: int + """Penalty for extending a gap (deletion/insertion).""" + prefix_bonus: int + """Bonus for matching the first character of the haystack.""" + capitalization_bonus: int + """Bonus for matching a capital letter after a lowercase letter.""" + matching_case_bonus: int + """Bonus for matching the case of the needle.""" + exact_match_bonus: int + """Bonus for matching the exact needle.""" + delimiter_bonus: int + """Bonus for matching after a delimiter character.""" + + def __init__( + self, + match_score: int | None = None, + mismatch_penalty: int | None = None, + gap_open_penalty: int | None = None, + gap_extend_penalty: int | None = None, + prefix_bonus: int | None = None, + capitalization_bonus: int | None = None, + matching_case_bonus: int | None = None, + exact_match_bonus: int | None = None, + delimiter_bonus: int | None = None, + ) -> None: + """Create a Scoring config. All parameters default to frizbee's defaults.""" + ... + +class Config: + """Configuration for fuzzy matching.""" + + max_typos: int | None + """Maximum number of typos (missing chars) before filtering out. None = unlimited.""" + sort: bool + """Whether to sort results by score (descending).""" + scoring: Scoring + """Scoring parameters for the algorithm.""" + + def __init__( + self, + max_typos: int | None = 0, + sort: bool = True, + scoring: Scoring | None = None, + ) -> None: + """Create a Config. Defaults match frizbee's defaults (max_typos=0, sort=True).""" + ... + +class Match: + """A single fuzzy match result.""" + + score: int + """Match score (higher is better).""" + index: int + """Index of the matched item in the original haystack list.""" + exact: bool + """Whether the needle matched the haystack exactly.""" + +class MatchIndices: + """A fuzzy match result with character match indices.""" + + score: int + """Match score (higher is better).""" + index: int + """Index of the matched item in the original haystack list.""" + exact: bool + """Whether the needle matched the haystack exactly.""" + indices: list[int] + """Indices of matched characters in the haystack (in reverse order).""" + +class Matcher: + """Stateful fuzzy matcher. Reuse for matching one needle against many haystacks.""" + + def __init__(self, needle: str, config: Config | None = None) -> None: + """Create a new Matcher for the given needle.""" + ... + def set_needle(self, needle: str) -> None: + """Update the needle to match against.""" + ... + def set_config(self, config: Config) -> None: + """Update the matching configuration.""" + ... + def match_list(self, haystacks: list[str]) -> list[Match]: + """Match the needle against a list of haystacks. + + Returns a list of Match objects for items that matched. + Results are sorted by score (descending) if config.sort is True. + """ + ... + def match_list_indices(self, haystacks: list[str]) -> list[MatchIndices]: + """Match the needle against a list of haystacks, returning match indices. + + Returns a list of MatchIndices objects with the positions of matched characters. + Results are sorted by score (descending) if config.sort is True. + """ + ... + +def match_list( + needle: str, + haystacks: list[str], + config: Config | None = None, +) -> list[Match]: + """Fuzzy match a needle against a list of haystacks. + + Returns a list of Match objects for items that matched, sorted by score + (descending) by default. + + Args: + needle: The search string. + haystacks: List of strings to match against. + config: Optional matching configuration. + """ + ... + +def match_list_indices( + needle: str, + haystacks: list[str], + config: Config | None = None, +) -> list[MatchIndices]: + """Fuzzy match a needle against a list of haystacks, returning match indices. + + Returns a list of MatchIndices objects with the positions of matched characters. + + Args: + needle: The search string. + haystacks: List of strings to match against. + config: Optional matching configuration. + """ + ... + +def match_list_parallel( + needle: str, + haystacks: list[str], + config: Config | None = None, + threads: int | None = None, +) -> list[Match]: + """Fuzzy match a needle against a list of haystacks using multiple threads. + + Like match_list but parallelized for large haystack lists. + + Args: + needle: The search string. + haystacks: List of strings to match against. + config: Optional matching configuration. + threads: Number of threads to use. Defaults to available parallelism. + """ + ... diff --git a/frizbee-python/frizbee/py.typed b/frizbee-python/frizbee/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/frizbee-python/pyproject.toml b/frizbee-python/pyproject.toml new file mode 100644 index 0000000..5b7766f --- /dev/null +++ b/frizbee-python/pyproject.toml @@ -0,0 +1,40 @@ +[project] +name = "frizbee" +description = "Python bindings for frizbee, a fast SIMD fuzzy matcher" +readme = "README.md" +requires-python = ">=3.12" +license = "MIT" +authors = [ + { name = "Philipp Temminghoff", email = "philipp.temminghoff@gmail.com" }, + { name = "Liam Dyer", email = "liamcdyer@gmail.com" }, +] +keywords = ["fuzzy", "fzf", "fzy", "matching", "rust", "simd", "smith-waterman"] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Programming Language :: Rust", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Text Processing", +] +dependencies = [] +dynamic = ["version"] + +[project.urls] +Homepage = "https://github.com/saghen/frizbee" +Issues = "https://github.com/saghen/frizbee/issues" +Repository = "https://github.com/saghen/frizbee" + +[project.optional-dependencies] +dev = ["pytest>=8.0"] + +[build-system] +requires = ["maturin>=1.0,<2.0"] +build-backend = "maturin" + +[tool.maturin] +features = ["pyo3/extension-module"] diff --git a/frizbee-python/src/binding.rs b/frizbee-python/src/binding.rs new file mode 100644 index 0000000..846b024 --- /dev/null +++ b/frizbee-python/src/binding.rs @@ -0,0 +1,321 @@ +use pyo3::prelude::*; + +// ── Scoring ────────────────────────────────────────────────────────── + +#[pyclass(name = "Scoring", frozen, from_py_object)] +#[derive(Debug, Clone)] +pub struct PyScoring { + #[pyo3(get)] + pub match_score: u16, + #[pyo3(get)] + pub mismatch_penalty: u16, + #[pyo3(get)] + pub gap_open_penalty: u16, + #[pyo3(get)] + pub gap_extend_penalty: u16, + #[pyo3(get)] + pub prefix_bonus: u16, + #[pyo3(get)] + pub capitalization_bonus: u16, + #[pyo3(get)] + pub matching_case_bonus: u16, + #[pyo3(get)] + pub exact_match_bonus: u16, + #[pyo3(get)] + pub delimiter_bonus: u16, +} + +#[pymethods] +impl PyScoring { + #[new] + #[pyo3(signature = ( + match_score = None, + mismatch_penalty = None, + gap_open_penalty = None, + gap_extend_penalty = None, + prefix_bonus = None, + capitalization_bonus = None, + matching_case_bonus = None, + exact_match_bonus = None, + delimiter_bonus = None, + ))] + fn new( + match_score: Option, + mismatch_penalty: Option, + gap_open_penalty: Option, + gap_extend_penalty: Option, + prefix_bonus: Option, + capitalization_bonus: Option, + matching_case_bonus: Option, + exact_match_bonus: Option, + delimiter_bonus: Option, + ) -> Self { + let defaults = frizbee::Scoring::default(); + PyScoring { + match_score: match_score.unwrap_or(defaults.match_score), + mismatch_penalty: mismatch_penalty.unwrap_or(defaults.mismatch_penalty), + gap_open_penalty: gap_open_penalty.unwrap_or(defaults.gap_open_penalty), + gap_extend_penalty: gap_extend_penalty.unwrap_or(defaults.gap_extend_penalty), + prefix_bonus: prefix_bonus.unwrap_or(defaults.prefix_bonus), + capitalization_bonus: capitalization_bonus.unwrap_or(defaults.capitalization_bonus), + matching_case_bonus: matching_case_bonus.unwrap_or(defaults.matching_case_bonus), + exact_match_bonus: exact_match_bonus.unwrap_or(defaults.exact_match_bonus), + delimiter_bonus: delimiter_bonus.unwrap_or(defaults.delimiter_bonus), + } + } + + fn __repr__(&self) -> String { + format!( + "Scoring(match_score={}, mismatch_penalty={}, gap_open_penalty={}, gap_extend_penalty={}, \ + prefix_bonus={}, capitalization_bonus={}, matching_case_bonus={}, exact_match_bonus={}, \ + delimiter_bonus={})", + self.match_score, + self.mismatch_penalty, + self.gap_open_penalty, + self.gap_extend_penalty, + self.prefix_bonus, + self.capitalization_bonus, + self.matching_case_bonus, + self.exact_match_bonus, + self.delimiter_bonus, + ) + } +} + +impl From<&PyScoring> for frizbee::Scoring { + fn from(s: &PyScoring) -> Self { + frizbee::Scoring { + match_score: s.match_score, + mismatch_penalty: s.mismatch_penalty, + gap_open_penalty: s.gap_open_penalty, + gap_extend_penalty: s.gap_extend_penalty, + prefix_bonus: s.prefix_bonus, + capitalization_bonus: s.capitalization_bonus, + matching_case_bonus: s.matching_case_bonus, + exact_match_bonus: s.exact_match_bonus, + delimiter_bonus: s.delimiter_bonus, + } + } +} + +// ── Config ─────────────────────────────────────────────────────────── + +#[pyclass(name = "Config", frozen, from_py_object)] +#[derive(Debug, Clone)] +pub struct PyConfig { + #[pyo3(get)] + pub max_typos: Option, + #[pyo3(get)] + pub sort: bool, + #[pyo3(get)] + pub scoring: PyScoring, +} + +#[pymethods] +impl PyConfig { + #[new] + #[pyo3(signature = (max_typos = Some(0), sort = true, scoring = None))] + fn new(max_typos: Option, sort: bool, scoring: Option) -> Self { + PyConfig { + max_typos, + sort, + scoring: scoring.unwrap_or_else(|| { + PyScoring::new(None, None, None, None, None, None, None, None, None) + }), + } + } + + fn __repr__(&self) -> String { + format!( + "Config(max_typos={}, sort={})", + self.max_typos + .map(|t| t.to_string()) + .unwrap_or("None".to_string()), + if self.sort { "True" } else { "False" }, + ) + } +} + +impl From<&PyConfig> for frizbee::Config { + fn from(c: &PyConfig) -> Self { + frizbee::Config { + max_typos: c.max_typos, + sort: c.sort, + scoring: (&c.scoring).into(), + } + } +} + +// ── Match ──────────────────────────────────────────────────────────── + +#[pyclass(name = "Match", frozen, from_py_object)] +#[derive(Debug, Clone)] +pub struct PyMatch { + #[pyo3(get)] + pub score: u16, + #[pyo3(get)] + pub index: u32, + #[pyo3(get)] + pub exact: bool, +} + +#[pymethods] +impl PyMatch { + fn __repr__(&self) -> String { + format!( + "Match(score={}, index={}, exact={})", + self.score, + self.index, + if self.exact { "True" } else { "False" }, + ) + } +} + +impl From for PyMatch { + fn from(m: frizbee::Match) -> Self { + PyMatch { + score: m.score, + index: m.index, + exact: m.exact, + } + } +} + +// ── MatchIndices ───────────────────────────────────────────────────── + +#[pyclass(name = "MatchIndices", frozen, from_py_object)] +#[derive(Debug, Clone)] +pub struct PyMatchIndices { + #[pyo3(get)] + pub score: u16, + #[pyo3(get)] + pub index: u32, + #[pyo3(get)] + pub exact: bool, + #[pyo3(get)] + pub indices: Vec, +} + +#[pymethods] +impl PyMatchIndices { + fn __repr__(&self) -> String { + format!( + "MatchIndices(score={}, index={}, exact={}, indices={:?})", + self.score, + self.index, + if self.exact { "True" } else { "False" }, + self.indices, + ) + } +} + +impl From for PyMatchIndices { + fn from(m: frizbee::MatchIndices) -> Self { + PyMatchIndices { + score: m.score, + index: m.index, + exact: m.exact, + indices: m.indices, + } + } +} + +// ── Matcher (stateful) ─────────────────────────────────────────────── + +#[pyclass(name = "Matcher")] +pub struct PyMatcher { + inner: frizbee::Matcher, +} + +#[pymethods] +impl PyMatcher { + #[new] + #[pyo3(signature = (needle, config = None))] + fn new(needle: &str, config: Option<&PyConfig>) -> Self { + let cfg = config.map(|c| c.into()).unwrap_or_default(); + PyMatcher { + inner: frizbee::Matcher::new(needle, &cfg), + } + } + + fn set_needle(&mut self, needle: &str) { + self.inner.set_needle(needle); + } + + fn set_config(&mut self, config: &PyConfig) { + let cfg: frizbee::Config = config.into(); + self.inner.set_config(&cfg); + } + + #[pyo3(signature = (haystacks))] + fn match_list(&mut self, haystacks: Vec) -> Vec { + self.inner + .match_list(&haystacks) + .into_iter() + .map(PyMatch::from) + .collect() + } + + #[pyo3(signature = (haystacks))] + fn match_list_indices(&mut self, haystacks: Vec) -> Vec { + self.inner + .match_list_indices(&haystacks) + .into_iter() + .map(PyMatchIndices::from) + .collect() + } + + fn __repr__(&self) -> String { + format!("Matcher(needle={:?})", self.inner.needle) + } +} + +// ── Free functions ─────────────────────────────────────────────────── + +#[pyfunction] +#[pyo3(name = "match_list", signature = (needle, haystacks, config = None))] +pub fn py_match_list( + needle: &str, + haystacks: Vec, + config: Option<&PyConfig>, +) -> Vec { + let cfg = config.map(|c| c.into()).unwrap_or_default(); + frizbee::match_list(needle, &haystacks, &cfg) + .into_iter() + .map(PyMatch::from) + .collect() +} + +#[pyfunction] +#[pyo3(name = "match_list_indices", signature = (needle, haystacks, config = None))] +pub fn py_match_list_indices( + needle: &str, + haystacks: Vec, + config: Option<&PyConfig>, +) -> Vec { + let cfg = config.map(|c| c.into()).unwrap_or_default(); + frizbee::match_list_indices(needle, &haystacks, &cfg) + .into_iter() + .map(PyMatchIndices::from) + .collect() +} + +#[pyfunction] +#[pyo3(name = "match_list_parallel", signature = (needle, haystacks, config = None, threads = None))] +pub fn py_match_list_parallel( + needle: &str, + haystacks: Vec, + config: Option<&PyConfig>, + threads: Option, +) -> Vec { + let cfg = config.map(|c| c.into()).unwrap_or_default(); + let num_threads = threads.unwrap_or_else(|| { + std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(4) + }); + frizbee::match_list_parallel(needle, &haystacks, &cfg, num_threads) + .into_iter() + .map(PyMatch::from) + .collect() +} diff --git a/frizbee-python/src/lib.rs b/frizbee-python/src/lib.rs new file mode 100644 index 0000000..2cbb1a1 --- /dev/null +++ b/frizbee-python/src/lib.rs @@ -0,0 +1,17 @@ +use pyo3::prelude::*; + +mod binding; +use binding::{PyConfig, PyMatch, PyMatchIndices, PyMatcher, PyScoring}; + +#[pymodule(gil_used = false)] +fn frizbee_rs(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_function(wrap_pyfunction!(binding::py_match_list, m)?)?; + m.add_function(wrap_pyfunction!(binding::py_match_list_indices, m)?)?; + m.add_function(wrap_pyfunction!(binding::py_match_list_parallel, m)?)?; + Ok(()) +} diff --git a/frizbee-python/uv.lock b/frizbee-python/uv.lock new file mode 100644 index 0000000..dcdb679 --- /dev/null +++ b/frizbee-python/uv.lock @@ -0,0 +1,77 @@ +version = 1 +revision = 3 +requires-python = ">=3.12" + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, +] + +[[package]] +name = "frizbee" +source = { editable = "." } + +[package.optional-dependencies] +dev = [ + { name = "pytest" }, +] + +[package.metadata] +requires-dist = [{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }] +provides-extras = ["dev"] + +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + +[[package]] +name = "packaging" +version = "26.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, +] + +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, +] + +[[package]] +name = "pygments" +version = "2.19.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, +] + +[[package]] +name = "pytest" +version = "9.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, +] diff --git a/frizbee/Cargo.toml b/frizbee/Cargo.toml new file mode 100644 index 0000000..f7faa3c --- /dev/null +++ b/frizbee/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "frizbee" +version = "0.8.2" +edition = "2024" + +description = "Fast typo-resistant fuzzy matching via SIMD smith waterman, similar algorithm to FZF/FZY" +repository = "https://github.com/saghen/frizbee" +authors = ["Liam Dyer "] +license = "MIT" + +[dev-dependencies] +criterion = "0.8" +nucleo = "0.5.0" +rand = "0.10" +rand_distr = "0.6" + +[[bench]] +name = "lib" +harness = false + +[lib] +bench = false + +[dependencies] +itertools = "0.14" # k-way merge +serde = { version = "1.0", features = ["derive"], optional = true } + +[target.'cfg(target_arch = "x86_64")'.dependencies] +raw-cpuid = "11.6" # runtime feature detection + +[features] +serde = ["dep:serde"] diff --git a/benches/lib.rs b/frizbee/benches/lib.rs similarity index 100% rename from benches/lib.rs rename to frizbee/benches/lib.rs diff --git a/benches/match_list/generate.rs b/frizbee/benches/match_list/generate.rs similarity index 100% rename from benches/match_list/generate.rs rename to frizbee/benches/match_list/generate.rs diff --git a/benches/match_list/mod.rs b/frizbee/benches/match_list/mod.rs similarity index 100% rename from benches/match_list/mod.rs rename to frizbee/benches/match_list/mod.rs diff --git a/src/const.rs b/frizbee/src/const.rs similarity index 100% rename from src/const.rs rename to frizbee/src/const.rs diff --git a/src/lib.rs b/frizbee/src/lib.rs similarity index 97% rename from src/lib.rs rename to frizbee/src/lib.rs index 1265728..bbc2241 100644 --- a/src/lib.rs +++ b/frizbee/src/lib.rs @@ -1,4 +1,4 @@ -//! Frizbee is a SIMD typo-resistant fuzzy string matcher written in Rust. The core of the algorithm uses Smith-Waterman with affine gaps, similar to FZF, but with many of the scoring bonuses from FZY. In the included benchmark, with typo resistance disabled, it outperforms [nucleo](https://github.com/helix-editor/nucleo) by ~1.7x and [fzf](https://github.com/junegunn/fzf) by ~7x and supports multithreading, see [benchmarks](./BENCHMARKS.md). It matches against bytes directly, ignoring unicode. +//! Frizbee is a SIMD typo-resistant fuzzy string matcher written in Rust. The core of the algorithm uses Smith-Waterman with affine gaps, similar to FZF, but with many of the scoring bonuses from FZY. In the included benchmark, with typo resistance disabled, it outperforms [nucleo](https://github.com/helix-editor/nucleo) by ~1.7x and [fzf](https://github.com/junegunn/fzf) by ~2.1x and supports multithreading, see [benchmarks](https://github.com/saghen/frizbee/blob/main/BENCHMARKS.md). It matches against bytes directly, ignoring unicode. //! //! Used by [blink.cmp](https://github.com/saghen/blink.cmp), [skim](https://github.com/skim-rs/skim), and [fff.nvim](https://github.com/dmtrKovalenko/fff.nvim). Special thank you to [stefanboca](https://github.com/stefanboca) and [ii14](https://github.com/ii14)! //! @@ -127,10 +127,11 @@ use r#const::*; #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[repr(C)] pub struct Match { - pub score: u16, /// Index of the match in the original list of haystacks pub index: u32, + pub score: u16, /// Matched the needle exactly (e.g. "foo" on "foo") pub exact: bool, } diff --git a/src/one_shot/matcher.rs b/frizbee/src/one_shot/matcher.rs similarity index 98% rename from src/one_shot/matcher.rs rename to frizbee/src/one_shot/matcher.rs index cf46e35..7cca4bb 100644 --- a/src/one_shot/matcher.rs +++ b/frizbee/src/one_shot/matcher.rs @@ -302,7 +302,7 @@ impl Matcher { } #[inline(always)] - pub fn guard_against_score_overflow(&self) { + fn guard_against_score_overflow(&self) { let scoring = &self.config.scoring; let max_per_char_score = scoring.match_score + scoring.capitalization_bonus / 2 @@ -319,7 +319,7 @@ impl Matcher { } #[inline(always)] - pub fn guard_against_haystack_overflow(haystack_len: usize, haystack_index_offset: u32) { + fn guard_against_haystack_overflow(haystack_len: usize, haystack_index_offset: u32) { assert!( (haystack_len.saturating_add(haystack_index_offset as usize)) <= (u32::MAX as usize), "too many haystack which will overflow the u32 index: {} > {} (index offset: {})", diff --git a/src/one_shot/mod.rs b/frizbee/src/one_shot/mod.rs similarity index 100% rename from src/one_shot/mod.rs rename to frizbee/src/one_shot/mod.rs diff --git a/src/one_shot/parallel.rs b/frizbee/src/one_shot/parallel.rs similarity index 100% rename from src/one_shot/parallel.rs rename to frizbee/src/one_shot/parallel.rs diff --git a/src/prefilter/aarch64/mod.rs b/frizbee/src/prefilter/aarch64/mod.rs similarity index 100% rename from src/prefilter/aarch64/mod.rs rename to frizbee/src/prefilter/aarch64/mod.rs diff --git a/src/prefilter/mod.rs b/frizbee/src/prefilter/mod.rs similarity index 100% rename from src/prefilter/mod.rs rename to frizbee/src/prefilter/mod.rs diff --git a/src/prefilter/scalar.rs b/frizbee/src/prefilter/scalar.rs similarity index 100% rename from src/prefilter/scalar.rs rename to frizbee/src/prefilter/scalar.rs diff --git a/src/prefilter/x86_64/avx2.rs b/frizbee/src/prefilter/x86_64/avx2.rs similarity index 100% rename from src/prefilter/x86_64/avx2.rs rename to frizbee/src/prefilter/x86_64/avx2.rs diff --git a/src/prefilter/x86_64/mod.rs b/frizbee/src/prefilter/x86_64/mod.rs similarity index 100% rename from src/prefilter/x86_64/mod.rs rename to frizbee/src/prefilter/x86_64/mod.rs diff --git a/src/prefilter/x86_64/sse.rs b/frizbee/src/prefilter/x86_64/sse.rs similarity index 100% rename from src/prefilter/x86_64/sse.rs rename to frizbee/src/prefilter/x86_64/sse.rs diff --git a/src/simd/avx.rs b/frizbee/src/simd/avx.rs similarity index 100% rename from src/simd/avx.rs rename to frizbee/src/simd/avx.rs diff --git a/src/simd/mod.rs b/frizbee/src/simd/mod.rs similarity index 100% rename from src/simd/mod.rs rename to frizbee/src/simd/mod.rs diff --git a/src/simd/neon.rs b/frizbee/src/simd/neon.rs similarity index 100% rename from src/simd/neon.rs rename to frizbee/src/simd/neon.rs diff --git a/src/simd/neon_256.rs b/frizbee/src/simd/neon_256.rs similarity index 100% rename from src/simd/neon_256.rs rename to frizbee/src/simd/neon_256.rs diff --git a/src/simd/sse.rs b/frizbee/src/simd/sse.rs similarity index 100% rename from src/simd/sse.rs rename to frizbee/src/simd/sse.rs diff --git a/src/simd/sse_256.rs b/frizbee/src/simd/sse_256.rs similarity index 100% rename from src/simd/sse_256.rs rename to frizbee/src/simd/sse_256.rs diff --git a/src/smith_waterman/greedy.rs b/frizbee/src/smith_waterman/greedy.rs similarity index 100% rename from src/smith_waterman/greedy.rs rename to frizbee/src/smith_waterman/greedy.rs diff --git a/src/smith_waterman/mod.rs b/frizbee/src/smith_waterman/mod.rs similarity index 100% rename from src/smith_waterman/mod.rs rename to frizbee/src/smith_waterman/mod.rs diff --git a/src/smith_waterman/simd/algo.rs b/frizbee/src/smith_waterman/simd/algo.rs similarity index 99% rename from src/smith_waterman/simd/algo.rs rename to frizbee/src/smith_waterman/simd/algo.rs index 31fa98c..9bf768e 100644 --- a/src/smith_waterman/simd/algo.rs +++ b/frizbee/src/smith_waterman/simd/algo.rs @@ -108,10 +108,10 @@ impl, Simd256: Vector256> let score_matrix = &mut self.score_matrix; score_matrix.set_haystack_chunks(haystack_chunks); - score_matrix.zero(); + // score_matrix.zero(); let match_masks = &mut self.match_masks; match_masks.set_haystack_chunks(haystack_chunks); - match_masks.zero(); + // match_masks.zero(); unsafe { // Constants diff --git a/src/smith_waterman/simd/alignment.rs b/frizbee/src/smith_waterman/simd/alignment.rs similarity index 100% rename from src/smith_waterman/simd/alignment.rs rename to frizbee/src/smith_waterman/simd/alignment.rs diff --git a/src/smith_waterman/simd/alignment_iter.rs b/frizbee/src/smith_waterman/simd/alignment_iter.rs similarity index 100% rename from src/smith_waterman/simd/alignment_iter.rs rename to frizbee/src/smith_waterman/simd/alignment_iter.rs diff --git a/src/smith_waterman/simd/gaps.rs b/frizbee/src/smith_waterman/simd/gaps.rs similarity index 100% rename from src/smith_waterman/simd/gaps.rs rename to frizbee/src/smith_waterman/simd/gaps.rs diff --git a/src/smith_waterman/simd/matrix.rs b/frizbee/src/smith_waterman/simd/matrix.rs similarity index 100% rename from src/smith_waterman/simd/matrix.rs rename to frizbee/src/smith_waterman/simd/matrix.rs diff --git a/src/smith_waterman/simd/mod.rs b/frizbee/src/smith_waterman/simd/mod.rs similarity index 98% rename from src/smith_waterman/simd/mod.rs rename to frizbee/src/smith_waterman/simd/mod.rs index dcbfb05..c0260e1 100644 --- a/src/smith_waterman/simd/mod.rs +++ b/frizbee/src/smith_waterman/simd/mod.rs @@ -281,6 +281,10 @@ mod tests { fn test_score_exact_match() { assert_eq!(get_score("a", "a"), CHAR_SCORE + PREFIX_BONUS); assert_eq!(get_score("abc", "abc"), 3 * CHAR_SCORE + PREFIX_BONUS); + assert_eq!( + get_score("vagrant-libvirt", "vagrant-libvirt"), + 15 * CHAR_SCORE + PREFIX_BONUS + ); } #[test] @@ -296,6 +300,7 @@ mod tests { #[test] fn test_score_no_delimiter_for_delimiter_chars() { assert_eq!(get_score("-", "a-bc"), CHAR_SCORE); + assert_eq!(get_score("-b", "a-bc"), 2 * CHAR_SCORE); assert_eq!(get_score("-", "a--bc"), CHAR_SCORE); assert!(get_score("a_b", "a_bb") > get_score("a_b", "a__b")); }