diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 7b215e1..bb8ac0b 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -1,9 +1,5 @@ { "permissions": { - "allow": [ - "Bash(cargo test:*)", - "Bash(cargo clippy:*)", - "Bash(git -C /Users/blopker/code/codebook log --oneline -20)" - ] + "allow": ["Bash(cargo test:*)", "Bash(cargo clippy:*)"] } } diff --git a/Cargo.lock b/Cargo.lock index b55c00e..14366b8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -941,9 +941,9 @@ checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" [[package]] name = "h2" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386" +checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" dependencies = [ "atomic-waker", "bytes", @@ -1127,7 +1127,6 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", - "webpki-roots", ] [[package]] @@ -1282,9 +1281,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.12.1" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ad4bb2b565bca0645f4d68c5c9af97fba094e9791da685bf83cb5f3ce74acf2" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" dependencies = [ "equivalent", "hashbrown 0.16.1", @@ -1349,9 +1348,9 @@ checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "jiff" -version = "0.2.17" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a87d9b8105c23642f50cbbae03d1f75d8422c5cb98ce7ee9271f7ff7505be6b8" +checksum = "e67e8da4c49d6d9909fe03361f9b620f58898859f5c7aded68351e85e71ecf50" dependencies = [ "jiff-static", "log", @@ -1362,9 +1361,9 @@ dependencies = [ [[package]] name = "jiff-static" -version = "0.2.17" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b787bebb543f8969132630c51fd0afab173a86c6abae56ff3b9e5e3e3f9f6e58" +checksum = "e0c84ee7f197eca9a86c6fd6cb771e55eb991632f15f2bc3ca6ec838929e6e78" dependencies = [ "proc-macro2", "quote", @@ -1504,9 +1503,9 @@ checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] name = "lru" -version = "0.16.2" +version = "0.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96051b46fc183dc9cd4a223960ef37b9af631b55191852a8274bfef064cda20f" +checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" dependencies = [ "hashbrown 0.16.1", ] @@ -1800,9 +1799,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.104" +version = "1.0.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9695f8df41bb4f3d222c95a67532365f569318332d03d5f3f67f37b20e6ebdf0" +checksum = "535d180e0ecab6268a3e718bb9fd44db66bbbc256257165fc699dadf70d16fe7" dependencies = [ "unicode-ident", ] @@ -1842,6 +1841,7 @@ version = "0.11.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" dependencies = [ + "aws-lc-rs", "bytes", "getrandom 0.3.4", "lru-slab", @@ -1873,9 +1873,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.42" +version = "1.0.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" +checksum = "dc74d9a594b72ae6656596548f56f667211f8a97b3d4c3d467150794690dc40a" dependencies = [ "proc-macro2", ] @@ -1966,9 +1966,9 @@ checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" [[package]] name = "reqwest" -version = "0.12.28" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +checksum = "04e9018c9d814e5f30cc16a0f03271aeab3571e609612d9fe78c1aa8d11c2f62" dependencies = [ "base64", "bytes", @@ -1988,9 +1988,9 @@ dependencies = [ "quinn", "rustls", "rustls-pki-types", + "rustls-platform-verifier", "serde", "serde_json", - "serde_urlencoded", "sync_wrapper", "tokio", "tokio-rustls", @@ -2001,7 +2001,6 @@ dependencies = [ "wasm-bindgen", "wasm-bindgen-futures", "web-sys", - "webpki-roots", ] [[package]] @@ -2054,14 +2053,13 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.35" +version = "0.23.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "533f54bc6a7d4f647e46ad909549eda97bf5afc1585190ef692b4286b198bd8f" +checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" dependencies = [ "aws-lc-rs", "log", "once_cell", - "ring", "rustls-pki-types", "rustls-webpki", "subtle", @@ -2135,12 +2133,6 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" -[[package]] -name = "ryu" -version = "1.0.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a50f4cf475b65d88e057964e0e9bb1f0aa9bbb2036dc65c64596b42932536984" - [[package]] name = "same-file" version = "1.0.6" @@ -2220,9 +2212,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.148" +version = "1.0.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3084b546a1dd6289475996f182a22aba973866ea8e8b02c51d9f46b1336a22da" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" dependencies = [ "indexmap", "itoa", @@ -2262,18 +2254,6 @@ dependencies = [ "serde_core", ] -[[package]] -name = "serde_urlencoded" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" -dependencies = [ - "form_urlencoded", - "itoa", - "ryu", - "serde", -] - [[package]] name = "sha1" version = "0.10.6" @@ -2435,9 +2415,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.113" +version = "2.0.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "678faa00651c9eb72dd2020cbdf275d92eccb2400d568e419efdd64838145cb4" +checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a" dependencies = [ "proc-macro2", "quote", @@ -3069,14 +3049,15 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.7" +version = "2.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" dependencies = [ "form_urlencoded", "idna", "percent-encoding", "serde", + "serde_derive", ] [[package]] @@ -3240,15 +3221,6 @@ dependencies = [ "rustls-pki-types", ] -[[package]] -name = "webpki-roots" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12bed680863276c63889429bfd6cab3b99943659923822de1c8a39c49e4d722c" -dependencies = [ - "rustls-pki-types", -] - [[package]] name = "winapi" version = "0.3.9" @@ -3604,18 +3576,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.31" +version = "0.8.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd74ec98b9250adb3ca554bdde269adf631549f51d8a8f8f0a10b50f1cb298c3" +checksum = "668f5168d10b9ee831de31933dc111a459c97ec93225beb307aed970d1372dfd" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.31" +version = "0.8.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8a8d209fdf45cf5138cbb5a506f6b52522a25afccc534d1475dad8e31105c6a" +checksum = "2c7962b26b0a8685668b671ee4b54d007a67d4eaf05fda79ac0ecf41e32270f1" dependencies = [ "proc-macro2", "quote", @@ -3684,6 +3656,6 @@ dependencies = [ [[package]] name = "zmij" -version = "1.0.10" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30e0d8dffbae3d840f64bda38e28391faef673a7b5a6017840f2a106c8145868" +checksum = "2fc5a66a20078bf1251bde995aa2fdcc4b800c70b5d92dd2c62abc5c60f679f8" diff --git a/Cargo.toml b/Cargo.toml index 78f5739..9192a2d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,7 @@ lazy_static = "1.5.0" log = "0.4.22" lru = "0.16" regex = "1.11.1" -reqwest = { version = "<0.14", default-features = false, features = ["blocking", "rustls-tls", "json"] } +reqwest = { version = "^0.13.0", default-features = false, features = ["blocking", "rustls", "json"] } rustls = { version = "<0.24", features = ["aws-lc-rs"] } rustls-platform-verifier = "0.6.0" serde = { version = "1", features = ["derive", "serde_derive"] } diff --git a/README.md b/README.md index 845d613..d2521ed 100644 --- a/README.md +++ b/README.md @@ -263,8 +263,7 @@ flag_words = ["todo", "fixme"] ignore_paths = ["target/**/*", "**/*.json", ".git/**/*"] # List of regex patterns to ignore when spell checking -# Patterns are matched against each line of text, not individual words -# Useful for domain-specific strings or patterns +# For code files: patterns match against the full source, tokens within matches are skipped # Tip: Use single quotes for literal strings to avoid escaping backslashes # Default: [] ignore_patterns = [ @@ -314,7 +313,12 @@ The `ignore_patterns` configuration allows you to define custom regex patterns t - Git commit hashes: `\b[0-9a-fA-F]{7,40}\b` - Markdown links: `\[([^\]]+)\]\(([^)]+)\)` -**Line-by-Line Matching**: Regex patterns are applied to each line of text, not individual words. This means your patterns should account for the line context. +**How Patterns Are Matched**: +- Patterns are matched against the full source text +- Words that fall entirely within a matched range are skipped +- **Multiline mode is enabled**: `^` and `$` match line boundaries, not just start/end of file +- Example: `'^vim\..*'` skips all words on lines starting with `vim.` +- Example: `'vim\.opt\.[a-z]+'` matches `vim.opt.showmode`, so `showmode` is skipped **TOML Literal Strings**: Use single quotes for regex patterns to avoid escaping backslashes: - `'\b'` for word boundaries (no escaping needed) @@ -325,13 +329,13 @@ The `ignore_patterns` configuration allows you to define custom regex patterns t ```toml ignore_patterns = [ '\b[ATCG]+\b', # DNA sequences with word boundaries - '^\s*//.*$', # Comment lines starting with // - 'https?://[^\s]+', # URLs (no escaping needed) - '\$[a-zA-Z_][a-zA-Z0-9_]*', # Variables starting with $ + '^vim\..*', # Lines starting with vim. + '^\s*//.*', # Lines that are // comments + 'https?://[^\s]+', # URLs ] ``` -**Migration Note**: If you're upgrading from an older version, patterns that used `^` and `$` anchors may need adjustment since matching now occurs line-by-line rather than word-by-word. +**Tip**: Include the identifier in your pattern. `'vim\.opt\.[a-z]+'` skips `showmode` in `vim.opt.showmode`, but `'vim\.opt\.'` alone won't (it only matches up to the dot). ### LSP Initialization Options diff --git a/crates/codebook/src/lib.rs b/crates/codebook/src/lib.rs index 2fd2120..67ecab3 100644 --- a/crates/codebook/src/lib.rs +++ b/crates/codebook/src/lib.rs @@ -47,9 +47,10 @@ impl Codebook { // call spell check on each dictionary let language = self.resolve_language(language, file_path); let dictionaries = self.get_dictionaries(Some(language)); - let mut regex_patterns = get_default_skip_patterns().clone(); - if let Some(config_patterns) = self.config.get_ignore_patterns() { - regex_patterns.extend(config_patterns); + // Combine default and user patterns + let mut all_patterns = get_default_skip_patterns().clone(); + if let Some(user_patterns) = self.config.get_ignore_patterns() { + all_patterns.extend(user_patterns); } parser::find_locations( text, @@ -71,7 +72,7 @@ impl Codebook { } false }, - ®ex_patterns, + &all_patterns, ) } diff --git a/crates/codebook/src/parser.rs b/crates/codebook/src/parser.rs index b7180cf..edf2dfd 100644 --- a/crates/codebook/src/parser.rs +++ b/crates/codebook/src/parser.rs @@ -23,69 +23,73 @@ struct SkipRange { end_byte: usize, } -impl SkipRange { - fn contains(&self, pos: usize) -> bool { - pos >= self.start_byte && pos < self.end_byte - } -} -/// Helper struct to handle all text position tracking in one place -struct TextProcessor { - text: String, - skip_ranges: Vec, +/// Check if a word at [start, end) is entirely within any skip range +fn is_within_skip_range(start: usize, end: usize, skip_ranges: &[SkipRange]) -> bool { + skip_ranges + .iter() + .any(|r| start >= r.start_byte && end <= r.end_byte) } -impl TextProcessor { - fn new(text: &str, skip_patterns: &[Regex]) -> Self { - let text = text.to_string(); - let skip_ranges = Self::find_skip_ranges(&text, skip_patterns); - - Self { text, skip_ranges } +/// Find skip ranges from pattern matches in text. +fn find_skip_ranges(text: &str, patterns: &[Regex]) -> Vec { + if patterns.is_empty() { + return Vec::new(); } - fn find_skip_ranges(text: &str, patterns: &[Regex]) -> Vec { - let mut ranges = Vec::new(); + let mut ranges = Vec::new(); - for pattern in patterns { - for regex_match in pattern.find_iter(text) { - ranges.push(SkipRange { - start_byte: regex_match.start(), - end_byte: regex_match.end(), - }); - } + for pattern in patterns { + for regex_match in pattern.find_iter(text) { + ranges.push(SkipRange { + start_byte: regex_match.start(), + end_byte: regex_match.end(), + }); } + } + + ranges.sort_by_key(|r| r.start_byte); + merge_overlapping_ranges(ranges) +} - // Sort ranges by start position and merge overlapping ones - ranges.sort_by_key(|r| r.start_byte); - Self::merge_overlapping_ranges(ranges) +/// Merge overlapping or adjacent ranges +fn merge_overlapping_ranges(ranges: Vec) -> Vec { + if ranges.is_empty() { + return ranges; } - fn merge_overlapping_ranges(ranges: Vec) -> Vec { - if ranges.is_empty() { - return ranges; + let mut merged = Vec::new(); + let mut current = ranges[0]; + + for range in ranges.into_iter().skip(1) { + if range.start_byte <= current.end_byte { + current.end_byte = current.end_byte.max(range.end_byte); + } else { + merged.push(current); + current = range; } + } + merged.push(current); + merged +} - let mut merged = Vec::new(); - let mut current = ranges[0]; +/// Helper struct to handle text position tracking and word extraction +struct TextProcessor { + text: String, + skip_ranges: Vec, +} - for range in ranges.into_iter().skip(1) { - if range.start_byte <= current.end_byte { - // Overlapping or adjacent ranges - merge them - current.end_byte = current.end_byte.max(range.end_byte); - } else { - merged.push(current); - current = range; - } +impl TextProcessor { + fn new(text: &str, skip_patterns: &[Regex]) -> Self { + let skip_ranges = find_skip_ranges(text, skip_patterns); + Self { + text: text.to_string(), + skip_ranges, } - merged.push(current); - merged } fn should_skip(&self, start_byte: usize, word_len: usize) -> bool { - let word_end = start_byte + word_len; - self.skip_ranges - .iter() - .any(|range| range.contains(start_byte) || range.contains(word_end)) + is_within_skip_range(start_byte, start_byte + word_len, &self.skip_ranges) } fn process_words_with_check(&self, mut check_function: F) -> Vec @@ -199,22 +203,33 @@ fn find_locations_code( let provider = text.as_bytes(); let mut matches_query = cursor.matches(&query, root_node, provider); + // Find all skip ranges from patterns matched against the full source text + let all_skip_ranges = find_skip_ranges(text, skip_patterns); + while let Some(match_) = matches_query.next() { for capture in match_.captures { let node = capture.node; - let node_text = node.utf8_text(provider).unwrap(); let node_start_byte = node.start_byte(); - // Create processor on just this part of the document - let processor = TextProcessor::new(node_text, skip_patterns); + + let node_text = node.utf8_text(provider).unwrap(); + let processor = TextProcessor::new(node_text, &[]); let words = processor.extract_words(); - // check words and fix locations relative to whole document + // Check words against global skip ranges and dictionary for word_pos in words { if !check_function(&word_pos.word) { for range in word_pos.locations { + let global_start = range.start_byte + node_start_byte; + let global_end = range.end_byte + node_start_byte; + + // Skip if word is entirely within a skip range + if is_within_skip_range(global_start, global_end, &all_skip_ranges) { + continue; + } + let location = TextRange { - start_byte: range.start_byte + node_start_byte, - end_byte: range.end_byte + node_start_byte, + start_byte: global_start, + end_byte: global_end, }; if let Some(existing_result) = word_locations.get_mut(&word_pos.word) { #[cfg(debug_assertions)] diff --git a/crates/codebook/src/queries/typst.scm b/crates/codebook/src/queries/typst.scm index 710fa7a..4f34d06 100644 --- a/crates/codebook/src/queries/typst.scm +++ b/crates/codebook/src/queries/typst.scm @@ -2,7 +2,17 @@ (text) @string -(string) @string +; Strings in math formulas +(formula (string) @string) + +; Strings with attachments (superscript/subscript) in math +(attach (string) @string) + +; Strings in content/dictionary values +(tagged (string) @string) + +; Strings in groups (parenthesized expressions) +(group (string) @string) (label) @identifier diff --git a/crates/codebook/src/regexes.rs b/crates/codebook/src/regexes.rs index 80538f3..570ed17 100644 --- a/crates/codebook/src/regexes.rs +++ b/crates/codebook/src/regexes.rs @@ -1,5 +1,5 @@ use lazy_static::lazy_static; -use regex::Regex; +use regex::{Regex, RegexBuilder}; lazy_static! { static ref DEFAULT_SKIP_PATTERNS: Vec = vec![ @@ -32,9 +32,13 @@ pub fn get_default_skip_patterns() -> &'static Vec { &DEFAULT_SKIP_PATTERNS } -/// Compile user-provided regex patterns from strings +/// Compile user-provided regex patterns from strings. +/// Patterns are compiled with multiline mode enabled, so `^` and `$` match line boundaries. pub fn compile_user_patterns(patterns: &[String]) -> Result, regex::Error> { - patterns.iter().map(|pattern| Regex::new(pattern)).collect() + patterns + .iter() + .map(|pattern| RegexBuilder::new(pattern).multi_line(true).build()) + .collect() } #[cfg(test)] @@ -93,4 +97,19 @@ mod tests { assert!(compile_user_patterns(&invalid_patterns).is_err()); } + + #[test] + fn test_multiline_mode_enabled() { + let patterns = vec![r"^vim\..*".to_string()]; + let compiled = compile_user_patterns(&patterns).unwrap(); + + let text = "let x = 1\nvim.opt.showmode = false\nlet y = 2"; + + // Should match line starting with vim. + assert!(compiled[0].is_match(text)); + + // Find the match + let m = compiled[0].find(text).unwrap(); + assert_eq!(m.as_str(), "vim.opt.showmode = false"); + } } diff --git a/crates/codebook/tests/test_regex.rs b/crates/codebook/tests/test_regex.rs index 6703012..76f58ca 100644 --- a/crates/codebook/tests/test_regex.rs +++ b/crates/codebook/tests/test_regex.rs @@ -251,3 +251,61 @@ fn test_user_defined_regex_patterns() { "anotherbadword should be flagged as it doesn't match any pattern" ); } + +#[test] +fn test_pattern_matching_against_full_source() { + utils::init_logging(); + + // Create a temporary config with a pattern that matches vim.opt.* expressions + let temp_dir = tempfile::TempDir::new().unwrap(); + let config_path = temp_dir.path().join("codebook.toml"); + + // Pattern to match "vim.opt." - must include the identifier to skip it + let config_content = r#" + ignore_patterns = [ + "vim\\.opt\\.[a-z]+" + ] + "#; + + std::fs::write(&config_path, config_content).unwrap(); + + let config = std::sync::Arc::new( + codebook_config::CodebookConfigFile::load(Some(temp_dir.path())).unwrap(), + ); + + let processor = codebook::Codebook::new(config).unwrap(); + + // Lua code with vim.opt settings + let sample_text = r#" + vim.opt.showmode = false + vim.opt.relativenumber = true + local badword = "test" + "#; + + let binding = processor + .spell_check(sample_text, Some(LanguageType::Lua), None) + .to_vec(); + let mut misspelled = binding + .iter() + .map(|r| r.word.as_str()) + .collect::>(); + misspelled.sort(); + println!("Misspelled words: {misspelled:?}"); + + // "showmode" and "relativenumber" should be skipped because they fall within + // the matched range of "vim.opt.showmode" and "vim.opt.relativenumber" + assert!( + !misspelled.contains(&"showmode"), + "showmode should be skipped - it's within the vim.opt.showmode match" + ); + assert!( + !misspelled.contains(&"relativenumber"), + "relativenumber should be skipped - it's within the vim.opt.relativenumber match" + ); + + // "badword" should still be flagged - it's not within any skip range + assert!( + misspelled.contains(&"badword"), + "badword should be flagged - it doesn't match the pattern" + ); +} diff --git a/examples/example.lua b/examples/example.lua index d8feaf5..8b1bd43 100644 --- a/examples/example.lua +++ b/examples/example.lua @@ -137,5 +137,12 @@ local function safeOperation(input) end end +local vm +vim.opt.showmode = false +vim.g.loaded_netrw = nil +vim.g['netrw_winsize'] = 30 +vim.cmd [[noautocmd sil norm! "vy]] +vim.fn.jobstart(cmd, { term = true, pty = true }) + -- Return the module return module