diff --git a/.claudeignore b/.claudeignore new file mode 100644 index 0000000..8b8d6eb --- /dev/null +++ b/.claudeignore @@ -0,0 +1,188 @@ +# DevFlow .claudeignore - Protects against sensitive files and context pollution +# Generated by DevFlow - Edit as needed for your project + +# === SECURITY: Sensitive Files === +# Environment and secrets +.env +.env.* +.env.local +.env.*.local +*.env +.envrc + +# Credentials and keys +*.key +*.pem +*.p12 +*.pfx +*.cer +*.crt +*.der +id_rsa +id_dsa +id_ecdsa +id_ed25519 +*.ppk +*_rsa +*_dsa +*secret* +*password* +*credential* +credentials.json +secrets.json +secrets.yaml +secrets.yml + +# Cloud provider credentials +.aws/credentials +.aws/config +.gcp/credentials.json +.azure/credentials + +# Package manager credentials +.npmrc +.pypirc +.gem/credentials +pip.conf + +# Database +*.sql +*.db +*.sqlite +*.sqlite3 + +# === DEPENDENCIES & BUILD === +# Node.js +node_modules/ +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +.pnpm-store/ + +# Python +__pycache__/ +*.py[cod] +*$py.class +.Python +env/ +venv/ +ENV/ +.venv/ +pip-log.txt +pip-delete-this-directory.txt +.eggs/ +*.egg-info/ +dist/ +build/ +*.whl + +# Ruby +vendor/bundle/ +.bundle/ + +# Go +vendor/ +go.sum + +# Rust +target/ +Cargo.lock + +# Java +target/ +*.class +*.jar +*.war + +# PHP +vendor/ +composer.lock + +# === BUILD ARTIFACTS === +dist/ +build/ +out/ +.next/ +.nuxt/ +.output/ +.vite/ +.cache/ +.parcel-cache/ +.turbo/ +*.tsbuildinfo + +# === LOGS & TEMP FILES === +logs/ +*.log +*.tmp +*.temp +*.swp +*.swo +*~ +.DS_Store +Thumbs.db +*.bak +*.orig +*.rej +.cache + +# === VERSION CONTROL === +.git/ +.svn/ +.hg/ +.gitignore + +# === IDE & EDITORS === +.vscode/ +.idea/ +*.sublime-* +*.code-workspace +.project +.classpath +.settings/ + +# === TEST COVERAGE === +coverage/ +.nyc_output/ +htmlcov/ +.coverage +.pytest_cache/ +.tox/ + +# === OS-SPECIFIC === +.DS_Store +.AppleDouble +.LSOverride +Thumbs.db +ehthumbs.db +Desktop.ini + +# === MEDIA & LARGE FILES === +*.mp4 +*.avi +*.mov +*.wmv +*.flv +*.mp3 +*.wav +*.zip +*.tar.gz +*.rar +*.7z +*.dmg +*.iso + +# === DOCUMENTATION BUILD === +site/ +_site/ +.docusaurus/ +.vuepress/dist/ + +# === LOCK FILES (usually not needed for AI context) === +package-lock.json +yarn.lock +pnpm-lock.yaml +Gemfile.lock +poetry.lock +Pipfile.lock diff --git a/Cargo.lock b/Cargo.lock index 05a09a3..4d7ed5b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,18 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.3" @@ -252,6 +264,16 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "colored" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" +dependencies = [ + "lazy_static", + "windows-sys 0.59.0", +] + [[package]] name = "console" version = "0.15.11" @@ -421,6 +443,18 @@ dependencies = [ "windows-sys 0.61.1", ] +[[package]] +name = "fallible-iterator" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + [[package]] name = "fancy-regex" version = "0.13.0" @@ -509,12 +543,30 @@ dependencies = [ "crunchy", ] +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", +] + [[package]] name = "hashbrown" version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" +[[package]] +name = "hashlink" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af" +dependencies = [ + "hashbrown 0.14.5", +] + [[package]] name = "heck" version = "0.5.0" @@ -550,7 +602,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.16.0", ] [[package]] @@ -628,6 +680,17 @@ dependencies = [ "libc", ] +[[package]] +name = "libsqlite3-sys" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c10584274047cb335c23d3e61bcef8e323adae7c5c8c760540f73610177fc3f" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + [[package]] name = "linux-raw-sys" version = "0.11.0" @@ -685,6 +748,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + [[package]] name = "plotters" version = "0.3.7" @@ -835,6 +904,7 @@ dependencies = [ "assert_cmd", "clap", "clap_complete", + "colored", "dirs", "globset", "ignore", @@ -842,6 +912,7 @@ dependencies = [ "rayon", "regex", "rskim-core", + "rusqlite", "serde", "serde_json", "sha2", @@ -873,6 +944,20 @@ dependencies = [ "tree-sitter-typescript", ] +[[package]] +name = "rusqlite" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b838eba278d213a8beaf485bd313fd580ca4505a00d5871caeb1457c55322cae" +dependencies = [ + "bitflags", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "smallvec", +] + [[package]] name = "rustc-hash" version = "1.1.0" @@ -1002,6 +1087,12 @@ version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa" +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + [[package]] name = "streaming-iterator" version = "0.1.9" @@ -1293,6 +1384,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.5" @@ -1596,3 +1693,23 @@ name = "wit-bindgen" version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" + +[[package]] +name = "zerocopy" +version = "0.8.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efbb2a062be311f2ba113ce66f697a4dc589f85e78a4aea276200804cea0ed87" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index 8f3b568..febcb55 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,8 @@ tiktoken-rs = "0.7" clap_complete = "4.5" strip-ansi-escapes = "0.2" regex = "1" +rusqlite = { version = "0.31", features = ["bundled"] } +colored = "2" [workspace.metadata.dist] cargo-dist-version = "0.14.0" diff --git a/README.md b/README.md index 502af4e..edfeb42 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ -# Skim 🔍 +# Skim: The Fastest, Most Comprehensive Context Optimization Tool for AI Coding Agents -> **Rust based smart code reader for AI agents** - Strip implementation, keep structure +> **Code skimming. Command rewriting. Test, build, and git output compression. Token budget cascading.** 12 languages. 14ms for 3,000 lines. Built in Rust. -Skim transforms source code by removing implementation details while preserving structure, signatures, and types. Built with tree-sitter for fast, accurate multi-language parsing. +Other tools skim code. Skim optimizes everything your AI agent touches: code, test output, build errors, git diffs, and raw commands. 14ms for 3,000 lines. 48x faster on cache hits. Nothing else comes close. [![Website](https://img.shields.io/badge/Website-skim-e87040)](https://dean0x.github.io/x/skim/) [![CI](https://github.com/dean0x/skim/actions/workflows/ci.yml/badge.svg)](https://github.com/dean0x/skim/actions/workflows/ci.yml) @@ -13,11 +13,9 @@ Skim transforms source code by removing implementation details while preserving ## Why Skim? -Take a typical 80-file TypeScript project: 63,000 tokens. Modern LLMs handle 200k+ context, so capacity isn't the issue. +**Context capacity is not the bottleneck. Attention is.** Every token you send to an LLM dilutes its focus. Research consistently shows attention dilution in long contexts -- models lose track of critical details even within their window. More tokens means higher latency, degraded recall, and weaker reasoning. Past a threshold, adding context makes outputs worse. While other tools stop at code skimming, Skim optimizes the full spectrum of AI agent context: code, test output, build errors, git diffs, and commands. Faster, broader, and smarter than anything else available. -But **context capacity isn't the bottleneck — attention is.** That 63k contains maybe 5k of actual signal. The rest? Implementation noise: loop bodies, error handlers, validation chains the model doesn't need to reason about architecture. - -**Large contexts degrade model performance.** Research consistently shows attention dilution in long contexts — models lose track of critical details even within their window. More tokens means higher latency, degraded recall, and weaker reasoning. The inverse scaling problem: past a threshold, *adding context makes outputs worse.* +Take a typical 80-file TypeScript project: 63,000 tokens. That contains maybe 5,000 tokens of actual signal. The rest is implementation noise the model doesn't need for architectural reasoning. **80% of the time, the model doesn't need implementation details.** It doesn't care *how* you loop through users or validate emails. It needs to understand *what* your code does and how pieces connect. @@ -51,15 +49,35 @@ That same 80-file project that wouldn't fit? Now you can ask: *"Explain the enti ## Features -- 🚀 **Fast** - 14.6ms for 3000-line files (powered by tree-sitter) -- ⚡ **Cached** - 40-50x faster on repeated processing (enabled by default) -- 🌐 **Multi-language** - TypeScript, JavaScript, Python, Rust, Go, Java, C, C++, Markdown, JSON, YAML, TOML -- 🎯 **Multiple modes** - Structure, signatures, types, or full code -- 📁 **Directory support** - Process entire directories recursively (`skim src/`) -- 📂 **Multi-file** - Glob patterns (`src/**/*.ts`) with parallel processing -- 🤖 **Auto-detection** - Automatically detects language from file extension -- 🔒 **DoS-resistant** - Built-in limits prevent stack overflow and memory exhaustion -- 💧 **Streaming** - Outputs to stdout for pipe workflows +### Code Skimming (the original, still unmatched) +- **12 languages** including TypeScript, JavaScript, Python, Rust, Go, Java, C, C++, Markdown, JSON, YAML, TOML +- **6 transformation modes** from full to minimal to pseudo to structure to signatures to types (15-95% reduction) +- **14.6ms** for 3,000-line files. **48x faster** on cache hits +- **Token budget cascading** that automatically selects the most aggressive mode fitting your budget +- **Parallel processing** with multi-file globs via rayon + +### Command Rewriting (`skim init`) +- PreToolUse hook rewrites `cat`, `head`, `tail`, `cargo test`, `npm test`, `git diff` into skim equivalents +- Two-layer rule system with declarative prefix-swap and custom argument handlers +- One command installs the hook for automatic, invisible context savings + +### Test Output Compression (`skim test`) +- Parses and compresses output from cargo, go, vitest, jest, pytest +- Extracts failures, assertions, pass/fail counts while stripping noise +- Three-tier degradation from structured parse to regex fallback to passthrough + +### Build Output Compression (`skim build`) +- Parses cargo, clippy, tsc build output +- Extracts errors, warnings, and summaries + +### Git Output Compression (`skim git`) +- Compresses `git status`, `git diff`, `git log` +- Flag-aware passthrough when user already specifies compact formats + +### Intelligence +- `skim discover` scans agent session history for optimization opportunities +- `skim learn` detects CLI error-retry patterns and generates correction rules +- Output guardrail ensures compressed output is never larger than the original ## Installation @@ -165,6 +183,7 @@ skim - --language typescript # Stdin (requires --language) - `-j, --jobs` - Parallel processing threads (default: CPU cores) - `--no-cache` - Disable caching - `--show-stats` - Show token reduction stats +- `--disable-analytics` - Disable analytics recording 📖 **[Full Usage Guide →](docs/usage.md)** @@ -440,6 +459,28 @@ skim 'src/**/*.ts' --show-stats Uses OpenAI's tiktoken (cl100k_base for GPT-3.5/GPT-4). Output to stderr for clean piping. +## Analytics + +Skim automatically tracks token savings from every invocation in a local SQLite database (`~/.cache/skim/analytics.db`). View your savings with the `stats` subcommand: + +```bash +skim stats # All-time dashboard +skim stats --since 7d # Last 7 days +skim stats --format json # Machine-readable output +skim stats --cost # Include cost savings estimates +skim stats --clear # Reset analytics data +``` + +**Environment variables:** + +| Variable | Description | +|----------|-------------| +| `SKIM_DISABLE_ANALYTICS` | Set to `1`, `true`, or `yes` to disable recording | +| `SKIM_INPUT_COST_PER_MTOK` | Override $/MTok for cost estimates (default: 3.0) | +| `SKIM_ANALYTICS_DB` | Override analytics database path | + +Analytics recording is fire-and-forget (non-blocking) and does not affect command performance. Data is automatically pruned after 90 days. + ## Security Skim includes built-in DoS protections: @@ -558,6 +599,16 @@ Comprehensive guides for all aspects of Skim: - ⏱️ **[Performance](docs/performance.md)** - Benchmarks and optimization guide - 🛠️ **[Development](docs/development.md)** - Contributing and adding languages +## Part of the AI Development Stack + +| Tool | Role | What It Does | +|------|------|-------------| +| **Skim** | Context Optimization | Compresses code, test output, build output, and git output for optimal LLM reasoning | +| **[DevFlow](https://github.com/dean0x/devflow)** | Quality Orchestration | 18 parallel reviewers, working memory, self-learning, production-grade lifecycle workflows | +| **[Backbeat](https://github.com/dean0x/backbeat)** | Agent Orchestration | Orchestration at scale. Karpathy optimization loops, multi-agent pipelines, DAG dependencies, autoscaling | + +Skim optimizes every byte of context. DevFlow enforces production-grade quality. Backbeat scales execution across agents. No other stack covers all three. + ## Contributing Contributions welcome! Please: diff --git a/crates/rskim-core/Cargo.toml b/crates/rskim-core/Cargo.toml index 6a1196c..baab45b 100644 --- a/crates/rskim-core/Cargo.toml +++ b/crates/rskim-core/Cargo.toml @@ -4,7 +4,7 @@ version = "1.0.0" edition = "2021" authors = ["Skim Contributors"] license = "MIT" -description = "Core library for smart code reading and transformation" +description = "Core library for the fastest, most comprehensive context optimization tool for AI coding agents" repository = "https://github.com/dean0x/skim" readme = "README.md" keywords = ["ast", "code-analysis", "tree-sitter", "llm"] diff --git a/crates/rskim/Cargo.toml b/crates/rskim/Cargo.toml index 5fe3bff..81bca3b 100644 --- a/crates/rskim/Cargo.toml +++ b/crates/rskim/Cargo.toml @@ -4,7 +4,7 @@ version = "1.0.0" edition = "2021" authors = ["Skim Contributors"] license = "MIT" -description = "Smart code reader - streaming code transformation for AI agents" +description = "The fastest, most comprehensive context optimization tool for AI coding agents. Code skimming, command rewriting, output compression." repository = "https://github.com/dean0x/skim" readme = "README.md" @@ -28,6 +28,8 @@ thiserror = { workspace = true } clap_complete = { workspace = true } strip-ansi-escapes = { workspace = true } regex = { workspace = true } +rusqlite = { workspace = true } +colored = { workspace = true } [dev-dependencies] assert_cmd = "2.0" diff --git a/crates/rskim/src/analytics/mod.rs b/crates/rskim/src/analytics/mod.rs new file mode 100644 index 0000000..5bc3127 --- /dev/null +++ b/crates/rskim/src/analytics/mod.rs @@ -0,0 +1,1187 @@ +//! Token analytics persistence layer. +//! +//! Records token savings from every skim invocation into a local SQLite +//! database (`~/.cache/skim/analytics.db`) and provides query functions +//! for the `skim stats` dashboard. +//! +//! ## Design +//! +//! - **SQLite + WAL mode** for concurrent read/write safety. +//! - **Fire-and-forget background threads** -- recording never blocks the +//! main processing pipeline. Token counting for analytics is deferred to +//! the background thread so the main thread pays zero BPE cost. +//! - **90-day auto-pruning** via [`AnalyticsDb::maybe_prune`], tracked in +//! the `analytics_meta` table (schema migration v2). +//! - **[`AnalyticsStore`] trait** abstracts query operations for testability; +//! test code can provide a mock without a real SQLite database. +//! - **Versioned schema migrations** in [`schema`] -- each migration is +//! idempotent and guarded by a `user_version` PRAGMA check. + +mod schema; + +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::Duration; + +use rusqlite::Connection; +use serde::Serialize; + +use crate::tokens; + +// ============================================================================ +// Types +// ============================================================================ + +/// Type of skim command that produced the savings. +#[derive(Debug, Clone, Copy)] +pub(crate) enum CommandType { + File, + Test, + Build, + Git, +} + +impl CommandType { + fn as_str(&self) -> &'static str { + match self { + CommandType::File => "file", + CommandType::Test => "test", + CommandType::Build => "build", + CommandType::Git => "git", + } + } +} + +/// A single token savings measurement. +pub(crate) struct TokenSavingsRecord { + pub(crate) timestamp: i64, + pub(crate) command_type: CommandType, + pub(crate) original_cmd: String, + pub(crate) raw_tokens: usize, + pub(crate) compressed_tokens: usize, + pub(crate) savings_pct: f32, + pub(crate) duration_ms: u64, + pub(crate) project_path: String, + pub(crate) mode: Option, + pub(crate) language: Option, + pub(crate) parse_tier: Option, +} + +// ============================================================================ +// Query result types +// ============================================================================ + +#[derive(Debug, Clone, PartialEq, Serialize)] +pub(crate) struct AnalyticsSummary { + pub(crate) invocations: u64, + pub(crate) raw_tokens: u64, + pub(crate) compressed_tokens: u64, + pub(crate) tokens_saved: u64, + pub(crate) avg_savings_pct: f64, +} + +#[derive(Debug, Clone, PartialEq, Serialize)] +pub(crate) struct DailyStats { + pub(crate) date: String, + pub(crate) invocations: u64, + pub(crate) tokens_saved: u64, + pub(crate) avg_savings_pct: f64, +} + +#[derive(Debug, Clone, PartialEq, Serialize)] +pub(crate) struct CommandStats { + #[serde(rename = "type")] + pub(crate) command_type: String, + pub(crate) invocations: u64, + pub(crate) tokens_saved: u64, + pub(crate) avg_savings_pct: f64, +} + +#[derive(Debug, Clone, PartialEq, Serialize)] +pub(crate) struct LanguageStats { + pub(crate) language: String, + pub(crate) files: u64, + pub(crate) tokens_saved: u64, + pub(crate) avg_savings_pct: f64, +} + +#[derive(Debug, Clone, PartialEq, Serialize)] +pub(crate) struct ModeStats { + pub(crate) mode: String, + pub(crate) files: u64, + pub(crate) tokens_saved: u64, + pub(crate) avg_savings_pct: f64, +} + +#[derive(Debug, Clone, PartialEq, Serialize)] +pub(crate) struct TierDistribution { + pub(crate) full_pct: f64, + pub(crate) degraded_pct: f64, + pub(crate) passthrough_pct: f64, +} + +// ============================================================================ +// Pricing +// ============================================================================ + +#[derive(Debug)] +pub(crate) struct PricingModel { + pub(crate) input_cost_per_mtok: f64, + pub(crate) model_name: &'static str, +} + +impl PricingModel { + pub(crate) fn default_pricing() -> Self { + Self { + input_cost_per_mtok: 3.0, + model_name: "claude-sonnet-4-6", + } + } + + pub(crate) fn from_env_or_default() -> Self { + if let Ok(val) = std::env::var("SKIM_INPUT_COST_PER_MTOK") { + if let Ok(cost) = val.parse::() { + if cost.is_finite() && cost >= 0.0 { + return Self { + input_cost_per_mtok: cost, + model_name: "custom", + }; + } + } + } + Self::default_pricing() + } + + pub(crate) fn estimate_savings(&self, tokens_saved: u64) -> f64 { + tokens_saved as f64 / 1_000_000.0 * self.input_cost_per_mtok + } +} + +// ============================================================================ +// Analytics enabled check +// ============================================================================ + +/// Process-wide flag to disable analytics without mutating environment +/// variables. Set via [`force_disable_analytics`] at startup, before any +/// background threads are spawned. Checked by [`is_analytics_enabled`]. +static ANALYTICS_FORCE_DISABLED: AtomicBool = AtomicBool::new(false); + +/// Disable analytics for the lifetime of this process. +/// +/// Thread-safe alternative to `std::env::set_var("SKIM_DISABLE_ANALYTICS", "1")`. +/// Call this early in `main()` when `--disable-analytics` is detected. +pub(crate) fn force_disable_analytics() { + ANALYTICS_FORCE_DISABLED.store(true, Ordering::Relaxed); +} + +/// Check if analytics recording is enabled. +/// +/// Returns `false` when: +/// - [`force_disable_analytics`] has been called (e.g., `--disable-analytics` flag), OR +/// - `SKIM_DISABLE_ANALYTICS` env var is set to a truthy value +/// (`1`, `true`, or `yes`, case-insensitive). +/// +/// Any other value (including `0`, `false`, `no`) keeps analytics enabled. +/// Unsetting the variable also keeps analytics enabled (the default). +pub(crate) fn is_analytics_enabled() -> bool { + if ANALYTICS_FORCE_DISABLED.load(Ordering::Relaxed) { + return false; + } + match std::env::var("SKIM_DISABLE_ANALYTICS") { + Ok(val) => !matches!(val.to_lowercase().as_str(), "1" | "true" | "yes"), + Err(_) => true, + } +} + +// ============================================================================ +// AnalyticsStore trait +// ============================================================================ + +/// Trait abstracting analytics query operations for testability. +/// +/// `AnalyticsDb` implements this trait directly. Test code can provide a +/// `MockStore` without requiring a real SQLite database. +pub(crate) trait AnalyticsStore { + fn query_summary(&self, since: Option) -> anyhow::Result; + fn query_daily(&self, since: Option) -> anyhow::Result>; + fn query_by_command(&self, since: Option) -> anyhow::Result>; + fn query_by_language(&self, since: Option) -> anyhow::Result>; + fn query_by_mode(&self, since: Option) -> anyhow::Result>; + fn query_tier_distribution(&self, since: Option) -> anyhow::Result; + fn clear(&self) -> anyhow::Result<()>; +} + +// ============================================================================ +// AnalyticsDb +// ============================================================================ + +pub(crate) struct AnalyticsDb { + conn: Connection, +} + +impl AnalyticsDb { + /// Open database at the given path, run migrations, enable WAL mode. + /// + /// On Unix, restricts file permissions to owner-only (0600) after + /// creation to prevent world-readable analytics data when the DB path + /// is outside the default 0700 cache directory. + pub(crate) fn open(path: &Path) -> anyhow::Result { + let conn = Connection::open(path)?; + + // Restrict DB file permissions to owner-only on Unix. + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + if let Ok(metadata) = std::fs::metadata(path) { + let mut perms = metadata.permissions(); + perms.set_mode(0o600); + let _ = std::fs::set_permissions(path, perms); + } + } + + conn.busy_timeout(Duration::from_millis(5000))?; + conn.execute_batch("PRAGMA journal_mode=WAL;")?; + schema::run_migrations(&conn)?; + Ok(Self { conn }) + } + + /// Open database at default location, or override with SKIM_ANALYTICS_DB env var. + pub(crate) fn open_default() -> anyhow::Result { + let path = if let Ok(override_path) = std::env::var("SKIM_ANALYTICS_DB") { + PathBuf::from(override_path) + } else { + crate::cache::get_cache_dir()?.join("analytics.db") + }; + Self::open(&path) + } + + /// Maximum length for the `original_cmd` column to prevent unbounded + /// DB growth from extremely long command strings. + const MAX_CMD_LEN: usize = 500; + + /// Record a token savings measurement. + /// + /// The `original_cmd` field is truncated to [`Self::MAX_CMD_LEN`] characters + /// before storage to bound database row size. + pub(crate) fn record(&self, r: &TokenSavingsRecord) -> anyhow::Result<()> { + let cmd = if r.original_cmd.len() > Self::MAX_CMD_LEN { + &r.original_cmd[..Self::MAX_CMD_LEN] + } else { + &r.original_cmd + }; + self.conn.execute( + "INSERT INTO token_savings (timestamp, command_type, original_cmd, raw_tokens, compressed_tokens, savings_pct, duration_ms, project_path, mode, language, parse_tier) + VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11)", + rusqlite::params![ + r.timestamp, + r.command_type.as_str(), + cmd, + r.raw_tokens as i64, + r.compressed_tokens as i64, + r.savings_pct as f64, + r.duration_ms as i64, + r.project_path, + r.mode, + r.language, + r.parse_tier, + ], + )?; + Ok(()) + } + + /// Query aggregate summary. + pub(crate) fn query_summary(&self, since: Option) -> anyhow::Result { + let (where_clause, params) = since_clause(since); + let sql = format!( + "SELECT COUNT(*), COALESCE(SUM(raw_tokens), 0), COALESCE(SUM(compressed_tokens), 0), COALESCE(AVG(savings_pct), 0) FROM token_savings {where_clause}" + ); + let mut stmt = self.conn.prepare(&sql)?; + let row = stmt.query_row(rusqlite::params_from_iter(params), |row| { + let invocations: u64 = row.get(0)?; + let raw_tokens: i64 = row.get(1)?; + let compressed_tokens: i64 = row.get(2)?; + let avg_savings_pct: f64 = row.get(3)?; + Ok(AnalyticsSummary { + invocations, + raw_tokens: raw_tokens as u64, + compressed_tokens: compressed_tokens as u64, + tokens_saved: (raw_tokens - compressed_tokens).max(0) as u64, + avg_savings_pct, + }) + })?; + Ok(row) + } + + /// Query daily breakdown. + pub(crate) fn query_daily(&self, since: Option) -> anyhow::Result> { + let (where_clause, params) = since_clause(since); + let sql = format!( + "SELECT date(timestamp, 'unixepoch') as day, COUNT(*), COALESCE(SUM(raw_tokens - compressed_tokens), 0), COALESCE(AVG(savings_pct), 0) FROM token_savings {where_clause} GROUP BY day ORDER BY day DESC" + ); + let mut stmt = self.conn.prepare(&sql)?; + let rows = stmt.query_map(rusqlite::params_from_iter(params), |row| { + Ok(DailyStats { + date: row.get(0)?, + invocations: row.get(1)?, + tokens_saved: row.get::<_, i64>(2)? as u64, + avg_savings_pct: row.get(3)?, + }) + })?; + rows.collect::, _>>().map_err(Into::into) + } + + /// Query breakdown by command type. + pub(crate) fn query_by_command(&self, since: Option) -> anyhow::Result> { + let (where_clause, params) = since_clause(since); + let sql = format!( + "SELECT command_type, COUNT(*), COALESCE(SUM(raw_tokens - compressed_tokens), 0), COALESCE(AVG(savings_pct), 0) FROM token_savings {where_clause} GROUP BY command_type ORDER BY SUM(raw_tokens - compressed_tokens) DESC" + ); + let mut stmt = self.conn.prepare(&sql)?; + let rows = stmt.query_map(rusqlite::params_from_iter(params), |row| { + Ok(CommandStats { + command_type: row.get(0)?, + invocations: row.get(1)?, + tokens_saved: row.get::<_, i64>(2)? as u64, + avg_savings_pct: row.get(3)?, + }) + })?; + rows.collect::, _>>().map_err(Into::into) + } + + /// Query breakdown by language (file operations only). + pub(crate) fn query_by_language( + &self, + since: Option, + ) -> anyhow::Result> { + let (clause, params) = since_clause_with_extra(since, "language IS NOT NULL"); + let sql = format!( + "SELECT language, COUNT(*), COALESCE(SUM(raw_tokens - compressed_tokens), 0), COALESCE(AVG(savings_pct), 0) FROM token_savings {clause} GROUP BY language ORDER BY SUM(raw_tokens - compressed_tokens) DESC" + ); + let mut stmt = self.conn.prepare(&sql)?; + let rows = stmt.query_map(rusqlite::params_from_iter(params), |row| { + Ok(LanguageStats { + language: row.get(0)?, + files: row.get(1)?, + tokens_saved: row.get::<_, i64>(2)? as u64, + avg_savings_pct: row.get(3)?, + }) + })?; + rows.collect::, _>>().map_err(Into::into) + } + + /// Query breakdown by mode (file operations only). + pub(crate) fn query_by_mode(&self, since: Option) -> anyhow::Result> { + let (clause, params) = since_clause_with_extra(since, "mode IS NOT NULL"); + let sql = format!( + "SELECT mode, COUNT(*), COALESCE(SUM(raw_tokens - compressed_tokens), 0), COALESCE(AVG(savings_pct), 0) FROM token_savings {clause} GROUP BY mode ORDER BY SUM(raw_tokens - compressed_tokens) DESC" + ); + let mut stmt = self.conn.prepare(&sql)?; + let rows = stmt.query_map(rusqlite::params_from_iter(params), |row| { + Ok(ModeStats { + mode: row.get(0)?, + files: row.get(1)?, + tokens_saved: row.get::<_, i64>(2)? as u64, + avg_savings_pct: row.get(3)?, + }) + })?; + rows.collect::, _>>().map_err(Into::into) + } + + /// Query parse tier distribution (command operations only). + pub(crate) fn query_tier_distribution( + &self, + since: Option, + ) -> anyhow::Result { + let (clause, params) = since_clause_with_extra(since, "parse_tier IS NOT NULL"); + let sql = format!( + "SELECT COALESCE(SUM(CASE WHEN parse_tier = 'full' THEN 1 ELSE 0 END), 0), \ + COALESCE(SUM(CASE WHEN parse_tier = 'degraded' THEN 1 ELSE 0 END), 0), \ + COALESCE(SUM(CASE WHEN parse_tier = 'passthrough' THEN 1 ELSE 0 END), 0), \ + COUNT(*) FROM token_savings {clause}" + ); + let mut stmt = self.conn.prepare(&sql)?; + let row = stmt.query_row(rusqlite::params_from_iter(params), |row| { + let full: i64 = row.get(0)?; + let degraded: i64 = row.get(1)?; + let passthrough: i64 = row.get(2)?; + let total: i64 = row.get(3)?; + let t = if total > 0 { total as f64 } else { 1.0 }; + Ok(TierDistribution { + full_pct: full as f64 / t * 100.0, + degraded_pct: degraded as f64 / t * 100.0, + passthrough_pct: passthrough as f64 / t * 100.0, + }) + })?; + Ok(row) + } + + /// Prune records older than N days. + pub(crate) fn prune_older_than(&self, days: u64) -> anyhow::Result { + let cutoff = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() as i64 + - (days as i64 * 86400); + let count = self + .conn + .execute("DELETE FROM token_savings WHERE timestamp < ?1", [cutoff])?; + Ok(count) + } + + /// Prune if last prune was >24h ago. Uses the `analytics_meta` table + /// (created by schema migration v2) for tracking. + pub(crate) fn maybe_prune(&self) { + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + let last_prune: i64 = self + .conn + .query_row( + "SELECT COALESCE((SELECT value FROM analytics_meta WHERE key = 'last_prune'), 0)", + [], + |row| row.get(0), + ) + .unwrap_or(0); + + if now as i64 - last_prune > 86400 && self.prune_older_than(90).is_ok() { + let _ = self.conn.execute( + "INSERT OR REPLACE INTO analytics_meta (key, value) VALUES ('last_prune', ?1)", + [now as i64], + ); + } + } + + /// Delete all analytics data. + fn clear_data(&self) -> anyhow::Result<()> { + self.conn.execute("DELETE FROM token_savings", [])?; + Ok(()) + } +} + +impl AnalyticsStore for AnalyticsDb { + fn query_summary(&self, since: Option) -> anyhow::Result { + self.query_summary(since) + } + fn query_daily(&self, since: Option) -> anyhow::Result> { + self.query_daily(since) + } + fn query_by_command(&self, since: Option) -> anyhow::Result> { + self.query_by_command(since) + } + fn query_by_language(&self, since: Option) -> anyhow::Result> { + self.query_by_language(since) + } + fn query_by_mode(&self, since: Option) -> anyhow::Result> { + self.query_by_mode(since) + } + fn query_tier_distribution(&self, since: Option) -> anyhow::Result { + self.query_tier_distribution(since) + } + fn clear(&self) -> anyhow::Result<()> { + self.clear_data() + } +} + +/// Build WHERE clause for optional since filter. +fn since_clause(since: Option) -> (String, Vec) { + match since { + Some(ts) => ("WHERE timestamp >= ?1".to_string(), vec![ts]), + None => (String::new(), vec![]), + } +} + +/// Build WHERE clause with an optional extra condition appended. +/// +/// Composes the `since` filter with an additional SQL predicate (e.g. +/// `"language IS NOT NULL"`). The extra condition is AND-ed to the since +/// clause when present, or becomes its own WHERE clause when since is None. +fn since_clause_with_extra(since: Option, extra_condition: &str) -> (String, Vec) { + let (base, params) = since_clause(since); + let clause = if base.is_empty() { + format!("WHERE {extra_condition}") + } else { + format!("{base} AND {extra_condition}") + }; + (clause, params) +} + +// ============================================================================ +// Fire-and-forget recording functions +// ============================================================================ + +/// Compute token savings as a percentage (0.0 when raw_tokens is zero). +pub(crate) fn savings_percentage(raw_tokens: usize, compressed_tokens: usize) -> f32 { + if raw_tokens == 0 { + 0.0 + } else { + (raw_tokens as f32 - compressed_tokens as f32) / raw_tokens as f32 * 100.0 + } +} + +/// Current Unix timestamp in seconds. +pub(crate) fn now_unix_secs() -> i64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() as i64 +} + +/// Persist a record to the default database, with auto-pruning. +fn persist_record(record: &TokenSavingsRecord) { + if let Ok(db) = AnalyticsDb::open_default() { + let _ = db.record(record); + db.maybe_prune(); + } +} + +/// Record command output token savings. Defers token counting to background thread. +/// Check is_analytics_enabled() BEFORE cloning strings. +pub(crate) fn record_fire_and_forget( + raw_text: String, + compressed_text: String, + original_cmd: String, + command_type: CommandType, + duration: Duration, + project_path: String, + parse_tier: Option, +) { + if !is_analytics_enabled() { + return; + } + std::thread::spawn(move || { + let Ok(raw_tokens) = tokens::count_tokens(&raw_text) else { + return; + }; + let Ok(comp_tokens) = tokens::count_tokens(&compressed_text) else { + return; + }; + let record = TokenSavingsRecord { + timestamp: now_unix_secs(), + command_type, + original_cmd, + raw_tokens, + compressed_tokens: comp_tokens, + savings_pct: savings_percentage(raw_tokens, comp_tokens), + duration_ms: duration.as_millis() as u64, + project_path, + mode: None, + language: None, + parse_tier, + }; + persist_record(&record); + }); +} + +/// Record file operation token savings where counts are already known. +/// +/// Accepts a fully-constructed [`TokenSavingsRecord`] and persists it on +/// a background thread. The `timestamp` and `savings_pct` fields should +/// be populated by the caller (use [`now_unix_secs`] and +/// [`savings_percentage`] helpers). +pub(crate) fn record_with_counts(record: TokenSavingsRecord) { + if !is_analytics_enabled() { + return; + } + std::thread::spawn(move || { + persist_record(&record); + }); +} + +// ============================================================================ +// Convenience helpers for subcommand call sites +// ============================================================================ + +/// Record command output analytics with automatic enabled-check and cwd detection. +/// +/// Reduces the 12-15 line inline pattern at each subcommand call site to a +/// single function call. Token counting is deferred to a background thread. +pub(crate) fn try_record_command( + raw_text: String, + compressed_text: String, + original_cmd: String, + command_type: CommandType, + duration: Duration, + parse_tier: Option<&str>, +) { + if !is_analytics_enabled() { + return; + } + let cwd = std::env::current_dir() + .unwrap_or_default() + .display() + .to_string(); + record_fire_and_forget( + raw_text, + compressed_text, + original_cmd, + command_type, + duration, + cwd, + parse_tier.map(|s| s.to_string()), + ); +} + +/// Record command output analytics when token counts are already known. +/// +/// Use this instead of [`try_record_command`] when the caller has already +/// computed token counts (e.g., via `--show-stats`), avoiding redundant +/// re-tokenization in the background thread. +/// +/// Delegates to [`record_with_counts`] after resolving cwd and building +/// the record. +#[allow(dead_code)] +pub(crate) fn try_record_command_with_counts( + raw_tokens: usize, + compressed_tokens: usize, + original_cmd: String, + command_type: CommandType, + duration: Duration, + parse_tier: Option<&str>, +) { + if !is_analytics_enabled() { + return; + } + let cwd = std::env::current_dir() + .unwrap_or_default() + .display() + .to_string(); + record_with_counts(TokenSavingsRecord { + timestamp: now_unix_secs(), + command_type, + original_cmd, + raw_tokens, + compressed_tokens, + savings_pct: savings_percentage(raw_tokens, compressed_tokens), + duration_ms: duration.as_millis() as u64, + project_path: cwd, + mode: None, + language: None, + parse_tier: parse_tier.map(|s| s.to_string()), + }); +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::NamedTempFile; + + /// Create a test database backed by a temporary file. + /// + /// Returns both the `AnalyticsDb` and the `NamedTempFile` handle. The + /// caller must keep the `NamedTempFile` alive for the duration of the + /// test -- dropping it deletes the underlying file, which would + /// invalidate the database connection. + fn test_db() -> (AnalyticsDb, NamedTempFile) { + let tmp = NamedTempFile::new().unwrap(); + let db = AnalyticsDb::open(tmp.path()).unwrap(); + (db, tmp) + } + + fn sample_record() -> TokenSavingsRecord { + TokenSavingsRecord { + timestamp: 1711300000, + command_type: CommandType::File, + original_cmd: "skim src/main.rs".to_string(), + raw_tokens: 1000, + compressed_tokens: 200, + savings_pct: 80.0, + duration_ms: 15, + project_path: "/tmp/test".to_string(), + mode: Some("structure".to_string()), + language: Some("rust".to_string()), + parse_tier: None, + } + } + + #[test] + fn test_open_creates_tables() { + let (db, _tmp) = test_db(); + let count: i64 = db + .conn + .query_row("SELECT COUNT(*) FROM token_savings", [], |row| row.get(0)) + .unwrap(); + assert_eq!(count, 0); + } + + #[test] + fn test_record_and_query_summary() { + let (db, _tmp) = test_db(); + db.record(&sample_record()).unwrap(); + + let summary = db.query_summary(None).unwrap(); + assert_eq!(summary.invocations, 1); + assert_eq!(summary.raw_tokens, 1000); + assert_eq!(summary.compressed_tokens, 200); + assert_eq!(summary.tokens_saved, 800); + } + + #[test] + fn test_daily_breakdown_groups_correctly() { + let (db, _tmp) = test_db(); + // Two records on same day + let mut r1 = sample_record(); + r1.timestamp = 1711300000; + db.record(&r1).unwrap(); + + let mut r2 = sample_record(); + r2.timestamp = 1711300100; + db.record(&r2).unwrap(); + + // One record on different day + let mut r3 = sample_record(); + r3.timestamp = 1711300000 + 86400; + db.record(&r3).unwrap(); + + let daily = db.query_daily(None).unwrap(); + assert_eq!(daily.len(), 2); + } + + #[test] + fn test_command_breakdown() { + let (db, _tmp) = test_db(); + let mut r1 = sample_record(); + r1.command_type = CommandType::File; + db.record(&r1).unwrap(); + + let mut r2 = sample_record(); + r2.command_type = CommandType::Test; + db.record(&r2).unwrap(); + + let by_cmd = db.query_by_command(None).unwrap(); + assert_eq!(by_cmd.len(), 2); + } + + #[test] + fn test_prune_removes_old_records() { + let (db, _tmp) = test_db(); + // Record from 100 days ago + let mut r = sample_record(); + r.timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs() as i64 + - (100 * 86400); + db.record(&r).unwrap(); + + // Record from today + let mut r2 = sample_record(); + r2.timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs() as i64; + db.record(&r2).unwrap(); + + let pruned = db.prune_older_than(90).unwrap(); + assert_eq!(pruned, 1); + + let summary = db.query_summary(None).unwrap(); + assert_eq!(summary.invocations, 1); + } + + #[test] + fn test_wal_mode_enabled() { + let (db, _tmp) = test_db(); + let mode: String = db + .conn + .query_row("PRAGMA journal_mode", [], |row| row.get(0)) + .unwrap(); + assert_eq!(mode, "wal"); + } + + #[test] + fn test_clear_deletes_all() { + let (db, _tmp) = test_db(); + db.record(&sample_record()).unwrap(); + db.record(&sample_record()).unwrap(); + db.clear().unwrap(); + let summary = db.query_summary(None).unwrap(); + assert_eq!(summary.invocations, 0); + } + + #[test] + fn test_language_breakdown() { + let (db, _tmp) = test_db(); + let mut r1 = sample_record(); + r1.language = Some("rust".to_string()); + db.record(&r1).unwrap(); + + let mut r2 = sample_record(); + r2.language = Some("typescript".to_string()); + db.record(&r2).unwrap(); + + let by_lang = db.query_by_language(None).unwrap(); + assert_eq!(by_lang.len(), 2); + } + + #[test] + fn test_mode_breakdown() { + let (db, _tmp) = test_db(); + let mut r1 = sample_record(); + r1.mode = Some("structure".to_string()); + db.record(&r1).unwrap(); + + let mut r2 = sample_record(); + r2.mode = Some("signatures".to_string()); + db.record(&r2).unwrap(); + + let by_mode = db.query_by_mode(None).unwrap(); + assert_eq!(by_mode.len(), 2); + } + + #[test] + fn test_tier_distribution() { + let (db, _tmp) = test_db(); + for tier in &["full", "full", "full", "degraded", "passthrough"] { + let mut r = sample_record(); + r.parse_tier = Some(tier.to_string()); + r.mode = None; + r.language = None; + db.record(&r).unwrap(); + } + let dist = db.query_tier_distribution(None).unwrap(); + assert!((dist.full_pct - 60.0).abs() < 0.1); + assert!((dist.degraded_pct - 20.0).abs() < 0.1); + assert!((dist.passthrough_pct - 20.0).abs() < 0.1); + } + + #[test] + fn test_pricing_default() { + let p = PricingModel::default_pricing(); + assert_eq!(p.input_cost_per_mtok, 3.0); + assert_eq!(p.model_name, "claude-sonnet-4-6"); + } + + #[test] + fn test_estimate_calculation() { + let p = PricingModel::default_pricing(); + let savings = p.estimate_savings(1_000_000); + assert!((savings - 3.0).abs() < 0.001); + } + + #[test] + fn test_since_filter() { + let (db, _tmp) = test_db(); + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs() as i64; + + let mut old = sample_record(); + old.timestamp = now - 86400 * 10; // 10 days ago + db.record(&old).unwrap(); + + let mut recent = sample_record(); + recent.timestamp = now - 3600; // 1 hour ago + db.record(&recent).unwrap(); + + let summary = db.query_summary(Some(now - 86400)).unwrap(); + assert_eq!(summary.invocations, 1); + } + + // ======================================================================== + // is_analytics_enabled() tests + // ======================================================================== + + /// Run a closure with `SKIM_DISABLE_ANALYTICS` set to the given value, + /// then restore the original environment. Uses a mutex to prevent + /// concurrent env-var mutations from interfering between tests. + fn with_env_var(value: Option<&str>, f: impl FnOnce()) { + use std::sync::Mutex; + static ENV_LOCK: Mutex<()> = Mutex::new(()); + let _guard = ENV_LOCK.lock().unwrap(); + + let prev = std::env::var("SKIM_DISABLE_ANALYTICS").ok(); + match value { + Some(v) => std::env::set_var("SKIM_DISABLE_ANALYTICS", v), + None => std::env::remove_var("SKIM_DISABLE_ANALYTICS"), + } + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(f)); + match prev { + Some(v) => std::env::set_var("SKIM_DISABLE_ANALYTICS", v), + None => std::env::remove_var("SKIM_DISABLE_ANALYTICS"), + } + if let Err(e) = result { + std::panic::resume_unwind(e); + } + } + + #[test] + fn test_analytics_enabled_when_env_unset() { + with_env_var(None, || { + assert!( + is_analytics_enabled(), + "analytics should be enabled when SKIM_DISABLE_ANALYTICS is unset" + ); + }); + } + + #[test] + fn test_analytics_disabled_with_value_1() { + with_env_var(Some("1"), || { + assert!( + !is_analytics_enabled(), + "analytics should be disabled when SKIM_DISABLE_ANALYTICS=1" + ); + }); + } + + #[test] + fn test_analytics_disabled_with_value_true() { + with_env_var(Some("true"), || { + assert!( + !is_analytics_enabled(), + "analytics should be disabled when SKIM_DISABLE_ANALYTICS=true" + ); + }); + } + + #[test] + fn test_analytics_disabled_with_value_yes() { + with_env_var(Some("yes"), || { + assert!( + !is_analytics_enabled(), + "analytics should be disabled when SKIM_DISABLE_ANALYTICS=yes" + ); + }); + } + + #[test] + fn test_analytics_disabled_case_insensitive() { + with_env_var(Some("TRUE"), || { + assert!( + !is_analytics_enabled(), + "analytics should be disabled when SKIM_DISABLE_ANALYTICS=TRUE (case-insensitive)" + ); + }); + } + + #[test] + fn test_analytics_enabled_with_value_0() { + with_env_var(Some("0"), || { + assert!( + is_analytics_enabled(), + "analytics should remain enabled when SKIM_DISABLE_ANALYTICS=0" + ); + }); + } + + #[test] + fn test_analytics_enabled_with_value_false() { + with_env_var(Some("false"), || { + assert!( + is_analytics_enabled(), + "analytics should remain enabled when SKIM_DISABLE_ANALYTICS=false" + ); + }); + } + + #[test] + fn test_analytics_enabled_with_value_no() { + with_env_var(Some("no"), || { + assert!( + is_analytics_enabled(), + "analytics should remain enabled when SKIM_DISABLE_ANALYTICS=no" + ); + }); + } + + #[test] + fn test_analytics_enabled_with_empty_string() { + with_env_var(Some(""), || { + assert!( + is_analytics_enabled(), + "analytics should remain enabled when SKIM_DISABLE_ANALYTICS is empty" + ); + }); + } + + // ======================================================================== + // Pricing validation tests + // ======================================================================== + + /// Run a closure with `SKIM_INPUT_COST_PER_MTOK` set to the given value, + /// then restore the original environment. Uses the same mutex as + /// `with_env_var` to prevent concurrent env-var mutations. + fn with_cost_env_var(value: Option<&str>, f: impl FnOnce()) { + use std::sync::Mutex; + static COST_ENV_LOCK: Mutex<()> = Mutex::new(()); + let _guard = COST_ENV_LOCK.lock().unwrap(); + + let prev = std::env::var("SKIM_INPUT_COST_PER_MTOK").ok(); + match value { + Some(v) => std::env::set_var("SKIM_INPUT_COST_PER_MTOK", v), + None => std::env::remove_var("SKIM_INPUT_COST_PER_MTOK"), + } + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(f)); + match prev { + Some(v) => std::env::set_var("SKIM_INPUT_COST_PER_MTOK", v), + None => std::env::remove_var("SKIM_INPUT_COST_PER_MTOK"), + } + if let Err(e) = result { + std::panic::resume_unwind(e); + } + } + + #[test] + fn test_pricing_negative_falls_back_to_default() { + with_cost_env_var(Some("-5"), || { + let p = PricingModel::from_env_or_default(); + assert_eq!( + p.input_cost_per_mtok, 3.0, + "negative cost should fall back to default" + ); + assert_eq!(p.model_name, "claude-sonnet-4-6"); + }); + } + + #[test] + fn test_pricing_zero_is_valid() { + with_cost_env_var(Some("0"), || { + let p = PricingModel::from_env_or_default(); + assert_eq!(p.input_cost_per_mtok, 0.0, "zero cost should be accepted"); + assert_eq!(p.model_name, "custom"); + }); + } + + #[test] + fn test_pricing_infinity_falls_back_to_default() { + with_cost_env_var(Some("inf"), || { + let p = PricingModel::from_env_or_default(); + assert_eq!( + p.input_cost_per_mtok, 3.0, + "infinite cost should fall back to default" + ); + assert_eq!(p.model_name, "claude-sonnet-4-6"); + }); + } + + #[test] + fn test_pricing_nan_falls_back_to_default() { + with_cost_env_var(Some("NaN"), || { + let p = PricingModel::from_env_or_default(); + assert_eq!( + p.input_cost_per_mtok, 3.0, + "NaN cost should fall back to default" + ); + assert_eq!(p.model_name, "claude-sonnet-4-6"); + }); + } + + // ======================================================================== + // Schema migration v2 test + // ======================================================================== + + #[test] + fn test_analytics_meta_table_created_by_migration() { + let (db, _tmp) = test_db(); + // analytics_meta should exist from the v2 migration + let count: i64 = db + .conn + .query_row( + "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='analytics_meta'", + [], + |row| row.get(0), + ) + .unwrap(); + assert_eq!( + count, 1, + "analytics_meta table should be created by migration" + ); + } + + // ======================================================================== + // force_disable_analytics / AtomicBool tests + // ======================================================================== + + #[test] + fn test_force_disable_analytics_disables() { + // Reset state (tests share the process-wide atomic) + ANALYTICS_FORCE_DISABLED.store(false, Ordering::Relaxed); + + // Should be enabled by default (assuming env var not set by another test) + with_env_var(None, || { + assert!( + is_analytics_enabled(), + "should be enabled before force_disable" + ); + force_disable_analytics(); + assert!( + !is_analytics_enabled(), + "should be disabled after force_disable" + ); + }); + + // Reset to not pollute other tests + ANALYTICS_FORCE_DISABLED.store(false, Ordering::Relaxed); + } + + // ======================================================================== + // original_cmd truncation test + // ======================================================================== + + #[test] + fn test_record_truncates_long_original_cmd() { + let (db, _tmp) = test_db(); + let mut r = sample_record(); + r.original_cmd = "x".repeat(1000); + db.record(&r).unwrap(); + + let stored: String = db + .conn + .query_row("SELECT original_cmd FROM token_savings", [], |row| { + row.get(0) + }) + .unwrap(); + assert_eq!( + stored.len(), + AnalyticsDb::MAX_CMD_LEN, + "original_cmd should be truncated to {} chars", + AnalyticsDb::MAX_CMD_LEN + ); + } + + // ======================================================================== + // DB file permissions test (Unix only) + // ======================================================================== + + #[cfg(unix)] + #[test] + fn test_db_file_permissions_are_owner_only() { + use std::os::unix::fs::PermissionsExt; + + let tmp = NamedTempFile::new().unwrap(); + let _db = AnalyticsDb::open(tmp.path()).unwrap(); + + let perms = std::fs::metadata(tmp.path()).unwrap().permissions(); + let mode = perms.mode() & 0o777; + assert_eq!( + mode, 0o600, + "DB file should have 0600 permissions, got {:o}", + mode + ); + } + + // ======================================================================== + // since_clause_with_extra helper test + // ======================================================================== + + #[test] + fn test_since_clause_with_extra_no_since() { + let (clause, params) = since_clause_with_extra(None, "language IS NOT NULL"); + assert_eq!(clause, "WHERE language IS NOT NULL"); + assert!(params.is_empty()); + } + + #[test] + fn test_since_clause_with_extra_with_since() { + let (clause, params) = since_clause_with_extra(Some(12345), "mode IS NOT NULL"); + assert_eq!(clause, "WHERE timestamp >= ?1 AND mode IS NOT NULL"); + assert_eq!(params, vec![12345]); + } +} diff --git a/crates/rskim/src/analytics/schema.rs b/crates/rskim/src/analytics/schema.rs new file mode 100644 index 0000000..4a46661 --- /dev/null +++ b/crates/rskim/src/analytics/schema.rs @@ -0,0 +1,42 @@ +//! Database schema and migrations for analytics. + +use rusqlite::Connection; + +/// Run all database migrations. +pub(super) fn run_migrations(conn: &Connection) -> anyhow::Result<()> { + let version: i64 = conn.query_row("PRAGMA user_version", [], |row| row.get(0))?; + + if version < 1 { + conn.execute_batch( + "CREATE TABLE IF NOT EXISTS token_savings ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + timestamp INTEGER NOT NULL, + command_type TEXT NOT NULL, + original_cmd TEXT NOT NULL, + raw_tokens INTEGER NOT NULL, + compressed_tokens INTEGER NOT NULL, + savings_pct REAL NOT NULL, + duration_ms INTEGER NOT NULL, + project_path TEXT NOT NULL, + mode TEXT, + language TEXT, + parse_tier TEXT + ); + CREATE INDEX IF NOT EXISTS idx_ts_timestamp ON token_savings(timestamp); + CREATE INDEX IF NOT EXISTS idx_ts_command_type ON token_savings(command_type); + PRAGMA user_version = 1;", + )?; + } + + if version < 2 { + conn.execute_batch( + "CREATE TABLE IF NOT EXISTS analytics_meta ( + key TEXT PRIMARY KEY, + value INTEGER + ); + PRAGMA user_version = 2;", + )?; + } + + Ok(()) +} diff --git a/crates/rskim/src/cache.rs b/crates/rskim/src/cache.rs index 8fdf5b6..ef520ec 100644 --- a/crates/rskim/src/cache.rs +++ b/crates/rskim/src/cache.rs @@ -204,7 +204,8 @@ pub(crate) fn clear_cache() -> Result<()> { for entry in fs::read_dir(&cache_dir)? { let entry = entry?; let path = entry.path(); - if path.is_file() { + // Only remove JSON cache files; skip analytics.db and other non-cache files. + if path.is_file() && path.extension().is_some_and(|ext| ext == "json") { // Best-effort removal; ignore errors from concurrent access. let _ = fs::remove_file(&path); } diff --git a/crates/rskim/src/cmd/build/cargo.rs b/crates/rskim/src/cmd/build/cargo.rs index cb181a6..070a3db 100644 --- a/crates/rskim/src/cmd/build/cargo.rs +++ b/crates/rskim/src/cmd/build/cargo.rs @@ -17,7 +17,8 @@ use std::sync::LazyLock; use regex::Regex; -use super::{inject_flag_before_separator, run_parsed_command, user_has_flag}; +use super::run_parsed_command; +use crate::cmd::{inject_flag_before_separator, user_has_flag}; use crate::output::canonical::BuildResult; use crate::output::ParseResult; use crate::runner::CommandOutput; @@ -43,11 +44,11 @@ static CARGO_ERROR_LINE_RE: LazyLock = /// /// Injects `--message-format=json` if not already set by the user, then /// parses the NDJSON output through the three-tier parser. -pub(crate) fn run(args: &[String]) -> anyhow::Result { +pub(crate) fn run(args: &[String], show_stats: bool) -> anyhow::Result { let mut full_args = vec!["build".to_string()]; full_args.extend_from_slice(args); - if !user_has_flag(&full_args, "--message-format") { + if !user_has_flag(&full_args, &["--message-format"]) { inject_flag_before_separator(&mut full_args, "--message-format=json"); } @@ -56,6 +57,7 @@ pub(crate) fn run(args: &[String]) -> anyhow::Result { &full_args, &[("CARGO_TERM_COLOR", "never")], "install Rust from https://rustup.rs", + show_stats, parse, ) } @@ -64,11 +66,11 @@ pub(crate) fn run(args: &[String]) -> anyhow::Result { /// /// Same JSON injection and parsing as cargo build, but with clippy-specific /// grouping of warnings by lint rule code. -pub(crate) fn run_clippy(args: &[String]) -> anyhow::Result { +pub(crate) fn run_clippy(args: &[String], show_stats: bool) -> anyhow::Result { let mut full_args = vec!["clippy".to_string()]; full_args.extend_from_slice(args); - if !user_has_flag(&full_args, "--message-format") { + if !user_has_flag(&full_args, &["--message-format"]) { inject_flag_before_separator(&mut full_args, "--message-format=json"); } @@ -77,6 +79,7 @@ pub(crate) fn run_clippy(args: &[String]) -> anyhow::Result { &full_args, &[("CARGO_TERM_COLOR", "never")], "install Rust from https://rustup.rs", + show_stats, parse, ) } @@ -370,7 +373,7 @@ mod tests { // If user already has --message-format=json2, we should not inject our own let args = vec!["--message-format=json2".to_string()]; assert!( - user_has_flag(&args, "--message-format"), + user_has_flag(&args, &["--message-format"]), "should detect existing --message-format flag" ); } @@ -388,7 +391,7 @@ mod tests { // Verify flag detection prevents injection let user_args = vec!["build".to_string(), "--message-format=short".to_string()]; assert!( - user_has_flag(&user_args, "--message-format"), + user_has_flag(&user_args, &["--message-format"]), "should detect user's --message-format flag" ); @@ -471,12 +474,12 @@ mod tests { #[test] fn test_user_has_flag_present() { let args = vec!["build".to_string(), "--message-format=json2".to_string()]; - assert!(user_has_flag(&args, "--message-format")); + assert!(user_has_flag(&args, &["--message-format"])); } #[test] fn test_user_has_flag_absent() { let args = vec!["build".to_string(), "--release".to_string()]; - assert!(!user_has_flag(&args, "--message-format")); + assert!(!user_has_flag(&args, &["--message-format"])); } } diff --git a/crates/rskim/src/cmd/build/mod.rs b/crates/rskim/src/cmd/build/mod.rs index 9c6ef34..9559686 100644 --- a/crates/rskim/src/cmd/build/mod.rs +++ b/crates/rskim/src/cmd/build/mod.rs @@ -28,13 +28,17 @@ pub(crate) fn run(args: &[String]) -> anyhow::Result { return Ok(ExitCode::SUCCESS); } - let sub = args.first().map(String::as_str); - let remaining = if args.len() > 1 { &args[1..] } else { &[] }; + let (filtered_args, show_stats) = crate::cmd::extract_show_stats(args); + + let (sub, remaining) = match filtered_args.split_first() { + Some((first, rest)) => (Some(first.as_str()), rest), + None => (None, [].as_slice()), + }; match sub { - Some("cargo") => cargo::run(remaining), - Some("clippy") => cargo::run_clippy(remaining), - Some("tsc") => tsc::run(remaining), + Some("cargo") => cargo::run(remaining, show_stats), + Some("clippy") => cargo::run_clippy(remaining, show_stats), + Some("tsc") => tsc::run(remaining, show_stats), Some(unknown) => { anyhow::bail!( "unknown build tool: '{unknown}'\n\n\ @@ -72,32 +76,7 @@ fn print_help() { println!(" skim build tsc --noEmit"); } -// ============================================================================ -// Shared helpers -// ============================================================================ - -/// Check whether user already passed a flag matching the given prefix. -/// -/// Returns `true` if any arg equals `prefix` exactly or starts with -/// `prefix=` (e.g., `--message-format=json`). This avoids false positives -/// from hypothetical flags that share a common prefix. -pub(super) fn user_has_flag(args: &[String], prefix: &str) -> bool { - args.iter() - .any(|a| a == prefix || a.starts_with(&format!("{prefix}="))) -} - -/// Inject a flag before the `--` separator, or at the end if no separator exists. -/// -/// This ensures injected flags (like `--message-format=json`) appear in the -/// correct position relative to any `--` separator that separates cargo flags -/// from rustc flags. -pub(super) fn inject_flag_before_separator(args: &mut Vec, flag: &str) { - if let Some(pos) = args.iter().position(|a| a == "--") { - args.insert(pos, flag.to_string()); - } else { - args.push(flag.to_string()); - } -} +// Shared helpers (user_has_flag, inject_flag_before_separator) are in crate::cmd /// Execute an external command, parse its output, and emit the result. /// @@ -118,6 +97,7 @@ pub(super) fn run_parsed_command( args: &[String], env_vars: &[(&str, &str)], install_hint: &str, + show_stats: bool, parser: fn(&CommandOutput) -> ParseResult, ) -> anyhow::Result { let runner = CommandRunner::new(Some(Duration::from_secs(600))); @@ -138,6 +118,14 @@ pub(super) fn run_parsed_command( } }; + // Strip ANSI escape codes before parsing. Some build tools emit color codes + // even with NO_COLOR=1, matching the shared run_parsed_command_with_mode pattern. + let output = CommandOutput { + stdout: crate::output::strip_ansi(&output.stdout), + stderr: crate::output::strip_ansi(&output.stderr), + ..output + }; + let result = parser(&output); // Emit markers to stderr (warnings, notices) @@ -151,6 +139,19 @@ pub(super) fn run_parsed_command( println!("{content}"); } + // Combine stdout+stderr for stats and analytics + let raw_text = if output.stderr.is_empty() { + output.stdout.clone() + } else { + format!("{}\n{}", output.stdout, output.stderr) + }; + + // Report token stats if requested + if show_stats { + let (orig, comp) = crate::process::count_token_pair(&raw_text, result.content()); + crate::process::report_token_stats(orig, comp, ""); + } + // Determine exit code from the parsed result let exit_code = match &result { ParseResult::Full(build_result) | ParseResult::Degraded(build_result, _) => { @@ -169,5 +170,18 @@ pub(super) fn run_parsed_command( } }; + // Record analytics (fire-and-forget, non-blocking). + // Guard to avoid .to_string() allocation when analytics are disabled. + if crate::analytics::is_analytics_enabled() { + crate::analytics::try_record_command( + raw_text, + result.content().to_string(), + format!("skim build {program} {}", args.join(" ")), + crate::analytics::CommandType::Build, + output.duration, + Some(result.tier_name()), + ); + } + Ok(exit_code) } diff --git a/crates/rskim/src/cmd/build/tsc.rs b/crates/rskim/src/cmd/build/tsc.rs index 6140cec..e678096 100644 --- a/crates/rskim/src/cmd/build/tsc.rs +++ b/crates/rskim/src/cmd/build/tsc.rs @@ -28,15 +28,13 @@ use crate::runner::CommandOutput; /// /// tsc writes errors to stderr in its standard format. No flag injection /// is needed (unlike cargo). -pub(crate) fn run(args: &[String]) -> anyhow::Result { - let mut full_args = Vec::new(); - full_args.extend_from_slice(args); - +pub(crate) fn run(args: &[String], show_stats: bool) -> anyhow::Result { run_parsed_command( "tsc", - &full_args, + args, &[], "npm install -g typescript", + show_stats, parse_tsc, ) } diff --git a/crates/rskim/src/cmd/git.rs b/crates/rskim/src/cmd/git.rs index f3a7a9b..fc4ad77 100644 --- a/crates/rskim/src/cmd/git.rs +++ b/crates/rskim/src/cmd/git.rs @@ -10,6 +10,7 @@ use std::sync::LazyLock; use regex::Regex; +use crate::cmd::user_has_flag; use crate::output::canonical::GitResult; use crate::runner::CommandRunner; @@ -43,7 +44,9 @@ pub(crate) fn run(args: &[String]) -> anyhow::Result { return Ok(ExitCode::SUCCESS); } - let (global_flags, rest) = split_global_flags(args); + let (filtered_args, show_stats) = crate::cmd::extract_show_stats(args); + + let (global_flags, rest) = split_global_flags(&filtered_args); let Some(subcmd) = rest.first() else { print_help(); @@ -53,9 +56,9 @@ pub(crate) fn run(args: &[String]) -> anyhow::Result { let subcmd_args = &rest[1..]; match subcmd.as_str() { - "status" => run_status(&global_flags, subcmd_args), - "diff" => run_diff(&global_flags, subcmd_args), - "log" => run_log(&global_flags, subcmd_args), + "status" => run_status(&global_flags, subcmd_args, show_stats), + "diff" => run_diff(&global_flags, subcmd_args, show_stats), + "log" => run_log(&global_flags, subcmd_args, show_stats), other => { anyhow::bail!( "unknown git subcommand: '{other}'\n\n\ @@ -156,19 +159,6 @@ fn split_global_flags(args: &[String]) -> (Vec, Vec) { // Helpers // ============================================================================ -/// Check whether any of `flags` appears in `args`. -/// -/// Supports both exact matches (`--oneline`) and prefix matches with `=` -/// (`--format=%H` matches `--format`). This is consistent with the rewrite -/// engine's `skip_if_flag_prefix` behavior. -fn user_has_flag(args: &[String], flags: &[&str]) -> bool { - args.iter().any(|a| { - flags - .iter() - .any(|&f| a.as_str() == f || a.starts_with(&format!("{f}="))) - }) -} - /// Check whether the user has specified a limit flag (`-n`, `--max-count`). fn has_limit_flag(args: &[String]) -> bool { args.iter() @@ -176,7 +166,7 @@ fn has_limit_flag(args: &[String]) -> bool { } /// Convert an optional exit code to an ExitCode. -fn exit_code_to_process(code: Option) -> ExitCode { +fn map_exit_code(code: Option) -> ExitCode { match code { Some(0) => ExitCode::SUCCESS, _ => ExitCode::FAILURE, @@ -188,6 +178,7 @@ fn run_passthrough( global_flags: &[String], subcmd: &str, args: &[String], + show_stats: bool, ) -> anyhow::Result { let mut full_args: Vec = global_flags.to_vec(); full_args.push(subcmd.to_string()); @@ -202,14 +193,40 @@ fn run_passthrough( eprint!("{}", output.stderr); } - Ok(exit_code_to_process(output.exit_code)) + if show_stats { + // Passthrough: raw == compressed (no savings) + let raw = &output.stdout; + let (orig, comp) = crate::process::count_token_pair(raw, raw); + crate::process::report_token_stats(orig, comp, ""); + } + + // Record analytics (fire-and-forget, non-blocking). + // Passthrough: raw == compressed (no transformation applied). + // Guard behind is_analytics_enabled() to avoid cloning large git output + // (100 KB+) when analytics are disabled. + if crate::analytics::is_analytics_enabled() { + crate::analytics::try_record_command( + output.stdout.clone(), + output.stdout, + format!("skim git {} {}", subcmd, args.join(" ")), + crate::analytics::CommandType::Git, + output.duration, + None, + ); + } + + Ok(map_exit_code(output.exit_code)) } /// Run a git command and parse its output with the given parser function. /// /// Callers are responsible for baking global flags into `subcmd_args` before /// calling this function. -fn run_parsed_command(subcmd_args: &[String], parser: F) -> anyhow::Result +fn run_parsed_command( + subcmd_args: &[String], + show_stats: bool, + parser: F, +) -> anyhow::Result where F: FnOnce(&str) -> GitResult, { @@ -225,11 +242,30 @@ where if !output.stdout.is_empty() { print!("{}", output.stdout); } - return Ok(exit_code_to_process(output.exit_code)); + return Ok(map_exit_code(output.exit_code)); } let result = parser(&output.stdout); - println!("{result}"); + let result_str = result.to_string(); + println!("{result_str}"); + + if show_stats { + let (orig, comp) = crate::process::count_token_pair(&output.stdout, &result_str); + crate::process::report_token_stats(orig, comp, ""); + } + + // Record analytics (fire-and-forget, non-blocking). + // Guard to avoid allocations when analytics are disabled. + if crate::analytics::is_analytics_enabled() { + crate::analytics::try_record_command( + output.stdout, + result_str, + format!("skim git {}", subcmd_args.join(" ")), + crate::analytics::CommandType::Git, + output.duration, + None, + ); + } Ok(ExitCode::SUCCESS) } @@ -242,9 +278,13 @@ where /// /// Flag-aware passthrough: if user has `--porcelain`, `--short`, or `-s`, /// output is already compact — pass through unmodified. -fn run_status(global_flags: &[String], args: &[String]) -> anyhow::Result { +fn run_status( + global_flags: &[String], + args: &[String], + show_stats: bool, +) -> anyhow::Result { if user_has_flag(args, &["--porcelain", "--short", "-s"]) { - return run_passthrough(global_flags, "status", args); + return run_passthrough(global_flags, "status", args, show_stats); } let mut full_args: Vec = global_flags.to_vec(); @@ -255,7 +295,7 @@ fn run_status(global_flags: &[String], args: &[String]) -> anyhow::Result &'static str { /// /// Flag-aware passthrough: if user has `--stat`, `--name-only`, or /// `--name-status`, output is already compact — pass through unmodified. -fn run_diff(global_flags: &[String], args: &[String]) -> anyhow::Result { +fn run_diff( + global_flags: &[String], + args: &[String], + show_stats: bool, +) -> anyhow::Result { if user_has_flag(args, &["--stat", "--name-only", "--name-status", "--check"]) { - return run_passthrough(global_flags, "diff", args); + return run_passthrough(global_flags, "diff", args, show_stats); } let mut full_args: Vec = global_flags.to_vec(); full_args.extend(["diff".to_string(), "--stat".to_string()]); full_args.extend_from_slice(args); - run_parsed_command(&full_args, parse_diff_stat) + run_parsed_command(&full_args, show_stats, parse_diff_stat) } /// Parse `git diff --stat` output into a compressed GitResult. @@ -499,9 +543,9 @@ fn parse_diff_stat(output: &str) -> GitResult { /// /// Flag-aware passthrough: if user has `--format`, `--pretty`, or `--oneline`, /// output is already compact — pass through unmodified. -fn run_log(global_flags: &[String], args: &[String]) -> anyhow::Result { +fn run_log(global_flags: &[String], args: &[String], show_stats: bool) -> anyhow::Result { if user_has_flag(args, &["--format", "--pretty", "--oneline"]) { - return run_passthrough(global_flags, "log", args); + return run_passthrough(global_flags, "log", args, show_stats); } let mut full_args: Vec = global_flags.to_vec(); @@ -513,7 +557,7 @@ fn run_log(global_flags: &[String], args: &[String]) -> anyhow::Result full_args.extend_from_slice(args); - run_parsed_command(&full_args, parse_log) + run_parsed_command(&full_args, show_stats, parse_log) } /// Parse formatted `git log` output into a compressed GitResult. @@ -763,7 +807,7 @@ mod tests { } // ======================================================================== - // user_has_flag / exit_code_to_process helpers + // user_has_flag / map_exit_code helpers // ======================================================================== #[test] @@ -772,21 +816,21 @@ mod tests { } #[test] - fn test_exit_code_to_process_success() { - let code = exit_code_to_process(Some(0)); + fn test_map_exit_code_success() { + let code = map_exit_code(Some(0)); // ExitCode doesn't impl PartialEq, so compare via Debug assert_eq!(format!("{code:?}"), format!("{:?}", ExitCode::SUCCESS)); } #[test] - fn test_exit_code_to_process_failure() { - let code = exit_code_to_process(Some(1)); + fn test_map_exit_code_failure() { + let code = map_exit_code(Some(1)); assert_eq!(format!("{code:?}"), format!("{:?}", ExitCode::FAILURE)); } #[test] - fn test_exit_code_to_process_none() { - let code = exit_code_to_process(None); + fn test_map_exit_code_none() { + let code = map_exit_code(None); assert_eq!(format!("{code:?}"), format!("{:?}", ExitCode::FAILURE)); } diff --git a/crates/rskim/src/cmd/mod.rs b/crates/rskim/src/cmd/mod.rs index ebd095f..8c4cef2 100644 --- a/crates/rskim/src/cmd/mod.rs +++ b/crates/rskim/src/cmd/mod.rs @@ -13,6 +13,7 @@ mod init; mod learn; mod rewrite; mod session; +mod stats; mod test; use std::io::{self, IsTerminal, Read, Write}; @@ -33,6 +34,7 @@ pub(crate) const KNOWN_SUBCOMMANDS: &[&str] = &[ "init", "learn", "rewrite", + "stats", "test", ]; @@ -57,6 +59,21 @@ pub(crate) fn user_has_flag(args: &[String], flags: &[&str]) -> bool { }) } +/// Extract the `--show-stats` flag from args, returning filtered args and whether +/// the flag was present. +/// +/// This centralises the pattern that was previously copy-pasted across build, +/// git, and test subcommand entry points. +pub(crate) fn extract_show_stats(args: &[String]) -> (Vec, bool) { + let show_stats = args.iter().any(|a| a == "--show-stats"); + let filtered: Vec = args + .iter() + .filter(|a| a.as_str() != "--show-stats") + .cloned() + .collect(); + (filtered, show_stats) +} + /// Inject a flag before the `--` separator, or at the end if no separator exists. /// /// This ensures injected flags (like `--message-format=json`) appear in the @@ -70,6 +87,20 @@ pub(crate) fn inject_flag_before_separator(args: &mut Vec, flag: &str) { } } +/// Configuration for running an external command with parsed output. +/// +/// Groups the cross-cutting parameters for [`run_parsed_command_with_mode`] +/// to reduce its positional parameter count. +pub(crate) struct ParsedCommandConfig<'a> { + pub program: &'a str, + pub args: &'a [String], + pub env_overrides: &'a [(&'a str, &'a str)], + pub install_hint: &'a str, + pub use_stdin: bool, + pub show_stats: bool, + pub command_type: crate::analytics::CommandType, +} + /// Execute an external command, parse its output, and emit the result. /// /// Convenience wrapper that auto-detects stdin piping via `is_terminal()`. @@ -81,13 +112,24 @@ pub(crate) fn run_parsed_command( args: &[String], env_overrides: &[(&str, &str)], install_hint: &str, + show_stats: bool, + command_type: crate::analytics::CommandType, parse: impl FnOnce(&CommandOutput, &[String]) -> ParseResult, ) -> anyhow::Result where T: AsRef, { let use_stdin = !io::stdin().is_terminal(); - run_parsed_command_with_mode(program, args, env_overrides, install_hint, use_stdin, parse) + let config = ParsedCommandConfig { + program, + args, + env_overrides, + install_hint, + use_stdin, + show_stats, + command_type, + }; + run_parsed_command_with_mode(config, parse) } /// Execute an external command, parse its output, and emit the result. @@ -100,15 +142,11 @@ where /// 4. Emitting the parsed result to stdout /// 5. Mapping the exit code /// -/// `use_stdin` — when `true`, reads stdin instead of spawning the command. +/// `config.use_stdin` — when `true`, reads stdin instead of spawning the command. /// Callers should set this based on their own heuristics (e.g., only read /// stdin when no user args are provided AND stdin is piped). pub(crate) fn run_parsed_command_with_mode( - program: &str, - args: &[String], - env_overrides: &[(&str, &str)], - install_hint: &str, - use_stdin: bool, + config: ParsedCommandConfig<'_>, parse: impl FnOnce(&CommandOutput, &[String]) -> ParseResult, ) -> anyhow::Result where @@ -118,6 +156,16 @@ where /// runner's `MAX_OUTPUT_BYTES` limit for command output pipes. const MAX_STDIN_BYTES: u64 = 64 * 1024 * 1024; + let ParsedCommandConfig { + program, + args, + env_overrides, + install_hint, + use_stdin, + show_stats, + command_type, + } = config; + let output = if use_stdin { // Piped stdin mode: read stdin instead of executing the command. // Size-limited to prevent unbounded memory growth from runaway pipes. @@ -180,10 +228,31 @@ where } stdout_handle.flush()?; + // Report token stats if requested + if show_stats { + let (orig, comp) = crate::process::count_token_pair(&output.stdout, result.content()); + crate::process::report_token_stats(orig, comp, ""); + } + + // Capture exit code before moving stdout into analytics + let code = output.exit_code.unwrap_or(1); + + // Record analytics (fire-and-forget, non-blocking). + // Guard to avoid .to_string() allocation when analytics are disabled. + if crate::analytics::is_analytics_enabled() { + crate::analytics::try_record_command( + output.stdout, + result.content().to_string(), + format!("skim {program} {}", args.join(" ")), + command_type, + output.duration, + Some(result.tier_name()), + ); + } + // Map exit code: preserve full 0-255 exit code granularity from the // underlying process. This maintains documented semantics (0=success, // 1=error, 2=parse error, 3=unsupported language) for downstream consumers. - let code = output.exit_code.unwrap_or(1); Ok(ExitCode::from(code.clamp(0, 255) as u8)) } @@ -202,34 +271,17 @@ pub(crate) fn dispatch(subcommand: &str, args: &[String]) -> anyhow::Result return build::run(args), - "completions" => return completions::run(args), - "discover" => return discover::run(args), - "git" => return git::run(args), - "init" => return init::run(args), - "learn" => return learn::run(args), - "rewrite" => return rewrite::run(args), - "test" => return test::run(args), - _ => {} - } - - // Check for --help / -h in remaining args - if args.iter().any(|a| matches!(a.as_str(), "--help" | "-h")) { - println!("skim {subcommand}"); - println!(); - println!(" Status: not yet implemented"); - println!(); - println!(" This subcommand is planned for a future release."); - println!(" See: https://github.com/dean0x/skim/issues/19"); - return Ok(ExitCode::SUCCESS); + "build" => build::run(args), + "completions" => completions::run(args), + "discover" => discover::run(args), + "git" => git::run(args), + "init" => init::run(args), + "learn" => learn::run(args), + "rewrite" => rewrite::run(args), + "stats" => stats::run(args), + "test" => test::run(args), + // Unreachable: is_known_subcommand guard above rejects unknown names + _ => unreachable!("unknown subcommand '{subcommand}' passed is_known_subcommand guard"), } - - eprintln!("skim {subcommand}: not yet implemented"); - eprintln!(); - eprintln!("This subcommand is planned for a future release."); - eprintln!("See: https://github.com/dean0x/skim/issues/19"); - - Ok(ExitCode::FAILURE) } diff --git a/crates/rskim/src/cmd/stats.rs b/crates/rskim/src/cmd/stats.rs new file mode 100644 index 0000000..1f7280a --- /dev/null +++ b/crates/rskim/src/cmd/stats.rs @@ -0,0 +1,562 @@ +//! Stats subcommand — token analytics dashboard (#56) +//! +//! Queries the analytics SQLite database and displays a summary of token +//! savings across all skim invocations. Supports time filtering (`--since`), +//! JSON output (`--format json`), cost estimates (`--cost`), and data clearing +//! (`--clear`). + +use std::io::{self, Write}; +use std::process::ExitCode; +use std::time::UNIX_EPOCH; + +use colored::Colorize; + +use crate::analytics::{AnalyticsDb, AnalyticsStore, PricingModel}; +use crate::cmd::session::types::parse_duration_ago; +use crate::tokens; + +// ============================================================================ +// Public entry point +// ============================================================================ + +/// Run the `skim stats` subcommand. +pub(crate) fn run(args: &[String]) -> anyhow::Result { + if args.iter().any(|a| matches!(a.as_str(), "--help" | "-h")) { + print_help(); + return Ok(ExitCode::SUCCESS); + } + + // Parse flags + let clear = args.iter().any(|a| a == "--clear"); + let show_cost = args.iter().any(|a| a == "--cost"); + let format = parse_value_flag(args, "--format"); + let since_str = parse_value_flag(args, "--since"); + + let db = AnalyticsDb::open_default()?; + + if clear { + return run_clear(&db); + } + + let since_ts = if let Some(s) = &since_str { + let time = parse_duration_ago(s)?; + let ts = time.duration_since(UNIX_EPOCH)?.as_secs() as i64; + Some(ts) + } else { + None + }; + + let mut stdout = io::stdout().lock(); + + if format.as_deref() == Some("json") { + return run_json(&mut stdout, &db, since_ts, show_cost); + } + + run_dashboard(&mut stdout, &db, since_ts, show_cost, since_str.as_deref()) +} + +// ============================================================================ +// Flag parsing +// ============================================================================ + +/// Parse a `--flag value` or `--flag=value` pair from args. +fn parse_value_flag(args: &[String], flag: &str) -> Option { + let mut iter = args.iter(); + while let Some(arg) = iter.next() { + if arg == flag { + return iter.next().cloned(); + } + if let Some(val) = arg.strip_prefix(&format!("{flag}=")) { + return Some(val.to_string()); + } + } + None +} + +// ============================================================================ +// Help +// ============================================================================ + +fn print_help() { + println!("skim stats"); + println!(); + println!(" Show token analytics dashboard."); + println!(); + println!("FLAGS:"); + println!(" --since Filter to recent data (e.g., 7d, 24h, 4w)"); + println!(" --format json Output as JSON"); + println!(" --cost Show cost savings estimates"); + println!(" --clear Delete all analytics data"); + println!(); + println!("EXAMPLES:"); + println!(" skim stats Show all-time summary"); + println!(" skim stats --since 7d Last 7 days"); + println!(" skim stats --format json Machine-readable output"); + println!(" skim stats --cost Include cost estimates"); + println!(" skim stats --clear Reset analytics data"); + println!(); + println!("ENVIRONMENT:"); + println!(" SKIM_INPUT_COST_PER_MTOK Override $/MTok for cost estimates (default: 3.0)"); + println!(" SKIM_ANALYTICS_DB Override analytics database path"); + println!( + " SKIM_DISABLE_ANALYTICS Set to 1, true, or yes to disable analytics recording" + ); +} + +// ============================================================================ +// Clear +// ============================================================================ + +fn run_clear(db: &dyn AnalyticsStore) -> anyhow::Result { + db.clear()?; + println!("Analytics data cleared."); + Ok(ExitCode::SUCCESS) +} + +// ============================================================================ +// JSON output +// ============================================================================ + +fn run_json( + w: &mut dyn Write, + db: &dyn AnalyticsStore, + since: Option, + show_cost: bool, +) -> anyhow::Result { + let summary = db.query_summary(since)?; + let daily = db.query_daily(since)?; + let by_command = db.query_by_command(since)?; + let by_language = db.query_by_language(since)?; + let by_mode = db.query_by_mode(since)?; + let tier_dist = db.query_tier_distribution(since)?; + + let mut root = serde_json::json!({ + "summary": summary, + "daily": daily, + "by_command": by_command, + "by_language": by_language, + "by_mode": by_mode, + "tier_distribution": tier_dist, + }); + + if show_cost { + let pricing = PricingModel::from_env_or_default(); + let cost_savings = pricing.estimate_savings(summary.tokens_saved); + root.as_object_mut().unwrap().insert( + "cost_estimate".to_string(), + serde_json::json!({ + "model": pricing.model_name, + "input_cost_per_mtok": pricing.input_cost_per_mtok, + "estimated_savings_usd": (cost_savings * 100.0).round() / 100.0, + "tokens_saved": summary.tokens_saved, + }), + ); + } + + writeln!(w, "{}", serde_json::to_string_pretty(&root)?)?; + Ok(ExitCode::SUCCESS) +} + +// ============================================================================ +// Terminal dashboard +// ============================================================================ + +fn run_dashboard( + w: &mut dyn Write, + db: &dyn AnalyticsStore, + since: Option, + show_cost: bool, + since_str: Option<&str>, +) -> anyhow::Result { + let summary = db.query_summary(since)?; + + if summary.invocations == 0 { + writeln!(w, "{}", "No analytics data found.".dimmed())?; + writeln!(w)?; + writeln!( + w, + "Run skim commands to start collecting token savings data." + )?; + writeln!(w, "Example: skim src/main.rs")?; + return Ok(ExitCode::SUCCESS); + } + + // Header + let period = since_str.map_or("all time".to_string(), |s| format!("last {s}")); + writeln!(w, "{}", format!("Token Analytics ({period})").bold())?; + writeln!(w)?; + + // Summary section + writeln!(w, "{}", "Summary".bold().underline())?; + writeln!( + w, + " Invocations: {}", + tokens::format_number(summary.invocations as usize) + )?; + writeln!( + w, + " Raw tokens: {}", + tokens::format_number(summary.raw_tokens as usize) + )?; + writeln!( + w, + " Compressed: {}", + tokens::format_number(summary.compressed_tokens as usize) + )?; + writeln!( + w, + " Tokens saved: {}", + tokens::format_number(summary.tokens_saved as usize).green() + )?; + writeln!(w, " Avg reduction: {:.1}%", summary.avg_savings_pct)?; + + // Efficiency meter + let pct = summary.avg_savings_pct.clamp(0.0, 100.0); + let filled = (pct / 5.0).round() as usize; + let empty = 20_usize.saturating_sub(filled); + let bar = format!( + " [{}{}] {:.1}%", + "\u{2588}".repeat(filled).green(), + "\u{2591}".repeat(empty), + pct + ); + writeln!(w, "{bar}")?; + writeln!(w)?; + + // By command type + let by_command = db.query_by_command(since)?; + if !by_command.is_empty() { + writeln!(w, "{}", "By Command".bold().underline())?; + for cmd in &by_command { + writeln!( + w, + " {:<8} {:>6} invocations, {} tokens saved ({:.1}%)", + cmd.command_type, + tokens::format_number(cmd.invocations as usize), + tokens::format_number(cmd.tokens_saved as usize), + cmd.avg_savings_pct, + )?; + } + writeln!(w)?; + } + + // By language + let by_language = db.query_by_language(since)?; + if !by_language.is_empty() { + writeln!(w, "{}", "By Language".bold().underline())?; + for lang in &by_language { + writeln!( + w, + " {:<12} {:>6} files, {} tokens saved ({:.1}%)", + lang.language, + tokens::format_number(lang.files as usize), + tokens::format_number(lang.tokens_saved as usize), + lang.avg_savings_pct, + )?; + } + writeln!(w)?; + } + + // By mode + let by_mode = db.query_by_mode(since)?; + if !by_mode.is_empty() { + writeln!(w, "{}", "By Mode".bold().underline())?; + for mode in &by_mode { + writeln!( + w, + " {:<12} {:>6} files, {} tokens saved ({:.1}%)", + mode.mode, + tokens::format_number(mode.files as usize), + tokens::format_number(mode.tokens_saved as usize), + mode.avg_savings_pct, + )?; + } + writeln!(w)?; + } + + // Parse tier distribution + let tier = db.query_tier_distribution(since)?; + if tier.full_pct > 0.0 || tier.degraded_pct > 0.0 || tier.passthrough_pct > 0.0 { + writeln!(w, "{}", "Parse Quality".bold().underline())?; + writeln!(w, " Full: {:.1}%", tier.full_pct)?; + writeln!(w, " Degraded: {:.1}%", tier.degraded_pct)?; + writeln!(w, " Passthrough: {:.1}%", tier.passthrough_pct)?; + writeln!(w)?; + } + + // Cost estimates + if show_cost { + let pricing = PricingModel::from_env_or_default(); + let cost_savings = pricing.estimate_savings(summary.tokens_saved); + writeln!(w, "{}", "Cost Estimates".bold().underline())?; + writeln!(w, " Model: {}", pricing.model_name)?; + writeln!( + w, + " Input cost: ${:.2}/MTok", + pricing.input_cost_per_mtok + )?; + writeln!( + w, + " Estimated savings: {}", + format!("${:.2}", cost_savings).green() + )?; + writeln!(w)?; + } + + Ok(ExitCode::SUCCESS) +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + use crate::analytics::*; + + /// In-memory mock store for testing dashboard rendering without a real DB. + struct MockStore { + summary: AnalyticsSummary, + daily: Vec, + by_command: Vec, + by_language: Vec, + by_mode: Vec, + tier_dist: TierDistribution, + } + + impl MockStore { + fn empty() -> Self { + Self { + summary: AnalyticsSummary { + invocations: 0, + raw_tokens: 0, + compressed_tokens: 0, + tokens_saved: 0, + avg_savings_pct: 0.0, + }, + daily: vec![], + by_command: vec![], + by_language: vec![], + by_mode: vec![], + tier_dist: TierDistribution { + full_pct: 0.0, + degraded_pct: 0.0, + passthrough_pct: 0.0, + }, + } + } + + fn with_data() -> Self { + Self { + summary: AnalyticsSummary { + invocations: 42, + raw_tokens: 100_000, + compressed_tokens: 30_000, + tokens_saved: 70_000, + avg_savings_pct: 70.0, + }, + daily: vec![DailyStats { + date: "2026-03-24".to_string(), + invocations: 42, + tokens_saved: 70_000, + avg_savings_pct: 70.0, + }], + by_command: vec![CommandStats { + command_type: "file".to_string(), + invocations: 30, + tokens_saved: 50_000, + avg_savings_pct: 72.0, + }], + by_language: vec![LanguageStats { + language: "rust".to_string(), + files: 25, + tokens_saved: 40_000, + avg_savings_pct: 75.0, + }], + by_mode: vec![ModeStats { + mode: "structure".to_string(), + files: 20, + tokens_saved: 35_000, + avg_savings_pct: 78.0, + }], + tier_dist: TierDistribution { + full_pct: 90.0, + degraded_pct: 8.0, + passthrough_pct: 2.0, + }, + } + } + } + + impl AnalyticsStore for MockStore { + fn query_summary(&self, _since: Option) -> anyhow::Result { + Ok(self.summary.clone()) + } + fn query_daily(&self, _since: Option) -> anyhow::Result> { + Ok(self.daily.clone()) + } + fn query_by_command(&self, _since: Option) -> anyhow::Result> { + Ok(self.by_command.clone()) + } + fn query_by_language(&self, _since: Option) -> anyhow::Result> { + Ok(self.by_language.clone()) + } + fn query_by_mode(&self, _since: Option) -> anyhow::Result> { + Ok(self.by_mode.clone()) + } + fn query_tier_distribution(&self, _since: Option) -> anyhow::Result { + Ok(self.tier_dist.clone()) + } + fn clear(&self) -> anyhow::Result<()> { + Ok(()) + } + } + + /// Helper: run a rendering function and return the captured output as a String. + fn capture(f: F) -> String + where + F: FnOnce(&mut Vec) -> anyhow::Result, + { + let mut buf = Vec::new(); + let code = f(&mut buf).expect("render function should succeed"); + assert_eq!(code, ExitCode::SUCCESS); + String::from_utf8(buf).expect("output should be valid UTF-8") + } + + #[test] + fn test_run_json_empty_store() { + let store = MockStore::empty(); + let output = capture(|w| run_json(w, &store, None, false)); + let parsed: serde_json::Value = + serde_json::from_str(&output).expect("output should be valid JSON"); + let summary = &parsed["summary"]; + assert_eq!(summary["invocations"], 0); + assert_eq!(summary["tokens_saved"], 0); + } + + #[test] + fn test_run_json_with_data() { + let store = MockStore::with_data(); + let output = capture(|w| run_json(w, &store, None, false)); + let parsed: serde_json::Value = + serde_json::from_str(&output).expect("output should be valid JSON"); + let summary = &parsed["summary"]; + assert_eq!(summary["invocations"], 42); + assert_eq!(summary["tokens_saved"], 70_000); + assert_eq!(summary["avg_savings_pct"], 70.0); + // Verify breakdowns are present + assert!(parsed["by_command"].as_array().unwrap().len() == 1); + assert!(parsed["by_language"].as_array().unwrap().len() == 1); + assert!(parsed["by_mode"].as_array().unwrap().len() == 1); + // No cost_estimate when show_cost is false + assert!(parsed.get("cost_estimate").is_none()); + } + + #[test] + fn test_run_json_with_cost() { + let store = MockStore::with_data(); + let output = capture(|w| run_json(w, &store, None, true)); + let parsed: serde_json::Value = + serde_json::from_str(&output).expect("output should be valid JSON"); + let cost = &parsed["cost_estimate"]; + assert!( + cost.is_object(), + "cost_estimate should be present when show_cost=true" + ); + assert_eq!(cost["tokens_saved"], 70_000); + assert!(cost["estimated_savings_usd"].as_f64().unwrap() > 0.0); + } + + #[test] + fn test_run_dashboard_empty_store() { + let store = MockStore::empty(); + let output = capture(|w| run_dashboard(w, &store, None, false, None)); + assert!( + output.contains("No analytics data found"), + "empty dashboard should show empty message" + ); + } + + #[test] + fn test_run_dashboard_with_data() { + let store = MockStore::with_data(); + let output = capture(|w| run_dashboard(w, &store, None, false, None)); + assert!( + output.contains("42"), + "dashboard should show invocation count" + ); + assert!( + output.contains("70,000"), + "dashboard should show tokens saved" + ); + assert!( + output.contains("70.0%"), + "dashboard should show avg reduction" + ); + assert!( + output.contains("all time"), + "dashboard should show period label" + ); + assert!( + output.contains("rust"), + "dashboard should show language breakdown" + ); + assert!( + output.contains("structure"), + "dashboard should show mode breakdown" + ); + } + + #[test] + fn test_run_dashboard_with_cost() { + let store = MockStore::with_data(); + let output = capture(|w| run_dashboard(w, &store, None, true, None)); + assert!( + output.contains("Cost Estimates"), + "dashboard should show cost section" + ); + assert!(output.contains("/MTok"), "dashboard should show cost rate"); + } + + #[test] + fn test_run_dashboard_with_since_label() { + let store = MockStore::with_data(); + let output = capture(|w| run_dashboard(w, &store, None, false, Some("7d"))); + assert!( + output.contains("last 7d"), + "dashboard should show since period" + ); + } + + #[test] + fn test_run_clear_mock() { + let store = MockStore::empty(); + let result = run_clear(&store); + assert!(result.is_ok()); + } + + #[test] + fn test_parse_value_flag_bare() { + let args: Vec = vec!["--format".into(), "json".into()]; + assert_eq!( + parse_value_flag(&args, "--format"), + Some("json".to_string()) + ); + } + + #[test] + fn test_parse_value_flag_equals() { + let args: Vec = vec!["--format=json".into()]; + assert_eq!( + parse_value_flag(&args, "--format"), + Some("json".to_string()) + ); + } + + #[test] + fn test_parse_value_flag_missing() { + let args: Vec = vec!["--cost".into()]; + assert_eq!(parse_value_flag(&args, "--format"), None); + } +} diff --git a/crates/rskim/src/cmd/test/cargo.rs b/crates/rskim/src/cmd/test/cargo.rs index bde6e60..0573df5 100644 --- a/crates/rskim/src/cmd/test/cargo.rs +++ b/crates/rskim/src/cmd/test/cargo.rs @@ -21,7 +21,9 @@ use std::sync::LazyLock; use regex::Regex; -use crate::cmd::{inject_flag_before_separator, run_parsed_command_with_mode, user_has_flag}; +use crate::cmd::{ + inject_flag_before_separator, run_parsed_command_with_mode, user_has_flag, ParsedCommandConfig, +}; use crate::output::canonical::{TestEntry, TestOutcome, TestResult, TestSummary}; use crate::output::ParseResult; use crate::runner::CommandOutput; @@ -40,7 +42,7 @@ static RE_CARGO_SUMMARY: LazyLock = LazyLock::new(|| { /// three-tier degradation. For nextest, skips JSON injection entirely. /// For standard cargo test, injects `--message-format=json` to get build /// artifact JSON on stdout (test results still come as plain text). -pub(crate) fn run(args: &[String]) -> anyhow::Result { +pub(crate) fn run(args: &[String], show_stats: bool) -> anyhow::Result { let is_nextest = args.iter().any(|a| a == "nextest"); // Build command args: start with "test", append all user args @@ -63,11 +65,15 @@ pub(crate) fn run(args: &[String]) -> anyhow::Result { let use_stdin = !std::io::stdin().is_terminal() && args.is_empty(); run_parsed_command_with_mode( - "cargo", - &cmd_args, - &[("CARGO_TERM_COLOR", "never")], - "Install Rust via https://rustup.rs", - use_stdin, + ParsedCommandConfig { + program: "cargo", + args: &cmd_args, + env_overrides: &[("CARGO_TERM_COLOR", "never")], + install_hint: "Install Rust via https://rustup.rs", + use_stdin, + show_stats, + command_type: crate::analytics::CommandType::Test, + }, move |output, _args| parse_impl(output, is_nextest), ) } diff --git a/crates/rskim/src/cmd/test/go.rs b/crates/rskim/src/cmd/test/go.rs index f6d2f14..f98def0 100644 --- a/crates/rskim/src/cmd/test/go.rs +++ b/crates/rskim/src/cmd/test/go.rs @@ -23,12 +23,15 @@ use crate::runner::CommandRunner; /// Injects `-json` if the user hasn't already set `-json` or `-v`, /// then runs the command through [`CommandRunner`] and parses output /// via three-tier degradation. -pub(crate) fn run(args: &[String]) -> anyhow::Result { +pub(crate) fn run(args: &[String], show_stats: bool) -> anyhow::Result { let mut go_args: Vec = vec!["test".to_string()]; // Inject -json before any `--` separator, unless the user already specified // -json or -v (verbose mode produces non-JSON output). - if !user_has_flag(args, "-json") && !user_has_flag(args, "-v") { + // + // Go flags use `-flag=false` to explicitly disable a flag, so we use + // go-specific detection that treats `-v=false` as NOT having `-v`. + if !go_has_flag(args, "-json") && !go_has_flag(args, "-v") { // Find the position of `--` if present, and inject -json before it. if let Some(sep_pos) = args.iter().position(|a| a == "--") { go_args.extend_from_slice(&args[..sep_pos]); @@ -64,44 +67,64 @@ pub(crate) fn run(args: &[String]) -> anyhow::Result { let parsed = parse(&combined); // Emit the result - match &parsed { + let exit_code = match &parsed { ParseResult::Full(result) | ParseResult::Degraded(result, _) => { println!("{result}"); // Emit degradation markers to stderr let mut stderr = std::io::stderr().lock(); let _ = parsed.emit_markers(&mut stderr); - let exit_code = if result.summary.fail > 0 { + if result.summary.fail > 0 { ExitCode::FAILURE } else { ExitCode::SUCCESS - }; - Ok(exit_code) + } } ParseResult::Passthrough(raw) => { println!("{raw}"); let mut stderr = std::io::stderr().lock(); let _ = parsed.emit_markers(&mut stderr); // Mirror the original process exit code - let exit_code = match output.exit_code { + match output.exit_code { Some(0) => ExitCode::SUCCESS, _ => ExitCode::FAILURE, - }; - Ok(exit_code) + } } + }; + + if show_stats { + let (orig, comp) = crate::process::count_token_pair(&combined, parsed.content()); + crate::process::report_token_stats(orig, comp, ""); + } + + // Record analytics (fire-and-forget, non-blocking). + // Guard to avoid .to_string() allocation when analytics are disabled. + if crate::analytics::is_analytics_enabled() { + crate::analytics::try_record_command( + combined, + parsed.content().to_string(), + format!("skim test go {}", args.join(" ")), + crate::analytics::CommandType::Test, + output.duration, + Some(parsed.tier_name()), + ); } + + Ok(exit_code) } // ============================================================================ // Flag detection // ============================================================================ -/// Check whether the user has specified a flag (exact match or prefix with `=`). +/// Go-specific flag detection that respects `-flag=false` semantics. /// -/// Handles both `-json` and `-json=true` as well as `-v` and `-v=true`. -/// Treats `-flag=false` as the flag NOT being set, since the user is -/// explicitly disabling it. -fn user_has_flag(args: &[String], flag: &str) -> bool { +/// Unlike the shared `crate::cmd::user_has_flag`, Go CLI flags use +/// `-flag=false` to explicitly disable a boolean flag. This function +/// treats `-v=false` as the flag NOT being set, which is required for +/// correct `-json` injection logic. The shared version does not handle +/// this because `=false` is not a convention outside Go tooling. +fn go_has_flag(args: &[String], flag: &str) -> bool { args.iter().any(|a| { if a == flag { return true; @@ -584,9 +607,9 @@ mod tests { #[test] fn test_flag_injection_skipped_with_v() { let args = vec!["-v".to_string(), "./...".to_string()]; - assert!(user_has_flag(&args, "-v"), "expected -v to be detected"); + assert!(go_has_flag(&args, "-v"), "expected -v to be detected"); assert!( - !user_has_flag(&args, "-json"), + !go_has_flag(&args, "-json"), "expected -json to NOT be detected" ); } @@ -594,21 +617,15 @@ mod tests { #[test] fn test_flag_injection_skipped_with_json() { let args = vec!["-json".to_string(), "./...".to_string()]; - assert!( - user_has_flag(&args, "-json"), - "expected -json to be detected" - ); - assert!( - !user_has_flag(&args, "-v"), - "expected -v to NOT be detected" - ); + assert!(go_has_flag(&args, "-json"), "expected -json to be detected"); + assert!(!go_has_flag(&args, "-v"), "expected -v to NOT be detected"); } #[test] fn test_user_has_flag_with_equals() { let args = vec!["-json=true".to_string()]; assert!( - user_has_flag(&args, "-json"), + go_has_flag(&args, "-json"), "expected -json=true to match -json" ); } @@ -621,13 +638,10 @@ mod tests { "TestFoo".to_string(), ]; assert!( - !user_has_flag(&args, "-json"), + !go_has_flag(&args, "-json"), "expected -json to NOT be detected" ); - assert!( - !user_has_flag(&args, "-v"), - "expected -v to NOT be detected" - ); + assert!(!go_has_flag(&args, "-v"), "expected -v to NOT be detected"); } // ======================================================================== @@ -747,7 +761,7 @@ mod tests { ]; let mut go_args: Vec = vec!["test".to_string()]; - if !user_has_flag(&args, "-json") && !user_has_flag(&args, "-v") { + if !go_has_flag(&args, "-json") && !go_has_flag(&args, "-v") { if let Some(sep_pos) = args.iter().position(|a| a == "--") { go_args.extend_from_slice(&args[..sep_pos]); go_args.push("-json".to_string()); @@ -774,7 +788,7 @@ mod tests { // `-v=false` explicitly disables verbose mode, so -json should be injected. let args = vec!["-v=false".to_string(), "./...".to_string()]; assert!( - !user_has_flag(&args, "-v"), + !go_has_flag(&args, "-v"), "expected -v=false to NOT be detected as -v" ); } @@ -784,7 +798,7 @@ mod tests { // `-v=true` enables verbose mode, so -json should NOT be injected. let args = vec!["-v=true".to_string(), "./...".to_string()]; assert!( - user_has_flag(&args, "-v"), + go_has_flag(&args, "-v"), "expected -v=true to be detected as -v" ); } diff --git a/crates/rskim/src/cmd/test/mod.rs b/crates/rskim/src/cmd/test/mod.rs index 7e90d02..6dc7558 100644 --- a/crates/rskim/src/cmd/test/mod.rs +++ b/crates/rskim/src/cmd/test/mod.rs @@ -23,14 +23,20 @@ pub(crate) fn run(args: &[String]) -> anyhow::Result { return Ok(ExitCode::SUCCESS); } - let runner = args[0].as_str(); - let runner_args = &args[1..]; + let (filtered_args, show_stats) = crate::cmd::extract_show_stats(args); + + let Some((runner_name, runner_args)) = filtered_args.split_first() else { + print_help(); + return Ok(ExitCode::SUCCESS); + }; + + let runner = runner_name.as_str(); match runner { - "cargo" => cargo::run(runner_args), - "go" => go::run(runner_args), - "vitest" | "jest" => vitest::run(runner, runner_args), - "pytest" => pytest::run(runner_args), + "cargo" => cargo::run(runner_args, show_stats), + "go" => go::run(runner_args, show_stats), + "vitest" | "jest" => vitest::run(runner, runner_args, show_stats), + "pytest" => pytest::run(runner_args, show_stats), _ => { eprintln!( "skim test: unknown runner '{runner}'\n\ diff --git a/crates/rskim/src/cmd/test/pytest.rs b/crates/rskim/src/cmd/test/pytest.rs index 1603ef7..4d208d2 100644 --- a/crates/rskim/src/cmd/test/pytest.rs +++ b/crates/rskim/src/cmd/test/pytest.rs @@ -25,6 +25,7 @@ use std::time::Duration; use regex::Regex; +use crate::cmd::user_has_flag; use crate::output::canonical::{TestEntry, TestOutcome, TestResult, TestSummary}; use crate::output::{strip_ansi, ParseResult}; use crate::runner::{CommandOutput, CommandRunner}; @@ -40,7 +41,7 @@ use crate::runner::{CommandOutput, CommandRunner}; /// - If stdin is not a terminal → attempt to read stdin; if empty, fall back /// to running pytest (handles test harness environments where stdin is a /// pipe with no data) -pub(crate) fn run(args: &[String]) -> anyhow::Result { +pub(crate) fn run(args: &[String], show_stats: bool) -> anyhow::Result { // Intercept --help/-h: show skim's pytest help, then forward to real pytest // so the user sees both skim's flags and pytest's own options. if args.iter().any(|a| matches!(a.as_str(), "--help" | "-h")) { @@ -73,6 +74,24 @@ pub(crate) fn run(args: &[String]) -> anyhow::Result { emit_result(&result, &output)?; + if show_stats { + let (orig, comp) = crate::process::count_token_pair(&cleaned, result.content()); + crate::process::report_token_stats(orig, comp, ""); + } + + // Record analytics (fire-and-forget, non-blocking). + // Guard to avoid .to_string() allocation when analytics are disabled. + if crate::analytics::is_analytics_enabled() { + crate::analytics::try_record_command( + cleaned, + result.content().to_string(), + format!("skim test pytest {}", args.join(" ")), + crate::analytics::CommandType::Test, + output.duration, + Some(result.tier_name()), + ); + } + // Exit code: mirror pytest's exit code if we ran it, or infer from parse let code = match output.exit_code { Some(0) => ExitCode::SUCCESS, @@ -139,17 +158,6 @@ fn build_args(user_args: &[String]) -> Vec { args } -/// Check if any of the given flag prefixes appear in the user's args. -/// -/// Matches both `--flag` and `--flag=value` forms. -fn user_has_flag(args: &[String], prefixes: &[&str]) -> bool { - args.iter().any(|arg| { - prefixes - .iter() - .any(|prefix| arg == prefix || arg.starts_with(&format!("{prefix}="))) - }) -} - // ============================================================================ // Command execution // ============================================================================ @@ -373,7 +381,10 @@ fn tier1_parse(output: &str) -> Option { // Inside "short test summary info": parse FAILED/ERROR lines if in_summary_info { - if let Some(rest) = trimmed.strip_prefix("FAILED ") { + let rest = trimmed + .strip_prefix("FAILED ") + .or_else(|| trimmed.strip_prefix("ERROR ")); + if let Some(rest) = rest { // Format: "FAILED tests/test_b.py::test_two - assert 1 == 2" let (name, detail) = if let Some(dash_pos) = rest.find(" - ") { ( @@ -388,20 +399,6 @@ fn tier1_parse(output: &str) -> Option { outcome: TestOutcome::Fail, detail, }); - } else if let Some(rest) = trimmed.strip_prefix("ERROR ") { - let (name, detail) = if let Some(dash_pos) = rest.find(" - ") { - ( - rest[..dash_pos].to_string(), - Some(rest[dash_pos + 3..].to_string()), - ) - } else { - (rest.to_string(), None) - }; - entries.push(TestEntry { - name, - outcome: TestOutcome::Fail, - detail, - }); } continue; } diff --git a/crates/rskim/src/cmd/test/vitest.rs b/crates/rskim/src/cmd/test/vitest.rs index 9652b3e..deabf2a 100644 --- a/crates/rskim/src/cmd/test/vitest.rs +++ b/crates/rskim/src/cmd/test/vitest.rs @@ -15,6 +15,7 @@ use std::sync::LazyLock; use regex::Regex; +use crate::cmd::user_has_flag; use crate::output::canonical::{TestEntry, TestOutcome, TestResult, TestSummary}; use crate::output::ParseResult; use crate::runner::CommandRunner; @@ -27,7 +28,8 @@ use crate::runner::CommandRunner; /// /// `program` is the runner binary name (e.g. `"vitest"` or `"jest"`), used when /// stdin is not piped and we need to spawn the process directly. -pub(crate) fn run(program: &str, args: &[String]) -> anyhow::Result { +pub(crate) fn run(program: &str, args: &[String], show_stats: bool) -> anyhow::Result { + let start = std::time::Instant::now(); let raw_output = if stdin_has_data() { read_stdin()? } else { @@ -37,7 +39,7 @@ pub(crate) fn run(program: &str, args: &[String]) -> anyhow::Result { let result = parse(&raw_output); // Emit the result to stdout - match &result { + let exit_code = match &result { ParseResult::Full(test_result) | ParseResult::Degraded(test_result, _) => { println!("{test_result}"); // Emit degradation markers to stderr @@ -45,21 +47,40 @@ pub(crate) fn run(program: &str, args: &[String]) -> anyhow::Result { let mut handle = stderr.lock(); let _ = result.emit_markers(&mut handle); - let exit_code = if test_result.summary.fail > 0 { + if test_result.summary.fail > 0 { ExitCode::FAILURE } else { ExitCode::SUCCESS - }; - Ok(exit_code) + } } ParseResult::Passthrough(raw) => { println!("{raw}"); let stderr = io::stderr(); let mut handle = stderr.lock(); let _ = result.emit_markers(&mut handle); - Ok(ExitCode::FAILURE) + ExitCode::FAILURE } + }; + + if show_stats { + let (orig, comp) = crate::process::count_token_pair(&raw_output, result.content()); + crate::process::report_token_stats(orig, comp, ""); + } + + // Record analytics (fire-and-forget, non-blocking). + // Guard to avoid .to_string() allocation when analytics are disabled. + if crate::analytics::is_analytics_enabled() { + crate::analytics::try_record_command( + raw_output, + result.content().to_string(), + format!("skim test {program} {}", args.join(" ")), + crate::analytics::CommandType::Test, + start.elapsed(), + Some(result.tier_name()), + ); } + + Ok(exit_code) } // ============================================================================ @@ -107,10 +128,10 @@ fn run_vitest(program: &str, args: &[String]) -> anyhow::Result { let mut final_args: Vec = args.to_vec(); if program == "jest" { - if !user_has_flag(args, "--json") { + if !user_has_flag(args, &["--json"]) { final_args.push("--json".to_string()); } - } else if !user_has_flag(args, "--reporter") { + } else if !user_has_flag(args, &["--reporter"]) { final_args.push("--reporter=json".to_string()); } @@ -137,14 +158,6 @@ fn run_vitest(program: &str, args: &[String]) -> anyhow::Result { Ok(combined) } -/// Check if the user has already specified a flag (with or without `=` value). -/// -/// Matches both `--reporter=verbose` and `--reporter verbose` forms. -fn user_has_flag(args: &[String], flag: &str) -> bool { - args.iter() - .any(|a| a == flag || a.starts_with(&format!("{flag}="))) -} - // ============================================================================ // Three-tier parser // ============================================================================ @@ -722,7 +735,7 @@ Duration: 1.5s"; "math".to_string(), ]; assert!( - user_has_flag(&args, "--reporter"), + user_has_flag(&args, &["--reporter"]), "should detect --reporter=verbose" ); } @@ -731,7 +744,7 @@ Duration: 1.5s"; fn test_flag_injection_skipped_bare_flag() { let args = vec!["--reporter".to_string(), "json".to_string()]; assert!( - user_has_flag(&args, "--reporter"), + user_has_flag(&args, &["--reporter"]), "should detect bare --reporter" ); } @@ -740,7 +753,7 @@ Duration: 1.5s"; fn test_flag_injection_needed_when_no_reporter() { let args = vec!["--run".to_string(), "math".to_string()]; assert!( - !user_has_flag(&args, "--reporter"), + !user_has_flag(&args, &["--reporter"]), "should not detect --reporter when absent" ); } diff --git a/crates/rskim/src/main.rs b/crates/rskim/src/main.rs index 5c5adda..8cc263c 100644 --- a/crates/rskim/src/main.rs +++ b/crates/rskim/src/main.rs @@ -9,6 +9,7 @@ //! - Multi-file glob pattern matching //! - File-based caching with mtime invalidation +mod analytics; mod cache; mod cascade; mod cmd; @@ -58,6 +59,7 @@ fn is_flag_with_value(flag: &str) -> bool { | "--since" | "--session" | "--agent" + | "--format" ) } @@ -193,6 +195,7 @@ SUBCOMMANDS:\n \ init Initialize skim configuration\n \ learn Detect CLI error patterns and generate correction rules\n \ rewrite [--suggest] ... Rewrite commands into skim equivalents\n \ + stats [--since N] [--format json] Show token analytics dashboard\n \ test Run test with output parsing\n\n\ For more info: https://github.com/dean0x/skim")] struct Args { @@ -288,6 +291,10 @@ struct Args { help = "Cascade through modes until output fits within N tokens" )] tokens: Option, + + /// Disable analytics recording for this invocation + #[arg(long, help = "Disable analytics recording")] + disable_analytics: bool, } /// Build the clap `Command` from `Args` for use by shell completion generation. @@ -443,6 +450,13 @@ fn validate_args(args: &Args) -> anyhow::Result<()> { } fn main() -> ExitCode { + // Extract --disable-analytics from raw args before routing, so it + // applies to both file operations and subcommands. Uses an AtomicBool + // instead of env::set_var to avoid unsoundness in multi-threaded context. + if std::env::args().any(|a| a == "--disable-analytics") { + analytics::force_disable_analytics(); + } + let result: anyhow::Result = match resolve_invocation() { Invocation::FileOperation => run_file_operation().map(|()| ExitCode::SUCCESS), Invocation::Subcommand { name, args } => cmd::dispatch(&name, &args), @@ -465,6 +479,9 @@ fn run_file_operation() -> anyhow::Result<()> { let args = Args::parse(); validate_args(&args)?; + // --disable-analytics is handled in main() before routing via + // analytics::force_disable_analytics(). No env var mutation needed. + if args.clear_cache { cache::clear_cache()?; println!("Cache cleared successfully"); @@ -473,6 +490,7 @@ fn run_file_operation() -> anyhow::Result<()> { let file = args .file + .clone() .ok_or_else(|| anyhow::anyhow!("FILE argument is required"))?; let process_options = process::ProcessOptions { @@ -487,9 +505,15 @@ fn run_file_operation() -> anyhow::Result<()> { }, }; + let disable_analytics = args.disable_analytics; + if file == "-" { let result = process::process_stdin(process_options, args.filename.as_deref())?; - return process::write_result_and_stats(&result, args.show_stats); + process::write_result_and_stats(&result, args.show_stats)?; + if !disable_analytics { + record_file_analytics(&result, "skim -", &args); + } + return Ok(()); } let path = PathBuf::from(&file); @@ -499,6 +523,7 @@ fn run_file_operation() -> anyhow::Result<()> { no_header: args.no_header, jobs: args.jobs, no_ignore: args.no_ignore, + disable_analytics, }; if path.is_dir() { @@ -510,7 +535,41 @@ fn run_file_operation() -> anyhow::Result<()> { } let result = process::process_file(&path, process_options)?; - process::write_result_and_stats(&result, args.show_stats) + process::write_result_and_stats(&result, args.show_stats)?; + if !disable_analytics { + record_file_analytics(&result, &format!("skim {file}"), &args); + } + Ok(()) +} + +/// Record token analytics for file operations (single file or stdin). +fn record_file_analytics(result: &process::ProcessResult, cmd: &str, args: &Args) { + if !analytics::is_analytics_enabled() { + return; + } + if let (Some(raw), Some(comp)) = (result.original_tokens, result.transformed_tokens) { + let cwd = std::env::current_dir() + .unwrap_or_default() + .display() + .to_string(); + let lang = args + .language + .map(|l| format!("{:?}", Language::from(l)).to_lowercase()); + let mode = format!("{:?}", Mode::from(args.mode)).to_lowercase(); + analytics::record_with_counts(analytics::TokenSavingsRecord { + timestamp: analytics::now_unix_secs(), + command_type: analytics::CommandType::File, + original_cmd: cmd.to_string(), + raw_tokens: raw, + compressed_tokens: comp, + savings_pct: analytics::savings_percentage(raw, comp), + duration_ms: 0, + project_path: cwd, + mode: Some(mode), + language: lang, + parse_tier: None, + }); + } } #[cfg(test)] @@ -607,6 +666,7 @@ mod tests { "--since", "--session", "--agent", + "--format", ]; /// Ensure every value-consuming flag (non-boolean, non-positional) in `Args` @@ -634,6 +694,7 @@ mod tests { "--no-cache", "--clear-cache", "--show-stats", + "--disable-analytics", ]; for flag in boolean_flags { diff --git a/crates/rskim/src/multi.rs b/crates/rskim/src/multi.rs index c32a02c..57880f2 100644 --- a/crates/rskim/src/multi.rs +++ b/crates/rskim/src/multi.rs @@ -21,6 +21,7 @@ pub(crate) struct MultiFileOptions { pub(crate) no_header: bool, pub(crate) jobs: Option, pub(crate) no_ignore: bool, + pub(crate) disable_analytics: bool, } /// Glob metacharacters recognised by skim. @@ -269,6 +270,34 @@ fn process_files(paths: Vec, options: MultiFileOptions) -> anyhow::Resu ); } + // Record analytics for multi-file operations + if !options.disable_analytics + && crate::analytics::is_analytics_enabled() + && total_original_tokens > 0 + { + let cwd = std::env::current_dir() + .unwrap_or_default() + .display() + .to_string(); + let mode = format!("{:?}", options.process.mode).to_lowercase(); + crate::analytics::record_with_counts(crate::analytics::TokenSavingsRecord { + timestamp: crate::analytics::now_unix_secs(), + command_type: crate::analytics::CommandType::File, + original_cmd: format!("skim [multi: {} files]", success_count), + raw_tokens: total_original_tokens, + compressed_tokens: total_transformed_tokens, + savings_pct: crate::analytics::savings_percentage( + total_original_tokens, + total_transformed_tokens, + ), + duration_ms: 0, + project_path: cwd, + mode: Some(mode), + language: None, // mixed languages + parse_tier: None, + }); + } + Ok(()) } diff --git a/crates/rskim/src/process.rs b/crates/rskim/src/process.rs index dd82586..d1c7188 100644 --- a/crates/rskim/src/process.rs +++ b/crates/rskim/src/process.rs @@ -49,7 +49,10 @@ pub(crate) struct ProcessResult { /// Count tokens for both original and transformed text, returning `(None, None)` on failure. /// /// Centralises the paired token-counting pattern used across the processing pipeline. -fn count_token_pair(original: &str, transformed: &str) -> (Option, Option) { +pub(crate) fn count_token_pair( + original: &str, + transformed: &str, +) -> (Option, Option) { match ( tokens::count_tokens(original), tokens::count_tokens(transformed), @@ -108,9 +111,11 @@ fn try_cached_result( return Ok(None); }; - // If stats are requested but the cache entry was written without them, - // read the original file and count tokens for both source and output. - let needs_recount = options.show_stats && hit.original_tokens.is_none(); + // If the cache entry was written without token counts, read the original + // file and count tokens for both source and output -- but only when + // --show-stats is active. Analytics background threads handle their own + // token counting, so we don't erode cache speedup for analytics alone. + let needs_recount = hit.original_tokens.is_none() && options.show_stats; let (orig_tokens, trans_tokens) = if needs_recount { let contents = read_and_validate(path)?; count_token_pair(&contents, &hit.content) @@ -262,6 +267,8 @@ pub(crate) fn process_stdin( (transformed, false) }; + // Only pay the tiktoken BPE cost on the main thread when --show-stats + // is set. Analytics background threads compute their own token counts. let (orig_tokens, trans_tokens) = if options.show_stats { count_token_pair(&buffer, &final_output) } else { @@ -296,6 +303,8 @@ pub(crate) fn process_file(path: &Path, options: ProcessOptions) -> anyhow::Resu (result, false) }; + // Only pay the tiktoken BPE cost on the main thread when --show-stats + // is set. Analytics background threads compute their own token counts. let (orig_tokens, trans_tokens) = if options.show_stats { count_token_pair(&contents, &final_output) } else { diff --git a/crates/rskim/tests/cli_stats.rs b/crates/rskim/tests/cli_stats.rs new file mode 100644 index 0000000..a53f866 --- /dev/null +++ b/crates/rskim/tests/cli_stats.rs @@ -0,0 +1,167 @@ +//! Integration tests for `skim stats` subcommand (#56). +//! +//! All tests use `tempfile::NamedTempFile` + `SKIM_ANALYTICS_DB` env override +//! for isolation. `SKIM_DISABLE_ANALYTICS=1` prevents test invocations from +//! recording to the database. `NO_COLOR=1` prevents colored output from +//! interfering with assertions. + +use assert_cmd::Command; +use predicates::prelude::*; +use tempfile::NamedTempFile; + +// ============================================================================ +// Helper: build an isolated `skim stats` command +// ============================================================================ + +/// Create a `skim stats` command with an isolated analytics database. +/// +/// Sets `SKIM_ANALYTICS_DB` to a temporary file path, `SKIM_DISABLE_ANALYTICS=1` +/// to prevent test interference, and `NO_COLOR=1` to disable colored output. +fn skim_stats_cmd(db_file: &NamedTempFile) -> Command { + let mut cmd = Command::cargo_bin("skim").unwrap(); + cmd.arg("stats") + .env("SKIM_ANALYTICS_DB", db_file.path().as_os_str()) + .env("SKIM_DISABLE_ANALYTICS", "1") + .env("NO_COLOR", "1"); + cmd +} + +// ============================================================================ +// Help +// ============================================================================ + +#[test] +fn test_stats_help() { + let db = NamedTempFile::new().unwrap(); + skim_stats_cmd(&db) + .arg("--help") + .assert() + .success() + .stdout(predicate::str::contains("stats")) + .stdout(predicate::str::contains("--since")) + .stdout(predicate::str::contains("--format")) + .stdout(predicate::str::contains("--cost")) + .stdout(predicate::str::contains("--clear")); +} + +// ============================================================================ +// Empty database — graceful message +// ============================================================================ + +#[test] +fn test_stats_empty_db() { + let db = NamedTempFile::new().unwrap(); + skim_stats_cmd(&db) + .assert() + .success() + .stdout(predicate::str::contains("No analytics data found")); +} + +// ============================================================================ +// JSON format — empty database should produce valid JSON +// ============================================================================ + +#[test] +fn test_stats_json_format() { + let db = NamedTempFile::new().unwrap(); + let output = skim_stats_cmd(&db) + .args(["--format", "json"]) + .output() + .unwrap(); + + assert!( + output.status.success(), + "skim stats --format json should exit 0" + ); + + let stdout = String::from_utf8(output.stdout).unwrap(); + let json: serde_json::Value = serde_json::from_str(stdout.trim()) + .unwrap_or_else(|e| panic!("Expected valid JSON, got parse error: {e}\nstdout: {stdout}")); + + // Verify expected top-level keys exist + assert!( + json.get("summary").is_some(), + "JSON should contain 'summary' key" + ); + assert!( + json.get("daily").is_some(), + "JSON should contain 'daily' key" + ); + assert!( + json.get("by_command").is_some(), + "JSON should contain 'by_command' key" + ); + assert!( + json.get("by_language").is_some(), + "JSON should contain 'by_language' key" + ); + assert!( + json.get("by_mode").is_some(), + "JSON should contain 'by_mode' key" + ); + assert!( + json.get("tier_distribution").is_some(), + "JSON should contain 'tier_distribution' key" + ); +} + +// ============================================================================ +// Clear — should succeed on empty or populated database +// ============================================================================ + +#[test] +fn test_stats_clear() { + let db = NamedTempFile::new().unwrap(); + skim_stats_cmd(&db) + .arg("--clear") + .assert() + .success() + .stdout(predicate::str::contains("Analytics data cleared")); +} + +// ============================================================================ +// Cost flag — should include cost section in JSON output +// ============================================================================ + +#[test] +fn test_stats_cost_flag() { + let db = NamedTempFile::new().unwrap(); + let output = skim_stats_cmd(&db) + .args(["--format", "json", "--cost"]) + .output() + .unwrap(); + + assert!( + output.status.success(), + "skim stats --format json --cost should exit 0" + ); + + let stdout = String::from_utf8(output.stdout).unwrap(); + let json: serde_json::Value = serde_json::from_str(stdout.trim()) + .unwrap_or_else(|e| panic!("Expected valid JSON, got parse error: {e}\nstdout: {stdout}")); + + // With --cost, the JSON should include cost_estimate section + let cost = json.get("cost_estimate"); + assert!( + cost.is_some(), + "JSON should contain 'cost_estimate' key when --cost is passed" + ); + + let cost = cost.unwrap(); + assert!( + cost.get("model").is_some(), + "cost_estimate should contain 'model' key" + ); + assert!( + cost.get("input_cost_per_mtok").is_some(), + "cost_estimate should contain 'input_cost_per_mtok' key" + ); + assert!( + cost.get("estimated_savings_usd").is_some(), + "cost_estimate should contain 'estimated_savings_usd' key" + ); + assert!( + cost.get("tokens_saved").is_some(), + "cost_estimate should contain 'tokens_saved' key" + ); +}