diff --git a/.github/actions/setup-coldvox/action.yml b/.github/actions/setup-coldvox/action.yml index 3b935839..a77e846c 100644 --- a/.github/actions/setup-coldvox/action.yml +++ b/.github/actions/setup-coldvox/action.yml @@ -1,5 +1,5 @@ name: Setup ColdVox Dependencies -description: Install system deps, libvosk, and Rust toolchain +description: Install system deps and Rust toolchain inputs: skip-toolchain: description: Skip Rust toolchain setup (for jobs with custom toolchain) diff --git a/.github/workflow-job-classifier.yml b/.github/workflow-job-classifier.yml index 495f9cfe..f10ea604 100644 --- a/.github/workflow-job-classifier.yml +++ b/.github/workflow-job-classifier.yml @@ -36,7 +36,6 @@ job_classifications: time_minutes: "2-5" concurrent_limit: 2 examples: - - setup-vosk-model - gui-groundwork - security diff --git a/CHANGELOG.md b/CHANGELOG.md index eee90071..63668199 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -100,7 +100,7 @@ Users can still enable detailed debugging via `RUST_LOG=debug` or `RUST_LOG=trac ## v2.0.2 — 2025-09-12 Highlights -- STT Plugin Manager: Full runtime integration, failover/GC, metrics/TUI, Vosk finalization +- STT Plugin Manager: Full runtime integration, failover/GC, metrics/TUI - Tests: Added failover, GC, hot-reload coverage - Docs: Plugin README section, migration notes @@ -113,7 +113,7 @@ Details - Updated README.md with STT plugins section and migration notes Upgrade Notes -- STT configuration now uses --stt-* flags instead of VOSK_MODEL_PATH +- STT configuration now uses --stt-* flags - Plugin settings are automatically persisted to config/plugins.json - TUI now available with --tui flag (requires tui feature) diff --git a/crates/coldvox-stt/src/plugins/coqui.rs b/crates/coldvox-stt/src/plugins/coqui.rs deleted file mode 100644 index 61553056..00000000 --- a/crates/coldvox-stt/src/plugins/coqui.rs +++ /dev/null @@ -1,223 +0,0 @@ -//! Coqui STT - Community fork of Mozilla DeepSpeech -//! -//! Coqui STT is an open-source speech recognition engine based on -//! TensorFlow, offering good accuracy with moderate resource usage. - -use async_trait::async_trait; -use parking_lot::RwLock; -use std::path::PathBuf; -use std::sync::Arc; - -use crate::plugin::*; -use crate::plugin_types::*; -use crate::types::{TranscriptionConfig, TranscriptionEvent}; -use coldvox_foundation::error::{ColdVoxError, SttError}; - -/// Coqui STT model configuration -#[derive(Debug, Clone)] -pub struct CoquiConfig { - /// Path to the model file (.tflite or .pbmm) - pub model_path: PathBuf, - /// Path to the scorer file (optional, for better accuracy) - pub scorer_path: Option, - /// Beam width for CTC decoding - pub beam_width: u32, - /// Enable external scorer - pub use_scorer: bool, - /// Alpha weight for language model - pub lm_alpha: f32, - /// Beta weight for word insertion - pub lm_beta: f32, -} - -impl Default for CoquiConfig { - fn default() -> Self { - Self { - model_path: PathBuf::from("models/coqui/model.tflite"), - scorer_path: Some(PathBuf::from("models/coqui/scorer.scorer")), - beam_width: 500, - use_scorer: true, - lm_alpha: 0.931, - lm_beta: 1.834, - } - } -} - -/// Coqui STT Plugin (formerly Mozilla DeepSpeech) -/// -/// This is a stub for the Coqui STT engine, which provides: -/// - TensorFlow-based acoustic models -/// - CTC decoding with language model scoring -/// - Good accuracy for English and other languages -#[derive(Debug)] -#[allow(dead_code)] -pub struct CoquiPlugin { - config: CoquiConfig, - state: Arc>, - metrics: Arc>, - // Future: Add actual Coqui STT model - // model: Option, - // stream: Option, -} - -impl Default for CoquiPlugin { - fn default() -> Self { - Self::new() - } -} - -impl CoquiPlugin { - pub fn new() -> Self { - Self::with_config(CoquiConfig::default()) - } - - pub fn with_config(config: CoquiConfig) -> Self { - Self { - config, - state: Arc::new(RwLock::new(PluginState::Uninitialized)), - metrics: Arc::new(RwLock::new(PluginMetrics::default())), - } - } - - pub fn enhanced_info() -> EnhancedPluginInfo { - EnhancedPluginInfo { - id: "coqui".to_string(), - name: "Coqui STT".to_string(), - description: "Open-source STT engine, community fork of Mozilla DeepSpeech".to_string(), - version: "1.4.0".to_string(), - author: "Coqui AI".to_string(), - license: "MPL-2.0".to_string(), - homepage: Some("https://github.com/coqui-ai/STT".to_string()), - - accuracy_level: AccuracyLevel::High, - latency_profile: LatencyProfile { - avg_ms: 200, - p95_ms: 400, - p99_ms: 800, - rtf: 0.4, - }, - resource_profile: ResourceProfile { - peak_memory_mb: 400, - avg_cpu_percent: 35.0, - uses_gpu: false, - disk_space_mb: 200, - }, - model_size: ModelSize::Medium, - - languages: vec![ - LanguageSupport { - code: "en".to_string(), - name: "English".to_string(), - quality: LanguageQuality::Stable, - variants: vec!["en-US".to_string()], - }, - // Additional languages available with different models - ], - - requires_internet: false, - requires_gpu: false, - requires_license_key: false, - - is_beta: false, - is_deprecated: false, - source: PluginSource::BuiltIn, - - metrics: None, - } - } -} - -#[async_trait] -impl SttPlugin for CoquiPlugin { - fn info(&self) -> PluginInfo { - PluginInfo { - id: "coqui".to_string(), - name: "Coqui STT".to_string(), - description: "TensorFlow-based STT engine (not yet available)".to_string(), - requires_network: false, - is_local: true, - is_available: false, - supported_languages: vec!["en".to_string()], - memory_usage_mb: Some(200), - } - } - - fn capabilities(&self) -> PluginCapabilities { - PluginCapabilities { - streaming: true, - batch: true, - word_timestamps: false, - confidence_scores: true, - speaker_diarization: false, - auto_punctuation: false, - custom_vocabulary: true, - } - } - - async fn is_available(&self) -> Result { - Ok(false) // Not yet implemented - } - - async fn initialize(&mut self, _config: TranscriptionConfig) -> Result<(), ColdVoxError> { - Err(SttError::NotAvailable { - plugin: "coqui".to_string(), - reason: "Coqui STT integration not yet implemented".to_string(), - } - .into()) - } - - async fn process_audio( - &mut self, - _samples: &[i16], - ) -> Result, ColdVoxError> { - Err(SttError::NotAvailable { - plugin: "coqui".to_string(), - reason: "Coqui STT plugin not yet implemented".to_string(), - } - .into()) - } - - async fn finalize(&mut self) -> Result, ColdVoxError> { - Ok(None) - } - - async fn reset(&mut self) -> Result<(), ColdVoxError> { - Ok(()) - } -} - -pub struct CoquiPluginFactory { - config: CoquiConfig, -} - -impl Default for CoquiPluginFactory { - fn default() -> Self { - Self::new() - } -} - -impl CoquiPluginFactory { - pub fn new() -> Self { - Self { - config: CoquiConfig::default(), - } - } -} - -impl SttPluginFactory for CoquiPluginFactory { - fn create(&self) -> Result, ColdVoxError> { - Ok(Box::new(CoquiPlugin::with_config(self.config.clone()))) - } - - fn plugin_info(&self) -> PluginInfo { - CoquiPlugin::new().info() - } - - fn check_requirements(&self) -> Result<(), ColdVoxError> { - Err(SttError::NotAvailable { - plugin: "coqui".to_string(), - reason: "Coqui STT not yet integrated".to_string(), - } - .into()) - } -} diff --git a/crates/coldvox-stt/src/plugins/leopard.rs b/crates/coldvox-stt/src/plugins/leopard.rs deleted file mode 100644 index 559c5626..00000000 --- a/crates/coldvox-stt/src/plugins/leopard.rs +++ /dev/null @@ -1,233 +0,0 @@ -//! Picovoice Leopard - Commercial ultra-lightweight STT -//! -//! Leopard is Picovoice's on-device speech-to-text engine optimized for -//! resource-constrained environments with excellent accuracy. - -use async_trait::async_trait; -use parking_lot::RwLock; -use std::path::PathBuf; -use std::sync::Arc; - -use crate::plugin::*; -use crate::plugin_types::*; -use crate::types::{TranscriptionConfig, TranscriptionEvent}; -use coldvox_foundation::error::{ColdVoxError, SttError}; - -/// Leopard configuration -#[derive(Debug, Clone)] -pub struct LeopardConfig { - /// Picovoice access key (required for commercial use) - pub access_key: String, - /// Path to Leopard model file (.pv) - pub model_path: PathBuf, - /// Enable automatic punctuation - pub enable_punctuation: bool, - /// Enable diarization (speaker identification) - pub enable_diarization: bool, -} - -impl Default for LeopardConfig { - fn default() -> Self { - Self { - access_key: std::env::var("PICOVOICE_ACCESS_KEY").unwrap_or_default(), - model_path: PathBuf::from("models/leopard/leopard-en.pv"), - enable_punctuation: true, - enable_diarization: false, - } - } -} - -/// Picovoice Leopard STT Plugin -/// -/// Commercial ultra-lightweight STT with: -/// - ~30MB model size -/// - Excellent accuracy for English -/// - Very low latency -/// - Minimal resource usage -#[derive(Debug)] -#[allow(dead_code)] -pub struct LeopardPlugin { - config: LeopardConfig, - state: Arc>, - metrics: Arc>, - // Future: Add Leopard SDK - // leopard: Option, -} - -impl Default for LeopardPlugin { - fn default() -> Self { - Self::new() - } -} - -impl LeopardPlugin { - pub fn new() -> Self { - Self::with_config(LeopardConfig::default()) - } - - pub fn with_config(config: LeopardConfig) -> Self { - Self { - config, - state: Arc::new(RwLock::new(PluginState::Uninitialized)), - metrics: Arc::new(RwLock::new(PluginMetrics::default())), - } - } - - pub fn enhanced_info() -> EnhancedPluginInfo { - EnhancedPluginInfo { - id: "leopard".to_string(), - name: "Picovoice Leopard".to_string(), - description: "Commercial ultra-lightweight on-device STT".to_string(), - version: "2.0.0".to_string(), - author: "Picovoice".to_string(), - license: "Commercial".to_string(), - homepage: Some("https://picovoice.ai/platform/leopard/".to_string()), - - accuracy_level: AccuracyLevel::High, - latency_profile: LatencyProfile { - avg_ms: 40, - p95_ms: 80, - p99_ms: 150, - rtf: 0.1, // Very fast - }, - resource_profile: ResourceProfile { - peak_memory_mb: 80, - avg_cpu_percent: 8.0, - uses_gpu: false, - disk_space_mb: 30, - }, - model_size: ModelSize::Tiny, - - languages: vec![LanguageSupport { - code: "en".to_string(), - name: "English".to_string(), - quality: LanguageQuality::Premium, - variants: vec!["en-US".to_string(), "en-GB".to_string()], - }], - - requires_internet: false, - requires_gpu: false, - requires_license_key: true, - - is_beta: false, - is_deprecated: false, - source: PluginSource::BuiltIn, - - metrics: None, - } - } -} - -#[async_trait] -impl SttPlugin for LeopardPlugin { - fn info(&self) -> PluginInfo { - PluginInfo { - id: "leopard".to_string(), - name: "Picovoice Leopard".to_string(), - description: "Commercial ultra-lightweight STT (requires license)".to_string(), - requires_network: false, - is_local: true, - is_available: false, - supported_languages: vec!["en".to_string()], - memory_usage_mb: Some(30), - } - } - - fn capabilities(&self) -> PluginCapabilities { - PluginCapabilities { - streaming: false, // Leopard is file-based - batch: true, - word_timestamps: true, - confidence_scores: true, - speaker_diarization: self.config.enable_diarization, - auto_punctuation: self.config.enable_punctuation, - custom_vocabulary: false, - } - } - - async fn is_available(&self) -> Result { - // Check for access key - if self.config.access_key.is_empty() { - return Ok(false); - } - Ok(false) // Not yet implemented - } - - async fn initialize(&mut self, _config: TranscriptionConfig) -> Result<(), ColdVoxError> { - if self.config.access_key.is_empty() { - return Err(SttError::InvalidConfig( - "PICOVOICE_ACCESS_KEY required for Leopard".to_string(), - ) - .into()); - } - - Err(SttError::NotAvailable { - plugin: "leopard".to_string(), - reason: "Leopard SDK integration not yet implemented".to_string(), - } - .into()) - } - - async fn process_audio( - &mut self, - _samples: &[i16], - ) -> Result, ColdVoxError> { - Err(SttError::NotAvailable { - plugin: "leopard".to_string(), - reason: "Leopard plugin not yet implemented".to_string(), - } - .into()) - } - - async fn finalize(&mut self) -> Result, ColdVoxError> { - Ok(None) - } - - async fn reset(&mut self) -> Result<(), ColdVoxError> { - Ok(()) - } -} - -pub struct LeopardPluginFactory { - config: LeopardConfig, -} - -impl Default for LeopardPluginFactory { - fn default() -> Self { - Self::new() - } -} - -impl LeopardPluginFactory { - pub fn new() -> Self { - Self { - config: LeopardConfig::default(), - } - } -} - -impl SttPluginFactory for LeopardPluginFactory { - fn create(&self) -> Result, ColdVoxError> { - Ok(Box::new(LeopardPlugin::with_config(self.config.clone()))) - } - - fn plugin_info(&self) -> PluginInfo { - LeopardPlugin::new().info() - } - - fn check_requirements(&self) -> Result<(), ColdVoxError> { - if self.config.access_key.is_empty() { - return Err(SttError::NotAvailable { - plugin: "leopard".to_string(), - reason: "Picovoice access key required".to_string(), - } - .into()); - } - - Err(SttError::NotAvailable { - plugin: "leopard".to_string(), - reason: "Leopard SDK not yet integrated".to_string(), - } - .into()) - } -} diff --git a/crates/coldvox-stt/src/plugins/mod.rs b/crates/coldvox-stt/src/plugins/mod.rs index 717e0c0b..f01c6e29 100644 --- a/crates/coldvox-stt/src/plugins/mod.rs +++ b/crates/coldvox-stt/src/plugins/mod.rs @@ -2,8 +2,6 @@ pub mod mock; pub mod noop; -// whisper backend temporarily removed; will be reintroduced as pure Rust implementation -// pub mod whisper_plugin; #[cfg(feature = "parakeet")] pub mod parakeet; @@ -11,22 +9,9 @@ pub mod parakeet; #[cfg(feature = "moonshine")] pub mod moonshine; -#[cfg(feature = "whisper")] -pub mod whisper_cpp; - -#[cfg(feature = "coqui")] -pub mod coqui; - -#[cfg(feature = "leopard")] -pub mod leopard; - -#[cfg(feature = "silero-stt")] -pub mod silero_stt; - // Re-export commonly used plugins pub use mock::MockPlugin; pub use noop::NoOpPlugin; -// pub use whisper_plugin::{WhisperPlugin, WhisperPluginFactory}; #[cfg(feature = "parakeet")] pub use parakeet::ParakeetPluginFactory; diff --git a/crates/coldvox-stt/src/plugins/silero_stt.rs b/crates/coldvox-stt/src/plugins/silero_stt.rs deleted file mode 100644 index aaed912e..00000000 --- a/crates/coldvox-stt/src/plugins/silero_stt.rs +++ /dev/null @@ -1,303 +0,0 @@ -//! Silero STT - ONNX-based lightweight speech recognition -//! -//! Silero provides lightweight ONNX models for speech recognition, -//! similar to their VAD models but for full transcription. - -use async_trait::async_trait; -use parking_lot::RwLock; -use std::path::PathBuf; -use std::sync::Arc; - -use crate::plugin::*; -use crate::plugin_types::*; -use crate::types::{TranscriptionConfig, TranscriptionEvent}; -use coldvox_foundation::error::{ColdVoxError, SttError}; - -/// Silero STT model variants -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum SileroSttModel { - /// Small model (~50MB) - Fast, lower accuracy - Small, - /// Medium model (~100MB) - Balanced - Medium, - /// Large model (~200MB) - Better accuracy - Large, -} - -impl SileroSttModel { - pub fn model_size_mb(&self) -> u32 { - match self { - Self::Small => 50, - Self::Medium => 100, - Self::Large => 200, - } - } - - pub fn expected_accuracy(&self) -> AccuracyLevel { - match self { - Self::Small => AccuracyLevel::Medium, - Self::Medium => AccuracyLevel::Medium, - Self::Large => AccuracyLevel::High, - } - } -} - -/// Silero STT configuration -#[derive(Debug, Clone)] -pub struct SileroSttConfig { - /// Model variant to use - pub model: SileroSttModel, - /// Path to ONNX model file - pub model_path: Option, - /// Language (supports multiple languages) - pub language: String, - /// Number of threads for ONNX runtime - pub num_threads: u32, - /// Use GPU acceleration if available - pub use_gpu: bool, -} - -impl Default for SileroSttConfig { - fn default() -> Self { - Self { - model: SileroSttModel::Small, - model_path: None, - language: "en".to_string(), - num_threads: 4, - use_gpu: false, - } - } -} - -/// Silero STT Plugin -/// -/// ONNX-based STT engine providing: -/// - Lightweight models -/// - Good accuracy for common languages -/// - CPU-optimized inference -/// - Easy deployment -#[derive(Debug)] -#[allow(dead_code)] -pub struct SileroSttPlugin { - config: SileroSttConfig, - state: Arc>, - metrics: Arc>, - // Future: Add ONNX runtime - // session: Option, - // tokenizer: Option, -} - -impl Default for SileroSttPlugin { - fn default() -> Self { - Self::new() - } -} - -impl SileroSttPlugin { - pub fn new() -> Self { - Self::with_config(SileroSttConfig::default()) - } - - pub fn with_config(config: SileroSttConfig) -> Self { - Self { - config, - state: Arc::new(RwLock::new(PluginState::Uninitialized)), - metrics: Arc::new(RwLock::new(PluginMetrics::default())), - } - } - - pub fn enhanced_info() -> EnhancedPluginInfo { - EnhancedPluginInfo { - id: "silero-stt".to_string(), - name: "Silero STT".to_string(), - description: "ONNX-based lightweight speech recognition".to_string(), - version: "0.2.0".to_string(), - author: "Silero Team".to_string(), - license: "MIT".to_string(), - homepage: Some("https://github.com/snakers4/silero-models".to_string()), - - accuracy_level: AccuracyLevel::Medium, - latency_profile: LatencyProfile { - avg_ms: 60, - p95_ms: 120, - p99_ms: 250, - rtf: 0.2, - }, - resource_profile: ResourceProfile { - peak_memory_mb: 150, - avg_cpu_percent: 15.0, - uses_gpu: false, - disk_space_mb: 50, - }, - model_size: ModelSize::Small, - - languages: vec![ - LanguageSupport { - code: "en".to_string(), - name: "English".to_string(), - quality: LanguageQuality::Stable, - variants: vec![], - }, - LanguageSupport { - code: "ru".to_string(), - name: "Russian".to_string(), - quality: LanguageQuality::Stable, - variants: vec![], - }, - LanguageSupport { - code: "de".to_string(), - name: "German".to_string(), - quality: LanguageQuality::Beta, - variants: vec![], - }, - LanguageSupport { - code: "es".to_string(), - name: "Spanish".to_string(), - quality: LanguageQuality::Beta, - variants: vec![], - }, - ], - - requires_internet: false, - requires_gpu: false, - requires_license_key: false, - - is_beta: true, - is_deprecated: false, - source: PluginSource::BuiltIn, - - metrics: None, - } - } -} - -#[async_trait] -impl SttPlugin for SileroSttPlugin { - fn info(&self) -> PluginInfo { - PluginInfo { - id: "silero-stt".to_string(), - name: "Silero STT".to_string(), - description: "ONNX-based lightweight STT (not yet available)".to_string(), - requires_network: false, - is_local: true, - is_available: false, - supported_languages: vec![ - "en".to_string(), - "ru".to_string(), - "de".to_string(), - "es".to_string(), - ], - memory_usage_mb: Some(self.config.model.model_size_mb()), - } - } - - fn capabilities(&self) -> PluginCapabilities { - PluginCapabilities { - streaming: true, - batch: true, - word_timestamps: false, - confidence_scores: true, - speaker_diarization: false, - auto_punctuation: false, - custom_vocabulary: false, - } - } - - async fn is_available(&self) -> Result { - // Check for ONNX runtime - // Check for model file - Ok(false) // Not yet implemented - } - - async fn initialize(&mut self, _config: TranscriptionConfig) -> Result<(), ColdVoxError> { - // Future: - // 1. Load ONNX model - // 2. Initialize tokenizer - // 3. Setup ONNX session - - Err(SttError::NotAvailable { - plugin: "silero-stt".to_string(), - reason: "Silero STT integration not yet implemented".to_string(), - } - .into()) - } - - async fn process_audio( - &mut self, - _samples: &[i16], - ) -> Result, ColdVoxError> { - Err(SttError::NotAvailable { - plugin: "silero-stt".to_string(), - reason: "Silero STT plugin not yet implemented".to_string(), - } - .into()) - } - - async fn finalize(&mut self) -> Result, ColdVoxError> { - Ok(None) - } - - async fn reset(&mut self) -> Result<(), ColdVoxError> { - Ok(()) - } -} - -pub struct SileroSttPluginFactory { - config: SileroSttConfig, -} - -impl Default for SileroSttPluginFactory { - fn default() -> Self { - Self::new() - } -} - -impl SileroSttPluginFactory { - pub fn new() -> Self { - Self { - config: SileroSttConfig::default(), - } - } -} - -impl SttPluginFactory for SileroSttPluginFactory { - fn create(&self) -> Result, ColdVoxError> { - Ok(Box::new(SileroSttPlugin::with_config(self.config.clone()))) - } - - fn plugin_info(&self) -> PluginInfo { - SileroSttPlugin::new().info() - } - - fn check_requirements(&self) -> Result<(), ColdVoxError> { - Err(SttError::NotAvailable { - plugin: "silero-stt".to_string(), - reason: "Silero STT not yet integrated".to_string(), - } - .into()) - } -} - -// Future implementation notes: -// -// Silero STT integration will require: -// -// 1. ONNX Runtime: -// - Use ort crate for ONNX inference -// - Support CPU and GPU backends -// - Optimize for mobile/edge devices -// -// 2. Tokenization: -// - Implement Silero's tokenizer -// - Handle multiple languages -// - Support subword tokenization -// -// 3. Model Management: -// - Download models from Silero's repository -// - Cache models locally -// - Support model updates -// -// 4. Performance: -// - Batch processing for efficiency -// - Streaming support with buffering -// - Model quantization options diff --git a/crates/coldvox-stt/src/plugins/whisper_cpp.rs b/crates/coldvox-stt/src/plugins/whisper_cpp.rs deleted file mode 100644 index 2aa3d399..00000000 --- a/crates/coldvox-stt/src/plugins/whisper_cpp.rs +++ /dev/null @@ -1,374 +0,0 @@ -//! Whisper.cpp - Lightweight C++ implementation of OpenAI Whisper -//! -//! This plugin wraps whisper.cpp, a lightweight C++ port of OpenAI's Whisper -//! that uses ggml quantization for efficient inference on CPU. - -use async_trait::async_trait; -use parking_lot::RwLock; -use std::path::PathBuf; -use std::sync::Arc; -use tracing::info; - -use crate::plugin::*; -use crate::plugin_types::*; -use crate::types::{TranscriptionConfig, TranscriptionEvent}; -use coldvox_foundation::error::{ColdVoxError, SttError}; - -/// Whisper model types (ggml quantized) -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum WhisperModelType { - /// Tiny model - 39MB, fastest, lower accuracy - Tiny, - /// Tiny English-only - 39MB, optimized for English - TinyEn, - /// Base model - 74MB, balanced - Base, - /// Base English-only - 74MB - BaseEn, - /// Small model - 244MB, good accuracy - Small, - /// Small English-only - 244MB - SmallEn, - /// Medium model - 769MB, better accuracy - Medium, - /// Medium English-only - 769MB - MediumEn, - /// Large model - 1550MB, best accuracy - Large, -} - -impl WhisperModelType { - pub fn model_size_mb(&self) -> u32 { - match self { - Self::Tiny | Self::TinyEn => 39, - Self::Base | Self::BaseEn => 74, - Self::Small | Self::SmallEn => 244, - Self::Medium | Self::MediumEn => 769, - Self::Large => 1550, - } - } - - pub fn expected_accuracy(&self) -> AccuracyLevel { - match self { - Self::Tiny | Self::TinyEn => AccuracyLevel::Low, - Self::Base | Self::BaseEn => AccuracyLevel::Medium, - Self::Small | Self::SmallEn => AccuracyLevel::High, - Self::Medium | Self::MediumEn => AccuracyLevel::High, - Self::Large => AccuracyLevel::VeryHigh, - } - } - - pub fn is_english_only(&self) -> bool { - matches!( - self, - Self::TinyEn | Self::BaseEn | Self::SmallEn | Self::MediumEn - ) - } - - pub fn filename(&self) -> &str { - match self { - Self::Tiny => "ggml-tiny.bin", - Self::TinyEn => "ggml-tiny.en.bin", - Self::Base => "ggml-base.bin", - Self::BaseEn => "ggml-base.en.bin", - Self::Small => "ggml-small.bin", - Self::SmallEn => "ggml-small.en.bin", - Self::Medium => "ggml-medium.bin", - Self::MediumEn => "ggml-medium.en.bin", - Self::Large => "ggml-large.bin", - } - } -} - -/// Whisper.cpp configuration -#[derive(Debug, Clone)] -pub struct WhisperCppConfig { - /// Model type to use - pub model_type: WhisperModelType, - /// Path to model file - pub model_path: Option, - /// Target language (ISO 639-1) - pub language: String, - /// Enable word-level timestamps - pub enable_timestamps: bool, - /// Number of threads for inference - pub num_threads: u32, - /// Use GPU if available (requires CUDA/Metal build) - pub use_gpu: bool, - /// Beam size for decoding - pub beam_size: u32, - /// Temperature for sampling - pub temperature: f32, -} - -impl Default for WhisperCppConfig { - fn default() -> Self { - Self { - model_type: WhisperModelType::TinyEn, - model_path: None, - language: "en".to_string(), - enable_timestamps: true, - num_threads: 4, - use_gpu: false, - beam_size: 5, - temperature: 0.0, - } - } -} - -/// Whisper.cpp STT Plugin -/// -/// This is a stub implementation for whisper.cpp integration. -/// Once implemented, it will provide: -/// - Quantized model support (ggml format) -/// - CPU-optimized inference -/// - Multiple model sizes for different accuracy/speed tradeoffs -#[derive(Debug)] -pub struct WhisperCppPlugin { - config: WhisperCppConfig, - state: Arc>, - // Future: Add actual whisper.cpp context - // context: Option<*mut WhisperContext>, -} - -impl WhisperCppPlugin { - pub fn new() -> Self { - Self::with_config(WhisperCppConfig::default()) - } - - pub fn with_config(config: WhisperCppConfig) -> Self { - Self { - config, - state: Arc::new(RwLock::new(PluginState::Uninitialized)), - } - } - - pub fn enhanced_info() -> EnhancedPluginInfo { - let config = WhisperCppConfig::default(); - - EnhancedPluginInfo { - id: "whisper-cpp".to_string(), - name: "Whisper.cpp".to_string(), - description: "Lightweight C++ implementation of OpenAI Whisper with quantized models" - .to_string(), - version: "1.5.0".to_string(), - author: "ggerganov".to_string(), - license: "MIT".to_string(), - homepage: Some("https://github.com/ggerganov/whisper.cpp".to_string()), - - accuracy_level: config.model_type.expected_accuracy(), - latency_profile: LatencyProfile { - avg_ms: 100, - p95_ms: 200, - p99_ms: 400, - rtf: 0.25, - }, - resource_profile: ResourceProfile { - peak_memory_mb: config.model_type.model_size_mb() + 100, - avg_cpu_percent: 30.0, - uses_gpu: config.use_gpu, - disk_space_mb: config.model_type.model_size_mb(), - }, - model_size: ModelSize::from_mb(config.model_type.model_size_mb()), - - languages: if config.model_type.is_english_only() { - vec![LanguageSupport { - code: "en".to_string(), - name: "English".to_string(), - quality: LanguageQuality::Premium, - variants: vec!["en-US".to_string(), "en-GB".to_string()], - }] - } else { - // Whisper supports 99+ languages - vec![LanguageSupport { - code: "multi".to_string(), - name: "Multilingual".to_string(), - quality: LanguageQuality::Stable, - variants: vec![], - }] - }, - - requires_internet: false, - requires_gpu: false, - requires_license_key: false, - - is_beta: false, - is_deprecated: false, - source: PluginSource::BuiltIn, - - metrics: None, - } - } -} - -impl Default for WhisperCppPlugin { - fn default() -> Self { - Self::new() - } -} - -#[async_trait] -impl SttPlugin for WhisperCppPlugin { - fn info(&self) -> PluginInfo { - PluginInfo { - id: "whisper-cpp".to_string(), - name: "Whisper.cpp".to_string(), - description: "Lightweight Whisper implementation (not yet available)".to_string(), - requires_network: false, - is_local: true, - is_available: false, // Not yet implemented - supported_languages: if self.config.model_type.is_english_only() { - vec!["en".to_string()] - } else { - vec!["multi".to_string()] - }, - memory_usage_mb: Some(self.config.model_type.model_size_mb()), - } - } - - fn capabilities(&self) -> PluginCapabilities { - PluginCapabilities { - streaming: true, - batch: true, - word_timestamps: self.config.enable_timestamps, - confidence_scores: true, - speaker_diarization: false, - auto_punctuation: true, // Whisper includes punctuation - custom_vocabulary: false, - } - } - - async fn is_available(&self) -> Result { - // Check if whisper.cpp library is available - // In the future, check for: - // 1. whisper.cpp shared library - // 2. Model file existence - // 3. CPU features (AVX, etc.) - - Ok(false) // Not yet implemented - } - - async fn initialize(&mut self, _config: TranscriptionConfig) -> Result<(), ColdVoxError> { - info!("Whisper.cpp plugin is a stub - not yet implemented"); - - // Future implementation: - // 1. Find or download model - // 2. Initialize whisper context - // 3. Configure parameters - // 4. Warm up with test audio - - Err(SttError::NotAvailable { - plugin: "whisper-cpp".to_string(), - reason: "Whisper.cpp integration not yet implemented".to_string(), - } - .into()) - } - - async fn process_audio( - &mut self, - _samples: &[i16], - ) -> Result, ColdVoxError> { - Err(SttError::NotAvailable { - plugin: "whisper-cpp".to_string(), - reason: "Whisper.cpp plugin not yet implemented".to_string(), - } - .into()) - } - - async fn finalize(&mut self) -> Result, ColdVoxError> { - Ok(None) - } - - async fn reset(&mut self) -> Result<(), ColdVoxError> { - let mut state = self.state.write(); - *state = PluginState::Ready; - Ok(()) - } -} - -/// Factory for creating Whisper.cpp plugin instances -pub struct WhisperCppPluginFactory { - config: WhisperCppConfig, -} - -impl WhisperCppPluginFactory { - pub fn new() -> Self { - Self { - config: WhisperCppConfig::default(), - } - } - - pub fn with_config(config: WhisperCppConfig) -> Self { - Self { config } - } - - pub fn with_model(model_type: WhisperModelType) -> Self { - let config = WhisperCppConfig { - model_type, - ..Default::default() - }; - Self { config } - } -} - -impl Default for WhisperCppPluginFactory { - fn default() -> Self { - Self::new() - } -} - -impl SttPluginFactory for WhisperCppPluginFactory { - fn create(&self) -> Result, ColdVoxError> { - Ok(Box::new(WhisperCppPlugin::with_config(self.config.clone()))) - } - - fn plugin_info(&self) -> PluginInfo { - WhisperCppPlugin::new().info() - } - - fn check_requirements(&self) -> Result<(), ColdVoxError> { - // Check for whisper.cpp library - // Check for model files - // Check CPU features - - Err(SttError::NotAvailable { - plugin: "whisper-cpp".to_string(), - reason: "Whisper.cpp not yet integrated".to_string(), - } - .into()) - } -} - -// Future implementation notes: -// -// Integration with whisper.cpp will require: -// -// 1. FFI Bindings: -// - Create Rust bindings for whisper.cpp C API -// - Handle memory management safely -// - Implement streaming interface -// -// 2. Model Management: -// - Download models from Hugging Face -// - Convert models to ggml format if needed -// - Cache models efficiently -// -// 3. Performance Optimizations: -// - Use CPU SIMD instructions (AVX, NEON) -// - Implement batch processing -// - Add model quantization options -// -// 4. Advanced Features: -// - Language detection -// - Translation mode -// - Diarization (future whisper.cpp feature) -// -// Example usage: -// ```rust -// let plugin = WhisperCppPlugin::with_config(WhisperCppConfig { -// model_type: WhisperModelType::Small, -// language: "en".to_string(), -// use_gpu: true, -// ..Default::default() -// }); -// ``` diff --git a/crates/coldvox-stt/src/plugins/whisper_plugin.rs b/crates/coldvox-stt/src/plugins/whisper_plugin.rs deleted file mode 100644 index a811dac5..00000000 --- a/crates/coldvox-stt/src/plugins/whisper_plugin.rs +++ /dev/null @@ -1,943 +0,0 @@ -//! Faster-Whisper speech-to-text plugin implementation. -//! -//! This plugin provides a local transcription backend powered by the -//! `faster-whisper` project. It relies on the `faster-whisper-rs` -//! bindings which bridge to the Python implementation. At this stage we -//! intentionally focus on providing a functional baseline capable of -//! loading a model, buffering audio produced by the VAD pipeline, and -//! performing batch transcription when the VAD signals the end of an -//! utterance. Follow-up work will iterate on streaming partials, -//! fine-grained error handling, and production hardening. -//! -//! # GPU Detection Caching -//! -//! The plugin implements GPU detection caching using `OnceLock` to avoid -//! repeated Python round-trips during `WhisperPluginFactory` construction. -//! This significantly improves performance when creating multiple factory -//! instances, which is common in testing scenarios. -//! -//! The caching mechanism: -//! - Uses a static `OnceLock` to cache the GPU detection result -//! - Performs GPU detection only once on the first call to `detect_device()` -//! - Returns the cached result for all subsequent calls -//! - Is thread-safe and handles concurrent access correctly -//! - Can still be overridden by setting the `WHISPER_DEVICE` environment variable -//! -//! This approach eliminates the overhead of shell-outs to Python/PyTorch -//! while maintaining the flexibility to override the detected device. - -use crate::plugin::*; -use crate::types::{TranscriptionConfig, TranscriptionEvent}; -#[cfg(feature = "whisper")] -use crate::WordInfo; -use async_trait::async_trait; -use coldvox_foundation::env::{detect_environment, Environment}; -use std::env; -use std::path::{Path, PathBuf}; -use std::sync::OnceLock; -#[allow(unused_imports)] -use tracing::{debug, info, warn}; - -#[cfg(feature = "whisper")] -use faster_whisper_rs::{ - config::{VadConfig, WhisperConfig}, - WhisperModel, -}; -#[cfg(feature = "whisper")] -use pyo3::Python; -#[cfg(feature = "whisper")] -use tempfile::Builder; - -use coldvox_foundation::error::{ColdVoxError, SttError}; - -/// Static cache for GPU detection result to avoid repeated Python round-trips -/// -/// This cache stores the result of GPU detection to avoid repeated shell-outs -/// to Python/PyTorch during `WhisperPluginFactory` construction. The cache is -/// initialized once using `OnceLock` and then reused for all subsequent calls. -/// -/// The cache is thread-safe and handles concurrent access correctly. The cached -/// value can still be overridden by setting the `WHISPER_DEVICE` environment -/// variable before creating a factory instance. -static GPU_DETECTION_CACHE: OnceLock = OnceLock::new(); - -/// Whisper-based STT plugin backed by faster-whisper. -#[derive(Debug)] -pub struct WhisperPlugin { - model_path: Option, - model_size: WhisperModelSize, - language: Option, - device: String, - compute_type: String, - #[allow(dead_code)] - initialized: bool, - #[cfg(feature = "whisper")] - model: Option, - #[cfg(feature = "whisper")] - audio_buffer: Vec, - #[cfg(feature = "whisper")] - active_config: Option, -} - -impl WhisperPlugin { - pub fn new() -> Self { - Self { - model_path: None, - model_size: WhisperModelSize::default(), - language: None, - device: "cpu".to_string(), - compute_type: "int8".to_string(), - initialized: false, - #[cfg(feature = "whisper")] - model: None, - #[cfg(feature = "whisper")] - audio_buffer: Vec::new(), - #[cfg(feature = "whisper")] - active_config: None, - } - } - - pub fn with_model_size(mut self, size: WhisperModelSize) -> Self { - self.model_size = size; - self - } - - pub fn with_language(mut self, language: String) -> Self { - self.language = Some(language); - self - } - - pub fn with_model_path(mut self, path: PathBuf) -> Self { - self.model_path = Some(path); - self - } - - pub fn with_device>(mut self, device: S) -> Self { - self.device = device.into(); - self - } - - pub fn with_compute_type>(mut self, compute_type: S) -> Self { - self.compute_type = compute_type.into(); - self - } - - #[cfg(feature = "whisper")] - fn resolve_model_identifier( - &self, - config: &TranscriptionConfig, - ) -> Result { - let path_candidate = if !config.model_path.is_empty() { - Some(PathBuf::from(&config.model_path)) - } else { - self.model_path.clone() - }; - - if let Some(path) = path_candidate { - if path.exists() { - return Ok(path.to_string_lossy().to_string()); - } - - warn!( - target: "coldvox::stt::whisper", - candidate = %path.display(), - "Configured Whisper model path does not exist; falling back to builtin model size" - ); - } - - Ok(self.model_size.model_identifier()) - } - - #[cfg(feature = "whisper")] - fn build_whisper_config(&self, config: &TranscriptionConfig) -> WhisperConfig { - WhisperConfig { - language: self.language.clone(), - beam_size: config.max_alternatives.max(1) as usize, - best_of: config.max_alternatives.max(1) as usize, - vad: VadConfig { - active: config.streaming, - ..Default::default() - }, - ..Default::default() - } - } -} - -impl Default for WhisperPlugin { - fn default() -> Self { - Self::new() - } -} - -/// Available Whisper model sizes. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] -pub enum WhisperModelSize { - Tiny, - #[default] - Base, - Small, - Medium, - Large, - LargeV2, - LargeV3, -} - -impl WhisperModelSize { - fn memory_usage_mb(&self) -> u32 { - match self { - Self::Tiny => 100, - Self::Base => 200, - Self::Small => 500, - Self::Medium => 1500, - Self::Large | Self::LargeV2 | Self::LargeV3 => 3000, - } - } - - #[allow(dead_code)] - fn model_identifier(&self) -> String { - match self { - Self::Tiny => "tiny".to_string(), - Self::Base => "base.en".to_string(), - Self::Small => "small.en".to_string(), - Self::Medium => "medium.en".to_string(), - Self::Large => "large".to_string(), - Self::LargeV2 => "large-v2".to_string(), - Self::LargeV3 => "large-v3".to_string(), - } - } -} - -/// Get the default model size for the given environment -fn default_model_size_for_environment(env: Environment) -> WhisperModelSize { - match env { - Environment::CI => { - // In CI, use the smallest model to conserve resources - WhisperModelSize::Tiny - } - Environment::Development => { - // In development, check available memory and choose accordingly - if let Some(available_mb) = WhisperPluginFactory::get_available_memory_mb() { - // On high-performance developer workstations, prefer the largest model for accuracy - // Use a conservative threshold (>= 12 GB available) to avoid impacting typical laptops - if available_mb >= 12_000 { - WhisperModelSize::LargeV3 - } else { - WhisperPluginFactory::get_model_size_for_memory(available_mb) - } - } else { - // If we can't determine memory, use a small model - WhisperModelSize::Base - } - } - Environment::Production => { - // In production, check available memory and choose accordingly - if let Some(available_mb) = WhisperPluginFactory::get_available_memory_mb() { - WhisperPluginFactory::get_model_size_for_memory(available_mb) - } else { - // If we can't determine memory, use a balanced model - WhisperModelSize::Small - } - } - } -} - -#[async_trait] -impl SttPlugin for WhisperPlugin { - fn info(&self) -> PluginInfo { - PluginInfo { - id: "whisper".to_string(), - name: "Faster Whisper".to_string(), - description: "Local transcription via faster-whisper".to_string(), - requires_network: false, - is_local: true, - is_available: check_whisper_available(), - supported_languages: vec!["auto".to_string(), "en".to_string()], - memory_usage_mb: Some(self.model_size.memory_usage_mb()), - } - } - - fn capabilities(&self) -> PluginCapabilities { - PluginCapabilities { - streaming: false, - batch: true, - word_timestamps: true, - confidence_scores: true, - speaker_diarization: false, - auto_punctuation: true, - custom_vocabulary: false, - } - } - - async fn is_available(&self) -> Result { - Ok(check_whisper_available()) - } - - async fn initialize(&mut self, config: TranscriptionConfig) -> Result<(), ColdVoxError> { - #[cfg(feature = "whisper")] - { - let model_id = self.resolve_model_identifier(&config)?; - let mut whisper_config = self.build_whisper_config(&config); - if whisper_config.language.is_none() { - whisper_config.language = self.language.clone(); - } - - // If the selected model is English-only (e.g., base.en/small.en/medium.en) - // and no language was set explicitly, default to "en" to avoid runtime warnings. - if whisper_config.language.is_none() && model_id.to_lowercase().contains(".en") { - whisper_config.language = Some("en".to_string()); - } - - debug!( - target: "coldvox::stt::whisper", - model = %model_id, - device = %self.device, - compute = %self.compute_type, - "Initializing Faster Whisper model" - ); - - let model = WhisperModel::new( - model_id, - self.device.clone(), - self.compute_type.clone(), - whisper_config, - ) - .map_err(|err| SttError::LoadFailed(err.to_string()))?; - - self.model = Some(model); - self.audio_buffer.clear(); - self.active_config = Some(config); - self.initialized = true; - info!( - target: "coldvox::stt::whisper", - "Faster Whisper plugin initialized" - ); - return Ok(()); - } - - #[cfg(not(feature = "whisper"))] - { - let _ = config; - Err(SttError::NotAvailable { - plugin: "whisper".to_string(), - reason: "Whisper feature not compiled".to_string(), - } - .into()) - } - } - - async fn process_audio( - &mut self, - samples: &[i16], - ) -> Result, ColdVoxError> { - #[cfg(feature = "whisper")] - { - if !self.initialized { - return Err(SttError::NotAvailable { - plugin: "whisper".to_string(), - reason: "Faster Whisper plugin not initialized".to_string(), - } - .into()); - } - - self.audio_buffer.extend_from_slice(samples); - Ok(None) - } - - #[cfg(not(feature = "whisper"))] - { - let _ = samples; - Err(SttError::NotAvailable { - plugin: "whisper".to_string(), - reason: "Whisper feature not compiled".to_string(), - } - .into()) - } - } - - async fn finalize(&mut self) -> Result, ColdVoxError> { - #[cfg(feature = "whisper")] - { - if !self.initialized { - return Ok(None); - } - - if self.audio_buffer.is_empty() { - return Ok(None); - } - - let temp = Builder::new() - .prefix("coldvox-whisper-") - .suffix(".wav") - .tempfile() - .map_err(|err| SttError::TranscriptionFailed(err.to_string()))?; - let temp_path = temp.path().to_path_buf(); - - { - let spec = hound::WavSpec { - channels: 1, - sample_rate: crate::constants::SAMPLE_RATE_HZ, - bits_per_sample: 16, - sample_format: hound::SampleFormat::Int, - }; - let mut writer = hound::WavWriter::create(&temp_path, spec) - .map_err(|err| SttError::TranscriptionFailed(err.to_string()))?; - for sample in &self.audio_buffer { - writer - .write_sample(*sample) - .map_err(|err| SttError::TranscriptionFailed(err.to_string()))?; - } - writer - .finalize() - .map_err(|err| SttError::TranscriptionFailed(err.to_string()))?; - } - - let transcription = self - .model - .as_ref() - .ok_or_else(|| { - SttError::TranscriptionFailed("Faster Whisper model not loaded".to_string()) - })? - .transcribe(temp_path.to_string_lossy().to_string()) - .map_err(|err| SttError::TranscriptionFailed(err.to_string()))?; - - let mut text = transcription.to_string(); - if text.ends_with('\n') { - text.pop(); - } - - let include_words = self - .active_config - .as_ref() - .map(|cfg| cfg.include_words) - .unwrap_or(false); - - let words = if include_words { - Some( - transcription - .1 - .iter() - .map(|segment| WordInfo { - start: segment.start, - end: segment.end, - conf: (1.0 - segment.no_speech_prob).clamp(0.0, 1.0), - text: segment.text.clone(), - }) - .collect(), - ) - } else { - None - }; - - self.audio_buffer.clear(); - // Ensure the temporary file is cleaned up. - if let Err(err) = temp.close() { - warn!( - target: "coldvox::stt::whisper", - error = %err, - "Failed to remove temporary whisper audio file" - ); - } - - Ok(Some(TranscriptionEvent::Final { - utterance_id: 0, - text, - words, - })) - } - - #[cfg(not(feature = "whisper"))] - { - Err(SttError::NotAvailable { - plugin: "whisper".to_string(), - reason: "Whisper feature not compiled".to_string(), - } - .into()) - } - } - - async fn reset(&mut self) -> Result<(), ColdVoxError> { - #[cfg(feature = "whisper")] - { - self.audio_buffer.clear(); - Ok(()) - } - - #[cfg(not(feature = "whisper"))] - { - Err(SttError::NotAvailable { - plugin: "whisper".to_string(), - reason: "Whisper feature not compiled".to_string(), - } - .into()) - } - } - - async fn load_model(&mut self, model_path: Option<&Path>) -> Result<(), ColdVoxError> { - if let Some(path) = model_path { - self.model_path = Some(path.to_path_buf()); - } - Ok(()) - } - - async fn unload(&mut self) -> Result<(), ColdVoxError> { - #[cfg(feature = "whisper")] - { - self.model = None; - self.audio_buffer.clear(); - self.initialized = false; - Ok(()) - } - - #[cfg(not(feature = "whisper"))] - { - Err(SttError::NotAvailable { - plugin: "whisper".to_string(), - reason: "Whisper feature not compiled".to_string(), - } - .into()) - } - } -} - -/// Factory for creating WhisperPlugin instances. -pub struct WhisperPluginFactory { - model_path: Option, - model_size: WhisperModelSize, - language: Option, - device: String, - compute_type: String, -} - -impl WhisperPluginFactory { - pub fn new() -> Self { - // Check for WHISPER_MODEL_SIZE environment variable first - let model_size = if let Ok(model_size_str) = env::var("WHISPER_MODEL_SIZE") { - Self::parse_model_size(&model_size_str).unwrap_or_else(|_| { - warn!( - target: "coldvox::stt::whisper", - "Invalid WHISPER_MODEL_SIZE value: {}, using default", model_size_str - ); - default_model_size_for_environment(detect_environment()) - }) - } else { - default_model_size_for_environment(detect_environment()) - }; - - let device = std::env::var("WHISPER_DEVICE").unwrap_or_else(|_| Self::detect_device()); - let compute_type = std::env::var("WHISPER_COMPUTE").unwrap_or_else(|_| { - if device == "cuda" { - "float16".to_string() - } else { - "int8".to_string() - } - }); - - Self { - model_path: std::env::var("WHISPER_MODEL_PATH").ok().map(PathBuf::from), - model_size, - language: std::env::var("WHISPER_LANGUAGE") - .ok() - .or(Some("en".to_string())), - device, - compute_type, - } - } - - /// Detect GPU availability and return appropriate device - /// - /// This function uses `OnceLock` to cache the GPU detection result and avoid - /// repeated Python round-trips. The first call performs the actual detection - /// by shell-ing out to Python/PyTorch to check CUDA availability. Subsequent - /// calls return the cached result. - /// - /// The detection process: - /// 1. Checks if CUDA is available using PyTorch's `torch.cuda.is_available()` - /// 2. Returns "cuda" if GPU is available, "cpu" otherwise - /// 3. Caches the result to avoid repeated shell-outs - /// - /// # Thread Safety - /// - /// This function is thread-safe and can be called concurrently from multiple - /// threads. The `OnceLock` ensures that only one thread performs the actual - /// detection, while others wait for and receive the cached result. - /// - /// # Environment Override - /// - /// The `WHISPER_DEVICE` environment variable can still override this detection - /// when creating a `WhisperPluginFactory`, as the factory checks this variable - /// before calling this function. - /// - /// # Returns - /// - /// Returns either "cuda" if a compatible GPU is detected, or "cpu" if no GPU - /// is available or detection fails. - pub fn detect_device() -> String { - GPU_DETECTION_CACHE.get_or_init(|| { - // Check for CUDA availability using PyTorch - if let Ok(output) = std::process::Command::new("python3") - .arg("-c") - .arg("import torch; print('cuda' if torch.cuda.is_available() else 'cpu')") - .output() - { - if output.status.success() { - let device = String::from_utf8_lossy(&output.stdout).trim().to_string(); - if device == "cuda" { - info!(target: "coldvox::stt::whisper", "CUDA GPU detected, using GPU acceleration"); - return device; - } - } - } - - warn!(target: "coldvox::stt::whisper", "No GPU detected, falling back to CPU"); - "cpu".to_string() - }).clone() - } - - /// Parse model size from string - fn parse_model_size(size_str: &str) -> Result { - match size_str.to_lowercase().as_str() { - "tiny" => Ok(WhisperModelSize::Tiny), - "base" => Ok(WhisperModelSize::Base), - "small" => Ok(WhisperModelSize::Small), - "medium" => Ok(WhisperModelSize::Medium), - "large" => Ok(WhisperModelSize::Large), - "large-v2" => Ok(WhisperModelSize::LargeV2), - "large-v3" => Ok(WhisperModelSize::LargeV3), - _ => Err(()), - } - } - - /// Get available memory in MB - fn get_available_memory_mb() -> Option { - // Test/override hook: allow forcing a specific available memory size via env var - // Useful for unit tests and local validation without relying on /proc/meminfo. - if let Ok(fake_mb) = env::var("WHISPER_AVAILABLE_MEM_MB") { - if let Ok(val) = fake_mb.parse::() { - return Some(val); - } - } - - #[cfg(unix)] - { - use std::fs; - match fs::read_to_string("/proc/meminfo") { - Ok(content) => { - for line in content.lines() { - if line.starts_with("MemAvailable:") { - let parts: Vec<&str> = line.split_whitespace().collect(); - if parts.len() >= 2 { - if let Ok(kb) = parts[1].parse::() { - return Some(kb / 1024); // Convert KB to MB - } - } - } - } - None - } - Err(_) => None, - } - } - - #[cfg(not(unix))] - { - // For non-Unix systems, return None - None - } - } - - /// Get appropriate model size based on available memory - fn get_model_size_for_memory(available_mb: u32) -> WhisperModelSize { - if available_mb < 500 { - WhisperModelSize::Tiny - } else if available_mb < 1000 { - WhisperModelSize::Base - } else if available_mb < 2000 { - WhisperModelSize::Small - } else if available_mb < 4000 { - WhisperModelSize::Medium - } else { - WhisperModelSize::Base // Default to Base even with lots of memory for stability - } - } - - pub fn with_model_size(mut self, size: WhisperModelSize) -> Self { - self.model_size = size; - self - } - - pub fn with_model_path(mut self, path: PathBuf) -> Self { - self.model_path = Some(path); - self - } - - pub fn with_language(mut self, language: String) -> Self { - self.language = Some(language); - self - } - - pub fn with_device>(mut self, device: S) -> Self { - self.device = device.into(); - self - } - - pub fn with_compute_type>(mut self, compute_type: S) -> Self { - self.compute_type = compute_type.into(); - self - } -} - -impl Default for WhisperPluginFactory { - fn default() -> Self { - Self::new() - } -} - -impl SttPluginFactory for WhisperPluginFactory { - fn create(&self) -> Result, ColdVoxError> { - let mut plugin = WhisperPlugin::new() - .with_model_size(self.model_size) - .with_device(self.device.clone()) - .with_compute_type(self.compute_type.clone()); - - if let Some(ref path) = self.model_path { - plugin = plugin.with_model_path(path.clone()); - } - - if let Some(ref lang) = self.language { - plugin = plugin.with_language(lang.clone()); - } - - Ok(Box::new(plugin)) - } - - fn plugin_info(&self) -> PluginInfo { - WhisperPlugin::new() - .with_model_size(self.model_size) - .with_device(self.device.clone()) - .with_compute_type(self.compute_type.clone()) - .info() - } - - fn check_requirements(&self) -> Result<(), ColdVoxError> { - if !check_whisper_available() { - return Err(SttError::NotAvailable { - plugin: "whisper".to_string(), - reason: "The faster-whisper Python module is not available. Install the `faster-whisper` package.".to_string(), - } - .into()); - } - - if let Some(ref path) = self.model_path { - if !path.exists() { - return Err(SttError::ModelNotFound { path: path.clone() }.into()); - } - } - - Ok(()) - } -} - -#[cfg(feature = "whisper")] -fn check_whisper_available() -> bool { - Python::with_gil(|py| py.import_bound("faster_whisper").is_ok()) -} - -#[cfg(not(feature = "whisper"))] -fn check_whisper_available() -> bool { - false -} - -#[cfg(test)] -mod tests { - use super::*; - use std::env; - - #[test] - fn model_size_identifier_mapping() { - assert_eq!(WhisperModelSize::Tiny.model_identifier(), "tiny"); - assert_eq!(WhisperModelSize::Base.model_identifier(), "base.en"); - assert_eq!(WhisperModelSize::LargeV3.model_identifier(), "large-v3"); - } - - #[test] - fn parse_model_size() { - assert_eq!( - WhisperPluginFactory::parse_model_size("tiny").unwrap(), - WhisperModelSize::Tiny - ); - assert_eq!( - WhisperPluginFactory::parse_model_size("large-v3").unwrap(), - WhisperModelSize::LargeV3 - ); - assert!(WhisperPluginFactory::parse_model_size("invalid").is_err()); - assert!(WhisperPluginFactory::parse_model_size("").is_err()); - } - - #[test] - fn environment_detection() { - // Test CI detection - env::set_var("CI", "true"); - assert_eq!(detect_environment(), Environment::CI); - env::remove_var("CI"); - - // Test development detection - env::set_var("DEBUG", "1"); - assert_eq!(detect_environment(), Environment::Development); - env::remove_var("DEBUG"); - - // Default to production when no indicators are present - assert_eq!(detect_environment(), Environment::Production); - } - - #[test] - fn model_size_for_memory() { - // Test memory-based model selection - assert_eq!( - WhisperPluginFactory::get_model_size_for_memory(300), - WhisperModelSize::Tiny - ); - assert_eq!( - WhisperPluginFactory::get_model_size_for_memory(750), - WhisperModelSize::Base - ); - assert_eq!( - WhisperPluginFactory::get_model_size_for_memory(1500), - WhisperModelSize::Small - ); - assert_eq!( - WhisperPluginFactory::get_model_size_for_memory(3000), - WhisperModelSize::Medium - ); - assert_eq!( - WhisperPluginFactory::get_model_size_for_memory(8000), - WhisperModelSize::Base - ); - } - - #[test] - fn environment_default_model_sizes() { - // Test default model sizes for each environment - assert_eq!( - default_model_size_for_environment(Environment::CI), - WhisperModelSize::Tiny - ); - - // Development and production depend on memory, so we can't test exact values - // without mocking memory detection - } - - #[test] - fn development_env_prefers_large_on_beefy_machine() { - // Simulate development environment - env::set_var("DEBUG", "1"); - // Simulate a beefy machine with lots of available memory - env::set_var("WHISPER_AVAILABLE_MEM_MB", "16384"); - - assert_eq!(detect_environment(), Environment::Development); - let chosen = default_model_size_for_environment(Environment::Development); - assert_eq!(chosen, WhisperModelSize::LargeV3); - - env::remove_var("WHISPER_AVAILABLE_MEM_MB"); - env::remove_var("DEBUG"); - } - - #[test] - fn production_env_does_not_escalate_to_large_by_default() { - // Ensure no CI or dev markers are present - for var in [ - "CI", - "CONTINUOUS_INTEGRATION", - "GITHUB_ACTIONS", - "GITLAB_CI", - "TRAVIS", - "CIRCLECI", - "JENKINS_URL", - "BUILDKITE", - "RUST_BACKTRACE", - "DEBUG", - "DEV", - ] { - env::remove_var(var); - } - - // Simulate lots of memory - env::set_var("WHISPER_AVAILABLE_MEM_MB", "16384"); - assert_eq!(detect_environment(), Environment::Production); - let chosen = default_model_size_for_environment(Environment::Production); - assert_ne!(chosen, WhisperModelSize::LargeV3); - env::remove_var("WHISPER_AVAILABLE_MEM_MB"); - } - - #[test] - fn whisper_model_size_env_var() { - // Test that WHISPER_MODEL_SIZE environment variable is respected - env::set_var("WHISPER_MODEL_SIZE", "large-v2"); - let factory = WhisperPluginFactory::new(); - assert_eq!(factory.model_size, WhisperModelSize::LargeV2); - env::remove_var("WHISPER_MODEL_SIZE"); - - // Test with invalid value - should fall back to environment default - env::set_var("WHISPER_MODEL_SIZE", "invalid-size"); - let factory = WhisperPluginFactory::new(); - // Should not panic and should use a valid default based on environment - assert!(matches!( - factory.model_size, - WhisperModelSize::Tiny | WhisperModelSize::Base | WhisperModelSize::Small - )); - env::remove_var("WHISPER_MODEL_SIZE"); - } - - #[test] - fn gpu_detection_caching() { - // Ensure WHISPER_DEVICE is not set to test detection - env::remove_var("WHISPER_DEVICE"); - - // First call should trigger detection - let device1 = WhisperPluginFactory::detect_device(); - - // Second call should return cached result without re-running detection - let device2 = WhisperPluginFactory::detect_device(); - - // Both calls should return the same result - assert_eq!(device1, device2); - - // Verify the device is either "cuda" or "cpu" - assert!(device1 == "cuda" || device1 == "cpu"); - } - - #[test] - fn whisper_device_env_var_overrides_cache() { - // Set WHISPER_DEVICE to override detection - env::set_var("WHISPER_DEVICE", "cuda:1"); - - let factory = WhisperPluginFactory::new(); - assert_eq!(factory.device, "cuda:1"); - - env::remove_var("WHISPER_DEVICE"); - } - - #[test] - fn gpu_detection_thread_safety() { - use std::thread; - - // Ensure WHISPER_DEVICE is not set to test detection - env::remove_var("WHISPER_DEVICE"); - - let handles: Vec<_> = (0..10) - .map(|_| thread::spawn(WhisperPluginFactory::detect_device)) - .collect(); - - // All threads should get the same result - let results: Vec = handles - .into_iter() - .map(|handle| handle.join().unwrap()) - .collect(); - - // All results should be identical - let first_result = &results[0]; - assert!(results.iter().all(|r| r == first_result)); - - // Verify the device is either "cuda" or "cpu" - assert!(first_result == "cuda" || first_result == "cpu"); - } -} diff --git a/docs/MasterDocumentationPlaybook.md b/docs/MasterDocumentationPlaybook.md index ba91056a..a1bbaf4d 100644 --- a/docs/MasterDocumentationPlaybook.md +++ b/docs/MasterDocumentationPlaybook.md @@ -127,11 +127,9 @@ Notes: ## 4) Placement, Naming, and Linking Rules - All documentation lives in `/docs` (see approved exceptions). -- Use kebab-case filenames: `vosk-model-discovery-flow.md`. - Reference crate indexes: place at `docs/reference/crates/.md` and link to the crate’s README: - Example contents: “This is the index for `` — authoritative docs live in `../../../crates//README.md`.” - Do not duplicate README contents; add only navigation/context. -- Domain troubleshooting (e.g., Vosk model discovery) must live inside the relevant `docs/domains//troubleshooting/`. - Roadmap lives at `docs/architecture/roadmap.md` and is linked from `docs/architecture.md`. - ADRs live under `docs/architecture/adr-XXXX.md` with incrementing numeric IDs and MUST be linked from `docs/architecture.md`. - `AGENTS.md` (root) is the canonical source for AI agent orientation (overview, workspace map, key commands, feature flags). `CLAUDE.md` should reference or import from `AGENTS.md`. See §2.1 for the full list of agent configuration files and their hierarchy. diff --git a/docs/adr/0001-vosk-model-distribution.md b/docs/adr/0001-vosk-model-distribution.md deleted file mode 100644 index 52cc9368..00000000 --- a/docs/adr/0001-vosk-model-distribution.md +++ /dev/null @@ -1,45 +0,0 @@ -# ADR 0001: Vosk Model Distribution Strategy - -Date: 2025-09-09 -Status: Accepted -Decision Context: Provide an offline STT path with minimal friction while keeping CI deterministic. - -## Options Considered -1. Download model at build/test time (cache in CI). -2. Require manual developer download (document steps only). -3. Commit the small English model into the repository. -4. Use Git LFS for model binaries. - -## Decision -Option 3: Commit the small (≈40–50MB) English model directory `models/vosk-model-small-en-us-0.15/` directly. - -## Rationale -- Eliminates network flakiness in CI (faster, deterministic runs). -- Simplifies onboarding (clone → run with `--features vosk`). -- Keeps evaluation & e2e tests reproducible (same acoustic graph & LM). -- Size impact acceptable for now; single large directory, infrequent updates. - -## Trade-offs -- Repository clone size increases (initial penalty for contributors). -- Future model updates create larger history deltas (git object storage growth). -- Harder to swap languages/variants dynamically without adding more bulk. - -## Mitigations -- Provide `SHA256SUMS` for integrity checking. -- Document provenance & license in `THIRDPARTY.md`. -- Warn of impending deprecation if model count grows (trigger revisit threshold: >150MB cumulative models). -- Potential future migration path: move to Git LFS or per-language on-demand download. - -## Revisit Conditions -- Added second model variant or language. -- Clone complaints or CI bandwidth constraints surface. -- Need for reproducible benchmarks on alternative models. - -## Implementation Notes -- Model resolution logic (`model.rs`): env > config > `models/` dir > legacy root fallback (deprecated). -- CI job validates directory structure plus checksum (to be extended with full hash check if needed). - -## Related Documents -- `THIRDPARTY.md` -- `crates/coldvox-stt-vosk/src/model.rs` -- `README.md` (root) diff --git a/docs/domains/audio/aud-user-config-design.md b/docs/domains/audio/aud-user-config-design.md index 93dfd798..3f7ec973 100644 --- a/docs/domains/audio/aud-user-config-design.md +++ b/docs/domains/audio/aud-user-config-design.md @@ -65,16 +65,22 @@ All environment variables follow the `COLDVOX_*` prefix convention: ### Plugin Management ```bash # CLI ---stt-preferred vosk ---stt-fallbacks whisper,mock +--stt-preferred moonshine +--stt-fallbacks parakeet,mock --stt-require-local # Environment -COLDVOX_STT_PREFERRED=vosk -COLDVOX_STT_FALLBACKS=whisper,mock +COLDVOX_STT_PREFERRED=moonshine +COLDVOX_STT_FALLBACKS=parakeet,mock COLDVOX_STT_REQUIRE_LOCAL=true ``` +**Available STT Plugins:** +- **Moonshine**: CPU-efficient model, recommended for most users +- **Parakeet**: GPU-accelerated model, high-quality transcription +- **Mock**: Test/debug plugin, always returns mock transcript +- **NoOp**: Placeholder plugin that returns empty transcript + ### Resource Management ```bash # CLI @@ -157,7 +163,7 @@ COLDVOX_INJECTION_COOLDOWN_MS=100 ### Storage Options ```bash -# CLI (requires 'vosk' feature) +# CLI --save-transcriptions --save-audio --output-dir /path/to/transcriptions @@ -187,7 +193,8 @@ COLDVOX_INJECTION_COOLDOWN_MS=100 ## Configuration Validation ### Required Dependencies -- **Vosk STT**: Requires `libvosk` system library and model files +- **Moonshine STT**: Pure Rust implementation, CPU-efficient +- **Parakeet STT**: ONNX-based model, requires GPU for optimal performance - **Text Injection**: Platform-specific dependencies (ydotool, kdotool, etc.) - **Audio Hardware**: Input device availability checked at startup @@ -224,15 +231,15 @@ RUST_LOG=info,stt=debug,coldvox_audio=trace ### Basic Voice Dictation ```bash -cargo run --features vosk,text-injection -- \ +cargo run --features moonshine,text-injection -- \ --device "USB Microphone" \ --activation-mode vad \ --enable-text-injection ``` -### High-Quality Recording Setup +### High-Quality GPU-Accelerated Setup ```bash -cargo run --features vosk,text-injection -- \ +cargo run --features parakeet,text-injection -- \ --device "HyperX QuadCast" \ --resampler-quality quality \ --save-transcriptions \ @@ -250,11 +257,11 @@ cargo run -- --tui --activation-mode hotkey ### Production Environment ```bash -COLDVOX_STT_PREFERRED=vosk \ +COLDVOX_STT_PREFERRED=moonshine \ COLDVOX_STT_REQUIRE_LOCAL=true \ COLDVOX_ENABLE_TEXT_INJECTION=true \ COLDVOX_RESTORE_CLIPBOARD=true \ -cargo run --features vosk,text-injection --release +cargo run --features moonshine,text-injection --release ``` ## Migration and Compatibility @@ -274,7 +281,7 @@ cargo run --features vosk,text-injection --release ### Common Configuration Issues 1. **No Audio Device**: Check `--list-devices` output and permissions -2. **STT Not Working**: Verify Vosk model installation and `VOSK_MODEL_PATH` +2. **STT Not Working**: Verify STT plugin installation and correct plugin selected 3. **Text Injection Fails**: Review backend permissions and `--allow-*` flags 4. **Performance Issues**: Adjust `--resampler-quality` and memory limits diff --git a/docs/domains/foundation/fdn-testing-guide.md b/docs/domains/foundation/fdn-testing-guide.md index 19a00ae4..ceb7b5d7 100644 --- a/docs/domains/foundation/fdn-testing-guide.md +++ b/docs/domains/foundation/fdn-testing-guide.md @@ -12,24 +12,27 @@ domain_code: fdn ## Overview -ColdVox has a comprehensive test suite that tests real STT functionality using Vosk models and actual hardware. Tests are designed to work with actual speech recognition and real audio devices rather than mocks to ensure functional correctness. This guide explains how to run tests and set up the required dependencies. +ColdVox has a comprehensive test suite that tests real STT functionality using modern speech recognition models and actual hardware. Tests are designed to work with actual speech recognition and real audio devices rather than mocks to ensure functional correctness. This guide explains how to run tests and set up the required dependencies. ## Test Categories ### Core Tests -**All tests use real Vosk models and hardware for functional validation** +**All tests use real STT models and hardware for functional validation** -- ✅ **Test actual STT functionality** (use real Vosk models) +- ✅ **Test actual STT functionality** (use real Moonshine or Parakeet models) - ✅ **Validate end-to-end pipeline behavior** - ✅ **Test with real audio hardware** (microphones, speakers) -- ✅ **Require Vosk model setup** (see setup section below) +- ✅ **Require STT model setup** (see setup section below) ```bash -# Run tests with Vosk model (all environments) -VOSK_MODEL_PATH="$(pwd)/models/vosk-model-small-en-us-0.15" cargo test +# Run tests with Moonshine (CPU-efficient) +cargo test # Run tests for specific crate -VOSK_MODEL_PATH="$(pwd)/models/vosk-model-small-en-us-0.15" cargo test -p coldvox-app +cargo test -p coldvox-app + +# Run with Parakeet (GPU-accelerated) +cargo test --features parakeet ``` ### Integration Tests (Full Hardware & Models) @@ -56,26 +59,30 @@ cargo test test_candidate_order_default_first ### Hardware Requirements **All environments must have real hardware available** -All tests use real Vosk models and actual audio hardware to validate functionality. This includes development environments and self-hosted CI runners. +All tests use real STT models and actual audio hardware to validate functionality. This includes development environments and self-hosted CI runners. ### Required Setup -#### 1. Vosk Model Setup -All tests require a real Vosk model for STT functionality: +#### 1. STT Model Setup +Tests support multiple STT backends. Choose based on your environment: + +**Option A: Moonshine (CPU-efficient, recommended for most users)** +```bash +# Moonshine models are auto-downloaded on first use +# No manual setup required - the plugin handles model initialization +cargo test +``` + +**Option B: Parakeet (GPU-accelerated)** +```bash +# Requires CUDA/GPU support +cargo test --features parakeet +``` +**Option C: Mock (testing/development)** ```bash -# Option A: Use the automated setup script -./scripts/ci/setup-vosk-cache.sh - -# Option B: Manual setup -# 1. Download a Vosk model -wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip -unzip vosk-model-small-en-us-0.15.zip -mkdir -p models/ -mv vosk-model-small-en-us-0.15 models/ - -# 2. Set environment variable -export VOSK_MODEL_PATH="$(pwd)/models/vosk-model-small-en-us-0.15" +# For testing without actual models +cargo test --features mock ``` #### 2. Audio Hardware Setup @@ -108,7 +115,7 @@ sudo usermod -a -G input $USER | Crate | Unit Tests | Integration Tests | Notes | |-------|------------|-------------------|-------| | `coldvox-audio` | Device enumeration, resampling | Real hardware detection | All tests run with real devices | -| `coldvox-app` | Plugin management, STT logic | End-to-end WAV processing | Vosk model required for all tests | +| `coldvox-app` | Plugin management, STT logic | End-to-end WAV processing | STT model required for all tests | | `coldvox-vad` | VAD algorithms | Real audio processing | Silero ONNX models tested | | `coldvox-stt` | Plugin interfaces | Model loading/inference | Real hardware and models used | @@ -118,7 +125,7 @@ sudo usermod -a -G input $USER # Audio tests (with real hardware) cargo test -p coldvox-audio -# STT tests (with real Vosk models) +# STT tests (with available models) cargo test -p coldvox-app stt --lib # Text injection tests (with real injection) @@ -127,32 +134,38 @@ cargo test -p coldvox-app --features text-injection injection # VAD tests (with real audio processing) cargo test -p coldvox-vad -# Full pipeline with real models and hardware -cargo test -p coldvox-app test_end_to_end_wav --features vosk +# Full pipeline with Moonshine (CPU) +cargo test -p coldvox-app test_end_to_end_wav + +# Full pipeline with Parakeet (GPU) +cargo test -p coldvox-app test_end_to_end_wav --features parakeet ``` ## Key Testing Principles ### Real Hardware Testing -- **Use real hardware**: All tests run against actual audio devices and Vosk models +- **Use real hardware**: All tests run against actual audio devices and STT models - **No mock-only paths**: If mocks are used for unit testing, full real tests must be included in the same test run - **Comprehensive**: Test actual functionality end-to-end with real hardware and models - **Reliable**: Target hardware is consistently available across environments ### Test Design - **No ignored tests**: All tests run by default in standard test execution -- **Real dependencies**: Use actual Vosk models and audio hardware for validation +- **Real dependencies**: Use actual STT models and audio hardware for validation - **Full validation**: Test complete pipeline from audio capture to text injection - **Mock + Real requirement**: Any test suite using mocks must also include corresponding real tests ## Common Issues & Solutions -### "Failed to locate Vosk model" Errors +### "Failed to initialize STT plugin" Errors ```bash -# Fix: Set up Vosk model -export VOSK_MODEL_PATH="/path/to/vosk-model-small-en-us-0.15" -# OR -./scripts/ci/setup-vosk-cache.sh +# Moonshine: Models auto-download on first use (may take a moment) +# Parakeet: Requires GPU/CUDA support available +# Mock: Use --features mock for testing without models + +# Verify your setup +cargo run --bin mic_probe +cargo test -p coldvox-audio # Test audio independently first ``` ### Audio Device Tests @@ -167,7 +180,7 @@ cargo test ### Test Execution All tests are designed to run with real hardware and models: -1. Ensure Vosk model is available at `VOSK_MODEL_PATH` +1. Ensure STT models are available (Moonshine auto-downloads, or select appropriate feature) 2. Verify audio hardware is accessible via `mic_probe` 3. All tests run by default - no tests should be ignored @@ -186,13 +199,13 @@ cargo test # All tests including integration cargo check --all-targets # Quick compile check cargo test --workspace # All crates with real hardware -# Environment setup -./scripts/ci/setup-vosk-cache.sh # Setup models for all tests -export VOSK_MODEL_PATH="$(pwd)/models/vosk-model-small-en-us-0.15" +# STT-specific tests +cargo test # Default: Moonshine +cargo test --features parakeet # GPU: Parakeet +cargo test --features mock # Testing: Mock plugin # Specific test patterns cargo test plugin_manager # Plugin management tests -cargo test --features vosk test_vosk # Vosk-specific tests cargo test audio_device # Audio hardware tests (real devices) # Debug failing tests @@ -211,4 +224,13 @@ RUST_LOG=debug cargo test test_name # Enable debug logging **Local development:** - Run `cargo test` for complete validation (includes all tests) - All tests use real hardware and models -- Use `scripts/ci/setup-vosk-cache.sh` for model setup +- Models are automatically downloaded when needed (Moonshine) + +## STT Plugin Selection + +| Plugin | Use Case | Requirements | +|--------|----------|--------------| +| **Moonshine** | Production, CPU | Pure Rust, auto-downloads models | +| **Parakeet** | High-quality, GPU | CUDA/GPU support required | +| **Mock** | Testing, CI | No external dependencies | +| **NoOp** | Debug, validation | Returns empty transcripts | diff --git a/docs/domains/foundation/fdn-voice-pipeline-core-design.md b/docs/domains/foundation/fdn-voice-pipeline-core-design.md index f32213d6..9a033235 100644 --- a/docs/domains/foundation/fdn-voice-pipeline-core-design.md +++ b/docs/domains/foundation/fdn-voice-pipeline-core-design.md @@ -70,7 +70,7 @@ Please update any bookmarks or links. ### 3. Speech-to-Text Processing -**Approach**: Plugin architecture, Vosk primary offline engine. +**Approach**: Plugin architecture, Parakeet (GPU) and Moonshine (CPU) as primary engines. **Code paths**: - `SttPluginManager` handles plugin lifecycle, failover, and garbage collection @@ -85,7 +85,9 @@ Please update any bookmarks or links. - Automatic model unloading after idle period (garbage collection) **Why this way**: -- Plugins: Future engines (Whisper, cloud) +- Plugins: Support multiple engines and future additions +- Parakeet: High-quality transcription with GPU support +- Moonshine: Efficient pure-Rust CPU alternative - Failover: Production reliability - GC: Prevent model memory bloat - Hotkey default: Reduce false activations @@ -148,9 +150,9 @@ AudioChunker (resample to 16kHz mono, 512-sample frames) **Memory**: - Audio buffers: ~256KB -- Vosk model: 40-100MB +- STT model: 40-500MB (model dependent: Moonshine ~40-100MB, Parakeet ~300-500MB) - Silero VAD: ~8MB -- **Baseline**: 60-120MB +- **Baseline**: 60-520MB **CPU**: - Idle: <5% diff --git a/docs/domains/stt/troubleshooting/vosk-model-discovery.md b/docs/domains/stt/troubleshooting/vosk-model-discovery.md deleted file mode 100644 index 568cf4d1..00000000 --- a/docs/domains/stt/troubleshooting/vosk-model-discovery.md +++ /dev/null @@ -1,306 +0,0 @@ ---- -doc_type: troubleshooting -subsystem: stt -version: 1.0.0 -status: draft -owners: Documentation Working Group -last_reviewed: 2025-10-19 ---- - -# Vosk Model Discovery Flow - Diagnostic Guide - -This document explains exactly how you reach each logging point during Vosk model initialization, making it easier to diagnose CI failures. - -## Complete Flow Diagram - -``` -App Startup - ↓ -Plugin Manager Initialize - ↓ -Vosk Plugin.initialize() → "Initializing Vosk STT plugin - START" - ↓ -model::ensure_model_available() - ↓ -model::locate_model() ────────────────────────┐ - │ │ - ├─→ 1. Check VOSK_MODEL_PATH env var │ - │ ├─→ Set? → Debug: "Trying VOSK_MODEL_PATH environment variable" - │ │ ├─→ Valid directory? → Info: "Vosk model found via VOSK_MODEL_PATH - SUCCESS" - │ │ └─→ Invalid? → Warn: "VOSK_MODEL_PATH points to invalid location" → FAIL (return error) - │ └─→ Not set? → Continue to step 2 - │ - ├─→ 2. Check config.model_path - │ ├─→ Provided & non-empty? → Debug: "Trying config-provided model path" - │ │ ├─→ Valid directory? → Info: "Vosk model found via config path - SUCCESS" - │ │ └─→ Invalid? → Warn: "Config-provided model path is invalid" → FAIL (return error) - │ └─→ Not provided? → Continue to step 3 - │ - ├─→ 3. Auto-discovery (scan models/ in 4 locations) - │ │ → Debug: "Starting auto-discovery for Vosk model" - │ │ → find_model_candidates() - │ │ → Scan CWD/models/, ../models/, ../../models/, ../../../models/ - │ │ → For each location: - │ │ → Trace: "Checking models directory - SCANNING" - │ │ → If exists & readable: - │ │ → For each vosk-model-* subdirectory: - │ │ → Debug: "Found potential Vosk model directory" - │ │ → Debug: "Completed scanning models directory - RESULT" - │ │ → If unreadable: - │ │ → Warn: "Failed to read models directory" - │ │ → Debug: "Model discovery completed" - │ │ - │ ├─→ Candidates found? - │ │ ├─→ Yes → Debug: "Vosk model discovery candidates found" - │ │ │ → pick_best_candidate() (prefers: small > en-us > highest version) - │ │ │ └─→ Info: "Vosk model found via auto-discovery - SUCCESS" - │ │ └─→ No → Debug: "No model candidates found during auto-discovery" - │ │ - │ └─→ No valid candidate → Continue to step 4 - │ - └─→ 4. COMPLETE FAILURE - Build error message - → Error: "Vosk model not found after exhaustive search - COMPLETE FAILURE" - → Shows all paths checked (env, config, 4 auto-discovery locations) - → Returns ModelError::NotFound - ↓ - Plugin.initialize() catches error - → Error: "Failed to locate or prepare Vosk model - CRITICAL" - → Returns SttPluginError::InitializationFailed - ↓ - Plugin Manager catches error - → Error: "STT plugin initialization failed" - → App fails to start OR STT is disabled -``` - -## Decision Tree: What Most Likely Happened - -### Scenario 1: "Trying VOSK_MODEL_PATH environment variable" -**What happened:** User/CI explicitly set `VOSK_MODEL_PATH` environment variable - -**Next steps:** -- ✅ **"Vosk model found via VOSK_MODEL_PATH - SUCCESS"** - - **Cause:** Path exists and is a valid directory with model files - - **Common in:** CI runners with cached models, production deployments - -- ⚠️ **"VOSK_MODEL_PATH points to invalid location"** - - **Cause:** Environment variable set but path is wrong - - **Common reasons:** - - Typo in CI configuration - - Model extraction step failed silently - - Cache was cleared but variable still set - - Wrong directory specified (file instead of directory) - - **Check:** `exists=false` (doesn't exist) or `is_dir=false` (is a file) - ---- - -### Scenario 2: "Trying config-provided model path" -**What happened:** No env var, but `model_path` was provided in `TranscriptionConfig` - -**Next steps:** -- ✅ **"Vosk model found via config path - SUCCESS"** - - **Cause:** Config path is valid - - **Common in:** Users with custom installations, --model-path CLI flag - -- ⚠️ **"Config-provided model path is invalid"** - - **Cause:** Config value is wrong - - **Common reasons:** - - Wrong path in config.toml or CLI argument - - Model was moved/deleted after config was written - - Relative path interpreted from wrong working directory - ---- - -### Scenario 3: "Starting auto-discovery for Vosk model" -**What happened:** No env var AND no config path provided (normal case) - -**Sub-scenarios:** - -#### 3a. "Vosk model discovery candidates found" -- **What happened:** Found one or more `vosk-model-*` directories -- **Where found:** In `models/` directory at one of these locations: - - `$CWD/models/` (current working directory) - - `$CWD/../models/` (parent) - - `$CWD/../../models/` (grandparent) - - `$CWD/../../../models/` (great-grandparent) -- **Common in:** Development environment, properly installed releases -- **Next:** Picks best candidate (prefers: small, en-us, highest version) - -#### 3b. "No model candidates found during auto-discovery" -- **What happened:** Scanned all 4 locations, no `vosk-model-*` directories exist -- **Common reasons:** - - Fresh clone/install, model never extracted - - Running from unexpected directory (too deep in subdirectories) - - Model directory named incorrectly (doesn't start with `vosk-model-`) - - CI: Model download/extraction step didn't run -- **Next:** Proceeds to final error - -#### 3c. "Found potential Vosk model directory" -- **What happened:** Found a specific candidate directory -- **Shows:** Full path and ancestor level (0=cwd, 1=parent, etc.) -- **Common in:** Normal operation - -#### 3d. "Checking models directory - SCANNING" -- **Trace-level:** Shows each location being checked -- **Fields:** - - `exists=true`: `models/` directory exists at this level - - `is_dir=true`: It's actually a directory (not a file) - - `ancestor_level=N`: How many levels up (0-3) - -#### 3e. "Failed to read models directory" -- **What happened:** `models/` exists but can't be read -- **Common reasons:** - - Permission denied (chmod issues) - - Network filesystem timeout - - Very rare: filesystem corruption - ---- - -### Scenario 4: "Vosk model not found after exhaustive search - COMPLETE FAILURE" -**What happened:** All attempts failed - this is the END of the line - -**Common causes by environment:** - -**CI/Runners:** -- Model download script didn't run (workflow step missing/failed) -- Download succeeded but extraction failed (disk full, corrupt zip) -- Cache key mismatch (expected cache not restored) -- Wrong working directory (changed directory before running app) - -**Local Development:** -- User forgot to download/extract model -- Running from wrong directory (e.g., `crates/app/` instead of project root) -- Model directory renamed or deleted - -**Production:** -- Deployment didn't include models directory -- Filesystem permissions changed -- Disk full prevented extraction - -**Check the error fields:** -- `checked_paths`: Lists every path that was tried -- `env_var_set=true`: Shows if VOSK_MODEL_PATH was set (even if invalid) -- `config_path_provided=true`: Shows if config had a path (even if invalid) -- `cwd`: Current working directory (may be unexpected) - ---- - -## After Model Found: Transcriber Creation - -### "Creating Vosk transcriber - NEXT" -**What happened:** Model directory was found, now attempting to load it - -**Next steps:** - -#### ✅ "VoskTranscriber created successfully" -- **Cause:** Model loaded, recognizer created -- **Means:** Everything is working correctly - -#### ❌ "Failed to create Vosk transcriber - REASON: Model corrupted or incompatible" -**What happened:** Directory exists but Vosk library couldn't load files - -**Common causes:** -- **Corrupted model:** Incomplete download, extraction failed partway -- **Missing files:** Required files like `am/`, `graph/`, `conf/` missing or empty -- **Version mismatch:** Model format incompatible with Vosk library version -- **Wrong model type:** Tried to load incompatible model (wrong language engine) - -**The error shows:** -- `model_path`: Path that failed to load -- `directory_contents`: First 10 files in the directory (helps diagnose what's missing) -- `exists/is_dir`: Confirms the path is actually a directory - ---- - -## Quick Diagnostic Checklist - -When you see a Vosk model error in CI, check logs for: - -1. **Was VOSK_MODEL_PATH set?** - - Look for: `"Trying VOSK_MODEL_PATH environment variable"` - - If yes, check `exists=` and `is_dir=` fields - -2. **What was the working directory?** - - Look for: `cwd=` field in error message - - Compare to where you expect models to be - -3. **Were any candidates found?** - - Look for: `"Vosk model discovery candidates found"` - - If yes, but still failed, model files are probably corrupted - -4. **What paths were checked?** - - Look at: `checked_paths=` in final error - - Verify these are the paths you expected - -5. **Can the models directory be read?** - - Look for: `"Failed to read models directory"` - - If present, it's a permission issue - -6. **Did extraction run?** - - Look for: `auto_extract_enabled=true` in logs - - Look for: `"Attempting to extract model from zip"` - - If no zip found, download step probably failed - ---- - -## Example Log Patterns - -### Pattern 1: Fresh Install (no model) -``` -DEBUG Starting auto-discovery for Vosk model -DEBUG No model candidates found during auto-discovery -ERROR Vosk model not found after exhaustive search checked_paths="auto_discovery=/path/to/models" -``` -**Diagnosis:** Model was never installed. User needs to download/extract. - ---- - -### Pattern 2: Wrong Working Directory -``` -DEBUG Starting auto-discovery cwd="/path/to/crates/app" -DEBUG Checking models directory search_path="/path/to/crates/app/models" exists=false -DEBUG Checking models directory search_path="/path/to/crates/models" exists=false -DEBUG Checking models directory search_path="/path/to/models" exists=false -DEBUG Checking models directory search_path="/path/models" exists=true -DEBUG Found potential Vosk model directory candidate="/path/models/vosk-model-small-en-us-0.15" -INFO Vosk model found via auto-discovery - SUCCESS -``` -**Diagnosis:** Running from deep subdirectory, but auto-discovery still found it (good design!). - ---- - -### Pattern 3: Env Var Set But Invalid -``` -DEBUG Trying VOSK_MODEL_PATH environment variable env_path="/nonexistent" -WARN VOSK_MODEL_PATH points to invalid location env_path="/nonexistent" exists=false is_dir=false -ERROR Failed to locate or prepare Vosk model env_var_set=true -``` -**Diagnosis:** CI set wrong path. Check CI configuration for typos. - ---- - -### Pattern 4: Model Directory Empty -``` -INFO Vosk model found via auto-discovery path="/path/to/models/vosk-model-small-en-us-0.15" -DEBUG Creating Vosk transcriber -ERROR Vosk library failed to load model directory_contents="" path_exists=true is_directory=true -``` -**Diagnosis:** Directory exists but is empty. Extraction failed or files were deleted. - ---- - -### Pattern 5: Corrupt Model -``` -INFO Vosk model found via auto-discovery -ERROR Failed to load Vosk model from: /path/models/vosk-model-small-en-us-0.15 - exists=true, is_dir=true, contents=README, conf/, incomplete_file -``` -**Diagnosis:** Partial extraction or corrupted download. Re-extract model. - ---- - -## Related Files - -- `crates/coldvox-stt-vosk/src/model.rs` - Model discovery logic -- `crates/coldvox-stt-vosk/src/plugin.rs` - Plugin initialization -- `crates/coldvox-stt-vosk/src/vosk_transcriber.rs` - Model loading -- `crates/app/src/stt/plugin_manager.rs` - Plugin selection and error handling diff --git a/docs/history/2025-11-06_04-16Z-branch-status-and-work-in-progress-overview.md b/docs/history/2025-11-06_04-16Z-branch-status-and-work-in-progress-overview.md index ecbe1c6e..27364e36 100644 --- a/docs/history/2025-11-06_04-16Z-branch-status-and-work-in-progress-overview.md +++ b/docs/history/2025-11-06_04-16Z-branch-status-and-work-in-progress-overview.md @@ -1,3 +1,12 @@ +--- +doc_type: history +subsystem: general +version: 1.0.0 +status: archived +owners: Coldaine +last_reviewed: 2025-11-06 +--- + @@ -14,18 +23,14 @@ _**Agent (model copilot/gpt-5-codex)**_ **Checking git status and branch****Addressing Git safe.directory setting****Checking git status again****Checking recent commits on branch****Reading architecture proposal doc****Reviewing branch state and changes****Summarizing current repo status and changes** -I’m noting that there’s an untracked new documentation file with a detailed plan on golden master tests, plus suspicious binary size changes in vendor/vosk files that could indicate corruption or accidental truncation. I’m gathering branch info, recent commits, and local setting adjustments to form a clear snapshot with potential risks and next steps for the user.**Current Snapshot** - On branch `retro-plan-vs-actual`; last commit `df86c08` (“retro: add docs_metadata CSV snapshot (92 files)”) is at HEAD; `origin/main` is still at `7bb7f26`. -- Working tree has two modified LFS binaries: `libvosk.so` now 172 B (was 82 B) and `vosk-model-en-us-0.22` now 140 B (was 66 B). That size drop suggests they were overwritten by placeholder text—worth confirming before committing. - Untracked file `golden-master-test-architecture-proposal.md` (~700 lines) describes a full Golden Master testing plan, detailing seven pipeline anchor points, artifact formats, CLI workflows, and nondeterminism mitigation. - I marked `` as a safe Git directory so status commands work under your user account. **Likely Work In Progress** - Drafting the Golden Master test architecture document for the retro initiative. -- Investigating or rehydrating the Vosk vendor binaries—current working copy looks incomplete, so check whether LFS/download steps need to be rerun. **Next Steps** -1. Restore the Vosk binaries from source (e.g., rerun the vendor sync or check out those files) if the tiny placeholders weren’t intentional. 2. Stage and commit the new Golden Master proposal once you’re happy with the draft. --- diff --git a/docs/plans/comprehensive-main-branch-review-plan-v2.md b/docs/plans/comprehensive-main-branch-review-plan-v2.md new file mode 100644 index 00000000..71bd03d0 --- /dev/null +++ b/docs/plans/comprehensive-main-branch-review-plan-v2.md @@ -0,0 +1,39 @@ +--- +doc_type: plan +subsystem: general +version: 2.0.0 +status: draft +owners: Coldaine +last_reviewed: 2025-12-09 +--- + +# ColdVox Main Branch Review Plan v2.0 + +## Current State + +**Implemented:** +- Parakeet (GPU), Moonshine (CPU), Mock, NoOp + +**Remove:** +- Stub plugins: coqui, leopard, silero_stt, whisper_cpp, whisper_plugin + +## 8 Review Agents + +1. Concurrency Safety - async hazards, locks, race conditions +2. Audio Pipeline - real-time constraints, buffer safety +3. STT Plugins - Parakeet & Moonshine review +4. Plugin Manager - lifecycle, GC, failover +5. Memory Safety - leaks, unsafe code, PyO3 +8. Documentation - match implementation + +## Execution + +Week 1: Agent 7 cleanup +Week 1-2: Agents 1-6 parallel +Week 3: Agent 8 + synthesis + +## Goals + +- Remove all stubs +- 80%+ coverage on real features +- Docs match code diff --git a/docs/plans/comprehensive-main-branch-review-plan.md b/docs/plans/comprehensive-main-branch-review-plan.md new file mode 100644 index 00000000..499a23e8 --- /dev/null +++ b/docs/plans/comprehensive-main-branch-review-plan.md @@ -0,0 +1,85 @@ +--- +title: Comprehensive Main Branch Review Plan - v2.0 Focused on Actual Implementation +doc_type: plan +subsystem: general +status: proposed +created: 2025-12-09 +version: 2.0.0 +owners: Coldaine +last_reviewed: 2025-12-09 +--- + +# ColdVox Main Branch Comprehensive Review Plan (v2.0) + +## Current State (What Actually Exists) + +### Implemented STT Plugins +- ✅ Parakeet (NVIDIA GPU, production) +- ✅ Moonshine (CPU via PyO3, recently added PR #259) +- ✅ Mock (testing) +- ✅ NoOp (testing) + +### Stub Plugins to Remove +- ❌ coqui, leopard, silero_stt, whisper_cpp, whisper_plugin + +### Obsolete to Remove + +## Review Agents (8 Total) + +### Agent 1: Concurrency Safety Auditor +- Lock hierarchy mapping +- Async/await race conditions +- Background task safety +- Text injection async issues from ti-async-safety-analysis.md + +### Agent 2: Audio Pipeline Specialist +- Ring buffer safety +- Real-time constraints (no allocations in callback) +- Resampling correctness +- Latency validation (150-500ms) + +### Agent 3: STT Plugin Reviewer +- Parakeet production readiness +- Moonshine integration review (post-PR #259) +- Plugin error handling + +### Agent 4: Plugin Manager Analyst +- Lifecycle state machine +- GC race conditions +- Failover with real plugins +- Config persistence + +### Agent 5: Memory Safety Analyst +- Unsafe code audit +- PyO3 safety (Moonshine) +- Model memory cleanup +- Resource leaks + +### Agent 6: Test Quality Assessor +- Coverage for Parakeet/Moonshine +- Concurrency tests +- Edge cases + +### Agent 7: Cleanup Agent (PRIORITY 1 - Run First) +**Tasks:** +1. Delete stub plugins (coqui, leopard, silero_stt, whisper_cpp, whisper_plugin) +3. Clean feature flags in Cargo.toml +4. Remove commented-out code + +### Agent 8: Documentation Reviewer +- Verify docs match implementation +- Document Parakeet/Moonshine +- Remove speculative content + +## Execution Order + +1. **Week 1:** Agent 7 (cleanup) - unblocks others +2. **Week 1-2:** Agents 1-6 (parallel reviews) +3. **Week 3:** Agent 8 (docs) + synthesis + final report + +## Success Criteria + +- [ ] All stubs removed +- [ ] Parakeet/Moonshine production-ready or issues documented +- [ ] >80% test coverage for implemented features +- [ ] Docs match implementation diff --git a/docs/plans/documentation-migration-mapping.md b/docs/plans/documentation-migration-mapping.md index eec913bd..6eb0b041 100644 --- a/docs/plans/documentation-migration-mapping.md +++ b/docs/plans/documentation-migration-mapping.md @@ -34,7 +34,6 @@ Phase 1 and the majority of Phase 2–3 migrations have been executed. Remaining | crates/coldvox-gui/README.md | docs/reference/crates/coldvox-gui.md | move | Replace with thin index linking to crate README. | | crates/app/docs/updated_architecture_diagram.md | docs/domains/gui/troubleshooting/updated-architecture-diagram.md | move | Confirm appropriate domain; include retention guidance if exploratory. | | crates/app/test_data/README.md | docs/reference/crates/app-test-data.md | move | Determine if this should stay with crate README or become troubleshooting note. | -| crates/coldvox-stt-vosk/README.md | docs/reference/crates/coldvox-stt-vosk.md | move | Thin index linking back. | | crates/voice-activity-detector/MODIFICATIONS.md | docs/domains/vad/modifications.md | move | Normalize filename + frontmatter. | | crates/coldvox-telemetry/README.md | docs/reference/crates/coldvox-telemetry.md | move | Thin index. | | crates/coldvox-text-injection/TESTING.md | docs/domains/text-injection/ti-testing.md | move | Add frontmatter and align with standards. | @@ -108,9 +107,6 @@ Phase 1 and the majority of Phase 2–3 migrations have been executed. Remaining | docs/domains/stt/whisper/README.md | docs/domains/stt/whisper/index.md | move | rename. | | docs/domains/stt/whisper/implementation-checklist.md | docs/domains/stt/whisper/implementation-checklist.md | retain | Add frontmatter. | | docs/domains/stt/whisper/windows-testing.md | docs/domains/stt/whisper/windows-testing.md | retain | Add frontmatter. | -| docs/domains/stt/vosk.md | docs/domains/stt/vosk.md | retain | Add frontmatter and move troubleshooting under subfolder. | -| docs/domains/stt/vosk-architecture.md | docs/domains/stt/vosk-architecture.md | retain | Add frontmatter. | -| docs/domains/stt/vosk-testing.md | docs/domains/stt/troubleshooting/vosk-testing.md | move | restructure. | | docs/domains/foundation.md | docs/domains/foundation/index.md | move | rename. | | docs/domains/foundation/README.md | docs/domains/foundation/index.md | consolidate | unify. | | docs/domains/foundation/runtime_vision.md | docs/domains/foundation/runtime-vision.md | retain | rename + frontmatter. | @@ -120,7 +116,6 @@ Phase 1 and the majority of Phase 2–3 migrations have been executed. Remaining | docs/domains/text-injection/tracing.md | docs/domains/text-injection/tracing.md | retain | add frontmatter. | | docs/domains/text-injection/injection_states.md | docs/domains/text-injection/injection-states.md | retain | rename + frontmatter. | | docs/domains/text-injection/voice_selection.md | docs/domains/text-injection/voice-selection.md | retain | rename + frontmatter. | -| docs/domains/text-injection/vosk.md | docs/domains/text-injection/vosk.md | retain | frontmatter; maybe troubleshooting. | | docs/domains/text-injection/silero_audio_stream_injection.md | docs/domains/text-injection/silero-audio-stream-injection.md | retain | rename + frontmatter. | | docs/domains/text-injection/docs-review-roadmap.md | docs/domains/text-injection/docs-review-roadmap.md | retain | ensure alignment with architecture roadmap. | | docs/review/README.md | docs/research/pr-reports/index.md | move | convert to index for review history or archive. | diff --git a/docs/plans/documentation/proposal-documentation-restructure.md b/docs/plans/documentation/proposal-documentation-restructure.md index 8193ba4d..633c12c6 100644 --- a/docs/plans/documentation/proposal-documentation-restructure.md +++ b/docs/plans/documentation/proposal-documentation-restructure.md @@ -77,8 +77,6 @@ docs/ | `crates/coldvox-gui/README.md` | Keep (crate stub)| `crates/coldvox-gui/README.md` | Keep minimal README in crate that links to `docs/domains/gui/`. | `crates/coldvox-gui/docs/*.md` | Move | `docs/domains/gui/` | Move GUI docs into the GUI domain folder. | | `crates/coldvox-stt/README.md` | Keep (crate stub)| `crates/coldvox-stt/README.md` | Keep minimal README in crate that links to `docs/domains/stt/`. -| `crates/coldvox-stt-vosk/README.md` | Keep (crate stub)| `crates/coldvox-stt-vosk/README.md` | Keep crate README; merge implementation notes into `docs/domains/stt/vosk.md`. -| `crates/coldvox-stt-vosk/docs/*.md` | Move | `docs/domains/stt/` | Move STT implementation docs into STT domain folder. | | `crates/coldvox-telemetry/README.md` | Keep (crate stub)| `crates/coldvox-telemetry/README.md` | Keep minimal README in crate that links to `docs/domains/telemetry/`. | `crates/coldvox-telemetry/docs/*.md` | Move | `docs/domains/telemetry/` | Move crate-specific docs into telemetry domain. | | `crates/coldvox-text-injection/README.md` | Keep (crate stub)| `crates/coldvox-text-injection/README.md` | Keep minimal README in crate that links to `docs/domains/text-injection/`. @@ -87,7 +85,6 @@ docs/ | `crates/voice-activity-detector/MODIFICATIONS.md` | Move | `docs/domains/vad/vendor_modifications.md` | | | **Docs (Old Structure)** | | | | | `docs/TextInjectionArchitecture.md` | Merge | `docs/architecture.md` | | -| `docs/adr/0001-vosk-model-distribution.md` | Move | `docs/architecture/adr-0001.md` | Create a new `architecture` sub-folder for ADRs. | | `docs/dev/logging.md` | Merge & Move | `docs/playbooks/organizational/logging_playbook.md` | | | `.github/*` workflow docs (if any) | Document Only | `docs/playbooks/organizational/github_governance.md` | Centralize repo settings and governance policies. | | `docs/plans/*.md`, `docs/research/*.md`, `docs/review/*.md` | Move | `docs/research/` | Consolidate historical plans and research. | diff --git a/docs/plans/foundation/logging-audit.md b/docs/plans/foundation/logging-audit.md index f8699642..f790ee52 100644 --- a/docs/plans/foundation/logging-audit.md +++ b/docs/plans/foundation/logging-audit.md @@ -87,7 +87,6 @@ Rationale: single global subscriber owned by `main()` ensures the `WorkerGuard` 2) Potential missing logs in STT plugin selection/failover -- Observation: `main.rs` warns when `VOSK_MODEL_PATH` is used, and `runtime.rs` and `stt::plugin_manager` have logging, but plugin selection/failover lacks structured spans and per-plugin context which would be helpful to diagnose missing model issues. - Recommendation: add a `tracing::span!(Level::INFO, "stt.selection", preferred = ?pref, fallbacks = ?fallbacks)` when building plugin selection in `main.rs`, and `instrument` plugin manager init and failover operations. @@ -300,7 +299,6 @@ Notes: prefer `trace` or `debug` for very frequent events; guard logs behind `tr 3. Add structured logs in `stt::plugin_manager` around plugin load/failover and in `stt::processor` for `TranscriptionEvent` tagging. -4. Consider adding an integration test that runs `main` with `--tui` in CI but with `--no-default-features` if heavy components (VOSK) are optional; confirm file logs are written and flush on shutdown. --- @@ -491,7 +489,6 @@ To address missing transcribed text, add targeted tracing logs at critical point - **Expected Output**: "VAD: SpeechStart - triggering STT" when speaking; confirms audio flow. ### 2. STT Plugin Selection - `crates/app/src/stt/plugin_manager.rs` -- **Rationale**: Log plugin load/fallback to confirm Vosk vs NoOp. Addresses cause #2. - **New Logs** (Diff for lines 523-535, 554-559, 668-670): - Add after initialize (line 634): `info!(target: "coldvox::stt", selected_plugin = %plugin_id, "STT initialized with plugin");` - In create_fallback_plugin (line 646): `warn!(target: "coldvox::stt", fallback_id = %fallback_id, "Fallback plugin unavailable, trying next");` @@ -534,7 +531,6 @@ To address missing transcribed text, add targeted tracing logs at critical point ``` ### Validation -- **Missing Text**: New logs will show "VAD: SpeechStart" if audio detected, "Final transcript empty" if NoOp, "All STT plugins failed" if Vosk missing. - **Logging Visibility**: Fix ensures file output; run with `RUST_LOG=debug` to see in TUI Logs tab and file. - **Performance**: Logs are event-driven (not per-frame), low overhead. diff --git a/docs/playbooks/organizational/runner_setup.md b/docs/playbooks/organizational/runner_setup.md index b81b052f..6e41a8bd 100644 --- a/docs/playbooks/organizational/runner_setup.md +++ b/docs/playbooks/organizational/runner_setup.md @@ -50,12 +50,8 @@ last_reviewed: 2025-10-19 The CI system now uses the application's built-in model autodetection and auto-extraction capabilities. Runners no longer require pre-provisioned, cached models. **Requirements:** -- A `vosk-model-*.zip` file must be present in the project's `vendor/` directory. -- The `setup_vosk.rs` script (run during CI) will copy this zip file to the project root, where the application will find and extract it on first use. **Workflow:** -1. The `setup-vosk-model` job in `ci.yml` copies the model zip from `vendor/` to the workspace root. -2. When tests are run, the `coldvox-stt-vosk` crate automatically finds the zip, extracts it to the `models/` directory, and uses the extracted model. 3. Subsequent runs will find the extracted model and skip the extraction step. This approach removes the dependency on a fixed-path runner cache and makes the CI setup more portable. @@ -82,10 +78,6 @@ LANG=en_US.utf8 ### Cache Directory Structure ``` /home/coldaine/ActionRunnerCache/ -├── vosk-models/ -│ └── vosk-model-small-en-us-0.15/ -├── libvosk-setup/ -│ └── vosk-linux-x86_64-0.3.45/ └── (planned: rust-toolchains/, system-packages/) ``` @@ -93,20 +85,14 @@ LANG=en_US.utf8 ## System Library Configuration -### libvosk Installation -**Location**: `/usr/local/lib/libvosk.so` (25,986,496 bytes) -**Header**: `/usr/local/include/vosk_api.h` **Dynamic Linker Configuration**: -**File**: `/etc/ld.so.conf.d/vosk.conf` ``` /usr/local/lib ``` **Verification**: ```bash -$ ldconfig -p | grep vosk -libvosk.so (libc6,x86-64) => /usr/local/lib/libvosk.so ``` ### System Dependencies Installed @@ -229,25 +215,20 @@ jobs: fi echo "All workflows render via gh." - setup-vosk-model: - name: Setup Vosk Model runs-on: [self-hosted, Linux, X64, fedora, nobara] outputs: download-outcome: success steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Setup Vosk Model run: | set -euo pipefail # This script will copy the model zip from vendor/ to the root - ./scripts/setup_vosk.rs # Static checks, formatting, linting, type-check, build, and docs build_and_check: name: Format, Lint, Typecheck, Build & Docs runs-on: [self-hosted, Linux, X64, fedora, nobara] - needs: [setup-vosk-model] steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -279,7 +260,6 @@ jobs: run: cargo doc --workspace --no-deps --locked - name: Run all tests (unit, integration, and E2E) - if: needs.setup-vosk-model.outputs.download-outcome == 'success' run: | cargo test --workspace --locked @@ -364,7 +344,6 @@ jobs: text_injection_tests: name: Text Injection Tests runs-on: [self-hosted, Linux, X64, fedora, nobara] - needs: [setup-vosk-model] timeout-minutes: 30 env: DISPLAY: :99 @@ -487,9 +466,7 @@ jobs: run: cargo build --locked -p coldvox-app - name: Run E2E pipeline test - if: needs.setup-vosk-model.outputs.download-outcome == 'success' env: - VOSK_MODEL_PATH: ${{ needs.setup-vosk-model.outputs.model-path }} run: | cargo test -p coldvox-app --locked test_end_to_end_wav_pipeline -- --nocapture @@ -542,7 +519,6 @@ jobs: ci-success: name: CI Success if: always() - needs: [validate-workflows, setup-vosk-model, build_and_check, msrv-check, gui-groundwork, text_injection_tests] runs-on: [self-hosted, Linux, X64, fedora, nobara] steps: - name: Check if all jobs succeeded @@ -551,7 +527,6 @@ jobs: failed=0 for res in \ "${{ needs.validate-workflows.result }}" \ - "${{ needs.setup-vosk-model.result }}" \ "${{ needs.build_and_check.result }}" \ "${{ needs.msrv-check.result }}" \ "${{ needs.gui-groundwork.result }}" \ @@ -567,24 +542,18 @@ jobs: echo "All CI jobs succeeded (ignoring skipped)" ``` -### 2. Vosk Integration Tests Workflow -**File**: `.github/workflows/vosk-integration.yml` ```yaml -name: Vosk STT Integration Tests on: pull_request: branches: [main] paths: - - 'crates/coldvox-stt-vosk/**' - 'crates/app/src/stt/**' - - '.github/workflows/vosk-integration.yml' workflow_dispatch: inputs: model_type: - description: 'Vosk model to use for testing' required: false default: 'small' type: choice @@ -598,8 +567,6 @@ env: CARGO_INCREMENTAL: 0 jobs: - vosk-integration: - name: Vosk STT Integration runs-on: [self-hosted, Linux, X64, fedora, nobara] timeout-minutes: 15 steps: @@ -613,31 +580,20 @@ jobs: - name: Setup ColdVox uses: ./.github/actions/setup-coldvox - - name: Setup Vosk Model from Cache run: | set -euo pipefail # Use pre-cached models from permanent cache location - CACHE_DIR="/home/coldaine/ActionRunnerCache/vosk-models" MODEL_DIR="models" mkdir -p $MODEL_DIR # Link the small model for tests (remove existing if present) - if [ -d "$CACHE_DIR/vosk-model-small-en-us-0.15" ]; then - rm -rf "$MODEL_DIR/vosk-model-small-en-us-0.15" - ln -sf "$CACHE_DIR/vosk-model-small-en-us-0.15" "$MODEL_DIR/" - echo "✅ Linked cached vosk-model-small-en-us-0.15" else - echo "❌ Error: Vosk model not found in cache at $CACHE_DIR" exit 1 fi # Link the production model if available - if [ -d "$CACHE_DIR/vosk-model-en-us-0.22" ]; then - rm -rf "$MODEL_DIR/vosk-model-en-us-0.22" - ln -sf "$CACHE_DIR/vosk-model-en-us-0.22" "$MODEL_DIR/" - echo "✅ Linked cached vosk-model-en-us-0.22" fi ls -la $MODEL_DIR/ @@ -645,35 +601,22 @@ jobs: - name: Install cargo-nextest run: cargo install cargo-nextest --locked - - name: Build with Vosk run: | - # Build both crates that use Vosk feature - cargo build --locked -p coldvox-stt-vosk --features vosk - cargo build --locked -p coldvox-app --features vosk - - name: Run Vosk tests env: - VOSK_MODEL_PATH: models/vosk-model-small-en-us-0.15 run: | - cargo test --locked -p coldvox-stt-vosk --features vosk -- --nocapture - name: Run end-to-end WAV pipeline test env: - VOSK_MODEL_PATH: models/vosk-model-small-en-us-0.15 run: | - cargo test --locked -p coldvox-app --features vosk test_end_to_end_wav_pipeline --nocapture - - name: Test Vosk examples env: - VOSK_MODEL_PATH: models/vosk-model-small-en-us-0.15 run: | - cargo run --locked --example vosk_test --features vosk,examples -- --test-duration 5 - name: Upload test artifacts on failure if: failure() uses: actions/upload-artifact@v4 with: - name: vosk-integration-artifacts path: | target/debug/deps/ target/debug/build/ @@ -682,7 +625,6 @@ jobs: - name: Performance summary run: | - echo "=== Vosk Integration Test Summary ===" echo "Model setup: ✅ Using cached models" echo "Build time: Fast (using Rust cache)" echo "Test execution: Complete" @@ -753,7 +695,6 @@ jobs: ```yaml name: Setup ColdVox Dependencies -description: Install system deps, libvosk, and Rust toolchain inputs: skip-toolchain: description: Skip Rust toolchain setup (for jobs with custom toolchain) @@ -790,25 +731,14 @@ runs: exit 1 fi - - name: Validate libvosk installation shell: bash run: | - echo "Validating pre-installed libvosk..." - if [ ! -f "/usr/local/lib/libvosk.so" ]; then - echo "ERROR: libvosk.so not found, run setup-permanent-libvosk.sh on runner" exit 1 fi - if [ ! -f "/usr/local/include/vosk_api.h" ]; then - echo "ERROR: vosk_api.h not found, run setup-permanent-libvosk.sh on runner" exit 1 fi - # Ensure libvosk is in dynamic linker cache - if ! ldconfig -p | grep -q vosk; then - echo "WARNING: libvosk not in linker cache, refreshing..." sudo ldconfig fi - echo "✅ libvosk available at /usr/local/lib/libvosk.so" - echo "✅ libvosk cached in dynamic linker" - name: Setup Rust toolchain if: inputs.skip-toolchain != 'true' @@ -824,43 +754,27 @@ runs: ## Configuration Scripts -### Permanent libvosk Installation Script -**File**: `scripts/setup-permanent-libvosk.sh` ```bash #!/bin/bash -# Permanent libvosk installation for self-hosted runner # This should be run ONCE on the runner to eliminate per-job extraction set -euo pipefail -echo "=== Setting up permanent libvosk installation ===" -VOSK_VER="0.3.45" -VENDOR_DIR="/home/coldaine/Projects/ColdVox/vendor/vosk" CACHE_DIR="/home/coldaine/ActionRunnerCache" # Ensure we have the vendor file -if [ ! -f "$VENDOR_DIR/vosk-linux-x86_64-${VOSK_VER}.zip" ]; then - echo "ERROR: Vendor file not found: $VENDOR_DIR/vosk-linux-x86_64-${VOSK_VER}.zip" exit 1 fi # Create working directory -mkdir -p "$CACHE_DIR/libvosk-setup" -cd "$CACHE_DIR/libvosk-setup" # Extract if not already done -if [ ! -d "vosk-linux-x86_64-${VOSK_VER}" ]; then - echo "Extracting libvosk..." - unzip -q "$VENDOR_DIR/vosk-linux-x86_64-${VOSK_VER}.zip" fi # Install permanently -echo "Installing libvosk system-wide..." -sudo cp -v "vosk-linux-x86_64-${VOSK_VER}/libvosk.so" /usr/local/lib/ -sudo cp -v "vosk-linux-x86_64-${VOSK_VER}/vosk_api.h" /usr/local/include/ # Update dynamic linker cache echo "Updating dynamic linker cache..." @@ -868,30 +782,20 @@ sudo ldconfig # Verify installation echo "Verifying installation..." -if ldconfig -p | grep -q vosk; then - echo "✅ libvosk successfully installed and cached" - ldconfig -p | grep vosk else - echo "❌ libvosk not found in linker cache" exit 1 fi # Test linking echo "Testing library linking..." -if ldd /usr/local/lib/libvosk.so >/dev/null 2>&1; then - echo "✅ libvosk dependencies resolved" else - echo "❌ libvosk dependency issues" - ldd /usr/local/lib/libvosk.so exit 1 fi # Create permanent ldconfig configuration echo "Creating permanent ldconfig entry..." -echo "/usr/local/lib" | sudo tee /etc/ld.so.conf.d/vosk.conf sudo ldconfig -echo "✅ Permanent libvosk installation complete!" echo "" echo "🚀 Now workflows should use validation instead of extraction:" echo " - Remove zip extraction from setup-coldvox action" @@ -1004,18 +908,14 @@ esac ## Current Issues and Status ### Known Working Components -- ✅ Vosk model caching (8-11 seconds setup vs 3+ hour timeouts) - ✅ System package installation with `--skip-unavailable` flags -- ✅ libvosk permanent installation and ldconfig configuration - ✅ Runner labels and basic workflow execution ### Current Failure Points -- ❌ Vosk integration tests failing with `libvosk.so: cannot open shared object file` - ❌ Some CI jobs queuing for extended periods (18+ minutes) - ❌ Intermittent network timeouts during GitHub Action downloads ### Recent Changes (Commits) -1. **959a04a**: Implemented permanent libvosk installation 2. **4062a15**: Fixed package name from 'app' to 'coldvox-app' in workflows 3. **1f1af7f**: Added `--skip-unavailable` flags to dnf commands diff --git a/docs/reference/crates/coldvox-stt-vosk.md b/docs/reference/crates/coldvox-stt-vosk.md deleted file mode 100644 index 249eab0a..00000000 --- a/docs/reference/crates/coldvox-stt-vosk.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -doc_type: index -subsystem: stt -version: 1.0.0 -status: draft -owners: Documentation Working Group -last_reviewed: 2025-10-19 ---- - -# crate: coldvox-stt-vosk (Index) - -Authoritative documentation lives in [`crates/coldvox-stt-vosk/README.md`](../../../crates/coldvox-stt-vosk/README.md). - -## Key Entry Points - -- [`src/lib.rs`](../../../crates/coldvox-stt-vosk/src/lib.rs) diff --git a/docs/repo/copilot-instructions.md b/docs/repo/copilot-instructions.md index 035de149..b297739f 100644 --- a/docs/repo/copilot-instructions.md +++ b/docs/repo/copilot-instructions.md @@ -16,7 +16,6 @@ Use these notes to help AI agents work effectively in this Rust workspace. Main **Prompt Response Format**: When asked to create a prompt for another agent, return ONLY the prompt content without any additional commentary, explanation, or wrapper text. Simply output the prompt as requested. Key defaults right now: -- STT (Vosk) is NOT built by default. Enable with the `vosk` feature. - Default audio/VAD windowing is 512 samples at 16 kHz (32 ms). **Platform Detection**: Build system automatically detects platform/desktop at compile time (`crates/app/build.rs`) and enables appropriate text injection backends. @@ -52,10 +51,7 @@ Key defaults right now: - `crates/coldvox-stt/` — STT core abstractions and event system - `types.rs`: Core STT types and events (`TranscriptionEvent`, `WordInfo`) - `processor.rs`: STT processing building blocks (used by app-level processor) - - `plugin.rs` + `plugins/`: Plugin architecture (e.g., `vosk_plugin.rs`, `whisper_plugin.rs`, `mock.rs`, `noop.rs`) -- `crates/coldvox-stt-vosk/` — Vosk STT integration (feature `vosk`, default enabled) - - `vosk_transcriber.rs`: `VoskTranscriber` implementing offline speech recognition - `crates/coldvox-telemetry/` — Pipeline metrics and performance tracking - `pipeline_metrics.rs`: `PipelineMetrics`, `metrics.rs`: `FpsTracker` @@ -69,7 +65,6 @@ Key defaults right now: - `crates/app/` — Main application crate with glue code, UI, and re-exports - **Audio glue**: `src/audio/vad_adapter.rs`, `src/audio/vad_processor.rs` - - **STT integration**: `src/stt/processor.rs`, `src/stt/vosk.rs`, `src/stt/persistence.rs` - **Text injection**: `src/text_injection/` - integration with text injection backends - **Hotkey system**: `src/hotkey/` - global hotkey support with KDE KGlobalAccel integration - **Probes**: `src/probes/` - diagnostic and testing utilities @@ -82,7 +77,6 @@ Key defaults right now: ### Main Binaries - App (default build, no STT): `cargo run` -- App with STT (Vosk): `cargo run --features vosk` - TUI Dashboard: `cargo run --bin tui_dashboard` (add `-- --device ""` and/or `--log-level `) - Mic Probe: `cargo run --bin mic_probe -- --duration 30 --device "" --silence_threshold 120` - Minimal (disable text injection too): `cargo run --no-default-features --features silero` @@ -90,7 +84,6 @@ Key defaults right now: ### Examples (at repo root `/examples/`, wired via Cargo metadata) - Foundation: `cargo run --example foundation_probe -- --duration 30` - Recording: `cargo run --example record_10s` -- STT Test (Vosk): `cargo run --features vosk,examples --example vosk_test` - Text Injection demo: `cargo run --features text-injection --example inject_demo` - Hotkeys: `cargo run --example test_hotkey_backend` - KDE KGlobalAccel: `cargo run --example test_kglobalaccel_hotkey` @@ -113,7 +106,6 @@ Key defaults right now: - CPAL callback → i16 samples → `AudioRingBuffer` (SPSC) → `FrameReader` → `AudioChunker` → broadcast channel - Chunker output: 512-sample frames (32 ms) at 16 kHz to VAD/STT subscribers - VAD: Silero V5 (default) generates `VadEvent`s -- STT (when compiled with `vosk`): - Activation gating: by default the app uses a hotkey workflow; enable `--activation-mode vad` to auto-activate on speech. - Transcribes segments during active speech (SpeechStart → SpeechEnd) and emits `TranscriptionEvent`s. - TUI: when STT is enabled and a model is present, partial/final transcripts are logged; Status shows last final transcript @@ -148,11 +140,9 @@ UnifiedVadConfig { } ``` -### STT Configuration (when `vosk` feature is enabled) ```rust TranscriptionConfig { enabled: true, // app sets this true only if the model path exists - model_path: "models/vosk-model-small-en-us-0.15", partial_results: true, max_alternatives: 1, include_words: false, @@ -160,7 +150,6 @@ TranscriptionConfig { } ``` Notes: -- If `VOSK_MODEL_PATH` is set, it overrides `model_path`. - If the model path does not exist at runtime, the app disables STT and logs a warning. ### Text Injection (Platform-aware) @@ -186,7 +175,6 @@ Notes: - `--device `: Select preferred input device (exact or substring) - `--resampler-quality `: Controls chunker resampler - `--activation-mode `: Choose activation workflow (default: `hotkey`) -- STT (only with `vosk` feature): `--save-transcriptions`, `--save-audio`, `--output-dir`, `--transcript-format`, `--retention-days` ### Metrics & Telemetry - **Pipeline metrics**: `Arc` shared across components @@ -212,7 +200,6 @@ FrameReader (from consumer) → AudioChunker → broadcast::Sender // VAD processing VadProcessor::spawn(vad_cfg, audio_rx, event_tx, Some(metrics))? -// STT processor (when vosk feature enabled) // See crates/app/src/stt/processor.rs ``` @@ -238,12 +225,8 @@ let devices = device_manager.enumerate_devices(); - **Configuration**: Thresholds, durations, and windowing via `UnifiedVadConfig` ## STT System (Default Enabled) -STT is optional and compiled in with the `vosk` feature. -- Integration: Via `crates/coldvox-stt-vosk/` (re-exported in `crates/app/src/stt/vosk.rs`) - Gating: Transcribes only during detected speech segments - Events: `TranscriptionEvent::{Partial, Final, Error}` via mpsc channels -- Model path: `VOSK_MODEL_PATH` env var or default `models/vosk-model-small-en-us-0.15` -- Requirements: System libvosk library for compilation ## Hotkey System - **Global hotkeys**: System-wide hotkey capture and processing (`src/hotkey/`) diff --git a/docs/research/logs/2025-10-10-commit-history-rewrite.md b/docs/research/logs/2025-10-10-commit-history-rewrite.md index e92d647b..545b981c 100644 --- a/docs/research/logs/2025-10-10-commit-history-rewrite.md +++ b/docs/research/logs/2025-10-10-commit-history-rewrite.md @@ -23,8 +23,6 @@ Retention: Ephemeral. Delete after 2025-11-09 unless promoted to playbooks/organ ### Existing Commits (Oldest to Newest): 1. `fadb82a` - begin documentation refactoring and injection changes -2. `f84fa98` - feat(stt): Enhance Vosk model discovery logging for CI debugging -3. `e892ec4` - docs: add comprehensive text injection architecture and Vosk diagnostic guide 4. `00c25de` - docs: add Parakeet STT research and remove outdated architecture doc 5. `90a6019` - chore(text-injection): snapshot old implementation before orchestrator rewrite 6. `ffe3ae6` - feat(text-injection): implement orchestrator-based architecture (WIP) @@ -48,7 +46,6 @@ Retention: Ephemeral. Delete after 2025-11-09 unless promoted to playbooks/organ 3. **Formatting commits** - Should be squashed into relevant features (commits 13, 14, 15) 4. **Scattered documentation** - Docs spread across multiple commits (1, 3, 4) 5. **Dependency updates isolated** - Should be with relevant features (commit 9) -6. **STT logging mixed in** - Vosk changes unrelated to injection work (commit 2) 7. **Non-atomic changes** - Some commits mix concerns (audio + injection) 8. **WIP markers** - Commit 6 has "(WIP)" but is in main history @@ -73,22 +70,18 @@ Retention: Ephemeral. Delete after 2025-11-09 unless promoted to playbooks/organ Add comprehensive documentation for the text injection refactor: - Text injection architecture and strategy overview - Parakeet STT research and evaluation - - Vosk diagnostic guide for CI debugging - Remove outdated architecture documentation This establishes the foundation and rationale for the orchestrator refactor that follows. ``` -#### **Commit 2: feat(stt): Enhance Vosk model discovery logging for CI** - **Keeps:** commit 2 (mostly unchanged) - **Purpose:** Standalone improvement, helps with CI/debugging - **Note:** Could be moved to separate PR if we want pure injection focus - **Message:** ``` - feat(stt): Enhance Vosk model discovery logging for CI debugging - Improve Vosk model path detection and logging to help diagnose CI failures and model availability issues. Adds detailed logging throughout the model discovery process. ``` @@ -263,8 +256,6 @@ Retention: Ephemeral. Delete after 2025-11-09 unless promoted to playbooks/organ 2. **Initial rebase plan** (in the editor): ``` pick fadb82a begin documentation refactoring and injection changes - pick f84fa98 feat(stt): Enhance Vosk model discovery logging for CI debugging - squash e892ec4 docs: add comprehensive text injection architecture and Vosk diagnostic guide squash 00c25de docs: add Parakeet STT research and remove outdated architecture doc pick 90a6019 chore(text-injection): snapshot old implementation before orchestrator rewrite fixup ffe3ae6 feat(text-injection): implement orchestrator-based architecture (WIP) diff --git a/docs/research/logs/2025-10-13-self-hosted-runner-status.md b/docs/research/logs/2025-10-13-self-hosted-runner-status.md index 6b3727d6..79131e78 100644 --- a/docs/research/logs/2025-10-13-self-hosted-runner-status.md +++ b/docs/research/logs/2025-10-13-self-hosted-runner-status.md @@ -67,14 +67,12 @@ runs-on: [self-hosted, Linux, X64, fedora, nobara] - `.github/workflows/ci.yml` (8 jobs) - `.github/workflows/release.yml` (2 jobs) - `.github/workflows/runner-test.yml` (1 job) -- `.github/workflows/vosk-integration.yml` (1 job) - `.github/workflows/runner-diagnostic.yml` (1 job) ### Current Performance Characteristics **Performance Testing Results (Phase 2.3):** ``` -Baseline Test: Vosk Integration Tests - Runtime: 3h 21m (failed after 38min build phase) - Peak Load: 10.03 (excellent CPU utilization) - Memory Usage: 10.3GB / 30GB available @@ -93,7 +91,6 @@ Hardware vs GitHub-hosted Comparison: ### Critical Problems (Blocking Phase 3) 1. **Build Failures** - - Vosk integration tests failing during compilation - 38-minute build process before failure - Dependency or system library issues @@ -223,16 +220,11 @@ timeout-minutes: 360 # Go wild, we have time **Persistent Storage & Local Caching:** ```bash -# Vosk Model Local Cache Strategy -Model Cache Location: /home/coldaine/actions-runner/_cache/vosk-models/ Current Waste: Re-downloading 1.8GB model per workflow run Optimization: Pre-cache models locally, symlink in workflows # Implementation Plan: -mkdir -p /home/coldaine/actions-runner/_cache/vosk-models/ # Pre-download models: -# - vosk-model-small-en-us-0.15 (40MB - fast testing) -# - vosk-model-en-us-0.22 (1.8GB - production quality) # - Future: Multi-language support (es, fr, de) ``` @@ -261,21 +253,16 @@ Networking: Rate-limited vs Direct control Root Access: No (GitHub) vs Yes (Self-hosted) ``` -### 🎯 Vosk-Specific Optimization Strategy **Model Management System:** ```bash # Proposed Structure /home/coldaine/actions-runner/_cache/ -├── vosk-models/ │ ├── small-en-us-0.15/ # Fast testing (40MB) │ ├── en-us-0.22/ # Production quality (1.8GB) │ ├── checksums.txt # Integrity verification │ └── version-manifest.json # Version tracking -├── vosk-binaries/ │ ├── 0.3.45/ # Current version -│ │ ├── libvosk.so # Pre-installed in /usr/local/lib -│ │ └── vosk_api.h # Pre-installed in /usr/local/include │ └── version-registry.json # Binary version tracking └── rust-artifacts/ ├── cargo-registry/ # Shared dependency cache @@ -284,27 +271,14 @@ Root Access: No (GitHub) vs Yes (Self-hosted) **Workflow Optimization:** ```yaml -# Enhanced Vosk Setup (Self-hosted optimized) -- name: Setup Vosk (Self-hosted optimized) run: | # Check local cache first - VOSK_CACHE="/home/coldaine/actions-runner/_cache/vosk-models" - MODEL_NAME="vosk-model-small-en-us-0.15" - if [ -d "$VOSK_CACHE/$MODEL_NAME" ]; then - echo "Using cached Vosk model: $MODEL_NAME" - ln -sf "$VOSK_CACHE/$MODEL_NAME" . else - echo "Downloading and caching Vosk model..." # Download, extract, and cache for future runs - mkdir -p "$VOSK_CACHE" # ... download and extract logic - mv "$MODEL_NAME" "$VOSK_CACHE/" - ln -sf "$VOSK_CACHE/$MODEL_NAME" . fi - # Vosk binaries already installed system-wide - echo "Vosk setup complete (cached)" ``` ### 💡 Advanced Self-Hosted Optimizations @@ -331,7 +305,6 @@ RUSTC_OPTS="--codegen opt-level=3" /home/coldaine/actions-runner/_persistent/ ├── cargo-cache/ # Never cleared ├── rust-analyzer-cache/ # IDE support -├── vosk-models/ # Downloaded once └── build-artifacts/ # Incremental builds # Workspace management @@ -352,12 +325,8 @@ cargo config set registry.local-mirror.index "file:///opt/cargo-registry" ## Immediate Action Plan (Phase 3.1) -### Priority 1: Fix Build Reliability + Vosk Optimization ```bash # Enhanced Steps: -1. Investigate Vosk compilation errors -2. Implement local Vosk model caching system -3. Pre-install Vosk models in runner cache 4. Update workflows to use cached models 5. Add model integrity verification 6. Test manual build with cached models @@ -378,10 +347,8 @@ cargo config set registry.local-mirror.index "file:///opt/cargo-registry" target/ # Additional self-hosted caching: -- name: Cache Vosk Models (Self-hosted) run: | # Use persistent local cache (no GitHub Actions cache needed) - echo "VOSK_MODEL_PATH=/home/coldaine/actions-runner/_cache/vosk-models/vosk-model-small-en-us-0.15" >> $GITHUB_ENV ``` ### Priority 3: Resource Optimization @@ -392,8 +359,6 @@ jobs: strategy: max-parallel: 4 # Increased based on 10-core + 30GB capacity matrix: - features: [default, vosk, text-injection] - vosk-model: [small, standard] # Test multiple models # Resource limits per job env: CARGO_BUILD_JOBS: 6 # Leverage more cores per job @@ -410,7 +375,6 @@ continue-on-error: true env: # Use local paths for better performance CARGO_HOME: /home/coldaine/.cargo - VOSK_MODEL_PATH: /home/coldaine/actions-runner/_cache/vosk-models # Custom build optimizations RUSTFLAGS: "-C target-cpu=native -C opt-level=3" CARGO_NET_GIT_FETCH_WITH_CLI: "true" # Better git performance @@ -463,7 +427,6 @@ Compliance logging and audit trails - **Successful build time**: < 30 minutes (down from 3h 21m failure) - **Cache hit rate**: > 80% for dependency builds - **Queue throughput**: 3-4 concurrent jobs -- **Failure rate**: < 10% (currently 100% for Vosk tests) ### Reliability Targets - **Job completion rate**: > 95% @@ -491,7 +454,6 @@ Usage: On-demand testing and analysis System Dependencies: ✅ Installed and verified - alsa-lib-devel, xdotool, libXtst-devel - wget, unzip, @development-tools -- Vosk libraries: libvosk.so, vosk_api.h Rust Toolchain: ✅ Configured - Version: 1.89.0 (stable) @@ -528,7 +490,6 @@ Current Status: ## Next Steps Summary **Immediate (This Week)**: -1. Diagnose and fix Vosk build failures 2. Implement Rust caching in workflows 3. Add job timeouts and concurrency limits 4. Test hybrid fallback strategy @@ -559,7 +520,6 @@ Current Status: **Self-Hosted Exclusive Capabilities:** ```bash -✅ Persistent Vosk model cache (save 1.8GB downloads per run) ✅ CPU-native optimizations (AVX2, FMA instruction sets) ✅ Unlimited concurrent jobs (based on hardware capacity) ✅ Custom system dependencies pre-installed @@ -615,21 +575,14 @@ runs-on: [self-hosted, Linux, X64, fedora, nobara] # No fallback needed - it's your personal project ``` -#### 2. **CRITICAL: Vosk Model Caching** **This is the #1 priority** - downloading 1.8GB model every run is insane: ```bash # Pre-cache models permanently -mkdir -p /home/coldaine/actions-runner/_cache/vosk-models/ -cd /home/coldaine/actions-runner/_cache/vosk-models/ # Download once, use forever -wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip -wget https://alphacephei.com/vosk/models/vosk-model-en-us-0.22.zip unzip *.zip && rm *.zip # Workflows should check cache first: -if [ -d "/home/coldaine/actions-runner/_cache/vosk-models/vosk-model-small-en-us-0.15" ]; then - ln -sf /home/coldaine/actions-runner/_cache/vosk-models/vosk-model-small-en-us-0.15 . else # Only download if missing fi @@ -669,8 +622,6 @@ GitHub-hosted runners: ### What to Actually Focus On **Priority 1 - Must Fix:** -1. **Vosk model caching** - Stop downloading 1.8GB every run -2. **Fix current Vosk build failures** - They're blocking everything 3. **Reasonable parallelization** - 3-4 jobs max, not stress testing **Priority 2 - Nice to Have:** @@ -692,4 +643,3 @@ This is the PERFECT use case for self-hosted runners: - **Learning opportunity** (but keep it reasonable) - **Infinite CI/CD minutes** (vs 33 hours free tier) -**The Real Focus:** Fix Vosk model caching FIRST (saves 1.8GB per run), then get builds working consistently. Everything else is optional optimization. This spare laptop setup gives you unlimited CI/CD for free - just need to make it work reasonably well, not perfectly. diff --git a/docs/research/logs/2025-10-19-file-inventory-prompt.md b/docs/research/logs/2025-10-19-file-inventory-prompt.md index cda1b77a..747bb34d 100644 --- a/docs/research/logs/2025-10-19-file-inventory-prompt.md +++ b/docs/research/logs/2025-10-19-file-inventory-prompt.md @@ -139,7 +139,6 @@ path/to/file.ext - **Interface/GUI:** `ui/`, `gui/`, `view/`, `electron`, `qt`, `react`, `flutter`, `wpf`, `swiftui`. - **Audio Capture:** `audio`, `mic`, `microphone`, `portaudio`, `pyaudio`, `sounddevice`, `coreaudio`, `avfoundation`, WebRTC audio. - **VAD:** `vad`, `webrtcvad`, `silero`, `rnnoise`, “silence/energy threshold”. -- **STT Engine:** `stt`, `asr`, `whisper`, `whisper.cpp`, `vosk`, `deepspeech`, `kaldi`, `nemo`, cloud STT SDKs. - **Text Injection:** `keyboard`, `keystroke`, `SendInput`, `xdotool`, `AutoHotkey`, `robotjs`, accessibility APIs. ### Assignment pass diff --git a/docs/research/pr-reports/PR-temp-comprehensive-testing-report.md b/docs/research/pr-reports/PR-temp-comprehensive-testing-report.md index 7cb8b75b..5fdaa0d8 100644 --- a/docs/research/pr-reports/PR-temp-comprehensive-testing-report.md +++ b/docs/research/pr-reports/PR-temp-comprehensive-testing-report.md @@ -25,7 +25,6 @@ Successfully identified and fixed critical hanging issues in clipboard injection ### Key Metrics - **🎯 Clipboard Tests**: 7/7 passing (0.26s) - Previously hanging indefinitely - **📦 Text Injection Library**: 55/55 tests passing (0.47s-1.34s) -- **🏗️ App Library Tests**: 29/31 tests passing (7.54s) - 2 unrelated Vosk model failures - **⏱️ Integration Tests**: 17/17 timing tests passing (0.05s) - **🚀 Performance**: >95% improvement (from timeout to <1s completion) @@ -191,8 +190,6 @@ test result: FAILED. 29 passed; 2 failed; 0 ignored; 0 measured; 0 filtered out; ``` **Failures (Unrelated to Clipboard Fixes):** -- ❌ `stt::plugin_manager::tests::test_unload_metrics` - Vosk model path issue -- ❌ `stt::tests::vosk_tests::vosk_integration_tests::test_vosk_transcriber_empty_model_path` - Vosk model assertion ### ✅ App Integration Tests (17/17 passing) @@ -302,11 +299,9 @@ Untracked files: ### For This PR 1. ✅ **Commit the changes** - All tests passing, ready to merge 2. ✅ **Update PR description** - Include clipboard fix details -3. ⚠️ **Note unrelated failures** - Vosk model path issues (separate PR) 4. ✅ **Verify in CI** - Should no longer hang ### For Future Work -1. Fix Vosk model path tests (separate issue) 2. Fix settings test default value mismatch 3. Consider applying similar timeout patterns to other external command executions 4. Add monitoring for clipboard operation performance diff --git a/docs/tasks/ci-runner-readiness-proposal.md b/docs/tasks/ci-runner-readiness-proposal.md index 0bdeed56..3093353b 100644 --- a/docs/tasks/ci-runner-readiness-proposal.md +++ b/docs/tasks/ci-runner-readiness-proposal.md @@ -19,7 +19,6 @@ The repository is synced to latest `main` and GitHub Actions workflows are valid ## Current State - Repo: up-to-date on `main` (fast-forwarded to 07c21dc). -- Workflows present: `ci.yml`, `vosk-integration.yml`, `release.yml`, `runner-test.yml`, `runner-diagnostic.yml`. - `actionlint`: clean (exit 0) for all workflows. - Runner status: online, labels match workflows (`self-hosted, Linux, X64, fedora, nobara`). - Missing deps on runner: @@ -82,7 +81,6 @@ sudo dnf install -y gtk3-devel libXtst-devel alsa-lib-devel ## Notes - The runner currently runs via `run.sh` (no systemd service). This is acceptable, but converting to a user systemd service can improve reliability: - `.service` marker indicates `actions.runner.Coldaine-ColdVox.laptop-extra.service`. If desired, enable a systemd user service and configure auto-start. -- Vosk dependencies are set up per job by `setup-vosk-cache.sh`. Ensure adequate disk space (env `MIN_FREE_DISK_GB=10`). - Nobara/Fedora typically ship PipeWire by default. Installing `pipewire-pulseaudio` ensures the PulseAudio compatibility layer exposes the expected CLI/bus without replacing the stock audio stack. Our script calls `pulseaudio --daemonize`; this remains compatible when the shim is present. The validation step includes `pactl info` to confirm the active server. @@ -90,4 +88,3 @@ sudo dnf install -y gtk3-devel libXtst-devel alsa-lib-devel - All remediation packages installed on the runner. - `scripts/start-headless.sh` completes without errors. - `ci.yml` jobs succeed on `main` for stable toolchain. -- `vosk-integration.yml` completes on PRs touching STT/Vosk.