Coldaine · google-labs-jules · Nov 10, 2025 · Nov 13, 2025
@@ -512,7 +512,7 @@ pub async fn start(
 
     // Text injection channel
     #[cfg(feature = "text-injection")]
-    let (text_injection_tx, text_injection_rx) = mpsc::channel::<TranscriptionEvent>(100);
+    let (_text_injection_tx, text_injection_rx) = mpsc::channel::<TranscriptionEvent>(100);
     #[cfg(not(feature = "text-injection"))]
     let (_text_injection_tx, _text_injection_rx) = mpsc::channel::<TranscriptionEvent>(100);
 
@@ -521,10 +521,10 @@ pub async fn start(
     let mut stt_forward_handle: Option<JoinHandle<()>> = None;
     #[allow(unused_variables)]
     let (stt_handle, vad_fanout_handle) = if let Some(pm) = plugin_manager.clone() {
-    // This is the single, unified path for STT processing.
-    #[cfg(feature = "whisper")]
-    let (session_tx, session_rx) = mpsc::channel::<SessionEvent>(100);
-    let stt_audio_rx = audio_tx.subscribe();
+        // This is the single, unified path for STT processing.
+        #[cfg(feature = "whisper")]
+        let (session_tx, session_rx) = mpsc::channel::<SessionEvent>(100);
+        let stt_audio_rx = audio_tx.subscribe();
 
         #[cfg(feature = "whisper")]
         let (stt_pipeline_tx, stt_pipeline_rx) = mpsc::channel::<TranscriptionEvent>(100);
@@ -550,8 +550,8 @@ pub async fn start(
             Settings::default(), // Use default settings for now
         );
 
-    let vad_bcast_tx_clone = vad_bcast_tx.clone();
-    let activation_mode = opts.activation_mode;
+        let vad_bcast_tx_clone = vad_bcast_tx.clone();
+        let activation_mode = opts.activation_mode;
 
         // This task is the new "translator" from VAD/Hotkey events to generic SessionEvents.
         let vad_fanout_handle = tokio::spawn(async move {
@@ -770,11 +770,9 @@ pub async fn start(
 #[cfg(test)]
 mod tests {
     use super::*;
-
-
+
     use coldvox_stt::plugin::{FailoverConfig, GcPolicy, PluginSelectionConfig};
     use coldvox_stt::TranscriptionEvent;
-
 
     /// Helper to create default runtime options for testing.
     fn test_opts(activation_mode: ActivationMode) -> AppRuntimeOptions {

@@ -117,20 +117,36 @@ pub mod harness {
                             (Value::Object(ao), Value::Object(bo)) => {
                                 let kind_a = ao.get("kind").and_then(|v| v.as_str()).unwrap_or("");
                                 let kind_b = bo.get("kind").and_then(|v| v.as_str()).unwrap_or("");
-                                if kind_a != kind_b { all_ok = false; break; }
+                                if kind_a != kind_b {
+                                    all_ok = false;
+                                    break;
+                                }
                                 if kind_a == "SpeechEnd" {
-                                    let da = ao.get("duration_ms").and_then(|v| v.as_u64()).unwrap_or(0);
-                                    let db = bo.get("duration_ms").and_then(|v| v.as_u64()).unwrap_or(0);
+                                    let da =
+                                        ao.get("duration_ms").and_then(|v| v.as_u64()).unwrap_or(0);
+                                    let db =
+                                        bo.get("duration_ms").and_then(|v| v.as_u64()).unwrap_or(0);
                                     let diff = da.abs_diff(db);
-                                    if diff > 128 { all_ok = false; break; }
+                                    if diff > 128 {
+                                        all_ok = false;
+                                        break;
+                                    }
                                 } else if kind_a == "SpeechStart" {
                                     // SpeechStart has no duration, ignore
                                 } else {
                                     // Unknown kind fallback to strict equality
-                                    if av != bv { all_ok = false; break; }
+                                    if av != bv {
+                                        all_ok = false;
+                                        break;
+                                    }
+                                }
+                            }
+                            _ => {
+                                if av != bv {
+                                    all_ok = false;
+                                    break;
                                 }
                             }
-                            _ => { if av != bv { all_ok = false; break; } }
                         }
                     }
                     all_ok

@@ -13,6 +13,7 @@ thiserror = "2.0"
 dirs = "5.0"
 serde = { version = "1.0", features = ["derive"] }
 coldvox-foundation = { path = "../coldvox-foundation" }
+dtw = "0.1.0"
 ## Removed Python-dependent faster-whisper backend; will replace with pure Rust implementation
 
 

@@ -0,0 +1,87 @@
+use candle::{Result, Tensor, Device};
+
+const N_FFT: usize = 400;
+const N_MELS: usize = 80;
+const HOP_LENGTH: usize = 160;
+const CHUNK_LENGTH: usize = 30;
+const SAMPLING_RATE: usize = 16000;
+
+pub fn log_mel_spectrogram(pcm: &[f32], device: &Device) -> Result<Tensor> {
+    let pcm_len = pcm.len();
+    let n_samples = CHUNK_LENGTH * SAMPLING_RATE;
+    let pcm = if pcm_len < n_samples {
+        let mut padded = vec![0.0; n_samples];
+        padded[..pcm_len].copy_from_slice(pcm);
+        padded
+    } else {
+        pcm.to_vec()
+    };
+
+    let stft = stft(&pcm, N_FFT, HOP_LENGTH)?;
+    let magnitudes = stft.abs()?.powf(2.0)?;
+    let mel_filters = mel_filters(device, N_MELS, N_FFT)?;
+    let mel_spec = magnitudes.matmul(&mel_filters.t()?)?;
+    let log_spec = (mel_spec.max(1e-10)?).log10()?;
+    let log_spec = (log_spec.max(log_spec.max_all()?.to_scalar::<f64>()? - 8.0)?)?;
+    let log_spec = (log_spec + 4.0)? / 4.0?;
+    Ok(log_spec)
+}
+
+fn stft(pcm: &[f32], n_fft: usize, hop_length: usize) -> Result<Tensor> {
+    let window = hamming_window(n_fft, &Device::Cpu)?;
+    let n_frames = (pcm.len() - n_fft) / hop_length + 1;
+    let mut frames = Vec::with_capacity(n_frames);
+    for i in 0..n_frames {
+        let start = i * hop_length;
+        let end = start + n_fft;
+        frames.extend_from_slice(&pcm[start..end]);
+    }
+    let frames = Tensor::new(frames, &Device::Cpu)?.reshape((n_frames, n_fft))?;
+    let frames = frames.broadcast_mul(&window)?;
+    let stft = frames.fft(n_fft)?;
+    Ok(stft.i((.., ..n_fft / 2 + 1))?)
+}
+
+fn hamming_window(n: usize, device: &Device) -> Result<Tensor> {
+    let ts = Tensor::arange(0, n as u32, device)?.to_dtype(candle::DType::F32)?;
+    let cos = (ts * (2.0 * std::f64::consts::PI / (n - 1) as f64))?.cos()?;
+    (0.54 - 0.46 * cos)?
+}
+
+fn hz_to_mel(hz: f64) -> f64 {
+    2595.0 * (1.0 + hz / 700.0).log10()
+}
+
+fn mel_to_hz(mel: f64) -> f64 {
+    700.0 * (10.0f64.powf(mel / 2595.0) - 1.0)
+}
+
+fn mel_filters(device: &Device, n_mels: usize, n_fft: usize) -> Result<Tensor> {
+    let f_min = 0.0;
+    let f_max = SAMPLING_RATE as f64 / 2.0;
+    let mel_min = hz_to_mel(f_min);
+    let mel_max = hz_to_mel(f_max);
+    let mel_points = (0..=n_mels + 1)
+        .map(|i| mel_min + (mel_max - mel_min) * i as f64 / (n_mels + 1) as f64)
+        .collect::<Vec<_>>();
+    let fft_freqs = (0..=n_fft / 2)
+        .map(|i| i as f64 * SAMPLING_RATE as f64 / n_fft as f64)
+        .collect::<Vec<_>>();
+    let mel_edges = mel_points.windows(3).map(|w| (w[0], w[1], w[2])).collect::<Vec<_>>();
+
+    let mut filters = vec![0.0; n_mels * (n_fft / 2 + 1)];
+    for (i, (mel_start, mel_center, mel_end)) in mel_edges.iter().enumerate() {
+        for (j, &freq) in fft_freqs.iter().enumerate() {
+            let mel_freq = hz_to_mel(freq);
+            let slope = if mel_freq >= *mel_start && mel_freq <= *mel_center {
+                (mel_freq - mel_start) / (mel_center - mel_start)
+            } else if mel_freq >= *mel_center && mel_freq <= *mel_end {
+                (mel_end - mel_freq) / (mel_end - mel_center)
+            } else {
+                0.0
+            };
+            filters[i * (n_fft / 2 + 1) + j] = slope as f32;
+        }
+    }
+    Tensor::new(filters, device)?.reshape((n_mels, n_fft / 2 + 1))
+}
@@ -0,0 +1,72 @@
+use candle::{Result, Tensor, D};
+use candle_nn::ops::softmax_last_dim;
+use crate::candle::timestamps::{perform_word_alignment, perform_timestamp_probs_alignment};
+use candle_transformers::models::whisper::{self as whisper, Config, Whisper};
+use crate::candle::timestamps::TranscriptionResult;
+use crate::candle::WordTimestampHeuristic;
+
+pub struct Decoder {
+    model: Whisper,
+    tokenizer: whisper::tokenizer::Tokenizer,
+    heuristic: WordTimestampHeuristic,
+}
+
+impl Decoder {
+    pub fn new(model: Whisper, tokenizer: whisper::tokenizer::Tokenizer, heuristic: &WordTimestampHeuristic) -> Self {
+        Self { model, tokenizer, heuristic: heuristic.clone() }
+    }
+
+    pub fn run(&mut self, mel: &Tensor) -> Result<Vec<TranscriptionResult>> {
+        let mut audio_features = self.model.encoder.forward(mel, true)?;
+        let mut tokens = vec![self.tokenizer.sot_token() as i32];
+        let mut words = vec![];
+
+        for _ in 0..self.model.config.max_target_positions {
+            let tokens_tensor = Tensor::new(tokens.as_slice(), mel.device())?.unsqueeze(0)?;
+            let (logits, cross_attentions) = self.model.decoder.forward(&tokens_tensor, &audio_features, false)?;
+
+            let next_token = self.argmax(&logits)?;
+
+            tokens.push(next_token);
+
+            if self.is_segment_end(next_token) {
+                let segment_tokens = &tokens;
+                let segment_words = match self.heuristic {
+                    WordTimestampHeuristic::AttentionDtw => {
+                        if let Some(cross_attentions) = cross_attentions {
+                            perform_word_alignment(
+                                segment_tokens,
+                                &cross_attentions,
+                                &self.tokenizer,
+                                true, // Assuming space-based splitting
+                            )?
+                        } else {
+                            vec![]
+                        }
+                    }
+                    WordTimestampHeuristic::TimestampProbs => {
+                        perform_timestamp_probs_alignment(segment_tokens, &logits, &self.tokenizer)?
+                    }
+                };
+                words.extend(segment_words);
+
+                if next_token == self.tokenizer.eot_token() as i32 {
+                    break;
+                }
+                tokens = vec![self.tokenizer.sot_token() as i32];
+            }
+        }
+
+        Ok(words)
+    }
+
+    fn argmax(&self, logits: &Tensor) -> Result<i32> {
+        let logits = logits.i((0, logits.dim(D::Minus1)? - 1, ..))?;
+        let next_token = logits.argmax(D::Minus1)?.to_scalar::<u32>()? as i32;
+        Ok(next_token)
+    }
+
+    fn is_segment_end(&self, token: i32) -> bool {
+        token >= self.tokenizer.timestamp_begin() as i32 || token == self.tokenizer.eot_token() as i32
+    }
+}
@@ -0,0 +1,22 @@
+use candle::{Device, Result, safetensors};
+use candle_transformers::models::whisper::{self as whisper, Config, Whisper};
+use hf_hub::api::sync::Api;
+use hf_hub::{Repo, RepoType};
+use std::fs::File;
+use std::path::Path;
+
+pub fn load_model(
+    model_path: &str,
+    tokenizer_path: &str,
+    config_path: &str,
+    quantized: bool,
+) -> Result<(Whisper, whisper::tokenizer::Tokenizer)> {
+    let device = Device::Cpu;
+
+    let config: Config = serde_json::from_reader(File::open(config_path).map_err(|e| candle::Error::Msg(e.to_string()))?).map_err(|e| candle::Error::Msg(e.to_string()))?;
+    let tokenizer = whisper::tokenizer::Tokenizer::from_file(tokenizer_path).map_err(|e| candle::Error::Msg(e.to_string()))?;
+
+    let mut vb = candle_nn::VarBuilder::from_safetensors(vec![model_path.to_string()], candle::DType::F32, &device)?;
+    let model = Whisper::load(&vb, config)?;
+    Ok((model, tokenizer))
+}
@@ -0,0 +1,48 @@
+pub mod audio;
+pub mod decode;
+pub mod loader;
+pub mod timestamps;
+
+use candle::{Device, Result, Tensor};
+use crate::candle::audio::log_mel_spectrogram;
+use crate::candle::decode::Decoder;
+use crate::candle::loader::load_model;
+use candle_transformers::models::whisper::{self as whisper, Config, Whisper};
+use timestamps::TranscriptionResult;
+
+pub enum WordTimestampHeuristic {
+    AttentionDtw,
+    TimestampProbs,
+}
+
+pub struct WhisperEngine {
+    decoder: Decoder,
+    config: WhisperEngineConfig,
+}
+
+pub struct WhisperEngineConfig {
+    pub model_path: String,
+    pub tokenizer_path: String,
+    pub config_path: String,
+    pub quantized: bool,
+    pub enable_timestamps: bool,
+    pub heuristic: WordTimestampHeuristic,
+}
+
+impl WhisperEngine {
+    pub fn new(config: WhisperEngineConfig) -> Result<Self> {
+        let (model, tokenizer) = load_model(&config.model_path, &config.tokenizer_path, &config.config_path, config.quantized)?;
+        let decoder = Decoder::new(model, tokenizer, &config.heuristic);
+        Ok(Self { decoder, config })
+    }
+
+    pub fn transcribe(&mut self, pcm_audio: &[f32]) -> Result<Vec<TranscriptionResult>> {
+        let mel = self.preprocess_audio(pcm_audio)?;
+        let words = self.decoder.run(&mel)?;
+        Ok(words)
+    }
+
+    fn preprocess_audio(&self, pcm_audio: &[f32]) -> Result<Tensor> {
+        log_mel_spectrogram(pcm_audio, &Device::Cpu)
+    }
+}