Add word-level timestamps support to CLI transcribe command

Alex-Wengg · claude · Alex-Wengg · commit 2bc70b886479 · 2025-11-19T18:57:21.000-05:00
Implements GitHub issue #189 by adding --word-timestamps flag to the transcribe command. This allows users to view timing information for each word in the transcription output. Features: - New --word-timestamps CLI flag for batch and streaming modes - WordTimingMerger helper to merge subword tokens into complete words - Word timestamps include start time, end time, and confidence score - Works by detecting word boundaries (whitespace) in token sequences - Averaging confidence scores across tokens that form each word Example usage: fluidaudio transcribe audio.wav --word-timestamps Output format: [0] 0.160s - 0.800s: "Hello" (conf: 0.999) [1] 0.800s - 1.440s: "world!" (conf: 0.771) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/Sources/FluidAudioCLI/Commands/ASR/TranscribeCommand.swift b/Sources/FluidAudioCLI/Commands/ASR/TranscribeCommand.swift
@@ -91,6 +91,79 @@ actor TranscriptionTracker {
     }
 }
 
+/// Word-level timing information
+struct WordTiming {
+    let word: String
+    let startTime: TimeInterval
+    let endTime: TimeInterval
+    let confidence: Float
+}
+
+/// Helper to merge tokens into word-level timings
+enum WordTimingMerger {
+    /// Merge token timings into word-level timings
+    /// Tokens are merged by detecting word boundaries (whitespace)
+    static func mergeTokensIntoWords(_ tokenTimings: [TokenTiming]) -> [WordTiming] {
+        guard !tokenTimings.isEmpty else { return [] }
+
+        var wordTimings: [WordTiming] = []
+        var currentWord = ""
+        var currentStartTime: TimeInterval?
+        var currentEndTime: TimeInterval = 0
+        var currentConfidences: [Float] = []
+
+        for timing in tokenTimings {
+            let token = timing.token
+
+            // Check if token starts with a space (indicates new word boundary)
+            if token.hasPrefix(" ") || token.hasPrefix("\n") || token.hasPrefix("\t") {
+                // Finish previous word if exists
+                if !currentWord.isEmpty, let startTime = currentStartTime {
+                    let avgConfidence =
+                        currentConfidences.isEmpty
+                        ? 0.0 : currentConfidences.reduce(0, +) / Float(currentConfidences.count)
+                    wordTimings.append(
+                        WordTiming(
+                            word: currentWord,
+                            startTime: startTime,
+                            endTime: currentEndTime,
+                            confidence: avgConfidence
+                        ))
+                }
+
+                // Start new word (trim the leading space)
+                currentWord = token.trimmingCharacters(in: .whitespacesAndNewlines)
+                currentStartTime = timing.startTime
+                currentEndTime = timing.endTime
+                currentConfidences = [timing.confidence]
+            } else {
+                // Continue current word
+                if currentStartTime == nil {
+                    currentStartTime = timing.startTime
+                }
+                currentWord += token
+                currentEndTime = timing.endTime
+                currentConfidences.append(timing.confidence)
+            }
+        }
+
+        // Add final word
+        if !currentWord.isEmpty, let startTime = currentStartTime {
+            let avgConfidence =
+                currentConfidences.isEmpty ? 0.0 : currentConfidences.reduce(0, +) / Float(currentConfidences.count)
+            wordTimings.append(
+                WordTiming(
+                    word: currentWord,
+                    startTime: startTime,
+                    endTime: currentEndTime,
+                    confidence: avgConfidence
+                ))
+        }
+
+        return wordTimings
+    }
+}
+
 /// Command to transcribe audio files using batch or streaming mode
 enum TranscribeCommand {
     private static let logger = AppLogger(category: "Transcribe")
@@ -106,6 +179,7 @@ enum TranscribeCommand {
         let audioFile = arguments[0]
         var streamingMode = false
         var showMetadata = false
+        var wordTimestamps = false
         var modelVersion: AsrModelVersion = .v3  // Default to v3
 
         // Parse options
@@ -119,6 +193,8 @@ enum TranscribeCommand {
                 streamingMode = true
             case "--metadata":
                 showMetadata = true
+            case "--word-timestamps":
+                wordTimestamps = true
             case "--model-version":
                 if i + 1 < arguments.count {
                     switch arguments[i + 1].lowercased() {
@@ -143,16 +219,19 @@ enum TranscribeCommand {
                 "Streaming mode enabled: simulating real-time audio with 1-second chunks.\n"
             )
             await testStreamingTranscription(
-                audioFile: audioFile, showMetadata: showMetadata, modelVersion: modelVersion)
+                audioFile: audioFile, showMetadata: showMetadata, wordTimestamps: wordTimestamps,
+                modelVersion: modelVersion)
         } else {
             logger.info("Using batch mode with direct processing\n")
-            await testBatchTranscription(audioFile: audioFile, showMetadata: showMetadata, modelVersion: modelVersion)
+            await testBatchTranscription(
+                audioFile: audioFile, showMetadata: showMetadata, wordTimestamps: wordTimestamps,
+                modelVersion: modelVersion)
         }
     }
 
     /// Test batch transcription using AsrManager directly
     private static func testBatchTranscription(
-        audioFile: String, showMetadata: Bool, modelVersion: AsrModelVersion
+        audioFile: String, showMetadata: Bool, wordTimestamps: Bool, modelVersion: AsrModelVersion
     ) async {
         do {
             // Initialize ASR models
@@ -194,6 +273,21 @@ enum TranscribeCommand {
             logger.info("Final transcription:")
             print(result.text)
 
+            // Print word-level timestamps if requested
+            if wordTimestamps {
+                if let tokenTimings = result.tokenTimings, !tokenTimings.isEmpty {
+                    let wordTimings = WordTimingMerger.mergeTokensIntoWords(tokenTimings)
+                    logger.info("\nWord-level timestamps:")
+                    for (index, word) in wordTimings.enumerated() {
+                        logger.info(
+                            "  [\(index)] \(String(format: "%.3f", word.startTime))s - \(String(format: "%.3f", word.endTime))s: \"\(word.word)\" (conf: \(String(format: "%.3f", word.confidence)))"
+                        )
+                    }
+                } else {
+                    logger.info("\nWord-level timestamps: Not available (no token timings)")
+                }
+            }
+
             if showMetadata {
                 logger.info("Metadata:")
                 logger.info("  Confidence: \(String(format: "%.3f", result.confidence))")
@@ -247,7 +341,7 @@ enum TranscribeCommand {
 
     /// Test streaming transcription
     private static func testStreamingTranscription(
-        audioFile: String, showMetadata: Bool, modelVersion: AsrModelVersion
+        audioFile: String, showMetadata: Bool, wordTimestamps: Bool, modelVersion: AsrModelVersion
     ) async {
         // Use optimized streaming configuration
         let config = StreamingAsrConfig.streaming
@@ -390,6 +484,22 @@ enum TranscribeCommand {
             logger.info(String(repeating: "=", count: 50))
             logger.info("Final transcription:")
             print(finalText)
+
+            // Print word-level timestamps if requested
+            if wordTimestamps {
+                if let snapshot = await tracker.metadataSnapshot() {
+                    let wordTimings = WordTimingMerger.mergeTokensIntoWords(snapshot.timings)
+                    logger.info("\nWord-level timestamps:")
+                    for (index, word) in wordTimings.enumerated() {
+                        logger.info(
+                            "  [\(index)] \(String(format: "%.3f", word.startTime))s - \(String(format: "%.3f", word.endTime))s: \"\(word.word)\" (conf: \(String(format: "%.3f", word.confidence)))"
+                        )
+                    }
+                } else {
+                    logger.info("\nWord-level timestamps: Not available (no token timings)")
+                }
+            }
+
             logger.info("Performance:")
             logger.info("  Audio duration: \(String(format: "%.2f", totalDuration))s")
             logger.info("  Processing time: \(String(format: "%.2f", processingTime))s")
@@ -455,12 +565,14 @@ enum TranscribeCommand {
                 --help, -h         Show this help message
                 --streaming        Use streaming mode with chunk simulation
                 --metadata         Show confidence, start time, and end time in results
+                --word-timestamps  Show word-level timestamps for each word in the transcription
                 --model-version <version>  ASR model version to use: v2 or v3 (default: v3)
 
             Examples:
                 fluidaudio transcribe audio.wav                    # Batch mode (default)
                 fluidaudio transcribe audio.wav --streaming        # Streaming mode
                 fluidaudio transcribe audio.wav --metadata         # Batch mode with metadata
+                fluidaudio transcribe audio.wav --word-timestamps  # Batch mode with word timestamps
                 fluidaudio transcribe audio.wav --streaming --metadata # Streaming mode with metadata
 
             Batch mode (default):
@@ -477,6 +589,12 @@ enum TranscribeCommand {
             - Batch mode: Shows duration and token-based start/end times (if available)
             - Streaming mode: Shows timestamps for each transcription update
             - Works with both batch and streaming modes
+
+            Word timestamps option:
+            - Shows start and end times for each word in the transcription
+            - Merges subword tokens into complete words with timing information
+            - Displays confidence scores for each word
+            - Works with both batch and streaming modes
             """
         )
     }