Skip to content

Commit 2bc70b8

Browse files
Alex-Wenggclaude
andcommitted
Add word-level timestamps support to CLI transcribe command
Implements GitHub issue #189 by adding --word-timestamps flag to the transcribe command. This allows users to view timing information for each word in the transcription output. Features: - New --word-timestamps CLI flag for batch and streaming modes - WordTimingMerger helper to merge subword tokens into complete words - Word timestamps include start time, end time, and confidence score - Works by detecting word boundaries (whitespace) in token sequences - Averaging confidence scores across tokens that form each word Example usage: fluidaudio transcribe audio.wav --word-timestamps Output format: [0] 0.160s - 0.800s: "Hello" (conf: 0.999) [1] 0.800s - 1.440s: "world!" (conf: 0.771) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]>
1 parent a6a0e2c commit 2bc70b8

File tree

1 file changed

+122
-4
lines changed

1 file changed

+122
-4
lines changed

Sources/FluidAudioCLI/Commands/ASR/TranscribeCommand.swift

Lines changed: 122 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,79 @@ actor TranscriptionTracker {
9191
}
9292
}
9393

94+
/// Word-level timing information
95+
struct WordTiming {
96+
let word: String
97+
let startTime: TimeInterval
98+
let endTime: TimeInterval
99+
let confidence: Float
100+
}
101+
102+
/// Helper to merge tokens into word-level timings
103+
enum WordTimingMerger {
104+
/// Merge token timings into word-level timings
105+
/// Tokens are merged by detecting word boundaries (whitespace)
106+
static func mergeTokensIntoWords(_ tokenTimings: [TokenTiming]) -> [WordTiming] {
107+
guard !tokenTimings.isEmpty else { return [] }
108+
109+
var wordTimings: [WordTiming] = []
110+
var currentWord = ""
111+
var currentStartTime: TimeInterval?
112+
var currentEndTime: TimeInterval = 0
113+
var currentConfidences: [Float] = []
114+
115+
for timing in tokenTimings {
116+
let token = timing.token
117+
118+
// Check if token starts with a space (indicates new word boundary)
119+
if token.hasPrefix(" ") || token.hasPrefix("\n") || token.hasPrefix("\t") {
120+
// Finish previous word if exists
121+
if !currentWord.isEmpty, let startTime = currentStartTime {
122+
let avgConfidence =
123+
currentConfidences.isEmpty
124+
? 0.0 : currentConfidences.reduce(0, +) / Float(currentConfidences.count)
125+
wordTimings.append(
126+
WordTiming(
127+
word: currentWord,
128+
startTime: startTime,
129+
endTime: currentEndTime,
130+
confidence: avgConfidence
131+
))
132+
}
133+
134+
// Start new word (trim the leading space)
135+
currentWord = token.trimmingCharacters(in: .whitespacesAndNewlines)
136+
currentStartTime = timing.startTime
137+
currentEndTime = timing.endTime
138+
currentConfidences = [timing.confidence]
139+
} else {
140+
// Continue current word
141+
if currentStartTime == nil {
142+
currentStartTime = timing.startTime
143+
}
144+
currentWord += token
145+
currentEndTime = timing.endTime
146+
currentConfidences.append(timing.confidence)
147+
}
148+
}
149+
150+
// Add final word
151+
if !currentWord.isEmpty, let startTime = currentStartTime {
152+
let avgConfidence =
153+
currentConfidences.isEmpty ? 0.0 : currentConfidences.reduce(0, +) / Float(currentConfidences.count)
154+
wordTimings.append(
155+
WordTiming(
156+
word: currentWord,
157+
startTime: startTime,
158+
endTime: currentEndTime,
159+
confidence: avgConfidence
160+
))
161+
}
162+
163+
return wordTimings
164+
}
165+
}
166+
94167
/// Command to transcribe audio files using batch or streaming mode
95168
enum TranscribeCommand {
96169
private static let logger = AppLogger(category: "Transcribe")
@@ -106,6 +179,7 @@ enum TranscribeCommand {
106179
let audioFile = arguments[0]
107180
var streamingMode = false
108181
var showMetadata = false
182+
var wordTimestamps = false
109183
var modelVersion: AsrModelVersion = .v3 // Default to v3
110184

111185
// Parse options
@@ -119,6 +193,8 @@ enum TranscribeCommand {
119193
streamingMode = true
120194
case "--metadata":
121195
showMetadata = true
196+
case "--word-timestamps":
197+
wordTimestamps = true
122198
case "--model-version":
123199
if i + 1 < arguments.count {
124200
switch arguments[i + 1].lowercased() {
@@ -143,16 +219,19 @@ enum TranscribeCommand {
143219
"Streaming mode enabled: simulating real-time audio with 1-second chunks.\n"
144220
)
145221
await testStreamingTranscription(
146-
audioFile: audioFile, showMetadata: showMetadata, modelVersion: modelVersion)
222+
audioFile: audioFile, showMetadata: showMetadata, wordTimestamps: wordTimestamps,
223+
modelVersion: modelVersion)
147224
} else {
148225
logger.info("Using batch mode with direct processing\n")
149-
await testBatchTranscription(audioFile: audioFile, showMetadata: showMetadata, modelVersion: modelVersion)
226+
await testBatchTranscription(
227+
audioFile: audioFile, showMetadata: showMetadata, wordTimestamps: wordTimestamps,
228+
modelVersion: modelVersion)
150229
}
151230
}
152231

153232
/// Test batch transcription using AsrManager directly
154233
private static func testBatchTranscription(
155-
audioFile: String, showMetadata: Bool, modelVersion: AsrModelVersion
234+
audioFile: String, showMetadata: Bool, wordTimestamps: Bool, modelVersion: AsrModelVersion
156235
) async {
157236
do {
158237
// Initialize ASR models
@@ -194,6 +273,21 @@ enum TranscribeCommand {
194273
logger.info("Final transcription:")
195274
print(result.text)
196275

276+
// Print word-level timestamps if requested
277+
if wordTimestamps {
278+
if let tokenTimings = result.tokenTimings, !tokenTimings.isEmpty {
279+
let wordTimings = WordTimingMerger.mergeTokensIntoWords(tokenTimings)
280+
logger.info("\nWord-level timestamps:")
281+
for (index, word) in wordTimings.enumerated() {
282+
logger.info(
283+
" [\(index)] \(String(format: "%.3f", word.startTime))s - \(String(format: "%.3f", word.endTime))s: \"\(word.word)\" (conf: \(String(format: "%.3f", word.confidence)))"
284+
)
285+
}
286+
} else {
287+
logger.info("\nWord-level timestamps: Not available (no token timings)")
288+
}
289+
}
290+
197291
if showMetadata {
198292
logger.info("Metadata:")
199293
logger.info(" Confidence: \(String(format: "%.3f", result.confidence))")
@@ -247,7 +341,7 @@ enum TranscribeCommand {
247341

248342
/// Test streaming transcription
249343
private static func testStreamingTranscription(
250-
audioFile: String, showMetadata: Bool, modelVersion: AsrModelVersion
344+
audioFile: String, showMetadata: Bool, wordTimestamps: Bool, modelVersion: AsrModelVersion
251345
) async {
252346
// Use optimized streaming configuration
253347
let config = StreamingAsrConfig.streaming
@@ -390,6 +484,22 @@ enum TranscribeCommand {
390484
logger.info(String(repeating: "=", count: 50))
391485
logger.info("Final transcription:")
392486
print(finalText)
487+
488+
// Print word-level timestamps if requested
489+
if wordTimestamps {
490+
if let snapshot = await tracker.metadataSnapshot() {
491+
let wordTimings = WordTimingMerger.mergeTokensIntoWords(snapshot.timings)
492+
logger.info("\nWord-level timestamps:")
493+
for (index, word) in wordTimings.enumerated() {
494+
logger.info(
495+
" [\(index)] \(String(format: "%.3f", word.startTime))s - \(String(format: "%.3f", word.endTime))s: \"\(word.word)\" (conf: \(String(format: "%.3f", word.confidence)))"
496+
)
497+
}
498+
} else {
499+
logger.info("\nWord-level timestamps: Not available (no token timings)")
500+
}
501+
}
502+
393503
logger.info("Performance:")
394504
logger.info(" Audio duration: \(String(format: "%.2f", totalDuration))s")
395505
logger.info(" Processing time: \(String(format: "%.2f", processingTime))s")
@@ -455,12 +565,14 @@ enum TranscribeCommand {
455565
--help, -h Show this help message
456566
--streaming Use streaming mode with chunk simulation
457567
--metadata Show confidence, start time, and end time in results
568+
--word-timestamps Show word-level timestamps for each word in the transcription
458569
--model-version <version> ASR model version to use: v2 or v3 (default: v3)
459570
460571
Examples:
461572
fluidaudio transcribe audio.wav # Batch mode (default)
462573
fluidaudio transcribe audio.wav --streaming # Streaming mode
463574
fluidaudio transcribe audio.wav --metadata # Batch mode with metadata
575+
fluidaudio transcribe audio.wav --word-timestamps # Batch mode with word timestamps
464576
fluidaudio transcribe audio.wav --streaming --metadata # Streaming mode with metadata
465577
466578
Batch mode (default):
@@ -477,6 +589,12 @@ enum TranscribeCommand {
477589
- Batch mode: Shows duration and token-based start/end times (if available)
478590
- Streaming mode: Shows timestamps for each transcription update
479591
- Works with both batch and streaming modes
592+
593+
Word timestamps option:
594+
- Shows start and end times for each word in the transcription
595+
- Merges subword tokens into complete words with timing information
596+
- Displays confidence scores for each word
597+
- Works with both batch and streaming modes
480598
"""
481599
)
482600
}

0 commit comments

Comments
 (0)