@@ -91,6 +91,79 @@ actor TranscriptionTracker {
9191 }
9292}
9393
94+ /// Word-level timing information
95+ struct WordTiming {
96+ let word : String
97+ let startTime : TimeInterval
98+ let endTime : TimeInterval
99+ let confidence : Float
100+ }
101+
102+ /// Helper to merge tokens into word-level timings
103+ enum WordTimingMerger {
104+ /// Merge token timings into word-level timings
105+ /// Tokens are merged by detecting word boundaries (whitespace)
106+ static func mergeTokensIntoWords( _ tokenTimings: [ TokenTiming ] ) -> [ WordTiming ] {
107+ guard !tokenTimings. isEmpty else { return [ ] }
108+
109+ var wordTimings : [ WordTiming ] = [ ]
110+ var currentWord = " "
111+ var currentStartTime : TimeInterval ?
112+ var currentEndTime : TimeInterval = 0
113+ var currentConfidences : [ Float ] = [ ]
114+
115+ for timing in tokenTimings {
116+ let token = timing. token
117+
118+ // Check if token starts with a space (indicates new word boundary)
119+ if token. hasPrefix ( " " ) || token. hasPrefix ( " \n " ) || token. hasPrefix ( " \t " ) {
120+ // Finish previous word if exists
121+ if !currentWord. isEmpty, let startTime = currentStartTime {
122+ let avgConfidence =
123+ currentConfidences. isEmpty
124+ ? 0.0 : currentConfidences. reduce ( 0 , + ) / Float( currentConfidences. count)
125+ wordTimings. append (
126+ WordTiming (
127+ word: currentWord,
128+ startTime: startTime,
129+ endTime: currentEndTime,
130+ confidence: avgConfidence
131+ ) )
132+ }
133+
134+ // Start new word (trim the leading space)
135+ currentWord = token. trimmingCharacters ( in: . whitespacesAndNewlines)
136+ currentStartTime = timing. startTime
137+ currentEndTime = timing. endTime
138+ currentConfidences = [ timing. confidence]
139+ } else {
140+ // Continue current word
141+ if currentStartTime == nil {
142+ currentStartTime = timing. startTime
143+ }
144+ currentWord += token
145+ currentEndTime = timing. endTime
146+ currentConfidences. append ( timing. confidence)
147+ }
148+ }
149+
150+ // Add final word
151+ if !currentWord. isEmpty, let startTime = currentStartTime {
152+ let avgConfidence =
153+ currentConfidences. isEmpty ? 0.0 : currentConfidences. reduce ( 0 , + ) / Float( currentConfidences. count)
154+ wordTimings. append (
155+ WordTiming (
156+ word: currentWord,
157+ startTime: startTime,
158+ endTime: currentEndTime,
159+ confidence: avgConfidence
160+ ) )
161+ }
162+
163+ return wordTimings
164+ }
165+ }
166+
94167/// Command to transcribe audio files using batch or streaming mode
95168enum TranscribeCommand {
96169 private static let logger = AppLogger ( category: " Transcribe " )
@@ -106,6 +179,7 @@ enum TranscribeCommand {
106179 let audioFile = arguments [ 0 ]
107180 var streamingMode = false
108181 var showMetadata = false
182+ var wordTimestamps = false
109183 var modelVersion : AsrModelVersion = . v3 // Default to v3
110184
111185 // Parse options
@@ -119,6 +193,8 @@ enum TranscribeCommand {
119193 streamingMode = true
120194 case " --metadata " :
121195 showMetadata = true
196+ case " --word-timestamps " :
197+ wordTimestamps = true
122198 case " --model-version " :
123199 if i + 1 < arguments. count {
124200 switch arguments [ i + 1 ] . lowercased ( ) {
@@ -143,16 +219,19 @@ enum TranscribeCommand {
143219 " Streaming mode enabled: simulating real-time audio with 1-second chunks. \n "
144220 )
145221 await testStreamingTranscription (
146- audioFile: audioFile, showMetadata: showMetadata, modelVersion: modelVersion)
222+ audioFile: audioFile, showMetadata: showMetadata, wordTimestamps: wordTimestamps,
223+ modelVersion: modelVersion)
147224 } else {
148225 logger. info ( " Using batch mode with direct processing \n " )
149- await testBatchTranscription ( audioFile: audioFile, showMetadata: showMetadata, modelVersion: modelVersion)
226+ await testBatchTranscription (
227+ audioFile: audioFile, showMetadata: showMetadata, wordTimestamps: wordTimestamps,
228+ modelVersion: modelVersion)
150229 }
151230 }
152231
153232 /// Test batch transcription using AsrManager directly
154233 private static func testBatchTranscription(
155- audioFile: String , showMetadata: Bool , modelVersion: AsrModelVersion
234+ audioFile: String , showMetadata: Bool , wordTimestamps : Bool , modelVersion: AsrModelVersion
156235 ) async {
157236 do {
158237 // Initialize ASR models
@@ -194,6 +273,21 @@ enum TranscribeCommand {
194273 logger. info ( " Final transcription: " )
195274 print ( result. text)
196275
276+ // Print word-level timestamps if requested
277+ if wordTimestamps {
278+ if let tokenTimings = result. tokenTimings, !tokenTimings. isEmpty {
279+ let wordTimings = WordTimingMerger . mergeTokensIntoWords ( tokenTimings)
280+ logger. info ( " \n Word-level timestamps: " )
281+ for (index, word) in wordTimings. enumerated ( ) {
282+ logger. info (
283+ " [ \( index) ] \( String ( format: " %.3f " , word. startTime) ) s - \( String ( format: " %.3f " , word. endTime) ) s: \" \( word. word) \" (conf: \( String ( format: " %.3f " , word. confidence) ) ) "
284+ )
285+ }
286+ } else {
287+ logger. info ( " \n Word-level timestamps: Not available (no token timings) " )
288+ }
289+ }
290+
197291 if showMetadata {
198292 logger. info ( " Metadata: " )
199293 logger. info ( " Confidence: \( String ( format: " %.3f " , result. confidence) ) " )
@@ -247,7 +341,7 @@ enum TranscribeCommand {
247341
248342 /// Test streaming transcription
249343 private static func testStreamingTranscription(
250- audioFile: String , showMetadata: Bool , modelVersion: AsrModelVersion
344+ audioFile: String , showMetadata: Bool , wordTimestamps : Bool , modelVersion: AsrModelVersion
251345 ) async {
252346 // Use optimized streaming configuration
253347 let config = StreamingAsrConfig . streaming
@@ -390,6 +484,22 @@ enum TranscribeCommand {
390484 logger. info ( String ( repeating: " = " , count: 50 ) )
391485 logger. info ( " Final transcription: " )
392486 print ( finalText)
487+
488+ // Print word-level timestamps if requested
489+ if wordTimestamps {
490+ if let snapshot = await tracker. metadataSnapshot ( ) {
491+ let wordTimings = WordTimingMerger . mergeTokensIntoWords ( snapshot. timings)
492+ logger. info ( " \n Word-level timestamps: " )
493+ for (index, word) in wordTimings. enumerated ( ) {
494+ logger. info (
495+ " [ \( index) ] \( String ( format: " %.3f " , word. startTime) ) s - \( String ( format: " %.3f " , word. endTime) ) s: \" \( word. word) \" (conf: \( String ( format: " %.3f " , word. confidence) ) ) "
496+ )
497+ }
498+ } else {
499+ logger. info ( " \n Word-level timestamps: Not available (no token timings) " )
500+ }
501+ }
502+
393503 logger. info ( " Performance: " )
394504 logger. info ( " Audio duration: \( String ( format: " %.2f " , totalDuration) ) s " )
395505 logger. info ( " Processing time: \( String ( format: " %.2f " , processingTime) ) s " )
@@ -455,12 +565,14 @@ enum TranscribeCommand {
455565 --help, -h Show this help message
456566 --streaming Use streaming mode with chunk simulation
457567 --metadata Show confidence, start time, and end time in results
568+ --word-timestamps Show word-level timestamps for each word in the transcription
458569 --model-version <version> ASR model version to use: v2 or v3 (default: v3)
459570
460571 Examples:
461572 fluidaudio transcribe audio.wav # Batch mode (default)
462573 fluidaudio transcribe audio.wav --streaming # Streaming mode
463574 fluidaudio transcribe audio.wav --metadata # Batch mode with metadata
575+ fluidaudio transcribe audio.wav --word-timestamps # Batch mode with word timestamps
464576 fluidaudio transcribe audio.wav --streaming --metadata # Streaming mode with metadata
465577
466578 Batch mode (default):
@@ -477,6 +589,12 @@ enum TranscribeCommand {
477589 - Batch mode: Shows duration and token-based start/end times (if available)
478590 - Streaming mode: Shows timestamps for each transcription update
479591 - Works with both batch and streaming modes
592+
593+ Word timestamps option:
594+ - Shows start and end times for each word in the transcription
595+ - Merges subword tokens into complete words with timing information
596+ - Displays confidence scores for each word
597+ - Works with both batch and streaming modes
480598 """
481599 )
482600 }
0 commit comments