@@ -1105,41 +1105,50 @@ func (provider *AzureProvider) SpeechStream(ctx *schemas.BifrostContext, postHoo
11051105 audioData = event
11061106 }
11071107
1108- // First, try to parse as JSON error response (these would be valid JSON text)
1109- var bifrostErr schemas.BifrostError
1110- if err := sonic .Unmarshal (audioData , & bifrostErr ); err == nil {
1111- if bifrostErr .Error != nil && bifrostErr .Error .Message != "" {
1112- bifrostErr .ExtraFields = schemas.BifrostErrorExtraFields {
1113- Provider : provider .GetProviderKey (),
1114- ModelRequested : request .Model ,
1115- RequestType : schemas .SpeechStreamRequest ,
1108+ // Skip empty data
1109+ if len (audioData ) == 0 {
1110+ continue
1111+ }
1112+
1113+ // Azure sends JSON-wrapped responses for speech streaming
1114+ // Parse the JSON to extract the response type and audio data
1115+ var response schemas.BifrostSpeechStreamResponse
1116+ if err := sonic .Unmarshal (audioData , & response ); err != nil {
1117+ // If JSON parsing fails, check if this might be an error response
1118+ var bifrostErr schemas.BifrostError
1119+ if errParseErr := sonic .Unmarshal (audioData , & bifrostErr ); errParseErr == nil {
1120+ if bifrostErr .Error != nil && bifrostErr .Error .Message != "" {
1121+ bifrostErr .ExtraFields = schemas.BifrostErrorExtraFields {
1122+ Provider : provider .GetProviderKey (),
1123+ ModelRequested : request .Model ,
1124+ RequestType : schemas .SpeechStreamRequest ,
1125+ }
1126+ ctx .SetValue (schemas .BifrostContextKeyStreamEndIndicator , true )
1127+ providerUtils .ProcessAndSendBifrostError (ctx , postHookRunner , & bifrostErr , responseChan , provider .logger )
1128+ return
11161129 }
1117- ctx .SetValue (schemas .BifrostContextKeyStreamEndIndicator , true )
1118- providerUtils .ProcessAndSendBifrostError (ctx , postHookRunner , & bifrostErr , responseChan , provider .logger )
1119- return
11201130 }
1131+ // If it's not valid JSON, log and skip
1132+ provider .logger .Warn ("failed to parse speech stream response: %v" , err )
1133+ continue
11211134 }
11221135
1123- // Skip empty audio data
1124- if len (audioData ) == 0 {
1136+ // Check for completion event - skip if no audio data
1137+ if response .Type == schemas .SpeechStreamResponseTypeDone || len (response .Audio ) == 0 {
1138+ // This is a control event or empty response - skip
11251139 continue
11261140 }
11271141
11281142 chunkIndex ++
11291143
1130- // Create response with raw audio data
1131- // Azure sends raw binary MP3 frames (starting with 0xff 0xf3 or 0xff 0xfb)
1132- response := schemas.BifrostSpeechStreamResponse {
1133- Type : schemas .SpeechStreamResponseTypeDelta ,
1134- Audio : audioData ,
1135- ExtraFields : schemas.BifrostResponseExtraFields {
1136- RequestType : schemas .SpeechStreamRequest ,
1137- Provider : provider .GetProviderKey (),
1138- ModelRequested : request .Model ,
1139- ModelDeployment : deployment ,
1140- ChunkIndex : chunkIndex ,
1141- Latency : time .Since (lastChunkTime ).Milliseconds (),
1142- },
1144+ // Set extra fields for the response
1145+ response .ExtraFields = schemas.BifrostResponseExtraFields {
1146+ RequestType : schemas .SpeechStreamRequest ,
1147+ Provider : provider .GetProviderKey (),
1148+ ModelRequested : request .Model ,
1149+ ModelDeployment : deployment ,
1150+ ChunkIndex : chunkIndex ,
1151+ Latency : time .Since (lastChunkTime ).Milliseconds (),
11431152 }
11441153 lastChunkTime = time .Now ()
11451154
0 commit comments