-
Notifications
You must be signed in to change notification settings - Fork 0
Audio
Arian Amiramjadi edited this page Dec 24, 2025
·
1 revision
Text-to-speech and speech-to-text support for voice applications.
Convert text to spoken audio:
// Quick: generate and save
err := ai.Speak("Hello, world!").Save("hello.mp3")
// Get raw audio bytes
audio, err := ai.Speak("Hello, world!").Do()ai.Speak("Hello").Voice(ai.VoiceAlloy) // Default, neutral
ai.Speak("Hello").Voice(ai.VoiceNova) // Female
ai.Speak("Hello").Voice(ai.VoiceOnyx) // Male, deep
ai.Speak("Hello").Voice(ai.VoiceEcho) // Male
ai.Speak("Hello").Voice(ai.VoiceFable) // Expressive
ai.Speak("Hello").Voice(ai.VoiceShimmer) // Female, warmai.Speak("Hello").Format(ai.AudioFormatMP3) // Default
ai.Speak("Hello").Format(ai.AudioFormatOpus) // Smaller, web
ai.Speak("Hello").Format(ai.AudioFormatAAC) // Apple devices
ai.Speak("Hello").Format(ai.AudioFormatFLAC) // Lossless
ai.Speak("Hello").Format(ai.AudioFormatWAV) // Uncompressedai.Speak("Hello").Speed(0.5) // Slow (0.25x-4.0x range)
ai.Speak("Hello").Speed(1.0) // Normal (default)
ai.Speak("Hello").Speed(1.5) // Fast
ai.Speak("Hello").Speed(2.0) // Very fast// High-definition model (slower but better quality)
ai.Speak("Hello").HD().Save("hello_hd.mp3")ai.Speak("Hello").Model(ai.TTSTTS1) // Standard (default)
ai.Speak("Hello").Model(ai.TTSTTS1HD) // High definitionerr := ai.Speak("Welcome to our application!").
Voice(ai.VoiceNova).
Format(ai.AudioFormatMP3).
Speed(1.1).
HD().
Save("welcome.mp3")Transcribe audio to text:
// From file
text, err := ai.Transcribe("audio.mp3").Do()
// From bytes
text, err := ai.TranscribeBytes(audioData, "audio.mp3").Do()
// From URL
text, err := ai.TranscribeURL("https://example.com/audio.mp3").Do()text, err := ai.Transcribe("audio.mp3").
Language("en"). // ISO 639-1 code
Do()
// Other languages
ai.Transcribe("audio.mp3").Language("es") // Spanish
ai.Transcribe("audio.mp3").Language("fr") // French
ai.Transcribe("audio.mp3").Language("de") // German
ai.Transcribe("audio.mp3").Language("ja") // JapaneseProvide context to improve accuracy:
text, err := ai.Transcribe("meeting.mp3").
Prompt("This is a technical meeting about Kubernetes").
Do()resp, err := ai.Transcribe("audio.mp3").
WithTimestamps().
DoWithMeta()
for _, word := range resp.Words {
fmt.Printf("%.2fs - %.2fs: %s\n", word.Start, word.End, word.Word)
}resp, err := ai.Transcribe("audio.mp3").DoWithMeta()
if err != nil {
panic(err)
}
fmt.Printf("Text: %s\n", resp.Text)
fmt.Printf("Language: %s\n", resp.Language)
fmt.Printf("Duration: %.1f seconds\n", resp.Duration)ai.Transcribe("audio.mp3").Model(ai.STTWhisper1) // Default, Whisper// Use OpenAI directly
audio, _ := ai.OpenAI().Speak("Hello").Do()
text, _ := ai.OpenAI().Transcribe("audio.mp3").Do()// Record user audio -> transcribe -> respond -> speak
userText, _ := ai.Transcribe("user_input.mp3").Do()
response, _ := ai.Claude().Ask(userText)
ai.Speak(response).Save("response.mp3")resp, err := ai.Transcribe("podcast.mp3").
Language("en").
WithTimestamps().
DoWithMeta()
// Generate transcript with timestamps
for _, word := range resp.Words {
fmt.Printf("[%02d:%02d] %s ",
int(word.Start)/60, int(word.Start)%60, word.Word)
}chapters := []string{
"Chapter 1: The Beginning...",
"Chapter 2: The Journey...",
}
for i, chapter := range chapters {
ai.Speak(chapter).
Voice(ai.VoiceFable).
HD().
Save(fmt.Sprintf("chapter_%d.mp3", i+1))
}// Change defaults
ai.DefaultTTSModel = ai.TTSTTS1HD
ai.DefaultSTTModel = ai.STTWhisper1
ai.DefaultVoice = ai.VoiceNova
ai.DefaultAudioFormat = ai.AudioFormatOpus