Skip to content
Arian Amiramjadi edited this page Dec 24, 2025 · 1 revision

Audio (TTS & STT)

Text-to-speech and speech-to-text support for voice applications.

Text-to-Speech (TTS)

Convert text to spoken audio:

// Quick: generate and save
err := ai.Speak("Hello, world!").Save("hello.mp3")

// Get raw audio bytes
audio, err := ai.Speak("Hello, world!").Do()

Voices

ai.Speak("Hello").Voice(ai.VoiceAlloy)   // Default, neutral
ai.Speak("Hello").Voice(ai.VoiceNova)    // Female
ai.Speak("Hello").Voice(ai.VoiceOnyx)    // Male, deep
ai.Speak("Hello").Voice(ai.VoiceEcho)    // Male
ai.Speak("Hello").Voice(ai.VoiceFable)   // Expressive
ai.Speak("Hello").Voice(ai.VoiceShimmer) // Female, warm

Audio Formats

ai.Speak("Hello").Format(ai.AudioFormatMP3)  // Default
ai.Speak("Hello").Format(ai.AudioFormatOpus) // Smaller, web
ai.Speak("Hello").Format(ai.AudioFormatAAC)  // Apple devices
ai.Speak("Hello").Format(ai.AudioFormatFLAC) // Lossless
ai.Speak("Hello").Format(ai.AudioFormatWAV)  // Uncompressed

Speed Control

ai.Speak("Hello").Speed(0.5)  // Slow (0.25x-4.0x range)
ai.Speak("Hello").Speed(1.0)  // Normal (default)
ai.Speak("Hello").Speed(1.5)  // Fast
ai.Speak("Hello").Speed(2.0)  // Very fast

HD Quality

// High-definition model (slower but better quality)
ai.Speak("Hello").HD().Save("hello_hd.mp3")

TTS Models

ai.Speak("Hello").Model(ai.TTSTTS1)     // Standard (default)
ai.Speak("Hello").Model(ai.TTSTTS1HD)   // High definition

Full Example

err := ai.Speak("Welcome to our application!").
    Voice(ai.VoiceNova).
    Format(ai.AudioFormatMP3).
    Speed(1.1).
    HD().
    Save("welcome.mp3")

Speech-to-Text (STT)

Transcribe audio to text:

// From file
text, err := ai.Transcribe("audio.mp3").Do()

// From bytes
text, err := ai.TranscribeBytes(audioData, "audio.mp3").Do()

// From URL
text, err := ai.TranscribeURL("https://example.com/audio.mp3").Do()

Language Hints

text, err := ai.Transcribe("audio.mp3").
    Language("en").  // ISO 639-1 code
    Do()

// Other languages
ai.Transcribe("audio.mp3").Language("es")  // Spanish
ai.Transcribe("audio.mp3").Language("fr")  // French
ai.Transcribe("audio.mp3").Language("de")  // German
ai.Transcribe("audio.mp3").Language("ja")  // Japanese

Context/Prompt

Provide context to improve accuracy:

text, err := ai.Transcribe("meeting.mp3").
    Prompt("This is a technical meeting about Kubernetes").
    Do()

Word-Level Timestamps

resp, err := ai.Transcribe("audio.mp3").
    WithTimestamps().
    DoWithMeta()

for _, word := range resp.Words {
    fmt.Printf("%.2fs - %.2fs: %s\n", word.Start, word.End, word.Word)
}

Response Metadata

resp, err := ai.Transcribe("audio.mp3").DoWithMeta()
if err != nil {
    panic(err)
}

fmt.Printf("Text: %s\n", resp.Text)
fmt.Printf("Language: %s\n", resp.Language)
fmt.Printf("Duration: %.1f seconds\n", resp.Duration)

STT Models

ai.Transcribe("audio.mp3").Model(ai.STTWhisper1)  // Default, Whisper

Provider-Specific

// Use OpenAI directly
audio, _ := ai.OpenAI().Speak("Hello").Do()
text, _ := ai.OpenAI().Transcribe("audio.mp3").Do()

Use Cases

Voice Assistant

// Record user audio -> transcribe -> respond -> speak
userText, _ := ai.Transcribe("user_input.mp3").Do()
response, _ := ai.Claude().Ask(userText)
ai.Speak(response).Save("response.mp3")

Podcast Transcription

resp, err := ai.Transcribe("podcast.mp3").
    Language("en").
    WithTimestamps().
    DoWithMeta()

// Generate transcript with timestamps
for _, word := range resp.Words {
    fmt.Printf("[%02d:%02d] %s ", 
        int(word.Start)/60, int(word.Start)%60, word.Word)
}

Audiobook Generation

chapters := []string{
    "Chapter 1: The Beginning...",
    "Chapter 2: The Journey...",
}

for i, chapter := range chapters {
    ai.Speak(chapter).
        Voice(ai.VoiceFable).
        HD().
        Save(fmt.Sprintf("chapter_%d.mp3", i+1))
}

Defaults

// Change defaults
ai.DefaultTTSModel = ai.TTSTTS1HD
ai.DefaultSTTModel = ai.STTWhisper1
ai.DefaultVoice = ai.VoiceNova
ai.DefaultAudioFormat = ai.AudioFormatOpus

Clone this wiki locally