Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public class OpenAiAudioSpeechProperties extends OpenAiParentProperties {

public static final String CONFIG_PREFIX = "spring.ai.openai.audio.speech";

public static final String DEFAULT_SPEECH_MODEL = OpenAiAudioApi.TtsModel.TTS_1.getValue();
public static final String DEFAULT_SPEECH_MODEL = OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.getValue();

private static final Float SPEED = 1.0f;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public class OpenAiAudioTranscriptionProperties extends OpenAiParentProperties {

public static final String CONFIG_PREFIX = "spring.ai.openai.audio.transcription";

public static final String DEFAULT_TRANSCRIPTION_MODEL = OpenAiAudioApi.WhisperModel.WHISPER_1.getValue();
public static final String DEFAULT_TRANSCRIPTION_MODEL = OpenAiAudioApi.TranscriptionModels.WHISPER_1.getValue();

private static final Double DEFAULT_TEMPERATURE = 0.7;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ public class OpenAiAudioSpeechModel implements SpeechModel, StreamingSpeechModel
public OpenAiAudioSpeechModel(OpenAiAudioApi audioApi) {
this(audioApi,
OpenAiAudioSpeechOptions.builder()
.model(OpenAiAudioApi.TtsModel.TTS_1.getValue())
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.getValue())
.responseFormat(AudioResponseFormat.MP3)
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
.speed(SPEED)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ public class OpenAiAudioTranscriptionModel implements TranscriptionModel {
public OpenAiAudioTranscriptionModel(OpenAiAudioApi audioApi) {
this(audioApi,
OpenAiAudioTranscriptionOptions.builder()
.model(OpenAiAudioApi.WhisperModel.WHISPER_1.getValue())
.model(OpenAiAudioApi.TranscriptionModels.WHISPER_1.getValue())
.responseFormat(OpenAiAudioApi.TranscriptResponseFormat.JSON)
.temperature(0.7f)
.build());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import reactor.core.publisher.Mono;

import org.springframework.ai.model.ApiKey;
import org.springframework.ai.model.ChatModelDescription;
import org.springframework.ai.model.NoopApiKey;
import org.springframework.ai.model.SimpleApiKey;
import org.springframework.ai.openai.api.common.OpenAiApiConstants;
Expand All @@ -50,6 +51,7 @@
* @author Ilayaperumal Gopinathan
* @author Jonghoon Park
* @author Filip Hrisafov
* @author Alexandros Pappas
* @since 0.8.1
*/
public class OpenAiAudioApi {
Expand Down Expand Up @@ -224,18 +226,18 @@ public String getFilename() {
* different model variates, tts-1 is optimized for real time text to speech use cases
* and tts-1-hd is optimized for quality. These models can be used with the Speech
* endpoint in the Audio API. Reference:
* <a href="https://platform.openai.com/docs/models/tts">TTS</a>
* <a href="https://platform.openai.com/docs/models#tts">TTS</a>
*/
public enum TtsModel {

// @formatter:off
/**
* The latest text to speech model, optimized for speed.
* Text-to-speech model optimized for speed
*/
@JsonProperty("tts-1")
TTS_1("tts-1"),
/**
* The latest text to speech model, optimized for quality.
* Text-to-speech model optimized for quality.
*/
@JsonProperty("tts-1-hd")
TTS_1_HD("tts-1-hd"),
Expand Down Expand Up @@ -266,6 +268,7 @@ public String getValue() {
* v2-large model is currently available through our API with the whisper-1 model
* name.
*/
@Deprecated
public enum WhisperModel {

// @formatter:off
Expand All @@ -285,6 +288,45 @@ public String getValue() {

}

/**
* The available models for the transcriptions API. Reference:
* <a href="https://platform.openai.com/docs/models#transcription">
*/
public enum TranscriptionModels implements ChatModelDescription {

/**
* Speech-to-text model powered by GPT-4o
*/
@JsonProperty("gpt-4o-transcribe")
GPT_4O_TRANSCRIBE("gpt-4o-transcribe"),
/**
* Speech-to-text model powered by GPT-4o mini
*/
@JsonProperty("gpt-4o-mini-transcribe")
GPT_4O_MINI_TRANSCRIBE("gpt-4o-mini-transcribe"),
/**
* General-purpose speech recognition model
*/
@JsonProperty("whisper-1")
WHISPER_1("whisper-1");

public final String value;

TranscriptionModels(String value) {
this.value = value;
}

public String getValue() {
return this.value;
}

@Override
public String getName() {
return this.value;
}

}

/**
* The format of the transcript and translation outputs, in one of these options:
* json, text, srt, verbose_json, or vtt. Defaults to json.
Expand Down Expand Up @@ -437,7 +479,7 @@ public String getValue() {
*/
public static class Builder {

private String model = TtsModel.TTS_1.getValue();
private String model = TtsModel.GPT_4_O_MINI_TTS.getValue();

private String input;

Expand Down Expand Up @@ -556,7 +598,7 @@ public static class Builder {

private String fileName;

private String model = WhisperModel.WHISPER_1.getValue();
private String model = TranscriptionModels.WHISPER_1.getValue();

private String language;

Expand Down Expand Up @@ -659,7 +701,7 @@ public static class Builder {

private String fileName;

private String model = WhisperModel.WHISPER_1.getValue();
private String model = TranscriptionModels.WHISPER_1.getValue();

private String prompt;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
import org.springframework.ai.openai.api.OpenAiAudioApi.SpeechRequest;
import org.springframework.ai.openai.api.OpenAiAudioApi.SpeechRequest.Voice;
import org.springframework.ai.openai.api.OpenAiAudioApi.StructuredResponse;
import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptionModels;
import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptionRequest;
import org.springframework.ai.openai.api.OpenAiAudioApi.TranslationRequest;
import org.springframework.ai.openai.api.OpenAiAudioApi.TtsModel;
import org.springframework.ai.openai.api.OpenAiAudioApi.WhisperModel;
import org.springframework.util.FileCopyUtils;

import static org.assertj.core.api.Assertions.assertThat;
Expand All @@ -52,7 +52,7 @@ void speechTranscriptionAndTranslation() throws IOException {

byte[] speech = this.audioApi
.createSpeech(SpeechRequest.builder()
.model(TtsModel.TTS_1_HD.getValue())
.model(TtsModel.GPT_4_O_MINI_TTS.getValue())
.input("Hello, my name is Chris and I love Spring A.I.")
.voice(Voice.ONYX.getValue())
.build())
Expand All @@ -64,7 +64,7 @@ void speechTranscriptionAndTranslation() throws IOException {

StructuredResponse translation = this.audioApi
.createTranslation(TranslationRequest.builder()
.model(WhisperModel.WHISPER_1.getValue())
.model(TranscriptionModels.WHISPER_1.getValue())
.file(speech)
.fileName("speech.mp3")
.build(), StructuredResponse.class)
Expand All @@ -74,7 +74,7 @@ void speechTranscriptionAndTranslation() throws IOException {

StructuredResponse transcriptionEnglish = this.audioApi
.createTranscription(TranscriptionRequest.builder()
.model(WhisperModel.WHISPER_1.getValue())
.model(TranscriptionModels.WHISPER_1.getValue())
.file(speech)
.fileName("speech.mp3")
.build(), StructuredResponse.class)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ public class OpenAiAudioModelNoOpApiKeysIT {
void checkNoOpKey() {
assertThatThrownBy(() -> this.audioApi
.createSpeech(OpenAiAudioApi.SpeechRequest.builder()
.model(OpenAiAudioApi.TtsModel.TTS_1_HD.getValue())
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.getValue())
.input("Hello, my name is Chris and I love Spring A.I.")
.voice(OpenAiAudioApi.SpeechRequest.Voice.ONYX.getValue())
.build())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ void shouldGenerateNonEmptyMp3AudioFromSpeechPrompt() {
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
.speed(SPEED)
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
.build();
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
speechOptions);
Expand Down Expand Up @@ -100,7 +100,7 @@ void speechRateLimitTest() {
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
.speed(SPEED)
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
.build();
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
speechOptions);
Expand All @@ -120,7 +120,7 @@ void shouldStreamNonEmptyResponsesForValidSpeechPrompts() {
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
.speed(SPEED)
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
.build();

SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
Expand All @@ -142,7 +142,7 @@ void speechVoicesTest(String voice) {
.voice(voice)
.speed(SPEED)
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
.build();
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
speechOptions);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ void aiResponseContainsImageResponseMetadata() {
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
.speed(SPEED)
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
.build();

SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ The prefix `spring.ai.openai.audio.speech` is used as the property prefix that l
| spring.ai.openai.audio.speech.api-key | The API Key | -
| spring.ai.openai.audio.speech.organization-id | Optionally you can specify which organization used for an API request. | -
| spring.ai.openai.audio.speech.project-id | Optionally, you can specify which project is used for an API request. | -
| spring.ai.openai.audio.speech.options.model | ID of the model to use for generating the audio. For OpenAI's TTS API, use one of the available models: tts-1 or tts-1-hd. | tts-1
| spring.ai.openai.audio.speech.options.model | ID of the model to use for generating the audio. Available models: `gpt-4o-mini-tts` (default, optimized for speed and cost), `gpt-4o-tts` (higher quality), `tts-1` (legacy, optimized for speed), or `tts-1-hd` (legacy, optimized for quality). | gpt-4o-mini-tts
| spring.ai.openai.audio.speech.options.voice | The voice to use for synthesis. For OpenAI's TTS API, One of the available voices for the chosen model: alloy, echo, fable, onyx, nova, and shimmer. | alloy
| spring.ai.openai.audio.speech.options.response-format | The format of the audio output. Supported formats are mp3, opus, aac, flac, wav, and pcm. | mp3
| spring.ai.openai.audio.speech.options.speed | The speed of the voice synthesis. The acceptable range is from 0.25 (slowest) to 4.0 (fastest). | 1.0
Expand All @@ -107,7 +107,7 @@ For example:
[source,java]
----
OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
.model("tts-1")
.model("gpt-4o-mini-tts")
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
.speed(1.0f)
Expand Down Expand Up @@ -153,7 +153,7 @@ var openAiAudioSpeechModel = new OpenAiAudioSpeechModel(openAiAudioApi);
var speechOptions = OpenAiAudioSpeechOptions.builder()
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
.speed(1.0f)
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
.build();

var speechPrompt = new SpeechPrompt("Hello, this is a text-to-speech example.", speechOptions);
Expand Down Expand Up @@ -181,7 +181,7 @@ OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
.speed(1.0f)
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
.model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value)
.build();

SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", speechOptions);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ The prefix `spring.ai.openai.audio.transcription` is used as the property prefix
| spring.ai.openai.audio.transcription.api-key | The API Key | -
| spring.ai.openai.audio.transcription.organization-id | Optionally you can specify which organization used for an API request. | -
| spring.ai.openai.audio.transcription.project-id | Optionally, you can specify which project is used for an API request. | -
| spring.ai.openai.audio.transcription.options.model | ID of the model to use. Only whisper-1 (which is powered by our open source Whisper V2 model) is currently available. | whisper-1
| spring.ai.openai.audio.transcription.options.model | ID of the model to use for transcription. Available models: `gpt-4o-transcribe` (speech-to-text powered by GPT-4o), `gpt-4o-mini-transcribe` (speech-to-text powered by GPT-4o mini), or `whisper-1` (general-purpose speech recognition model, default). | whisper-1
| spring.ai.openai.audio.transcription.options.response-format | The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt. | json
| spring.ai.openai.audio.transcription.options.prompt | An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language. |
| spring.ai.openai.audio.transcription.options.language | The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. |
Expand Down