Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,44 @@ To generate TTS benchmarks, there are various scripts for the individual provide
python elevenlabs_stream_benchmark.py "Haikus I find tricky, With a 5-7-5 count, But I'll give it a go"
```

#### To run the TTS benchmark suite (ElevenLabs, Cartesia, PlayHT)

1. Ensure you have Poetry installed.
2. Set up the following environment variables or provide them as command-line arguments:
- ELEVEN_API_KEY
- CARTESIA_API_KEY
- PLAYHT_API_KEY
- PLAYHT_USER_ID

3. Run the benchmark using:

```
poetry run python tts_benchmark_suite.py "Your text here" [--eleven-api-key KEY] [--cartesia-api-key KEY] [--playht-api-key KEY] [--playht-user-id ID]
```

Example:

```
poetry run python tts_benchmark_suite.py "It's simple: Overspecialize, and you breed in weakness. It's slow death." --eleven-api-key YOUR_ELEVEN_KEY --cartesia-api-key YOUR_CARTESIA_KEY --playht-api-key YOUR_PLAYHT_KEY --playht-user-id YOUR_PLAYHT_USER_ID
```

or

```
poetry run python tts_benchmark_suite.py "It's simple: Overspecialize, and you breed in weakness. It's slow death."
```

Note: If you provide the API keys and user ID as command-line arguments, they will override any existing environment variables for that run.

#### Output

```
(TTFU) time to first utterance: 316.48ms <---- Time from the outbound request start to first audio chunk received
Average chunk latency: 234.13ms <---- Average time between receiving consecutive audio chunks
Total chunks received: 12 <---- Number of audio chunks received for the entire request
Total processing time: 2809.58ms <---- Total time from request start of request to the final audio packet being received
```

#### Playing audio

By default, only timing information for TTS is emitted. Follow the steps below to actually play out the received audio.
Expand Down
173 changes: 121 additions & 52 deletions elevenlabs_api_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import requests
import json
import time
import os
import argparse
import asyncio
import websockets
import base64
import json
import logging
import os
import time
from typing import Iterator

import requests
import websockets

logging.basicConfig(level=logging.INFO)

# Defaults for both scripts
Expand All @@ -20,7 +21,7 @@
DEFAULT_OUTPUT_FORMAT = "mp3_44100"
DEFAULT_STABILITY = 0.5
DEFAULT_SIMILARITY_BOOST = False
DEFAULT_XI_API_KEY = os.environ["ELEVEN_API_KEY"],
DEFAULT_XI_API_KEY = (os.environ["ELEVEN_API_KEY"],)

# Configuration for HTTP API
DEFAULT_CHUNK_SIZE = 7868
Expand All @@ -35,42 +36,70 @@
# Argument parsing
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description='''\
description="""\
The script allows for comprehensive benchmarking of the 11Labs API for text-to-speech generation to achieve the lowest possible latency, given any combination of parameters.
''')

API_group = parser.add_argument_group('API Type')
API_group.add_argument("--API", choices=["http", "websocket"], required=True,
help="API type: 'http' or 'websocket'")

input_group = parser.add_argument_group('Input Parameters')
input_group.add_argument("--text", default=DEFAULT_TEXT,
help="Input text for speech synthesis")
input_group.add_argument("--model", default=DEFAULT_MODEL_ID,
help="Model ID for speech synthesis. Options: 'eleven_monolingual_v1', 'eleven_english_v2', 'eleven_multilingual_v1', 'eleven_multilingual_v2'")

output_group = parser.add_argument_group('Output Parameters')
output_group.add_argument("--num_samples", type=int, default=DEFAULT_SAMPLES,
help="Number of speech samples to generate")
output_group.add_argument("--output_format", default=DEFAULT_OUTPUT_FORMAT,
help="Speech output format. Options: 'mp3_44100', 'pcm_16000', 'pcm_22050', 'pcm_24000', 'pcm_44100'")

http_group = parser.add_argument_group('HTTP API Parameters')
http_group.add_argument("--chunk_size", type=int, default=DEFAULT_CHUNK_SIZE,
help="Size of the first playable chunk in bytes, default is 7868")

websocket_group = parser.add_argument_group('WebSocket API Parameters')
websocket_group.add_argument("--latency_optimizer", type=int, default=DEFAULT_LATENCY_OPTIMIZER,
help="Latency optimization level. Default is 4. Lower to 3 or less to improve pronunciation of numbers and dates.")
websocket_group.add_argument("--text_chunker", action="store_true", default=False,
help="Enable text chunker for input streaming. This chunks text blocks and sets last char to space, simulating the default behavior of the 11labs Library.")

general_group = parser.add_argument_group('General Parameters')
general_group.add_argument("--voice_id", default=DEFAULT_VOICE_ID,
help="ID of the voice for speech synthesis")
""",
)

args = parser.parse_args()
API_group = parser.add_argument_group("API Type")
API_group.add_argument(
"--API",
choices=["http", "websocket"],
required=True,
help="API type: 'http' or 'websocket'",
)

input_group = parser.add_argument_group("Input Parameters")
input_group.add_argument(
"--text", default=DEFAULT_TEXT, help="Input text for speech synthesis"
)
input_group.add_argument(
"--model",
default=DEFAULT_MODEL_ID,
help="Model ID for speech synthesis. Options: 'eleven_monolingual_v1', 'eleven_english_v2', 'eleven_multilingual_v1', 'eleven_multilingual_v2'",
)

output_group = parser.add_argument_group("Output Parameters")
output_group.add_argument(
"--num_samples",
type=int,
default=DEFAULT_SAMPLES,
help="Number of speech samples to generate",
)
output_group.add_argument(
"--output_format",
default=DEFAULT_OUTPUT_FORMAT,
help="Speech output format. Options: 'mp3_44100', 'pcm_16000', 'pcm_22050', 'pcm_24000', 'pcm_44100'",
)

http_group = parser.add_argument_group("HTTP API Parameters")
http_group.add_argument(
"--chunk_size",
type=int,
default=DEFAULT_CHUNK_SIZE,
help="Size of the first playable chunk in bytes, default is 7868",
)

websocket_group = parser.add_argument_group("WebSocket API Parameters")
websocket_group.add_argument(
"--latency_optimizer",
type=int,
default=DEFAULT_LATENCY_OPTIMIZER,
help="Latency optimization level. Default is 4. Lower to 3 or less to improve pronunciation of numbers and dates.",
)
websocket_group.add_argument(
"--text_chunker",
action="store_true",
default=False,
help="Enable text chunker for input streaming. This chunks text blocks and sets last char to space, simulating the default behavior of the 11labs Library.",
)

general_group = parser.add_argument_group("General Parameters")
general_group.add_argument(
"--voice_id", default=DEFAULT_VOICE_ID, help="ID of the voice for speech synthesis"
)

args = parser.parse_args()


# Text chunker function
Expand All @@ -91,6 +120,7 @@ def text_chunker(text: str) -> Iterator[str]:
if buffer != "":
yield buffer + " "


# Simulate text stream function
def simulate_text_stream():
"""
Expand All @@ -111,6 +141,7 @@ def simulate_text_stream():
time.sleep(delay_time)
yield text_chunk


# Truncate audio string function
def truncate_audio_string(audio_string):
"""
Expand All @@ -120,6 +151,7 @@ def truncate_audio_string(audio_string):
return audio_string[:max_length] + "..."
return audio_string


# HTTP API request function
def http_api_request():
url = f"https://api.elevenlabs.io/v1/text-to-speech/{args.voice_id}/stream?optimize_streaming_latency={args.latency_optimizer}"
Expand All @@ -131,14 +163,19 @@ def http_api_request():
data = {
"text": args.text,
"model_id": args.model,
"voice_settings": {"stability": DEFAULT_STABILITY, "similarity_boost": DEFAULT_SIMILARITY_BOOST},
"voice_settings": {
"stability": DEFAULT_STABILITY,
"similarity_boost": DEFAULT_SIMILARITY_BOOST,
},
}
response_latencies = []
chunk_latencies = []
for i in range(args.num_samples):
print(f"\nAPI Call {i+1}:")
start_time = time.perf_counter()
response = requests.post(url, headers=headers, data=json.dumps(data), stream=True)
response = requests.post(
url, headers=headers, data=json.dumps(data), stream=True
)
if not response.ok:
print("Error: " + response.json()["detail"]["message"])
exit(1)
Expand All @@ -150,19 +187,25 @@ def http_api_request():
for chunk in response.iter_content(chunk_size=DEFAULT_CHUNK_SIZE):
if chunk:
audio_data += chunk
if len(audio_data) >= args.chunk_size:
if len(audio_data) >= args.chunk_size:
chunk_received_time = time.perf_counter()
chunk_latency = (chunk_received_time - start_time) * 1000
chunk_latencies.append(chunk_latency)

print(f" First Playable Chunk (Body) Time: {chunk_latency:.2f} ms")
break

average_response_latency = sum(response_latencies) / len(response_latencies)
median_response_latency = sorted(response_latencies)[len(response_latencies) // 2]
average_chunk_latency = sum(chunk_latencies) / len(chunk_latencies)
median_chunk_latency = sorted(chunk_latencies)[len(chunk_latencies) // 2]
return average_response_latency, median_response_latency, average_chunk_latency, median_chunk_latency
return (
average_response_latency,
median_response_latency,
average_chunk_latency,
median_chunk_latency,
)


async def websocket_api_request():
logging.basicConfig(level=logging.INFO) # Configure logging inside the function
Expand Down Expand Up @@ -210,7 +253,9 @@ async def websocket_api_request():
chunk_received_time = time.time()
if not first_chunk_received:
first_chunk_received = True
first_chunk_time = chunk_received_time - start_time # Calculate the time from the request to the first chunk
first_chunk_time = (
chunk_received_time - start_time
) # Calculate the time from the request to the first chunk
chunk_times.append(chunk_received_time - connection_open_time)
except asyncio.TimeoutError:
pass
Expand All @@ -234,20 +279,44 @@ async def websocket_api_request():
break
connection_close_time = time.time()
total_time_websocket_was_open = connection_close_time - connection_open_time
return time_to_open_connection, first_chunk_time, chunk_times, total_time_websocket_was_open
return (
time_to_open_connection,
first_chunk_time,
chunk_times,
total_time_websocket_was_open,
)


# Main function
if args.API == "http":
average_response_latency, median_response_latency, average_chunk_latency, median_chunk_latency = http_api_request()
print(f"\nAverage Initial Response (Header) Time: {average_response_latency:.2f} ms")
(
average_response_latency,
median_response_latency,
average_chunk_latency,
median_chunk_latency,
) = http_api_request()
print(
f"\nAverage Initial Response (Header) Time: {average_response_latency:.2f} ms"
)
print(f"Median Initial Response (Header) Time: {median_response_latency:.2f} ms")
print(f"Average First Playable Chunk (Body) Time: {average_chunk_latency:.2f} ms")
print(f"Median First Playable Chunk (Body) Time: {median_chunk_latency:.2f} ms")
elif args.API == "websocket":
time_to_open_connection, first_chunk_time, chunk_times, total_time_websocket_was_open = asyncio.run(websocket_api_request())
(
time_to_open_connection,
first_chunk_time,
chunk_times,
total_time_websocket_was_open,
) = asyncio.run(websocket_api_request())
print(f"\nTime to open connection: {time_to_open_connection:.4f} seconds")
if first_chunk_time is not None:
print(f"Time from request to first chunk: {first_chunk_time:.4f} seconds") # Updated print statement
print(
f"Time from request to first chunk: {first_chunk_time:.4f} seconds"
) # Updated print statement
for i, chunk_time in enumerate(chunk_times, start=1):
print(f"Time to receive chunk {i} after request: {chunk_time:.4f} seconds") # Updated print statement
print(f"Total time WebSocket connection was open: {total_time_websocket_was_open:.4f} seconds")
print(
f"Time to receive chunk {i} after request: {chunk_time:.4f} seconds"
) # Updated print statement
print(
f"Total time WebSocket connection was open: {total_time_websocket_was_open:.4f} seconds"
)
20 changes: 11 additions & 9 deletions elevenlabs_stream_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import requests
import argparse
import json
import time
import os
import argparse
import time

import requests

DEFAULT_SAMPLES = 10
DEFAULT_TEXT = "I'm calling for Jim."
DEFAULT_MODEL_ID = "eleven_monolingual_v1"
DEFAULT_CHUNK_SIZE = 7868 #This defines the size of the first playable chunk in bytes, which is 7868, roughly equivalent to half a second of audio
DEFAULT_LATENCY_OPTIMIZER = 4 # This can be set to values 1 through 4, with 4 disabling the text normalizer
DEFAULT_VOICE_ID = "flq6f7yk4E4fJM5XTYuZ"
DEFAULT_CHUNK_SIZE = 7868 # This defines the size of the first playable chunk in bytes, which is 7868, roughly equivalent to half a second of audio
DEFAULT_LATENCY_OPTIMIZER = (
4 # This can be set to values 1 through 4, with 4 disabling the text normalizer
)
DEFAULT_VOICE_ID = "flq6f7yk4E4fJM5XTYuZ"

parser = argparse.ArgumentParser()
parser.add_argument("text", nargs="?", default=DEFAULT_TEXT)
Expand Down Expand Up @@ -77,14 +80,14 @@
for chunk in response.iter_content(chunk_size=1024):
if chunk:
audio_data += chunk
if len(audio_data) >= args.chunk_size:
if len(audio_data) >= args.chunk_size:
chunk_received_time = time.perf_counter()
chunk_latency = (chunk_received_time - start_time) * 1000
chunk_latencies.append(chunk_latency)
print(f" First Playable Chunk (Body) Time: {chunk_latency:.2f} ms")
break

with open(f'audio_sample_{i+1}.mp3', 'wb') as f:
with open(f"audio_sample_{i+1}.mp3", "wb") as f:
f.write(audio_data)

average_response_latency = sum(response_latencies) / len(response_latencies)
Expand All @@ -96,4 +99,3 @@
median_chunk_latency = sorted(chunk_latencies)[len(chunk_latencies) // 2]
print(f"\nAverage First Playable Chunk (Body) Time: {average_chunk_latency:.2f} ms")
print(f"Median First Playable Chunk (Body) Time: {median_chunk_latency:.2f} ms")

11 changes: 6 additions & 5 deletions elevenlabs_ws_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import argparse
import asyncio
import websockets
import json
import base64
import time
import json
import logging
from typing import Iterator
import os
import argparse
import time
from typing import Iterator

import websockets

# Read some settings from command line
parser = argparse.ArgumentParser()
Expand Down
Loading