fixie-ai · cezarc1 · Aug 11, 2024 · Aug 11, 2024 · Aug 13, 2024
diff --git a/README.md b/README.md
@@ -131,6 +131,44 @@ To generate TTS benchmarks, there are various scripts for the individual provide
 python elevenlabs_stream_benchmark.py "Haikus I find tricky, With a 5-7-5 count, But I'll give it a go"
 ```
 
+#### To run the TTS benchmark suite (ElevenLabs, Cartesia, PlayHT)
+
+1. Ensure you have Poetry installed.
+2. Set up the following environment variables or provide them as command-line arguments:
+   - ELEVEN_API_KEY
+   - CARTESIA_API_KEY
+   - PLAYHT_API_KEY
+   - PLAYHT_USER_ID
+
+3. Run the benchmark using:
+
+```
+poetry run python tts_benchmark_suite.py "Your text here" [--eleven-api-key KEY] [--cartesia-api-key KEY] [--playht-api-key KEY] [--playht-user-id ID]
+```
+
+Example:
+
+```
+poetry run python tts_benchmark_suite.py "It's simple: Overspecialize, and you breed in weakness. It's slow death." --eleven-api-key YOUR_ELEVEN_KEY --cartesia-api-key YOUR_CARTESIA_KEY --playht-api-key YOUR_PLAYHT_KEY --playht-user-id YOUR_PLAYHT_USER_ID
+```
+
+or
+
+```
+poetry run python tts_benchmark_suite.py "It's simple: Overspecialize, and you breed in weakness. It's slow death."
+```
+
+Note: If you provide the API keys and user ID as command-line arguments, they will override any existing environment variables for that run.
+
+#### Output
+
+```
+(TTFU) time to first utterance: 316.48ms   <---- Time from the outbound request start to first audio chunk received
+Average chunk latency: 234.13ms            <---- Average time between receiving consecutive audio chunks
+Total chunks received: 12                  <---- Number of audio chunks received for the entire request
+Total processing time: 2809.58ms           <---- Total time from request start of request to the final audio packet being received
+```
+
 #### Playing audio
 
 By default, only timing information for TTS is emitted. Follow the steps below to actually play out the received audio.

diff --git a/elevenlabs_api_benchmark.py b/elevenlabs_api_benchmark.py
@@ -1,14 +1,15 @@
-import requests
-import json
-import time
-import os
 import argparse
 import asyncio
-import websockets
 import base64
+import json
 import logging
+import os
+import time
 from typing import Iterator
 
+import requests
+import websockets
+
 logging.basicConfig(level=logging.INFO)
 
 # Defaults for both scripts
@@ -20,7 +21,7 @@
 DEFAULT_OUTPUT_FORMAT = "mp3_44100"
 DEFAULT_STABILITY = 0.5
 DEFAULT_SIMILARITY_BOOST = False
-DEFAULT_XI_API_KEY = os.environ["ELEVEN_API_KEY"],
+DEFAULT_XI_API_KEY = (os.environ["ELEVEN_API_KEY"],)
 
 # Configuration for HTTP API
 DEFAULT_CHUNK_SIZE = 7868
@@ -35,42 +36,70 @@
 # Argument parsing
 parser = argparse.ArgumentParser(
     formatter_class=argparse.RawDescriptionHelpFormatter,
-    description='''\
+    description="""\
 The script allows for comprehensive benchmarking of the 11Labs API for text-to-speech generation to achieve the lowest possible latency, given any combination of parameters.
-''')
-
-API_group = parser.add_argument_group('API Type')
-API_group.add_argument("--API", choices=["http", "websocket"], required=True,
-                       help="API type: 'http' or 'websocket'")
-
-input_group = parser.add_argument_group('Input Parameters')
-input_group.add_argument("--text", default=DEFAULT_TEXT,
-                         help="Input text for speech synthesis")
-input_group.add_argument("--model", default=DEFAULT_MODEL_ID,
-                         help="Model ID for speech synthesis. Options: 'eleven_monolingual_v1', 'eleven_english_v2', 'eleven_multilingual_v1', 'eleven_multilingual_v2'")
-
-output_group = parser.add_argument_group('Output Parameters')
-output_group.add_argument("--num_samples", type=int, default=DEFAULT_SAMPLES,
-                          help="Number of speech samples to generate")
-output_group.add_argument("--output_format", default=DEFAULT_OUTPUT_FORMAT,
-                          help="Speech output format. Options: 'mp3_44100', 'pcm_16000', 'pcm_22050', 'pcm_24000', 'pcm_44100'")
-
-http_group = parser.add_argument_group('HTTP API Parameters')
-http_group.add_argument("--chunk_size", type=int, default=DEFAULT_CHUNK_SIZE,
-                        help="Size of the first playable chunk in bytes, default is 7868")
-
-websocket_group = parser.add_argument_group('WebSocket API Parameters')
-websocket_group.add_argument("--latency_optimizer", type=int, default=DEFAULT_LATENCY_OPTIMIZER,
-                             help="Latency optimization level. Default is 4. Lower to 3 or less to improve pronunciation of numbers and dates.")
-websocket_group.add_argument("--text_chunker", action="store_true", default=False,
-                             help="Enable text chunker for input streaming. This chunks text blocks and sets last char to space, simulating the default behavior of the 11labs Library.")
-
-general_group = parser.add_argument_group('General Parameters')
-general_group.add_argument("--voice_id", default=DEFAULT_VOICE_ID,
-                           help="ID of the voice for speech synthesis")
+""",
+)
 
-args = parser.parse_args()
+API_group = parser.add_argument_group("API Type")
+API_group.add_argument(
+    "--API",
+    choices=["http", "websocket"],
+    required=True,
+    help="API type: 'http' or 'websocket'",
+)
 
+input_group = parser.add_argument_group("Input Parameters")
+input_group.add_argument(
+    "--text", default=DEFAULT_TEXT, help="Input text for speech synthesis"
+)
+input_group.add_argument(
+    "--model",
+    default=DEFAULT_MODEL_ID,
+    help="Model ID for speech synthesis. Options: 'eleven_monolingual_v1', 'eleven_english_v2', 'eleven_multilingual_v1', 'eleven_multilingual_v2'",
+)
+
+output_group = parser.add_argument_group("Output Parameters")
+output_group.add_argument(
+    "--num_samples",
+    type=int,
+    default=DEFAULT_SAMPLES,
+    help="Number of speech samples to generate",
+)
+output_group.add_argument(
+    "--output_format",
+    default=DEFAULT_OUTPUT_FORMAT,
+    help="Speech output format. Options: 'mp3_44100', 'pcm_16000', 'pcm_22050', 'pcm_24000', 'pcm_44100'",
+)
+
+http_group = parser.add_argument_group("HTTP API Parameters")
+http_group.add_argument(
+    "--chunk_size",
+    type=int,
+    default=DEFAULT_CHUNK_SIZE,
+    help="Size of the first playable chunk in bytes, default is 7868",
+)
+
+websocket_group = parser.add_argument_group("WebSocket API Parameters")
+websocket_group.add_argument(
+    "--latency_optimizer",
+    type=int,
+    default=DEFAULT_LATENCY_OPTIMIZER,
+    help="Latency optimization level. Default is 4. Lower to 3 or less to improve pronunciation of numbers and dates.",
+)
+websocket_group.add_argument(
+    "--text_chunker",
+    action="store_true",
+    default=False,
+    help="Enable text chunker for input streaming. This chunks text blocks and sets last char to space, simulating the default behavior of the 11labs Library.",
+)
+
+general_group = parser.add_argument_group("General Parameters")
+general_group.add_argument(
+    "--voice_id", default=DEFAULT_VOICE_ID, help="ID of the voice for speech synthesis"
+)
+
+args = parser.parse_args()
 
 
 # Text chunker function
@@ -91,6 +120,7 @@ def text_chunker(text: str) -> Iterator[str]:
     if buffer != "":
         yield buffer + " "
 
+
 # Simulate text stream function
 def simulate_text_stream():
     """
@@ -111,6 +141,7 @@ def simulate_text_stream():
         time.sleep(delay_time)
         yield text_chunk
 
+
 # Truncate audio string function
 def truncate_audio_string(audio_string):
     """
@@ -120,6 +151,7 @@ def truncate_audio_string(audio_string):
         return audio_string[:max_length] + "..."
     return audio_string
 
+
 # HTTP API request function
 def http_api_request():
     url = f"https://api.elevenlabs.io/v1/text-to-speech/{args.voice_id}/stream?optimize_streaming_latency={args.latency_optimizer}"
@@ -131,14 +163,19 @@ def http_api_request():
     data = {
         "text": args.text,
         "model_id": args.model,
-        "voice_settings": {"stability": DEFAULT_STABILITY, "similarity_boost": DEFAULT_SIMILARITY_BOOST},
+        "voice_settings": {
+            "stability": DEFAULT_STABILITY,
+            "similarity_boost": DEFAULT_SIMILARITY_BOOST,
+        },
     }
     response_latencies = []
     chunk_latencies = []
     for i in range(args.num_samples):
         print(f"\nAPI Call {i+1}:")
         start_time = time.perf_counter()
-        response = requests.post(url, headers=headers, data=json.dumps(data), stream=True)
+        response = requests.post(
+            url, headers=headers, data=json.dumps(data), stream=True
+        )
         if not response.ok:
             print("Error: " + response.json()["detail"]["message"])
             exit(1)
@@ -150,19 +187,25 @@ def http_api_request():
         for chunk in response.iter_content(chunk_size=DEFAULT_CHUNK_SIZE):
             if chunk:
                 audio_data += chunk
-                if len(audio_data) >= args.chunk_size:  
+                if len(audio_data) >= args.chunk_size:
                     chunk_received_time = time.perf_counter()
                     chunk_latency = (chunk_received_time - start_time) * 1000
                     chunk_latencies.append(chunk_latency)
-                   
+
                     print(f"  First Playable Chunk (Body) Time: {chunk_latency:.2f} ms")
                     break
 
     average_response_latency = sum(response_latencies) / len(response_latencies)
     median_response_latency = sorted(response_latencies)[len(response_latencies) // 2]
     average_chunk_latency = sum(chunk_latencies) / len(chunk_latencies)
     median_chunk_latency = sorted(chunk_latencies)[len(chunk_latencies) // 2]
-    return average_response_latency, median_response_latency, average_chunk_latency, median_chunk_latency
+    return (
+        average_response_latency,
+        median_response_latency,
+        average_chunk_latency,
+        median_chunk_latency,
+    )
+
 
 async def websocket_api_request():
     logging.basicConfig(level=logging.INFO)  # Configure logging inside the function
@@ -210,7 +253,9 @@ async def websocket_api_request():
                     chunk_received_time = time.time()
                     if not first_chunk_received:
                         first_chunk_received = True
-                        first_chunk_time = chunk_received_time - start_time  # Calculate the time from the request to the first chunk
+                        first_chunk_time = (
+                            chunk_received_time - start_time
+                        )  # Calculate the time from the request to the first chunk
                     chunk_times.append(chunk_received_time - connection_open_time)
             except asyncio.TimeoutError:
                 pass
@@ -234,20 +279,44 @@ async def websocket_api_request():
                 break
         connection_close_time = time.time()
         total_time_websocket_was_open = connection_close_time - connection_open_time
-    return time_to_open_connection, first_chunk_time, chunk_times, total_time_websocket_was_open
+    return (
+        time_to_open_connection,
+        first_chunk_time,
+        chunk_times,
+        total_time_websocket_was_open,
+    )
+
 
 # Main function
 if args.API == "http":
-    average_response_latency, median_response_latency, average_chunk_latency, median_chunk_latency = http_api_request()
-    print(f"\nAverage Initial Response (Header) Time: {average_response_latency:.2f} ms")
+    (
+        average_response_latency,
+        median_response_latency,
+        average_chunk_latency,
+        median_chunk_latency,
+    ) = http_api_request()
+    print(
+        f"\nAverage Initial Response (Header) Time: {average_response_latency:.2f} ms"
+    )
     print(f"Median Initial Response (Header) Time: {median_response_latency:.2f} ms")
     print(f"Average First Playable Chunk (Body) Time: {average_chunk_latency:.2f} ms")
     print(f"Median First Playable Chunk (Body) Time: {median_chunk_latency:.2f} ms")
 elif args.API == "websocket":
-    time_to_open_connection, first_chunk_time, chunk_times, total_time_websocket_was_open = asyncio.run(websocket_api_request())
+    (
+        time_to_open_connection,
+        first_chunk_time,
+        chunk_times,
+        total_time_websocket_was_open,
+    ) = asyncio.run(websocket_api_request())
     print(f"\nTime to open connection: {time_to_open_connection:.4f} seconds")
     if first_chunk_time is not None:
-        print(f"Time from request to first chunk: {first_chunk_time:.4f} seconds")  # Updated print statement
+        print(
+            f"Time from request to first chunk: {first_chunk_time:.4f} seconds"
+        )  # Updated print statement
     for i, chunk_time in enumerate(chunk_times, start=1):
-        print(f"Time to receive chunk {i} after request: {chunk_time:.4f} seconds")  # Updated print statement
-    print(f"Total time WebSocket connection was open: {total_time_websocket_was_open:.4f} seconds")
+        print(
+            f"Time to receive chunk {i} after request: {chunk_time:.4f} seconds"
+        )  # Updated print statement
+    print(
+        f"Total time WebSocket connection was open: {total_time_websocket_was_open:.4f} seconds"
+    )
diff --git a/elevenlabs_stream_benchmark.py b/elevenlabs_stream_benchmark.py
@@ -1,15 +1,18 @@
-import requests
+import argparse
 import json
-import time
 import os
-import argparse
+import time
+
+import requests
 
 DEFAULT_SAMPLES = 10
 DEFAULT_TEXT = "I'm calling for Jim."
 DEFAULT_MODEL_ID = "eleven_monolingual_v1"
-DEFAULT_CHUNK_SIZE = 7868  #This defines the size of the first playable chunk in bytes, which is 7868, roughly equivalent to half a second of audio
-DEFAULT_LATENCY_OPTIMIZER = 4  # This can be set to values 1 through 4, with 4 disabling the text normalizer 
-DEFAULT_VOICE_ID = "flq6f7yk4E4fJM5XTYuZ"  
+DEFAULT_CHUNK_SIZE = 7868  # This defines the size of the first playable chunk in bytes, which is 7868, roughly equivalent to half a second of audio
+DEFAULT_LATENCY_OPTIMIZER = (
+    4  # This can be set to values 1 through 4, with 4 disabling the text normalizer
+)
+DEFAULT_VOICE_ID = "flq6f7yk4E4fJM5XTYuZ"
 
 parser = argparse.ArgumentParser()
 parser.add_argument("text", nargs="?", default=DEFAULT_TEXT)
@@ -77,14 +80,14 @@
     for chunk in response.iter_content(chunk_size=1024):
         if chunk:
             audio_data += chunk
-            if len(audio_data) >= args.chunk_size:  
+            if len(audio_data) >= args.chunk_size:
                 chunk_received_time = time.perf_counter()
                 chunk_latency = (chunk_received_time - start_time) * 1000
                 chunk_latencies.append(chunk_latency)
                 print(f"  First Playable Chunk (Body) Time: {chunk_latency:.2f} ms")
                 break
 
-    with open(f'audio_sample_{i+1}.mp3', 'wb') as f:
+    with open(f"audio_sample_{i+1}.mp3", "wb") as f:
         f.write(audio_data)
 
 average_response_latency = sum(response_latencies) / len(response_latencies)
@@ -96,4 +99,3 @@
 median_chunk_latency = sorted(chunk_latencies)[len(chunk_latencies) // 2]
 print(f"\nAverage First Playable Chunk (Body) Time: {average_chunk_latency:.2f} ms")
 print(f"Median First Playable Chunk (Body) Time: {median_chunk_latency:.2f} ms")
-
diff --git a/elevenlabs_ws_benchmark.py b/elevenlabs_ws_benchmark.py
@@ -1,12 +1,13 @@
+import argparse
 import asyncio
-import websockets
-import json
 import base64
-import time
+import json
 import logging
-from typing import Iterator
 import os
-import argparse
+import time
+from typing import Iterator
+
+import websockets
 
 # Read some settings from command line
 parser = argparse.ArgumentParser()