Fixes and improvements for recognize_assemblyai() method:

patrickloeber · patrickloeber · commit 4e9ad6d60870 · 2023-10-20T12:20:42.000-04:00
- Adds support for `AudioData` instance.
  Before, it only worked with a path to a file
- Add more error handling
- Removes inner `read_file` function since the requests module
  automatically handles chunking
- Removes "content-type" from header since this is not needed
- Add docstring
- Add example code snippet in `examples/audio_transcribe.py`
- List AssemblyAI in README
diff --git a/README.rst b/README.rst
@@ -30,6 +30,7 @@ Speech recognition engine/API support:
 * `CMU Sphinx <http://cmusphinx.sourceforge.net/wiki/>`__ (works offline)
 * Google Speech Recognition
 * `Google Cloud Speech API <https://cloud.google.com/speech/>`__
+* `AssemblyAI API <https://www.assemblyai.com/>`__
 * `Wit.ai <https://wit.ai/>`__
 * `Microsoft Azure Speech <https://azure.microsoft.com/en-us/services/cognitive-services/speech/>`__
 * `Microsoft Bing Voice Recognition (Deprecated) <https://www.microsoft.com/cognitive-services/en-us/speech-api>`__
@@ -202,7 +203,7 @@ The solution is to decrease this threshold, or call ``recognizer_instance.adjust
 The recognizer doesn't understand my particular language/dialect.
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Try setting the recognition language to your language/dialect. To do this, see the documentation for ``recognizer_instance.recognize_sphinx``, ``recognizer_instance.recognize_google``, ``recognizer_instance.recognize_wit``, ``recognizer_instance.recognize_bing``, ``recognizer_instance.recognize_api``, ``recognizer_instance.recognize_houndify``, and ``recognizer_instance.recognize_ibm``.
+Try setting the recognition language to your language/dialect. To do this, see the documentation for ``recognizer_instance.recognize_sphinx``, ``recognizer_instance.recognize_google``, ``recognizer_instance.recognize_wit``, ``recognizer_instance.recognize_bing``, ``recognizer_instance.recognize_api``, ``recognizer_instance.recognize_houndify``, and ``recognizer_instance.recognize_ibm``, and ``recognizer_instance.recognize_assemblyai``.
 
 For example, if your language/dialect is British English, it is better to use ``"en-GB"`` as the language rather than ``"en-US"``.
 
diff --git a/examples/audio_transcribe.py b/examples/audio_transcribe.py
@@ -87,3 +87,22 @@
     print("IBM Speech to Text could not understand audio")
 except sr.RequestError as e:
     print("Could not request results from IBM Speech to Text service; {0}".format(e))
+
+# recognize speech using the AssemblyAI API
+ASSEMBLYAI_API_TOKEN = "INSERT ASSEMBLYAI API TOKEN HERE"  # Get a Free token at https://www.assemblyai.com/
+try:
+    r.recognize_assemblyai(audio, api_token=ASSEMBLYAI_API_TOKEN)
+except sr.TranscriptionNotReady as e:
+    job_name = e.job_name
+except sr.TranscriptionFailed as e:
+    print(e)
+except sr.RequestError as e:
+    print("Could not request results from AssemblyAI service; {0}".format(e))
+
+# wait a little bit, then query the transcript with the job_name...
+try:
+    print("AssemblyAI thinks you said " + r.recognize_assemblyai(audio_data=None, api_token=ASSEMBLYAI_API_TOKEN, job_name=job_name)[0])
+except sr.TranscriptionFailed as e:
+    print(e)
+except sr.RequestError as e:
+    print("Could not request results from AssemblyAI service; {0}".format(e))
diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
@@ -1289,33 +1289,51 @@ def recognize_amazon(self, audio_data, bucket_name=None, access_key_id=None, sec
 
     def recognize_assemblyai(self, audio_data, api_token, job_name=None, **kwargs):
         """
-        Wraps the AssemblyAI STT service.
+        Performs speech recognition using the AssemblyAI API.
+
         https://www.assemblyai.com/
+
+        Args:
+            audio_data: Can be an ``AudioData`` instance or a str with a path to a file.
+            api_token: An AssemblyAI API token.
+            job_name: The name of the job which corresponds to the transcription id. If no job_name is given, it submits the file for transcription
+                      and raises a ``speech_recognition.TranscriptionNotReady`` exception. The final transcript can then be queried at a later time
+                      by passing the job_name.
+
+        Raises a ``speech_recognition.TranscriptionFailed`` exception if the speech recognition operation failed or if the key isn't valid.
+        Raises a ``speech_recognition.RequestError`` exception if API requests failed, e.g. if there is no internet connection.
+
+        Example:
+        ```
+        try:
+            r.recognize_assemblyai(audio_data=audio, api_token=your_token)
+        except sr.TranscriptionNotReady as e:
+            job_name = e.job_name
+
+        # wait a little bit...
+        result = r.recognize_assemblyai(audio_data=None, api_token=your_token, job_name=job_name)
+        ```
         """
 
-        def read_file(filename, chunk_size=5242880):
-            with open(filename, 'rb') as _file:
-                while True:
-                    data = _file.read(chunk_size)
-                    if not data:
-                        break
-                    yield data
+        headers = {"authorization": api_token}
 
         check_existing = audio_data is None and job_name
         if check_existing:
             # Query status.
             transciption_id = job_name
             endpoint = f"https://api.assemblyai.com/v2/transcript/{transciption_id}"
-            headers = {
-                "authorization": api_token,
-            }
-            response = requests.get(endpoint, headers=headers)
+
+            try:
+                response = requests.get(endpoint, headers=headers)
+            except requests.exceptions.RequestException as e:
+                raise RequestError("recognition request failed: {}".format(e.reason))
+
             data = response.json()
             status = data['status']
 
             if status == 'error':
                 # Handle error.
-                exc = TranscriptionFailed()
+                exc = TranscriptionFailed("Transcription failed: {}".format(data["error"]))
                 exc.job_name = None
                 exc.file_key = None
                 raise exc
@@ -1332,24 +1350,52 @@ def read_file(filename, chunk_size=5242880):
             exc.file_key = None
             raise exc
         else:
-            # Upload file.
-            headers = {'authorization': api_token}
-            response = requests.post('https://api.assemblyai.com/v2/upload',
-                                     headers=headers,
-                                     data=read_file(audio_data))
-            upload_url = response.json()['upload_url']
+            # Upload file and queue for transcription.
+            # This path raises a TranscriptionNotReady error that contains the job_id.
+            # The job_id can then be used at a later point to query the transcript
+            if isinstance(audio_data, AudioData):
+                # convert to flac first
+                upload_data = audio_data.get_flac_data(
+                    convert_rate=None if audio_data.sample_rate >= 8000 else 8000,  # audio samples should be at least 8 kHz
+                    convert_width=None if audio_data.sample_width >= 2 else 2  # audio samples should be at least 16-bit
+                )
+            else:
+                # assume audio_data is a path to a file that can be uploaded directly
+                upload_data = audio_data
+
+            try:
+                response = requests.post('https://api.assemblyai.com/v2/upload',
+                                         headers=headers,
+                                         data=upload_data)
+            except requests.exceptions.RequestException as e:
+                raise RequestError("recognition request failed: {}".format(e.reason))
+
+            data = response.json()
+            if "error" in data:
+                exc = TranscriptionFailed("Transcription failed: {}".format(data["error"]))
+                exc.job_name = None
+                exc.file_key = None
+                raise exc
+
+            upload_url = data['upload_url']
 
             # Queue file for transcription.
             endpoint = "https://api.assemblyai.com/v2/transcript"
-            json = {
-              "audio_url": upload_url
-            }
-            headers = {
-                "authorization": api_token,
-                "content-type": "application/json"
-            }
-            response = requests.post(endpoint, json=json, headers=headers)
+            json = { "audio_url": upload_url }
+
+            try:
+                response = requests.post(endpoint, json=json, headers=headers)
+            except requests.exceptions.RequestException as e:
+                raise RequestError("recognition request failed: {}".format(e.reason))
+
             data = response.json()
+
+            if "error" in data:
+                exc = TranscriptionFailed("Transcription failed: {}".format(data["error"]))
+                exc.job_name = None
+                exc.file_key = None
+                raise exc
+
             transciption_id = data['id']
             exc = TranscriptionNotReady()
             exc.job_name = transciption_id