NVIDIA
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 6 additions & 0 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎dataset_configs/english/coraal/config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎dataset_configs/english/coraal/config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dataset_configs/english/hifitts2/config_22khz.yaml‎
Lines changed: 63 additions & 0 deletions b/‎dataset_configs/english/hifitts2/config_22khz.yaml‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎dataset_configs/english/hifitts2/config_44khz.yaml‎
Lines changed: 64 additions & 0 deletions b/‎dataset_configs/english/hifitts2/config_44khz.yaml‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎dataset_configs/english/hifitts2/config_bandwidth.yaml‎
Lines changed: 44 additions & 0 deletions b/‎dataset_configs/english/hifitts2/config_bandwidth.yaml‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎docker/Dockerfile‎
Lines changed: 4 additions & 1 deletion b/‎docker/Dockerfile‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/src/conf.py‎
Lines changed: 5 additions & 0 deletions b/‎docs/src/conf.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/src/sdp/api.rst‎
Lines changed: 15 additions & 1 deletion b/‎docs/src/sdp/api.rst‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎docs/src/sdp/existing_configs.rst‎
Lines changed: 29 additions & 4 deletions b/‎docs/src/sdp/existing_configs.rst‎
Lines changed: 29 additions & 4 deletions
diff --git a/‎requirements/main.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/main.txt‎
Lines changed: 1 addition & 1 deletion
@@ -75,14 +75,20 @@ jobs:
         pip install nemo-toolkit[asr,nlp]==1.23.0
         pip install nemo_text_processing
         pip install -r requirements/huggingface.txt
+        pip install certifi #this needed to avoid problems with certificates [COORAL]
+        export SSL_CERT_FILE=$(python -m certifi)
         python -m pip cache purge
+        
 
     - name: Run all tests
       env:
         AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }}
         AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }}
         CLEAN_UP_TMP_PATH: 1
       run: |
+        wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem #downloading cert manually [for CORAL]
+        sudo cp incommon-rsa-ca2.pem     /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL]
+        sudo update-ca-certificates # [cert for CORAL]
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
         python -m pytest tests/ --junitxml=pytest.xml --ignore=tests/test_tts_sdp_end_to_end.py --cov-report=term-missing:skip-covered --cov=sdp --durations=30 -rs | tee pytest-coverage.txt
 
 
@@ -18,7 +18,7 @@ documentation: |
   This config performs the following data processing.
 
   1. Downloads CORAAL data based on the
-     `official file list <http://lingtools.uoregon.edu/coraal/coraal_download_list.txt>`_.
+     `official file list <https://lingtools.uoregon.edu/coraal/coraal_download_list.txt>`_. #Official mirror link
      There are a couple of errors in the links there, which are fixed in our code.
   2. Drops all utterances which contain only pauses. Set ``drop_pauses=False`` to undo.
   3. Groups all consecutive segments from the same speaker until 20 seconds duration
 
@@ -0,0 +1,63 @@
+documentation: |
+  HiFiTTS-2 22kHz
+  ###############
+  
+  This config can be used to download the audio data for
+  `HiFiTTS-2 22kHz <https://huggingface.co/datasets/nvidia/hifitts-2>`_
+  
+  1. Downloads HiFiTTS-2 audio from LibriVox.
+  2. Outputs a new manifest in which LibriVox audiobook chapters which could not be downloaded (e.g. because they
+     were removed from the website) are removed.
+
+  **Required arguments**.
+
+  * **workspace_dir**: specify the workspace folder where all audio files and manifests will be stored.
+
+  Note that you can customize any part of this config either directly or from command-line.
+ 
+  **Output format**.
+
+  This config outputs 2 manifest files:
+
+  * ``${workspace_dir}/errors.json`` - entries from the input chapters file which failed to download from LibriVox.
+  * ``${workspace_dir}/manifest_filtered_22khz`` - input manifest file without utterances from failed chapters.
+
+processors_to_run: all
+workspace_dir: ???
+manifest_filename: manifest_22khz.json
+output_filename: manifest_filtered_22khz.json
+chapter_filename: chapters_22khz.json
+error_filename: errors_22khz.json
+audio_dir_name: audio_22khz
+chapter_audio_dir_name: chapters
+sample_rate: 22050
+delete_chapter_files: true
+exit_on_error: false
+use_dask: false
+max_workers: 8
+chunksize: 50
+
+input_manifest_file: ${workspace_dir}/${manifest_filename}
+chapter_file: ${workspace_dir}/${chapter_filename}
+error_file: ${workspace_dir}/${error_filename}
+audio_dir: ${workspace_dir}/${audio_dir_name}
+chapter_dir: ${workspace_dir}/${chapter_audio_dir_name}
+final_manifest: ${workspace_dir}/${output_filename}
+
+processors:
+  - _target_: sdp.processors.DownloadHiFiTTS2
+    audio_dir: ${audio_dir}
+    chapter_dir: ${chapter_dir}
+    sample_rate: ${sample_rate}
+    delete_chapter_files: ${delete_chapter_files}
+    exit_on_error: ${exit_on_error}
+    input_manifest_file: ${chapter_file}
+    output_manifest_file: ${error_file}
+    use_dask: ${use_dask}
+    max_workers: ${max_workers}
+    chunksize: ${chunksize}
+
+  - _target_: sdp.processors.RemovedFailedChapters
+    input_manifest_file: ${input_manifest_file}
+    output_manifest_file: ${final_manifest}
+    error_file: ${error_file}
@@ -0,0 +1,64 @@
+documentation: |
+  HiFiTTS-2 44kHz
+  ##################
+  
+  This config can be used to download the audio data for
+  `HiFiTTS-2 44kHz <https://huggingface.co/datasets/nvidia/hifitts-2>`_
+  
+  
+  1. Downloads HiFiTTS-2 audio from LibriVox.
+  2. Outputs a new manifest in which LibriVox audiobook chapters which could not be downloaded (e.g. because they
+     were removed from the website) are removed.
+
+  **Required arguments**.
+
+  * **workspace_dir**: specify the workspace folder where all audio files and manifests will be stored.
+
+  Note that you can customize any part of this config either directly or from command-line.
+ 
+  **Output format**.
+
+  This config outputs 2 manifest files:
+
+  * ``${workspace_dir}/errors.json`` - entries from the input chapters file which failed to download from LibriVox.
+  * ``${workspace_dir}/manifest_filtered_44khz`` - input manifest file without utterances from failed chapters.
+
+processors_to_run: all
+workspace_dir: ???
+manifest_filename: manifest_44khz.json
+output_filename: manifest_filtered_44khz.json
+chapter_filename: chapters_44khz.json
+error_filename: errors_44khz.json
+audio_dir_name: audio_44khz
+chapter_audio_dir_name: chapters
+sample_rate: 44100
+delete_chapter_files: true
+exit_on_error: false
+use_dask: false
+max_workers: 8
+chunksize: 50
+
+input_manifest_file: ${workspace_dir}/${manifest_filename}
+chapter_file: ${workspace_dir}/${chapter_filename}
+error_file: ${workspace_dir}/${error_filename}
+audio_dir: ${workspace_dir}/${audio_dir_name}
+chapter_dir: ${workspace_dir}/${chapter_audio_dir_name}
+final_manifest: ${workspace_dir}/${output_filename}
+
+processors:
+  - _target_: sdp.processors.DownloadHiFiTTS2
+    audio_dir: ${audio_dir}
+    chapter_dir: ${chapter_dir}
+    sample_rate: ${sample_rate}
+    delete_chapter_files: ${delete_chapter_files}
+    exit_on_error: ${exit_on_error}
+    input_manifest_file: ${chapter_file}
+    output_manifest_file: ${error_file}
+    use_dask: ${use_dask}
+    max_workers: ${max_workers}
+    chunksize: ${chunksize}
+
+  - _target_: sdp.processors.RemovedFailedChapters
+    input_manifest_file: ${input_manifest_file}
+    output_manifest_file: ${final_manifest}
+    error_file: ${error_file}
@@ -0,0 +1,44 @@
+documentation: |
+  HiFiTTS-2 Bandwidth Estimation
+  ##############################
+  
+  This config contains the bandwidth estimation code used for HiFiTTS and HiFiTTS-2.
+  This config can be used to estimate bandwidth for any dataset. For HiFiTTS-2 bandwidth
+  was estimated using the first 30 seconds of every audiobook chapter, but the estimate is still
+  reasonably accurate if run over a shorter duration or with individual utterances.
+
+  **Required arguments**.
+
+  * **workspace_dir**: The workspace folder where all audio files and manifests are stored.
+  * **audio_dir**: Folder in workspace containing audio files to estimate bandwidth of.
+  * **input_manifest_filename**: Manifest file in workspace containing relative paths to audio.
+ 
+  **Output format**.
+  
+  This config outputs a single manifest with the following field(s):
+
+  * **bandwidth (int)**: Estimated bandwidth of the audio file.
+
+processors_to_run: all
+workspace_dir: ???
+audio_dir_name: ???
+input_manifest_filename: ???
+output_manifest_filename: manifest_bandwidth.json
+audio_key: audio_filepath
+use_dask: false
+max_workers: 1
+chunksize: 1
+
+input_manifest_file: ${workspace_dir}/${input_manifest_filename}
+final_manifest: ${workspace_dir}/${output_manifest_filename}
+audio_dir: ${workspace_dir}/${audio_dir_name}
+
+processors:
+  - _target_: sdp.processors.EstimateBandwidth
+    input_manifest_file: ${input_manifest_file}
+    output_manifest_file: ${final_manifest}
+    audio_dir: ${audio_dir}
+    input_audio_key: ${audio_key}
+    use_dask: ${use_dask}
+    max_workers: ${max_workers}
+    chunksize: ${chunksize}
@@ -21,6 +21,9 @@ RUN apt-get update \
 # Update pip
 RUN pip install --upgrade pip
 
+#install typing-ext manually
+RUN pip install typing-extensions
+
 # Clone the NeMo SDP repository
 COPY . /src/NeMo-speech-data-processor
 RUN rm -rf /src/NeMo-speech-data-processor/.git
@@ -34,4 +37,4 @@ RUN find requirements/ -name "*.txt" -exec pip install -r {} \;
 WORKDIR /src/NeMo-speech-data-processor
 
 # Set up entrypoint
-CMD ["bash"]
+CMD ["bash"]
@@ -189,3 +189,8 @@ def setup(app):
 ]
 # nitpick_ignore_regex = [('py:class', '*')]
 
+#adding this especially for coraal, temporary
+linkcheck_ignore = [
+    r'https://lingtools\.uoregon\.edu/coraal/coraal_download_list\.txt',
+]
+# https://lingtools.uoregon.edu/coraal/coraal_download_list.txt
@@ -137,12 +137,24 @@ HuggingFace Datasets
 .. autodata:: sdp.processors.CreateInitialManifestHuggingFace
    :annotation:
 
+
 YTC Datasets
 ''''''''''''
 
 .. autodata:: sdp.processors.datasets.ytc.create_initial_manifest.CreateInitialManifestYTC
    :annotation:
 
+
+HiFiTTS-2
+''''''''''''''''''''
+
+.. autodata:: sdp.processors.DownloadHiFiTTS2
+   :annotation:
+
+.. autodata:: sdp.processors.RemovedFailedChapters
+   :annotation:
+
+
 Lhotse processors
 #################
 
@@ -172,6 +184,9 @@ used in the downstream processing for additional enhancement or filtering.
 .. autodata:: sdp.processors.ASRTransformers
    :annotation:
 
+.. autodata:: sdp.processors.EstimateBandwidth
+   :annotation:
+
 .. autodata:: sdp.processors.tts.pyannote.PyAnnoteDiarizationAndOverlapDetection
    :annotation:
 
@@ -187,7 +202,6 @@ used in the downstream processing for additional enhancement or filtering.
 .. autodata:: sdp.processors.tts.metrics.BandwidthEstimationProcessor
    :annotation:
 
-
 Text-only processors
 ####################
 
 
@@ -366,6 +366,13 @@ Armenian Toloka
    `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/armenian/toloka/pipeline_get_final_res.yaml>`__ |
    :doc:`documentation <config-docs/armenian/toloka/pipeline_get_final_res>`
 
+.. toctree::
+   :hidden:
+
+   config-docs/armenian/toloka/pipeline_start
+   config-docs/armenian/toloka/pipeline_validate_answers
+   config-docs/armenian/toloka/pipeline_get_final_res
+
 YouTube Commons (YTC)
 ~~~~~~~~~~~~~~~~~~~~~~
 
@@ -377,8 +384,26 @@ YouTube Commons (YTC)
 .. toctree::
    :hidden:
 
-   config-docs/armenian/toloka/pipeline_start
-   config-docs/armenian/toloka/pipeline_validate_answers
-   config-docs/armenian/toloka/pipeline_get_final_res
-
    config-docs/tts/ytc/config
+
+HiFiTTS-2
+~~~~~~~~~~~~~~~~~~~~~~~
+
+**Dataset link:** https://huggingface.co/datasets/nvidia/hifitts-2
+
+* **22kHz**:
+   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/hifitts2/config_22khz.yaml>`__ |
+   :doc:`documentation <config-docs/english/hifitts2/config_22khz>`
+* **44kHz**:
+   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/hifitts2/config_44khz.yaml>`__ |
+   :doc:`documentation <config-docs/english/hifitts2/config_44khz>`
+* **Bandwidth Estimation**:
+   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/hifitts2/config_bandwidth.yaml>`__ |
+   :doc:`documentation <config-docs/english/hifitts2/config_bandwidth>`
+
+.. toctree::
+   :hidden:
+
+   config-docs/english/hifitts2/config_22khz
+   config-docs/english/hifitts2/config_44khz
+   config-docs/english/hifitts2/config_bandwidth
@@ -18,7 +18,7 @@ python-docx
 pydub
 dask
 distributed
-
+jiwer>=3.1.0,<4.0.0
 # toloka-kit  # Temporarily disabled due to Toloka's technical pause; keep as reference for past and future API support
 # for some processers, additionally https://github.com/NVIDIA/NeMo is required
 # for some processers, additionally nemo_text_processing is required
Original file line number	Diff line number	Diff line change
`@@ -189,3 +189,8 @@ def setup(app):`
`189`	`189`	`]`
`190`	`190`	`# nitpick_ignore_regex = [('py:class', '*')]`
`191`	`191`
	`192`	`+#adding this especially for coraal, temporary`
	`193`	`+linkcheck_ignore = [`
	`194`	`+ r'https://lingtools\.uoregon\.edu/coraal/coraal_download_list\.txt',`
	`195`	`+]`
	`196`	`+# https://lingtools.uoregon.edu/coraal/coraal_download_list.txt`