NVIDIA · karpnv · Jul 19, 2025 · Jun 3, 2025 · Jun 26, 2025 · Jul 1, 2025
diff --git a/dataset_configs/english/earnings/config.yaml b/dataset_configs/english/earnings/config.yaml
@@ -0,0 +1,257 @@
+documentation: |
+  Earnings21/22 Dataset Processing Pipeline
+  #########################################
+
+  This configuration implements a comprehensive 8-step processing pipeline for converting 
+  Earnings21 and Earnings22 datasets to NeMo format with advanced forced alignment capabilities.
+  The pipeline supports both full dataset processing and evaluation subsets with optional 
+  speaker segmentation.
+
+  **Dataset Overview**
+
+  The Earnings21 dataset is a 39-hour corpus of earnings calls containing entity-dense speech 
+  from nine different financial sectors. The Earnings22 dataset provides similar financial 
+  domain content. Both datasets include token-level transcripts with metadata, normalization 
+  candidates, and entity tags.
+
+  **Processing Pipeline**
+
+  The configuration performs the following 8-step data processing:
+
+  1. **CreateInitialAudioAndManifest**: Initial audio manifest creation from dataset files
+  2. **FfmpegConvert**: Audio format conversion (MP3 → WAV, multi-channel → mono, any sample rate → 16kHz)
+  3. **CreateFullAudioManifestEarnings21**: Ground truth text reconstruction from NLP token 
+     files with punctuation/capitalization preservation
+  4. **SubRegex**: Clean text patterns and remove unwanted characters
+  5. **NeMoForcedAligner**: Word-level forced alignment using NeMo ASR models with CTC heads
+  6. **CreateSentenceSegmentedManifest**: Intelligent sentence-level segmentation based on 
+     CTM files with punctuation-aware splitting
+  7. **SpeakerSegmentedManifest**: Speaker-change detection and segmentation with optional 
+     metadata mapping (optional)
+  8. **KeepOnlySpecifiedFields**: Filter manifest to keep only required fields
+
+  **Required Arguments**
+
+  * **output_directory**: Path to the main output directory where all processed files will be stored.
+  * **dataset_root**: Path to the root directory of Earnings21 or Earnings22 dataset.
+  * **dataset_type**: Dataset type, should be "earnings21" or "earnings22".
+  * **subset**: Dataset subset, should be "full" or "eval10" (earnings21 only). Defaults to "full".
+  * **forced_alignment_model**: NeMo ASR model for forced alignment with CTC head. 
+    Defaults to "nvidia/parakeet-tdt_ctc-1.1b".
+  * **preserve_punctuation**: Whether to preserve punctuation in text. Defaults to true.
+  * **preserve_capitalization**: Whether to preserve capitalization in text. Defaults to true.
+  * **include_speaker_info**: Whether to include speaker information in segments. Defaults to true.
+  * **include_tags**: Whether to include entity tags (earnings21 only). Defaults to false.
+  * **use_speaker_metadata_csv**: Whether to map speaker IDs to names from speaker-metadata.csv 
+    (earnings21 only). Defaults to false.
+  * **device**: Device for forced alignment ("cuda" or "cpu"). Defaults to "cuda".
+  * **test_mode**: Set to true to process only 2 files for testing. Defaults to false.
+
+  **Output Format**
+
+  The pipeline generates multiple intermediate manifests and a final filtered manifest:
+
+  **Step 1 Output** (Full audio manifest):
+
+  .. code-block:: json
+
+    {
+      "audio_filepath": "/path/to/dataset/media/file_id.wav",
+      "duration": 1800.0,
+      "text": "",
+      "file_id": "original_file_id"
+    }
+
+  **Step 2 Output** (Converted audio):
+
+  .. code-block:: json
+
+    {
+      "audio_filepath": "/path/to/output/converted_audio/file_id.wav",
+      "duration": 1800.0,
+      "text": "",
+      "file_id": "original_file_id"
+    }
+
+  **Step 3 Output** (Full audio with text):
+
+  .. code-block:: json
+
+    {
+      "audio_filepath": "/path/to/output/converted_audio/file_id.wav",
+      "duration": 1800.0,
+      "text": "Complete transcribed text with punctuation and capitalization.",
+      "file_id": "original_file_id"
+    }
+
+  **Step 6 Output** (Sentence-level segments - Primary Output):
+
+  .. code-block:: json
+
+    {
+      "audio_filepath": "/path/to/output/converted_audio/file_id.wav",
+      "duration": 15.2,
+      "text": "This is a complete sentence with proper punctuation.",
+      "file_id": "original_file_id",
+      "segment_id": 0,
+      "offset": 45.3,
+      "end_time": 60.5,
+      "alignment": [
+        {"word": "This", "start": 45.3, "end": 45.6},
+        {"word": "is", "start": 45.6, "end": 45.8}
+      ]
+    }
+
+  **Step 7 Output** (Speaker-level segments - Optional):
+
+  .. code-block:: json
+
+    {
+      "audio_filepath": "/path/to/output/converted_audio/file_id.wav",
+      "duration": 0,
+      "text": "Speaker segment text...",
+      "file_id": "original_file_id",
+      "segment_id": 0,
+      "start_time": null,
+      "end_time": null,
+      "speaker": "speaker_1"
+    }
+
+  **Final Output** (Filtered manifest):
+
+  .. code-block:: json
+
+    {
+      "audio_filepath": "/path/to/output/converted_audio/file_id.wav",
+      "duration": 15.2,
+      "offset": 45.3,
+      "text": "This is a complete sentence with proper punctuation."
+    }
+
+  **Usage Examples**
+
+  Process Earnings21 full dataset:
+
+  .. code-block:: bash
+
+    python main.py --config-path=dataset_configs/english/earnings --config-name=config \
+      dataset_type=earnings21 \
+      dataset_root=/path/to/earnings21 \
+      output_directory=/path/to/output
+
+  Process Earnings22 with custom model:
+
+  .. code-block:: bash
+
+    python main.py --config-path=dataset_configs/english/earnings --config-name=config \
+      dataset_type=earnings22 \
+      forced_alignment_model=nvidia/parakeet-tdt_ctc-1.1b \
+      dataset_root=/path/to/earnings22 \
+      output_directory=/path/to/output
+
+  Process Earnings21 Eval-10 subset:
+
+  .. code-block:: bash
+
+    python main.py --config-path=dataset_configs/english/earnings --config-name=config \
+      dataset_type=earnings21 \
+      subset=eval10 \
+      dataset_root=/path/to/earnings21 \
+      output_directory=/path/to/output
+
+  **Key Features**
+
+  * Supports both Earnings21 and Earnings22 datasets
+  * Automatic audio format conversion (MP3/WAV → 16kHz mono WAV)
+  * Word-level forced alignment using NeMo ASR models
+  * Sentence-level segmentation based on punctuation patterns
+  * Optional speaker-level segmentation with metadata mapping
+  * Entity-aware processing capabilities
+  * Configurable text processing (punctuation/capitalization preservation)
+  * Test mode for development and debugging
+
+processors_to_run: "0:"
+
+output_directory: ??
+dataset_root: ??
+raw_audio_input_dir: ${dataset_root}/media
+
+dataset_type: "earnings21"
+subset: "full"
+test_mode: false
+
+use_dask: false
+
+preserve_punctuation: true
+preserve_capitalization: true
+
+include_speaker_info: true
+include_tags: false
+use_speaker_metadata_csv: false
+
+forced_alignment_model: nvidia/parakeet-tdt_ctc-1.1b
+device: "cuda"
+
+processors:
+  - _target_: sdp.processors.datasets.earnings.CreateInitialAudioAndManifest
+    dataset_root: ${dataset_root}
+    raw_audio_source_dir: ${raw_audio_input_dir}
+    output_manifest_file: ${output_directory}/01_initial_audio_manifest.json
+    dataset_type: ${dataset_type}
+    subset: ${subset}
+    test_mode: ${test_mode}
+
+  - _target_: sdp.processors.FfmpegConvert
+    input_manifest_file: ${output_directory}/01_initial_audio_manifest.json
+    output_manifest_file: ${output_directory}/02_converted_audio_manifest.json
+    converted_audio_dir: ${output_directory}/converted_audio
+    input_file_key: audio_filepath
+    output_file_key: audio_filepath
+    output_format: wav
+    target_samplerate: 16000
+    target_nchannels: 1
+
+  - _target_: sdp.processors.datasets.earnings.CreateFullAudioManifestEarnings21
+    input_manifest_file: ${output_directory}/02_converted_audio_manifest.json
+    dataset_root: ${dataset_root}
+    output_manifest_file: ${output_directory}/03_full_audio_with_text_manifest.json
+    dataset_type: ${dataset_type}
+    preserve_punctuation: ${preserve_punctuation}
+    preserve_capitalization: ${preserve_capitalization}
+
+  - _target_: sdp.processors.SubRegex
+    input_manifest_file: ${output_directory}/03_full_audio_with_text_manifest.json
+    output_manifest_file: ${output_directory}/04_full_audio_with_text_manifest_cleaned.json
+    regex_params_list:
+      - {"pattern": "[…+×]", "repl": ""}
+      - {"pattern": "<.*?>", "repl": ""}
+      - {"pattern": "\\[.*?\\]", "repl": ""}
+
+  - _target_: sdp.processors.datasets.earnings.NeMoForcedAligner
+    input_manifest_file: ${output_directory}/04_full_audio_with_text_manifest_cleaned.json
+    output_manifest_file: ${output_directory}/05_aligned_manifest.json
+    output_dir: ${output_directory}/forced_alignment_output
+    pretrained_name: ${forced_alignment_model}
+    device: ${device}
+    batch_size: 1
+
+  - _target_: sdp.processors.datasets.earnings.CreateSentenceSegmentedManifest
+    input_manifest_file: ${output_directory}/05_aligned_manifest.json
+    ctm_dir: ${output_directory}/forced_alignment_output/ctm/words
+    output_manifest_file: ${output_directory}/06_sentence_segmented_manifest.json
+
+  - _target_: sdp.processors.datasets.earnings.SpeakerSegmentedManifest
+    input_manifest_file: ${output_directory}/04_full_audio_with_text_manifest_cleaned.json
+    dataset_root: ${dataset_root}
+    output_manifest_file: ${output_directory}/07_speaker_segmented_manifest.json
+    dataset_type: ${dataset_type}
+    preserve_punctuation: ${preserve_punctuation}
+    preserve_capitalization: ${preserve_capitalization}
+    include_speaker_info: ${include_speaker_info}
+    include_tags: ${include_tags}
+    use_speaker_metadata_csv: ${use_speaker_metadata_csv}
+
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    input_manifest_file: ${output_directory}/06_sentence_segmented_manifest.json
+    output_manifest_file: ${output_directory}/08_final_filtered_manifest.json
+    fields_to_keep: ["audio_filepath", "duration", "offset", "text"] 
diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
@@ -96,6 +96,27 @@ UzbekVoice
 .. autodata:: sdp.processors.CreateInitialManifestUzbekvoice
    :annotation:
 
+Earnings21/22
+'''''''''''''
+
+.. autodata:: sdp.processors.datasets.earnings21.CreateInitialAudioAndManifest
+   :annotation:
+
+.. autodata:: sdp.processors.datasets.earnings21.CreateFullAudioManifestEarnings21
+   :annotation:
+
+.. autodata:: sdp.processors.datasets.earnings21.SpeakerSegmentedManifest
+   :annotation:
+
+.. autodata:: sdp.processors.datasets.earnings21.CreateSentenceSegmentedManifest
+   :annotation:
+
+.. autodata:: sdp.processors.datasets.earnings21.NeMoForcedAligner
+   :annotation:
+
+.. autodata:: sdp.processors.datasets.earnings21.ApplyEarnings21Normalizations
+   :annotation:
+
 
 MASC
 ''''''

diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
@@ -21,6 +21,13 @@
     CreateInitialManifestCORAAL,
     TrainDevTestSplitCORAAL,
 )
+from sdp.processors.datasets.earnings import (
+    CreateInitialAudioAndManifest,
+    CreateFullAudioManifestEarnings21,
+    SpeakerSegmentedManifest,
+    CreateSentenceSegmentedManifest,
+    ApplyEarnings21Normalizations,
+)
 from sdp.processors.datasets.fleurs.create_initial_manifest import (
     CreateInitialManifestFleurs,
 )

diff --git a/sdp/processors/datasets/earnings/__init__.py b/sdp/processors/datasets/earnings/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from sdp.processors.datasets.earnings.create_initial_manifest import (
+    CreateInitialAudioAndManifest,
+    CreateFullAudioManifestEarnings21,
+    SpeakerSegmentedManifest,
+    CreateSentenceSegmentedManifest,
+    NeMoForcedAligner,
+)
+from sdp.processors.datasets.earnings.apply_normalizations import (
+    ApplyEarnings21Normalizations,
+)