UMass-Rescue · nmj7 · Apr 18, 2025 · May 5, 2025 · May 8, 2025 · May 9, 2025
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,11 +22,14 @@ onnxruntime = "1.21.0"
 opencv-python = ">=4.11.0.86,<5.0.0.0"
 ollama = ">=0.4.7,<0.5.0"
 pypdf2 = ">=3.0.1,<4.0.0"
+ffmpeg-python = "^0.2.0"
+pyannote-audio = "3.3.2"
 
 rb-lib = { path = "src/rb-lib", develop = true }
 
 file-utils = { path = "src/file-utils", develop = true }
 doc-parser = { path = "src/doc-parser", develop = true }
+Audio-Diarization = { path = "src/Audio-Diarization-Transcription", develop = true }
 audio-transcription = { path = "src/audio-transcription", develop = true }
 age-and-gender-detection = { path = "src/age_and_gender_detection", develop = true }
 text-summary = {path = "src/text-summary", develop = true}

diff --git a/rescuebox/plugins/__init__.py b/rescuebox/plugins/__init__.py
@@ -6,7 +6,10 @@
     APP_NAME as AUDIO_APP_NAME,
 )  # type: ignore
 from text_summary.main import app as text_summary_app, APP_NAME as TEXT_SUM_APP_NAME  # type: ignore
-
+from Audio_Diarization.model_3endpoints import (
+    app as diarize_transcribe_app,
+    APP_NAME as diarize_transcribe_app_name,
+)
 from age_and_gender_detection.main import app as age_gender_app, APP_NAME as AGE_GENDER_APP_NAME  # type: ignore
 
 # Import plugin modules
@@ -28,6 +31,11 @@ class RescueBoxPlugin:
     RescueBoxPlugin(
         audio_transcription_app, AUDIO_APP_NAME, "Audio transcription library"
     ),
+    RescueBoxPlugin(
+        diarize_transcribe_app,
+        diarize_transcribe_app_name,
+        "Speaker Diarization + Transcription service",
+    ),
     RescueBoxPlugin(age_gender_app, AGE_GENDER_APP_NAME, "Age and Gender Classifier"),
     RescueBoxPlugin(text_summary_app, TEXT_SUM_APP_NAME, "Text summarization library"),
 ]

diff --git a/src/Audio-Diarization-Transcription/Audio_Diarization/README.md b/src/Audio-Diarization-Transcription/Audio_Diarization/README.md
@@ -0,0 +1,164 @@
+# Audio-Diarization
+
+
+
+Speaker Diarization – Identifying and separating speakers in an audio file, transcribing the speech with timestamps and speaker labels.
+
+
+
+This process aids child rescue efforts by distinguishing victim and abuser voices, providing crucial evidence for court proceedings.
+
+
+
+## Installation
+
+
+
+1.  **Clone the Repository**:
+
+
+
+```bash
+
+git clone https://github.com/UMass-Rescue/Audio-Diarization.git
+
+cd Audio-Diarization
+
+```
+
+
+
+2.  **Install Dependencies:**
+
+
+
+For the best results create a virtaul environment. You can use any method to create a virtual environment!
+
+
+
+One of the ways to create a virtual environment is listed below
+
+
+
+```bash
+
+python -m venv <virtual_env_name>
+
+```
+
+
+
+Activate the virtual environment
+
+```bash
+
+source <virtual_env_name>/bin/activate
+
+```
+
+
+
+Install the required Python packages using the following command:
+
+
+
+```bash
+
+pip install -r requirements.txt
+
+```
+
+3.  **Access the model**
+
+```bash
+
+huggingface-cli login
+
+```
+
+You will be prompted to enter the access token which you can find: https://huggingface.co/settings/tokens
+
+<img  width="937"  alt="diarization_accesstoken"  src="https://github.com/user-attachments/assets/5e766cd7-45ef-4b2b-8d80-cc608d86e77c"  />
+
+
+(Incase there are issues with the token, you can contact us and one of us will provide it to you!)
+
+
+4.  **Running the Flask-ML Server**
+
+Start the Flask-ML server to work with RescueBox for predictions:
+
+
+
+```bash
+
+python transcribe_diarize_app.py
+
+```
+
+The server will start running on 127.0.0.1 5000
+
+
+
+5.  **Download and run RescueBox Desktop from the following link: [Rescue Box Desktop](https://github.com/UMass-Rescue/RescueBox-Desktop/releases)**
+
+
+
+Open the RescueBox Desktop application and register the model
+
+<img  width="495"  alt="diarization_register"  src="https://github.com/user-attachments/assets/b223ff7b-e941-44d1-a6e8-7c95a46487a3"  />
+
+
+
+Run the model
+
+Set the Input and Output directory.
+
+<img  width="749"  alt="diarization_directory"  src="https://github.com/user-attachments/assets/5cbb8304-59de-49b7-9fc6-78eb7a5e7e16"  />
+
+
+
+
+Input directory should have an audio file and an output directory where the json file with the predictions will be outputted.
+
+Results will be displayed
+
+![image](https://github.com/user-attachments/assets/da0dc54d-b929-4ef0-b808-bfa10e9a87c4)
+
+First make sure ffmpeg is installed on your system, if you don't already have it
+
+### For MacOS  
+
+If you already have homebrew you can use the command listed below to directly install ffmpeg. If not you can follow the [documentation](https://docs.brew.sh/Installation) to install homebrew and then use the command listed below.
+
+```bash
+
+brew  install  ffmpeg
+
+```
+
+
+### For Windows
+
+Use this [link to install the ffmpeg executable](https://www.ffmpeg.org/download.html#build-windows). Click on the windows icon and use the windows build from gyan.dev
+
+Follow the installation instructions mentioned in the installer
+
+Add ffmpeg to the environment variables to make to accessible globally
+
+
+
+### Running the Diarization and Transcription Model
+
+Once you have ffmpeg installed on your system, make sure you open a new terminal for the changes to be reflected!
+
+Now you can simply run the model.py file using the following command
+
+```bash
+
+python  model.py
+
+```
+
+The output will look something like this
+<img width="813" alt="image" src="https://github.com/user-attachments/assets/57cb1b36-2174-4208-a50d-dce3099d7e5a" />
diff --git a/src/Audio-Diarization-Transcription/Audio_Diarization/app-info.md b/src/Audio-Diarization-Transcription/Audio_Diarization/app-info.md
@@ -0,0 +1,7 @@
+# Speaker Diarization Application
+
+This application performs speaker diarization on audio files using the `pyannote.audio` library.
+
+## Usage
+1. Upload an audio file or directory containing audio files.
+2. The application will return a JSON file with the start and end times of each speaker's turn.
diff --git a/src/Audio-Diarization-Transcription/Audio_Diarization/app.py b/src/Audio-Diarization-Transcription/Audio_Diarization/app.py
@@ -0,0 +1,131 @@
+from typing import TypedDict, Dict, List
+from pathlib import Path
+from flask_ml.flask_ml_server import MLServer
+from flask_ml.flask_ml_server.models import (
+    DirectoryInput,
+    FileResponse,
+    InputSchema,
+    InputType,
+    ResponseBody,
+    TaskSchema,
+)
+from pyannote.audio import Pipeline
+import json
+from collections import defaultdict
+
+# Load the pre-trained speaker diarization pipeline
+pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.0")
+
+
+class DiarizationInputs(TypedDict):
+    input_dir: DirectoryInput
+    output_dir: DirectoryInput
+
+
+class DiarizationParameters(TypedDict):
+    pass
+
+
+# Specifies the input and output directory
+def create_diarization_task_schema() -> TaskSchema:
+    input_schema = InputSchema(
+        key="input_dir",
+        label="Path to the directory containing audio files",
+        input_type=InputType.DIRECTORY,
+    )
+    output_schema = InputSchema(
+        key="output_dir",
+        label="Path to the output directory",
+        input_type=InputType.DIRECTORY,
+    )
+    return TaskSchema(inputs=[input_schema, output_schema], parameters=[])
+
+
+# Formats the output in the JSON
+def format_segments(segments: List[Dict[str, float]]) -> Dict[str, List[str]]:
+    """Format speaker segments into the desired output format."""
+    speaker_segments = defaultdict(list)
+
+    for segment in segments:
+        speaker = segment["speaker"]
+        start = f'{segment["start"]:.2f}'
+        end = f'{segment["end"]:.2f}'
+        speaker_segments[speaker].append(f"{start} - {end}")
+
+    # Convert defaultdict to regular dict and format the output
+    formatted_output = {}
+    for speaker, times in speaker_segments.items():
+        formatted_output[speaker] = "  ".join(times)
+
+    return formatted_output
+
+
+# Create a server instance
+server = MLServer(__name__)
+
+server.add_app_metadata(
+    name="Speaker Diarization",
+    author="Christina. Swetha, Nikita",
+    version="1.0",
+    info="app-info.md",
+)
+
+
+# Checks audio file types
+def is_audio_file(file_path: Path) -> bool:
+    """Check if a file is an audio file based on its extension."""
+    audio_extensions = {".wav", ".mp3", ".flac", ".ogg"}
+    return file_path.suffix.lower() in audio_extensions
+
+
+@server.route(
+    "/diarize",
+    task_schema_func=create_diarization_task_schema,
+    short_title="Speaker separation and transcription",
+)
+def diarize(
+    inputs: DiarizationInputs, parameters: DiarizationParameters
+) -> ResponseBody:
+    input_path = Path(inputs["input_dir"].path)
+    output_path = Path(inputs["output_dir"].path)
+    output_path.mkdir(
+        parents=True, exist_ok=True
+    )  # Create output directory if it doesn't exist
+
+    results = {}
+
+    if input_path.is_file():
+        # Process a single file
+        if is_audio_file(input_path):
+            diarization = pipeline(str(input_path))
+            segments = []
+            for turn, _, speaker in diarization.itertracks(yield_label=True):
+                segments.append(
+                    {"speaker": speaker, "start": turn.start, "end": turn.end}
+                )
+            # Format the segments into the desired output
+            results[input_path.name] = format_segments(segments)
+        else:
+            results[input_path.name] = "Error: Not a valid audio file"
+    else:
+        # Process all audio files in the input directory
+        for input_file in input_path.glob("*"):
+            if input_file.is_file() and is_audio_file(input_file):
+                diarization = pipeline(str(input_file))
+                segments = []
+                for turn, _, speaker in diarization.itertracks(yield_label=True):
+                    segments.append(
+                        {"speaker": speaker, "start": turn.start, "end": turn.end}
+                    )
+                results[input_file.name] = format_segments(segments)
+
+    # Save results to a JSON file in the specified output directory
+    output_file = output_path / "output.json"
+    with open(output_file, "w") as f:
+        json.dump(results, f, indent=4)
+
+    return ResponseBody(FileResponse(path=str(output_file), file_type="json"))
+
+
+if __name__ == "__main__":
+    server.run()