Skip to content
Closed
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
95fd4af
added audio diarization
nmj7 Apr 18, 2025
5ebfa5d
removed flask-ml dependencies and added 3 endpoints
nmj7 May 5, 2025
4e8a67a
shifted from flask-ml to rescuebox
nmj7 May 8, 2025
c42214a
Adding the source code files
LadyTuna May 9, 2025
0e0e1ac
Removing extra dependencies
LadyTuna May 9, 2025
e33855e
Removing unused imports
LadyTuna May 9, 2025
b7532be
Code Reformatting Using Black
LadyTuna May 9, 2025
ea3a522
Code fomatting using Black
LadyTuna May 9, 2025
4a88f8a
Renaming file
LadyTuna May 9, 2025
8b4358d
Renaming the folder
LadyTuna May 9, 2025
c168737
poetry lock
LadyTuna May 9, 2025
6150b03
outputs as now in csv and sqliteDB has been added
nmj7 May 12, 2025
082afb3
removed union
nmj7 May 12, 2025
89372e8
removed unmwated imports
nmj7 May 12, 2025
f11af2b
Removing Submodule
LadyTuna May 13, 2025
1bc807c
Formatting Code using Black
LadyTuna May 13, 2025
6901d25
Adding Trial Test case
LadyTuna May 13, 2025
02f50bf
Removing unused imports
LadyTuna May 13, 2025
38832f4
Reformatting test file
LadyTuna May 13, 2025
17f95c8
Adding test cases
LadyTuna May 13, 2025
37ee8ce
Updating import to the test file
LadyTuna May 13, 2025
3a60a7a
Fixing Test Cases
LadyTuna May 13, 2025
6798de6
Adding additional Test Cases
LadyTuna May 13, 2025
e0f378f
Updating filepath for local model
LadyTuna May 14, 2025
5e325d7
Removing unused import
LadyTuna May 14, 2025
ea07066
Evaluation
swethamo May 15, 2025
448be34
Updating readme
LadyTuna May 15, 2025
a0c0765
Merge branch 'Audio_Diarization' of https://github.com/nmj7/RescueBox…
swethamo May 15, 2025
a758c47
evaluations folder
swethamo May 15, 2025
e630866
Adding a few eval formatting scripts
LadyTuna May 15, 2025
fa79796
Create ReadMe.md
swethamo May 15, 2025
aeffa5e
Rename ReadMe.md to README.md
swethamo May 15, 2025
8ebfe5f
Rename READMe.md to README.md
swethamo May 15, 2025
d7a27a9
Add files via upload
swethamo May 15, 2025
0272b3d
Add files via upload
swethamo May 15, 2025
df2d4cb
Delete src/Audio-Diarization-Transcription/Audio_Diarization/input/TO…
swethamo May 15, 2025
b5cbcd3
Delete src/Audio-Diarization-Transcription/Audio_Diarization/model.onnx
swethamo May 15, 2025
87b0395
Adding Licenses
LadyTuna May 15, 2025
dac637e
Black file formatting
LadyTuna May 15, 2025
5a7d62a
Updating the README.md files
LadyTuna May 15, 2025
1324fde
Updating README.md
LadyTuna May 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,340 changes: 2,937 additions & 403 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,14 @@ onnxruntime = "1.21.0"
opencv-python = ">=4.11.0.86,<5.0.0.0"
ollama = ">=0.4.7,<0.5.0"
pypdf2 = ">=3.0.1,<4.0.0"
ffmpeg-python = "^0.2.0"
pyannote-audio = "3.3.2"

rb-lib = { path = "src/rb-lib", develop = true }

file-utils = { path = "src/file-utils", develop = true }
doc-parser = { path = "src/doc-parser", develop = true }
Audio-Diarization = { path = "src/Audio-Diarization-Transcription", develop = true }
audio-transcription = { path = "src/audio-transcription", develop = true }
age-and-gender-detection = { path = "src/age_and_gender_detection", develop = true }
text-summary = {path = "src/text-summary", develop = true}
Expand Down
10 changes: 9 additions & 1 deletion rescuebox/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
APP_NAME as AUDIO_APP_NAME,
) # type: ignore
from text_summary.main import app as text_summary_app, APP_NAME as TEXT_SUM_APP_NAME # type: ignore

from Audio_Diarization.model_3endpoints import (
app as diarize_transcribe_app,
APP_NAME as diarize_transcribe_app_name,
)
from age_and_gender_detection.main import app as age_gender_app, APP_NAME as AGE_GENDER_APP_NAME # type: ignore

# Import plugin modules
Expand All @@ -28,6 +31,11 @@ class RescueBoxPlugin:
RescueBoxPlugin(
audio_transcription_app, AUDIO_APP_NAME, "Audio transcription library"
),
RescueBoxPlugin(
diarize_transcribe_app,
diarize_transcribe_app_name,
"Speaker Diarization + Transcription service",
),
RescueBoxPlugin(age_gender_app, AGE_GENDER_APP_NAME, "Age and Gender Classifier"),
RescueBoxPlugin(text_summary_app, TEXT_SUM_APP_NAME, "Text summarization library"),
]
Expand Down
164 changes: 164 additions & 0 deletions src/Audio-Diarization-Transcription/Audio_Diarization/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# Audio-Diarization



Speaker Diarization – Identifying and separating speakers in an audio file, transcribing the speech with timestamps and speaker labels.



This process aids child rescue efforts by distinguishing victim and abuser voices, providing crucial evidence for court proceedings.



## Installation



1. **Clone the Repository**:



```bash

git clone https://github.com/UMass-Rescue/Audio-Diarization.git

cd Audio-Diarization

```



2. **Install Dependencies:**



For the best results create a virtaul environment. You can use any method to create a virtual environment!



One of the ways to create a virtual environment is listed below



```bash

python -m venv <virtual_env_name>

```



Activate the virtual environment

```bash

source <virtual_env_name>/bin/activate

```



Install the required Python packages using the following command:



```bash

pip install -r requirements.txt

```

3. **Access the model**

```bash

huggingface-cli login

```

You will be prompted to enter the access token which you can find: https://huggingface.co/settings/tokens

<img width="937" alt="diarization_accesstoken" src="https://github.com/user-attachments/assets/5e766cd7-45ef-4b2b-8d80-cc608d86e77c" />


(Incase there are issues with the token, you can contact us and one of us will provide it to you!)


4. **Running the Flask-ML Server**

Start the Flask-ML server to work with RescueBox for predictions:



```bash

python transcribe_diarize_app.py

```

The server will start running on 127.0.0.1 5000



5. **Download and run RescueBox Desktop from the following link: [Rescue Box Desktop](https://github.com/UMass-Rescue/RescueBox-Desktop/releases)**



Open the RescueBox Desktop application and register the model

<img width="495" alt="diarization_register" src="https://github.com/user-attachments/assets/b223ff7b-e941-44d1-a6e8-7c95a46487a3" />



Run the model

Set the Input and Output directory.

<img width="749" alt="diarization_directory" src="https://github.com/user-attachments/assets/5cbb8304-59de-49b7-9fc6-78eb7a5e7e16" />




Input directory should have an audio file and an output directory where the json file with the predictions will be outputted.

Results will be displayed

![image](https://github.com/user-attachments/assets/da0dc54d-b929-4ef0-b808-bfa10e9a87c4)

First make sure ffmpeg is installed on your system, if you don't already have it

### For MacOS

If you already have homebrew you can use the command listed below to directly install ffmpeg. If not you can follow the [documentation](https://docs.brew.sh/Installation) to install homebrew and then use the command listed below.

```bash

brew install ffmpeg

```


### For Windows

Use this [link to install the ffmpeg executable](https://www.ffmpeg.org/download.html#build-windows). Click on the windows icon and use the windows build from gyan.dev

Follow the installation instructions mentioned in the installer

Add ffmpeg to the environment variables to make to accessible globally



### Running the Diarization and Transcription Model

Once you have ffmpeg installed on your system, make sure you open a new terminal for the changes to be reflected!

Now you can simply run the model.py file using the following command

```bash

python model.py

```

The output will look something like this
<img width="813" alt="image" src="https://github.com/user-attachments/assets/57cb1b36-2174-4208-a50d-dce3099d7e5a" />
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Speaker Diarization Application

This application performs speaker diarization on audio files using the `pyannote.audio` library.

## Usage
1. Upload an audio file or directory containing audio files.
2. The application will return a JSON file with the start and end times of each speaker's turn.
131 changes: 131 additions & 0 deletions src/Audio-Diarization-Transcription/Audio_Diarization/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
from typing import TypedDict, Dict, List
from pathlib import Path
from flask_ml.flask_ml_server import MLServer
from flask_ml.flask_ml_server.models import (
DirectoryInput,
FileResponse,
InputSchema,
InputType,
ResponseBody,
TaskSchema,
)
from pyannote.audio import Pipeline
import json
from collections import defaultdict

# Load the pre-trained speaker diarization pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.0")


class DiarizationInputs(TypedDict):
input_dir: DirectoryInput
output_dir: DirectoryInput


class DiarizationParameters(TypedDict):
pass


# Specifies the input and output directory
def create_diarization_task_schema() -> TaskSchema:
input_schema = InputSchema(
key="input_dir",
label="Path to the directory containing audio files",
input_type=InputType.DIRECTORY,
)
output_schema = InputSchema(
key="output_dir",
label="Path to the output directory",
input_type=InputType.DIRECTORY,
)
return TaskSchema(inputs=[input_schema, output_schema], parameters=[])


# Formats the output in the JSON
def format_segments(segments: List[Dict[str, float]]) -> Dict[str, List[str]]:
"""Format speaker segments into the desired output format."""
speaker_segments = defaultdict(list)

for segment in segments:
speaker = segment["speaker"]
start = f'{segment["start"]:.2f}'
end = f'{segment["end"]:.2f}'
speaker_segments[speaker].append(f"{start} - {end}")

# Convert defaultdict to regular dict and format the output
formatted_output = {}
for speaker, times in speaker_segments.items():
formatted_output[speaker] = " ".join(times)

return formatted_output


# Create a server instance
server = MLServer(__name__)

server.add_app_metadata(
name="Speaker Diarization",
author="Christina. Swetha, Nikita",
version="1.0",
info="app-info.md",
)


# Checks audio file types
def is_audio_file(file_path: Path) -> bool:
"""Check if a file is an audio file based on its extension."""
audio_extensions = {".wav", ".mp3", ".flac", ".ogg"}
return file_path.suffix.lower() in audio_extensions


@server.route(
"/diarize",
task_schema_func=create_diarization_task_schema,
short_title="Speaker separation and transcription",
)
def diarize(
inputs: DiarizationInputs, parameters: DiarizationParameters
) -> ResponseBody:
input_path = Path(inputs["input_dir"].path)
output_path = Path(inputs["output_dir"].path)
output_path.mkdir(
parents=True, exist_ok=True
) # Create output directory if it doesn't exist

results = {}

if input_path.is_file():
# Process a single file
if is_audio_file(input_path):
diarization = pipeline(str(input_path))
segments = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
segments.append(
{"speaker": speaker, "start": turn.start, "end": turn.end}
)
# Format the segments into the desired output
results[input_path.name] = format_segments(segments)
else:
results[input_path.name] = "Error: Not a valid audio file"
else:
# Process all audio files in the input directory
for input_file in input_path.glob("*"):
if input_file.is_file() and is_audio_file(input_file):
diarization = pipeline(str(input_file))
segments = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
segments.append(
{"speaker": speaker, "start": turn.start, "end": turn.end}
)
results[input_file.name] = format_segments(segments)

# Save results to a JSON file in the specified output directory
output_file = output_path / "output.json"
with open(output_file, "w") as f:
json.dump(results, f, indent=4)

return ResponseBody(FileResponse(path=str(output_file), file_type="json"))


if __name__ == "__main__":
server.run()
Loading