Skip to content

Commit 4e74b5c

Browse files
committedDec 18, 2024·
fix for stt-server (got broken by webserver update)
1 parent c1a67e6 commit 4e74b5c

File tree

5 files changed

+170
-31
lines changed

5 files changed

+170
-31
lines changed
 

‎RealtimeSTT/audio_recorder.py

+27-11
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,13 @@ def run(self):
139139
device_index=self.gpu_device_index,
140140
download_root=self.download_root,
141141
)
142+
# Create a short dummy audio array, for example 1 second of silence at 16 kHz
142143
if self.batch_size > 0:
143144
model = BatchedInferencePipeline(model=model)
145+
146+
# Run a warm-up transcription
147+
dummy_audio = np.zeros(16000, dtype=np.float32)
148+
model.transcribe(dummy_audio, language="en", beam_size=1)
144149
except Exception as e:
145150
logging.exception(f"Error initializing main faster_whisper transcription model: {e}")
146151
raise
@@ -281,6 +286,7 @@ def __init__(self,
281286
buffer_size: int = BUFFER_SIZE,
282287
sample_rate: int = SAMPLE_RATE,
283288
initial_prompt: Optional[Union[str, Iterable[int]]] = None,
289+
initial_prompt_realtime: Optional[Union[str, Iterable[int]]] = None,
284290
suppress_tokens: Optional[List[int]] = [-1],
285291
print_transcription_time: bool = False,
286292
early_transcription_on_silence: int = 0,
@@ -294,12 +300,12 @@ def __init__(self,
294300
295301
Args:
296302
- model (str, default="tiny"): Specifies the size of the transcription
297-
model to use or the path to a converted model directory.
298-
Valid options are 'tiny', 'tiny.en', 'base', 'base.en',
299-
'small', 'small.en', 'medium', 'medium.en', 'large-v1',
300-
'large-v2'.
301-
If a specific size is provided, the model is downloaded
302-
from the Hugging Face Hub.
303+
model to use or the path to a converted model directory.
304+
Valid options are 'tiny', 'tiny.en', 'base', 'base.en',
305+
'small', 'small.en', 'medium', 'medium.en', 'large-v1',
306+
'large-v2'.
307+
If a specific size is provided, the model is downloaded
308+
from the Hugging Face Hub.
303309
- download_root (str, default=None): Specifies the root path were the Whisper models
304310
are downloaded to. When empty, the default is used.
305311
- language (str, default=""): Language code for speech-to-text engine.
@@ -472,7 +478,9 @@ def __init__(self,
472478
recording. Changing this will very probably functionality (as the
473479
WebRTC VAD model is very sensitive towards the sample rate).
474480
- initial_prompt (str or iterable of int, default=None): Initial
475-
prompt to be fed to the transcription models.
481+
prompt to be fed to the main transcription model.
482+
- initial_prompt_realtime (str or iterable of int, default=None):
483+
Initial prompt to be fed to the real-time transcription model.
476484
- suppress_tokens (list of int, default=[-1]): Tokens to be suppressed
477485
from the transcription output.
478486
- print_transcription_time (bool, default=False): Logs processing time
@@ -533,6 +541,8 @@ def __init__(self,
533541
self.enable_realtime_transcription = enable_realtime_transcription
534542
self.use_main_model_for_realtime = use_main_model_for_realtime
535543
self.main_model_type = model
544+
if not download_root:
545+
download_root = None
536546
self.download_root = download_root
537547
self.realtime_model_type = realtime_model_type
538548
self.realtime_processing_pause = realtime_processing_pause
@@ -583,6 +593,7 @@ def __init__(self,
583593
self.last_transcription_bytes = None
584594
self.last_transcription_bytes_b64 = None
585595
self.initial_prompt = initial_prompt
596+
self.initial_prompt_realtime = initial_prompt_realtime
586597
self.suppress_tokens = suppress_tokens
587598
self.use_wake_words = wake_words or wakeword_backend in {'oww', 'openwakeword', 'openwakewords'}
588599
self.detected_language = None
@@ -697,7 +708,11 @@ def __init__(self,
697708
if self.enable_realtime_transcription and not self.use_main_model_for_realtime:
698709
try:
699710
logging.info("Initializing faster_whisper realtime "
700-
f"transcription model {self.realtime_model_type}"
711+
f"transcription model {self.realtime_model_type}, "
712+
f"default device: {self.device}, "
713+
f"compute type: {self.compute_type}, "
714+
f"device index: {self.gpu_device_index}, "
715+
f"download root: {self.download_root}"
701716
)
702717
self.realtime_model_type = faster_whisper.WhisperModel(
703718
model_size_or_path=self.realtime_model_type,
@@ -708,7 +723,8 @@ def __init__(self,
708723
)
709724
if self.realtime_batch_size > 0:
710725
self.realtime_model_type = BatchedInferencePipeline(model=self.realtime_model_type)
711-
726+
dummy_audio = np.zeros(16000, dtype=np.float32)
727+
self.realtime_model_type.transcribe(dummy_audio, language="en", beam_size=1)
712728
except Exception as e:
713729
logging.exception("Error initializing faster_whisper "
714730
f"realtime transcription model: {e}"
@@ -2104,7 +2120,7 @@ def _realtime_worker(self):
21042120
audio_array,
21052121
language=self.language if self.language else None,
21062122
beam_size=self.beam_size_realtime,
2107-
initial_prompt=self.initial_prompt,
2123+
initial_prompt=self.initial_prompt_realtime,
21082124
suppress_tokens=self.suppress_tokens,
21092125
batch_size=self.realtime_batch_size
21102126
)
@@ -2113,7 +2129,7 @@ def _realtime_worker(self):
21132129
audio_array,
21142130
language=self.language if self.language else None,
21152131
beam_size=self.beam_size_realtime,
2116-
initial_prompt=self.initial_prompt,
2132+
initial_prompt=self.initial_prompt_realtime,
21172133
suppress_tokens=self.suppress_tokens
21182134
)
21192135

‎RealtimeSTT/audio_recorder_client.py

+48
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
INIT_MODEL_TRANSCRIPTION = "tiny"
2929
INIT_MODEL_TRANSCRIPTION_REALTIME = "tiny"
3030
INIT_REALTIME_PROCESSING_PAUSE = 0.2
31+
INIT_REALTIME_INITIAL_PAUSE = 0.2
3132
INIT_SILERO_SENSITIVITY = 0.4
3233
INIT_WEBRTC_SENSITIVITY = 3
3334
INIT_POST_SPEECH_SILENCE_DURATION = 0.6
@@ -68,6 +69,7 @@ class AudioToTextRecorderClient:
6869

6970
def __init__(self,
7071
model: str = INIT_MODEL_TRANSCRIPTION,
72+
download_root: str = None,
7173
language: str = "",
7274
compute_type: str = "default",
7375
input_device_index: int = None,
@@ -81,14 +83,17 @@ def __init__(self,
8183
use_microphone=True,
8284
spinner=True,
8385
level=logging.WARNING,
86+
batch_size: int = 16,
8487

8588
# Realtime transcription parameters
8689
enable_realtime_transcription=False,
8790
use_main_model_for_realtime=False,
8891
realtime_model_type=INIT_MODEL_TRANSCRIPTION_REALTIME,
8992
realtime_processing_pause=INIT_REALTIME_PROCESSING_PAUSE,
93+
init_realtime_after_seconds=INIT_REALTIME_INITIAL_PAUSE,
9094
on_realtime_transcription_update=None,
9195
on_realtime_transcription_stabilized=None,
96+
realtime_batch_size: int = 16,
9297

9398
# Voice activation parameters
9499
silero_sensitivity: float = INIT_SILERO_SENSITIVITY,
@@ -133,6 +138,7 @@ def __init__(self,
133138
buffer_size: int = BUFFER_SIZE,
134139
sample_rate: int = SAMPLE_RATE,
135140
initial_prompt: Optional[Union[str, Iterable[int]]] = None,
141+
initial_prompt_realtime: Optional[Union[str, Iterable[int]]] = None,
136142
suppress_tokens: Optional[List[int]] = [-1],
137143
print_transcription_time: bool = False,
138144
early_transcription_on_silence: int = 0,
@@ -162,10 +168,14 @@ def __init__(self,
162168
self.use_microphone = use_microphone
163169
self.spinner = spinner
164170
self.level = level
171+
self.batch_size = batch_size
172+
self.init_realtime_after_seconds = init_realtime_after_seconds
173+
self.realtime_batch_size = realtime_batch_size
165174

166175
# Real-time transcription parameters
167176
self.enable_realtime_transcription = enable_realtime_transcription
168177
self.use_main_model_for_realtime = use_main_model_for_realtime
178+
self.download_root = download_root
169179
self.realtime_model_type = realtime_model_type
170180
self.realtime_processing_pause = realtime_processing_pause
171181
self.on_realtime_transcription_update = on_realtime_transcription_update
@@ -204,6 +214,7 @@ def __init__(self,
204214
self.buffer_size = buffer_size
205215
self.sample_rate = sample_rate
206216
self.initial_prompt = initial_prompt
217+
self.initial_prompt_realtime = initial_prompt_realtime
207218
self.suppress_tokens = suppress_tokens
208219
self.print_transcription_time = print_transcription_time
209220
self.early_transcription_on_silence = early_transcription_on_silence
@@ -376,6 +387,43 @@ def start_server(self):
376387
args += ['--model', self.model]
377388
if self.realtime_model_type:
378389
args += ['--realtime_model_type', self.realtime_model_type]
390+
if self.download_root:
391+
args += ['--root', self.download_root]
392+
if self.batch_size is not None:
393+
args += ['--batch', str(self.batch_size)]
394+
if self.realtime_batch_size is not None:
395+
args += ['--realtime_batch_size', str(self.realtime_batch_size)]
396+
if self.init_realtime_after_seconds is not None:
397+
args += ['--init_realtime_after_seconds', str(self.init_realtime_after_seconds)]
398+
if self.initial_prompt_realtime:
399+
sanitized_prompt = self.initial_prompt_realtime.replace("\n", "\\n")
400+
args += ['--initial_prompt_realtime', sanitized_prompt]
401+
402+
# if self.compute_type:
403+
# args += ['--compute_type', self.compute_type]
404+
# if self.input_device_index is not None:
405+
# args += ['--input_device_index', str(self.input_device_index)]
406+
# if self.gpu_device_index is not None:
407+
# args += ['--gpu_device_index', str(self.gpu_device_index)]
408+
# if self.device:
409+
# args += ['--device', self.device]
410+
# if self.spinner:
411+
# args.append('--spinner') # flag, no need for True/False
412+
# if self.enable_realtime_transcription:
413+
# args.append('--enable_realtime_transcription') # flag, no need for True/False
414+
# if self.handle_buffer_overflow:
415+
# args.append('--handle_buffer_overflow') # flag, no need for True/False
416+
# if self.suppress_tokens:
417+
# args += ['--suppress_tokens', str(self.suppress_tokens)]
418+
# if self.print_transcription_time:
419+
# args.append('--print_transcription_time') # flag, no need for True/False
420+
# if self.allowed_latency_limit is not None:
421+
# args += ['--allowed_latency_limit', str(self.allowed_latency_limit)]
422+
# if self.no_log_file:
423+
# args.append('--no_log_file') # flag, no need for True
424+
# if self.debug_mode:
425+
# args.append('--debug') # flag, no need for True/False
426+
379427
if self.language:
380428
args += ['--language', self.language]
381429
if self.silero_sensitivity is not None:

‎RealtimeSTT_server/stt_server.py

+61-6
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
- `-D, --debug`: Enable debug logging.
2828
- `-W, --write`: Save audio to WAV file.
2929
- `-s, --silence_timing`: Enable dynamic silence duration for sentence detection; default True.
30+
- `-b, --batch, --batch_size`: Batch size for inference; default 16.
31+
- `--root, --download_root`: Specifies the root path were the Whisper models are downloaded to.
3032
- `--silero_sensitivity`: Silero VAD sensitivity (0-1); default 0.05.
3133
- `--silero_use_onnx`: Use Silero ONNX model; default False.
3234
- `--webrtc_sensitivity`: WebRTC VAD sensitivity (0-3); default 3.
@@ -38,7 +40,10 @@
3840
- `--early_transcription_on_silence`: Start transcription after silence in seconds; default 0.2.
3941
- `--beam_size`: Beam size for main model; default 5.
4042
- `--beam_size_realtime`: Beam size for real-time model; default 3.
41-
- `--initial_prompt`: Initial transcription guidance prompt.
43+
- `--init_realtime_after_seconds`: Initial waiting time for realtime transcription; default 0.2.
44+
- `--realtime_batch_size`: Batch size for the real-time transcription model; default 16.
45+
- `--initial_prompt`: Initial main transcription guidance prompt.
46+
- `--initial_prompt_realtime`: Initial realtime transcription guidance prompt.
4247
- `--end_of_sentence_detection_pause`: Silence duration for sentence end detection; default 0.45.
4348
- `--unknown_sentence_detection_pause`: Pause duration for incomplete sentence detection; default 0.7.
4449
- `--mid_sentence_detection_pause`: Pause for mid-sentence break; default 2.0.
@@ -52,6 +57,14 @@
5257
- `--use_main_model_for_realtime`: Use main model for real-time transcription.
5358
- `--use_extended_logging`: Enable extensive log messages.
5459
- `--logchunks`: Log incoming audio chunks.
60+
- `--compute_type`: Type of computation to use.
61+
- `--input_device_index`: Index of the audio input device.
62+
- `--gpu_device_index`: Index of the GPU device.
63+
- `--device`: Device to use for computation.
64+
- `--handle_buffer_overflow`: Handle buffer overflow during transcription.
65+
- `--suppress_tokens`: Suppress tokens during transcription.
66+
- `--allowed_latency_limit`: Allowed latency limit for real-time transcription.
67+
5568
5669
### WebSocket Interface:
5770
The server supports two WebSocket connections:
@@ -364,7 +377,7 @@ def parse_arguments():
364377
parser.add_argument('-l', '--lang', '--language', type=str, default='en',
365378
help='Language code for the STT model to transcribe in a specific language. Leave this empty for auto-detection based on input audio. Default is en. List of supported language codes: https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L11-L110')
366379

367-
parser.add_argument('-i', '--input-device', '--input_device_index', type=int, default=1,
380+
parser.add_argument('-i', '--input-device', '--input-device-index', type=int, default=1,
368381
help='Index of the audio input device to use. Use this option to specify a particular microphone or audio input device based on your system. Default is 1.')
369382

370383
parser.add_argument('-c', '--control', '--control_port', type=int, default=8011,
@@ -378,12 +391,23 @@ def parse_arguments():
378391

379392
parser.add_argument('-D', '--debug', action='store_true', help='Enable debug logging for detailed server operations')
380393

381-
parser.add_argument("-W", "--write", metavar="FILE",
382-
help="Save received audio to a WAV file")
394+
parser.add_argument('-W', '--write', metavar='FILE', help='Save received audio to a WAV file')
395+
396+
parser.add_argument('-b', '--batch', '--batch_size', type=int, default=16, help='Batch size for inference. This parameter controls the number of audio chunks processed in parallel during transcription. Default is 16.')
397+
398+
parser.add_argument('--root', '--download_root', type=str,default=None, help='Specifies the root path where the Whisper models are downloaded to. Default is None.')
383399

384400
parser.add_argument('-s', '--silence_timing', action='store_true', default=True,
385401
help='Enable dynamic adjustment of silence duration for sentence detection. Adjusts post-speech silence duration based on detected sentence structure and punctuation. Default is False.')
386402

403+
parser.add_argument('--init_realtime_after_seconds', type=float, default=0.2,
404+
help='The initial waiting time in seconds before real-time transcription starts. This delay helps prevent false positives at the beginning of a session. Default is 0.2 seconds.')
405+
406+
parser.add_argument('--realtime_batch_size', type=int, default=16,
407+
help='Batch size for the real-time transcription model. This parameter controls the number of audio chunks processed in parallel during real-time transcription. Default is 16.')
408+
409+
parser.add_argument('--initial_prompt_realtime', type=str, default="", help='Initial prompt that guides the real-time transcription model to produce transcriptions in a particular style or format.')
410+
387411
parser.add_argument('--silero_sensitivity', type=float, default=0.05,
388412
help='Sensitivity level for Silero Voice Activity Detection (VAD), with a range from 0 to 1. Lower values make the model less sensitive, useful for noisy environments. Default is 0.05.')
389413

@@ -457,6 +481,23 @@ def parse_arguments():
457481
parser.add_argument('--use_extended_logging', action='store_true',
458482
help='Writes extensive log messages for the recording worker, that processes the audio chunks.')
459483

484+
parser.add_argument('--compute_type', type=str, default='default',
485+
help='Type of computation to use. See https://opennmt.net/CTranslate2/quantization.html')
486+
487+
parser.add_argument('--gpu_device_index', type=int, default=0,
488+
help='Index of the GPU device to use. Default is None.')
489+
490+
parser.add_argument('--device', type=str, default='cuda',
491+
help='Device for model to use. Can either be "cuda" or "cpu". Default is cuda.')
492+
493+
parser.add_argument('--handle_buffer_overflow', action='store_true',
494+
help='Handle buffer overflow during transcription. Default is False.')
495+
496+
parser.add_argument('--suppress_tokens', type=int, default=[-1], nargs='*', help='Suppress tokens during transcription. Default is [-1].')
497+
498+
parser.add_argument('--allowed_latency_limit', type=int, default=100,
499+
help='Maximal amount of chunks that can be unprocessed in queue before discarding chunks.. Default is 100.')
500+
460501
parser.add_argument('--logchunks', action='store_true', help='Enable logging of incoming audio chunks (periods)')
461502

462503
# Parse arguments
@@ -479,6 +520,9 @@ def parse_arguments():
479520
if args.initial_prompt:
480521
args.initial_prompt = args.initial_prompt.replace("\\n", "\n")
481522

523+
if args.initial_prompt_realtime:
524+
args.initial_prompt_realtime = args.initial_prompt_realtime.replace("\\n", "\n")
525+
482526
return args
483527

484528
def _recorder_thread(loop):
@@ -534,7 +578,7 @@ def decode_and_resample(
534578

535579
return resampled_audio.astype(np.int16).tobytes()
536580

537-
async def control_handler(websocket, path):
581+
async def control_handler(websocket):
538582
debug_print(f"New control connection from {websocket.remote_address}")
539583
print(f"{bcolors.OKGREEN}Control client connected{bcolors.ENDC}")
540584
global recorder
@@ -629,7 +673,7 @@ async def control_handler(websocket, path):
629673
finally:
630674
control_connections.remove(websocket)
631675

632-
async def data_handler(websocket, path):
676+
async def data_handler(websocket):
633677
global writechunks, wav_file
634678
print(f"{bcolors.OKGREEN}Data client connected{bcolors.ENDC}")
635679
data_connections.add(websocket)
@@ -700,8 +744,13 @@ async def main_async():
700744

701745
recorder_config = {
702746
'model': args.model,
747+
'download_root': args.root,
703748
'realtime_model_type': args.rt_model,
704749
'language': args.lang,
750+
'batch_size': args.batch,
751+
'init_realtime_after_seconds': args.init_realtime_after_seconds,
752+
'realtime_batch_size': args.realtime_batch_size,
753+
'initial_prompt_realtime': args.initial_prompt_realtime,
705754
'input_device_index': args.input_device,
706755
'silero_sensitivity': args.silero_sensitivity,
707756
'silero_use_onnx': args.silero_use_onnx,
@@ -740,6 +789,12 @@ async def main_async():
740789
'no_log_file': True, # Disable logging to file
741790
'use_extended_logging': args.use_extended_logging,
742791
'level': loglevel,
792+
'compute_type': args.compute_type,
793+
'gpu_device_index': args.gpu_device_index,
794+
'device': args.device,
795+
'handle_buffer_overflow': args.handle_buffer_overflow,
796+
'suppress_tokens': args.suppress_tokens,
797+
'allowed_latency_limit': args.allowed_latency_limit,
743798
}
744799

745800
try:

‎setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
setuptools.setup(
1111
name="RealtimeSTT",
12-
version="0.3.92",
12+
version="0.3.93",
1313
author="Kolja Beigel",
1414
author_email="kolja.beigel@web.de",
1515
description="A fast Voice Activity Detection and Transcription System",

‎tests/realtimestt_speechendpoint_binary_classified.py

+33-13
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@
5050

5151
tokenizer = DistilBertTokenizerFast.from_pretrained(model_dir)
5252
classification_model = DistilBertForSequenceClassification.from_pretrained(model_dir)
53+
# tokenizer = DistilBertTokenizerFast.from_pretrained(model_dir, force_download=True)
54+
# classification_model = DistilBertForSequenceClassification.from_pretrained(model_dir, force_download=True)
5355
classification_model.to(device)
5456
classification_model.eval()
5557

@@ -85,7 +87,7 @@ def get_completion_probability(sentence, model, tokenizer, device, max_length):
8587
anchor_points = [
8688
(0.0, 1.0),
8789
(1.0, 0)
88-
]
90+
]
8991
# anchor_points = [
9092
# (0.0, 0.4),
9193
# (0.5, 0.3),
@@ -144,10 +146,9 @@ def is_speech_finished(text):
144146
text_time_deque = deque()
145147

146148
# Default values
147-
#rapid_sentence_end_detection = 0.2
148149
end_of_sentence_detection_pause = 0.3
149150
unknown_sentence_detection_pause = 0.8
150-
mid_sentence_detection_pause = 2.0
151+
mid_sentence_detection_pause = 1.7
151152
hard_break_even_on_background_noise = 3.0
152153
hard_break_even_on_background_noise_min_texts = 3
153154
hard_break_even_on_background_noise_min_chars = 15
@@ -172,26 +173,45 @@ def text_detected(text):
172173
def additional_pause_based_on_words(text):
173174
word_count = len(text.split())
174175
pauses = {
175-
1: 0.6,
176-
2: 0.5,
177-
3: 0.4,
178-
4: 0.3,
179-
5: 0.2,
180-
6: 0.1,
176+
0: 0.35,
177+
1: 0.3,
178+
2: 0.25,
179+
3: 0.2,
180+
4: 0.15,
181+
5: 0.1,
182+
6: 0.05,
181183
}
182184
return pauses.get(word_count, 0.0)
183185

184186
def process_queue():
185187
global recorder, full_sentences, prev_text, displayed_text, rich_text_stored, text_time_deque, abrupt_stop, rapid_sentence_end_detection
186188

187189
while True:
190+
text = None # Initialize text to ensure it's defined
191+
188192
try:
193+
# Attempt to retrieve the first item, blocking with timeout
189194
text = text_queue.get(timeout=1)
190195
except queue.Empty:
191-
continue
196+
continue # No item retrieved, continue the loop
197+
198+
if text is None:
199+
# Exit signal received
200+
break
201+
202+
# Drain the queue to get the latest text
203+
try:
204+
while True:
205+
latest_text = text_queue.get_nowait()
206+
if latest_text is None:
207+
text = None
208+
break
209+
text = latest_text
210+
except queue.Empty:
211+
pass # No more items to retrieve
192212

193213
if text is None:
194-
# Exit
214+
# Exit signal received after draining
195215
break
196216

197217
text = preprocess_text(text)
@@ -274,7 +294,7 @@ def process_queue():
274294

275295
def process_text(text):
276296
global recorder, full_sentences, prev_text, abrupt_stop
277-
#if IS_DEBUG: print(f"SENTENCE: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
297+
if IS_DEBUG: print(f"SENTENCE: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
278298
recorder.post_speech_silence_duration = unknown_sentence_detection_pause
279299
text = preprocess_text(text)
280300
text = text.rstrip()
@@ -312,7 +332,7 @@ def process_text(text):
312332
'beam_size': 5,
313333
'beam_size_realtime': 3,
314334
'no_log_file': True,
315-
'initial_prompt': (
335+
'initial_prompt_realtime': (
316336
"End incomplete sentences with ellipses.\n"
317337
"Examples:\n"
318338
"Complete: The sky is blue.\n"

0 commit comments

Comments
 (0)
Please sign in to comment.