27
27
- `-D, --debug`: Enable debug logging.
28
28
- `-W, --write`: Save audio to WAV file.
29
29
- `-s, --silence_timing`: Enable dynamic silence duration for sentence detection; default True.
30
+ - `-b, --batch, --batch_size`: Batch size for inference; default 16.
31
+ - `--root, --download_root`: Specifies the root path were the Whisper models are downloaded to.
30
32
- `--silero_sensitivity`: Silero VAD sensitivity (0-1); default 0.05.
31
33
- `--silero_use_onnx`: Use Silero ONNX model; default False.
32
34
- `--webrtc_sensitivity`: WebRTC VAD sensitivity (0-3); default 3.
38
40
- `--early_transcription_on_silence`: Start transcription after silence in seconds; default 0.2.
39
41
- `--beam_size`: Beam size for main model; default 5.
40
42
- `--beam_size_realtime`: Beam size for real-time model; default 3.
41
- - `--initial_prompt`: Initial transcription guidance prompt.
43
+ - `--init_realtime_after_seconds`: Initial waiting time for realtime transcription; default 0.2.
44
+ - `--realtime_batch_size`: Batch size for the real-time transcription model; default 16.
45
+ - `--initial_prompt`: Initial main transcription guidance prompt.
46
+ - `--initial_prompt_realtime`: Initial realtime transcription guidance prompt.
42
47
- `--end_of_sentence_detection_pause`: Silence duration for sentence end detection; default 0.45.
43
48
- `--unknown_sentence_detection_pause`: Pause duration for incomplete sentence detection; default 0.7.
44
49
- `--mid_sentence_detection_pause`: Pause for mid-sentence break; default 2.0.
52
57
- `--use_main_model_for_realtime`: Use main model for real-time transcription.
53
58
- `--use_extended_logging`: Enable extensive log messages.
54
59
- `--logchunks`: Log incoming audio chunks.
60
+ - `--compute_type`: Type of computation to use.
61
+ - `--input_device_index`: Index of the audio input device.
62
+ - `--gpu_device_index`: Index of the GPU device.
63
+ - `--device`: Device to use for computation.
64
+ - `--handle_buffer_overflow`: Handle buffer overflow during transcription.
65
+ - `--suppress_tokens`: Suppress tokens during transcription.
66
+ - `--allowed_latency_limit`: Allowed latency limit for real-time transcription.
67
+
55
68
56
69
### WebSocket Interface:
57
70
The server supports two WebSocket connections:
@@ -364,7 +377,7 @@ def parse_arguments():
364
377
parser .add_argument ('-l' , '--lang' , '--language' , type = str , default = 'en' ,
365
378
help = 'Language code for the STT model to transcribe in a specific language. Leave this empty for auto-detection based on input audio. Default is en. List of supported language codes: https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L11-L110' )
366
379
367
- parser .add_argument ('-i' , '--input-device' , '--input_device_index ' , type = int , default = 1 ,
380
+ parser .add_argument ('-i' , '--input-device' , '--input-device-index ' , type = int , default = 1 ,
368
381
help = 'Index of the audio input device to use. Use this option to specify a particular microphone or audio input device based on your system. Default is 1.' )
369
382
370
383
parser .add_argument ('-c' , '--control' , '--control_port' , type = int , default = 8011 ,
@@ -378,12 +391,23 @@ def parse_arguments():
378
391
379
392
parser .add_argument ('-D' , '--debug' , action = 'store_true' , help = 'Enable debug logging for detailed server operations' )
380
393
381
- parser .add_argument ("-W" , "--write" , metavar = "FILE" ,
382
- help = "Save received audio to a WAV file" )
394
+ parser .add_argument ('-W' , '--write' , metavar = 'FILE' , help = 'Save received audio to a WAV file' )
395
+
396
+ parser .add_argument ('-b' , '--batch' , '--batch_size' , type = int , default = 16 , help = 'Batch size for inference. This parameter controls the number of audio chunks processed in parallel during transcription. Default is 16.' )
397
+
398
+ parser .add_argument ('--root' , '--download_root' , type = str ,default = None , help = 'Specifies the root path where the Whisper models are downloaded to. Default is None.' )
383
399
384
400
parser .add_argument ('-s' , '--silence_timing' , action = 'store_true' , default = True ,
385
401
help = 'Enable dynamic adjustment of silence duration for sentence detection. Adjusts post-speech silence duration based on detected sentence structure and punctuation. Default is False.' )
386
402
403
+ parser .add_argument ('--init_realtime_after_seconds' , type = float , default = 0.2 ,
404
+ help = 'The initial waiting time in seconds before real-time transcription starts. This delay helps prevent false positives at the beginning of a session. Default is 0.2 seconds.' )
405
+
406
+ parser .add_argument ('--realtime_batch_size' , type = int , default = 16 ,
407
+ help = 'Batch size for the real-time transcription model. This parameter controls the number of audio chunks processed in parallel during real-time transcription. Default is 16.' )
408
+
409
+ parser .add_argument ('--initial_prompt_realtime' , type = str , default = "" , help = 'Initial prompt that guides the real-time transcription model to produce transcriptions in a particular style or format.' )
410
+
387
411
parser .add_argument ('--silero_sensitivity' , type = float , default = 0.05 ,
388
412
help = 'Sensitivity level for Silero Voice Activity Detection (VAD), with a range from 0 to 1. Lower values make the model less sensitive, useful for noisy environments. Default is 0.05.' )
389
413
@@ -457,6 +481,23 @@ def parse_arguments():
457
481
parser .add_argument ('--use_extended_logging' , action = 'store_true' ,
458
482
help = 'Writes extensive log messages for the recording worker, that processes the audio chunks.' )
459
483
484
+ parser .add_argument ('--compute_type' , type = str , default = 'default' ,
485
+ help = 'Type of computation to use. See https://opennmt.net/CTranslate2/quantization.html' )
486
+
487
+ parser .add_argument ('--gpu_device_index' , type = int , default = 0 ,
488
+ help = 'Index of the GPU device to use. Default is None.' )
489
+
490
+ parser .add_argument ('--device' , type = str , default = 'cuda' ,
491
+ help = 'Device for model to use. Can either be "cuda" or "cpu". Default is cuda.' )
492
+
493
+ parser .add_argument ('--handle_buffer_overflow' , action = 'store_true' ,
494
+ help = 'Handle buffer overflow during transcription. Default is False.' )
495
+
496
+ parser .add_argument ('--suppress_tokens' , type = int , default = [- 1 ], nargs = '*' , help = 'Suppress tokens during transcription. Default is [-1].' )
497
+
498
+ parser .add_argument ('--allowed_latency_limit' , type = int , default = 100 ,
499
+ help = 'Maximal amount of chunks that can be unprocessed in queue before discarding chunks.. Default is 100.' )
500
+
460
501
parser .add_argument ('--logchunks' , action = 'store_true' , help = 'Enable logging of incoming audio chunks (periods)' )
461
502
462
503
# Parse arguments
@@ -479,6 +520,9 @@ def parse_arguments():
479
520
if args .initial_prompt :
480
521
args .initial_prompt = args .initial_prompt .replace ("\\ n" , "\n " )
481
522
523
+ if args .initial_prompt_realtime :
524
+ args .initial_prompt_realtime = args .initial_prompt_realtime .replace ("\\ n" , "\n " )
525
+
482
526
return args
483
527
484
528
def _recorder_thread (loop ):
@@ -534,7 +578,7 @@ def decode_and_resample(
534
578
535
579
return resampled_audio .astype (np .int16 ).tobytes ()
536
580
537
- async def control_handler (websocket , path ):
581
+ async def control_handler (websocket ):
538
582
debug_print (f"New control connection from { websocket .remote_address } " )
539
583
print (f"{ bcolors .OKGREEN } Control client connected{ bcolors .ENDC } " )
540
584
global recorder
@@ -629,7 +673,7 @@ async def control_handler(websocket, path):
629
673
finally :
630
674
control_connections .remove (websocket )
631
675
632
- async def data_handler (websocket , path ):
676
+ async def data_handler (websocket ):
633
677
global writechunks , wav_file
634
678
print (f"{ bcolors .OKGREEN } Data client connected{ bcolors .ENDC } " )
635
679
data_connections .add (websocket )
@@ -700,8 +744,13 @@ async def main_async():
700
744
701
745
recorder_config = {
702
746
'model' : args .model ,
747
+ 'download_root' : args .root ,
703
748
'realtime_model_type' : args .rt_model ,
704
749
'language' : args .lang ,
750
+ 'batch_size' : args .batch ,
751
+ 'init_realtime_after_seconds' : args .init_realtime_after_seconds ,
752
+ 'realtime_batch_size' : args .realtime_batch_size ,
753
+ 'initial_prompt_realtime' : args .initial_prompt_realtime ,
705
754
'input_device_index' : args .input_device ,
706
755
'silero_sensitivity' : args .silero_sensitivity ,
707
756
'silero_use_onnx' : args .silero_use_onnx ,
@@ -740,6 +789,12 @@ async def main_async():
740
789
'no_log_file' : True , # Disable logging to file
741
790
'use_extended_logging' : args .use_extended_logging ,
742
791
'level' : loglevel ,
792
+ 'compute_type' : args .compute_type ,
793
+ 'gpu_device_index' : args .gpu_device_index ,
794
+ 'device' : args .device ,
795
+ 'handle_buffer_overflow' : args .handle_buffer_overflow ,
796
+ 'suppress_tokens' : args .suppress_tokens ,
797
+ 'allowed_latency_limit' : args .allowed_latency_limit ,
743
798
}
744
799
745
800
try :
0 commit comments