Skip to content

Commit a694d92

Browse files
committed
sensevoice
1 parent 43c47a7 commit a694d92

File tree

4 files changed

+119
-57
lines changed

4 files changed

+119
-57
lines changed

README.md

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -95,24 +95,6 @@ pip install -r requirements.txt
9595

9696
## Inference
9797

98-
### Method 1
99-
100-
```python
101-
from model import SenseVoiceSmall
102-
103-
model_dir = "iic/SenseVoiceSmall"
104-
m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir)
105-
106-
107-
res = m.inference(
108-
data_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
109-
language="zh", # "zn", "en", "yue", "ja", "ko", "nospeech"
110-
use_itn=False,
111-
**kwargs,
112-
)
113-
114-
print(res)
115-
```
11698

11799
### Method 2
118100

@@ -159,7 +141,24 @@ res = model.generate(
159141

160142
For more usage, please refer to [docs](https://github.com/modelscope/FunASR/blob/main/docs/tutorial/README.md)
161143

144+
### Method 1
162145

146+
```python
147+
from model import SenseVoiceSmall
148+
149+
model_dir = "iic/SenseVoiceSmall"
150+
m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir)
151+
152+
153+
res = m.inference(
154+
data_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
155+
language="zh", # "zn", "en", "yue", "ja", "ko", "nospeech"
156+
use_itn=False,
157+
**kwargs,
158+
)
159+
160+
print(res)
161+
```
163162

164163
### Export and Test
165164

README_zh.md

Lines changed: 38 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -95,56 +95,42 @@ pip install -r requirements.txt
9595

9696
## 推理
9797

98-
### 直接推理
99-
100-
```python
101-
from model import SenseVoiceSmall
102-
103-
model_dir = "iic/SenseVoiceSmall"
104-
m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir)
10598

10699

107-
res = m.inference(
108-
data_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
109-
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
110-
use_itn=False,
111-
**kwargs,
112-
)
113-
114-
print(res)
115-
```
116-
117100
### 使用funasr推理
118101

102+
支持任意格式音频输入,支持任意时长输入
103+
119104
```python
120105
from funasr import AutoModel
121106
from funasr.utils.postprocess_utils import rich_transcription_postprocess
122107

123108
model_dir = "iic/SenseVoiceSmall"
124-
input_file = (
125-
"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
126-
)
127109

128-
model = AutoModel(model=model_dir,
129-
vad_model="fsmn-vad",
130-
vad_kwargs={"max_single_segment_time": 30000},
131-
trust_remote_code=True, device="cuda:0")
132110

111+
model = AutoModel(
112+
model=model_dir,
113+
vad_model="fsmn-vad",
114+
vad_kwargs={"max_single_segment_time": 30000},
115+
device="cpu",
116+
)
117+
118+
# en
133119
res = model.generate(
134-
input=input_file,
120+
input=f"{model.model_path}/example/en.mp3",
135121
cache={},
136-
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
137-
use_itn=False,
138-
batch_size_s=0,
122+
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
123+
use_itn=True,
124+
batch_size_s=60,
125+
merge_vad=True, #
126+
merge_length_s=15,
139127
)
140-
141128
text = rich_transcription_postprocess(res[0]["text"])
142-
143129
print(text)
144130
```
145131

146132
funasr版本已经集成了vad模型,支持任意时长音频输入,`batch_size_s`单位为秒。
147-
如果输入均为短音频,并且需要批量化推理,为了加快推理效率,可以移除vad模型,并设置`batch_size`
133+
如果输入均为短音频(小于30s),并且需要批量化推理,为了加快推理效率,可以移除vad模型,并设置`batch_size`
148134

149135
```python
150136
model = AutoModel(model=model_dir, trust_remote_code=True, device="cuda:0")
@@ -160,6 +146,27 @@ res = model.generate(
160146

161147
更多详细用法,请参考 [文档](https://github.com/modelscope/FunASR/blob/main/docs/tutorial/README.md)
162148

149+
### 直接推理
150+
151+
支持任意格式音频输入,输入音频时长限制在30s以下
152+
153+
```python
154+
from model import SenseVoiceSmall
155+
156+
model_dir = "iic/SenseVoiceSmall"
157+
m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir)
158+
159+
160+
res = m.inference(
161+
data_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
162+
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
163+
use_itn=False,
164+
**kwargs,
165+
)
166+
167+
print(res)
168+
```
169+
163170
## 服务部署
164171

165172
Undo

demo_funasr.py

Lines changed: 63 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,82 @@
33
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
44
# MIT License (https://opensource.org/licenses/MIT)
55

6+
67
from funasr import AutoModel
78
from funasr.utils.postprocess_utils import rich_transcription_postprocess
89

910
model_dir = "iic/SenseVoiceSmall"
10-
input_file = (
11-
"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
12-
)
11+
1312

1413
model = AutoModel(
1514
model=model_dir,
16-
trust_remote_code=True,
15+
vad_model="fsmn-vad",
16+
vad_kwargs={"max_single_segment_time": 30000},
17+
device="cpu",
18+
)
19+
20+
# en
21+
res = model.generate(
22+
input=f"{model.model_path}/example/en.mp3",
23+
cache={},
24+
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
25+
use_itn=True,
26+
batch_size_s=60,
27+
merge_vad=True, #
28+
merge_length_s=15,
29+
)
30+
text = rich_transcription_postprocess(res[0]["text"])
31+
print(text)
32+
33+
# zh
34+
res = model.generate(
35+
input=f"{model.model_path}/example/zh.mp3",
36+
cache={},
37+
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
38+
use_itn=True,
39+
batch_size_s=60,
40+
merge_vad=True, #
41+
merge_length_s=15,
1742
)
43+
text = rich_transcription_postprocess(res[0]["text"])
44+
print(text)
1845

46+
# yue
1947
res = model.generate(
20-
input=input_file,
48+
input=f"{model.model_path}/example/yue.mp3",
2149
cache={},
22-
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
23-
use_itn=False,
50+
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
51+
use_itn=True,
52+
batch_size_s=60,
53+
merge_vad=True, #
54+
merge_length_s=15,
2455
)
56+
text = rich_transcription_postprocess(res[0]["text"])
57+
print(text)
2558

59+
# ja
60+
res = model.generate(
61+
input=f"{model.model_path}/example/ja.mp3",
62+
cache={},
63+
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
64+
use_itn=True,
65+
batch_size_s=60,
66+
merge_vad=True, #
67+
merge_length_s=15,
68+
)
2669
text = rich_transcription_postprocess(res[0]["text"])
70+
print(text)
71+
2772

73+
# ko
74+
res = model.generate(
75+
input=f"{model.model_path}/example/ko.mp3",
76+
cache={},
77+
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
78+
use_itn=True,
79+
batch_size_s=60,
80+
merge_vad=True, #
81+
merge_length_s=15,
82+
)
83+
text = rich_transcription_postprocess(res[0]["text"])
2884
print(text)

finetune.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
1010
# model_name from model_hub, or model_dir in local path
1111

1212
## option 1, download model automatically
13-
model_name_or_model_dir="iic/SenseVoiceCTC"
13+
model_name_or_model_dir="iic/SenseVoiceSmall"
1414

1515
## option 2, download model by git
1616
#local_path_root=${workspace}/modelscope_models

0 commit comments

Comments
 (0)