Update models, simplify inputs
Browse files- README.md +36 -8
- SenseVoiceAx.py +48 -129
- gradio_demo.py +21 -13
- main.py +18 -17
- pe_nonstream.npy +0 -3
- pe_streaming.npy +0 -3
- print_utils.py +0 -131
- requirements.txt +1 -4
- am.mvn → sensevoice_ax630c/am.mvn +0 -0
- chn_jpn_yue_eng_ko_spectok.bpe.model → sensevoice_ax630c/chn_jpn_yue_eng_ko_spectok.bpe.model +0 -0
- sensevoice_ax630c/sensevoice.axmodel +2 -2
- sensevoice_ax630c/streaming_sensevoice.axmodel +2 -2
- sensevoice_ax630c/tokens.txt +0 -0
- sensevoice_ax650/am.mvn +8 -0
- embeddings.npy → sensevoice_ax650/chn_jpn_yue_eng_ko_spectok.bpe.model +2 -2
- sensevoice_ax650/sensevoice.axmodel +2 -2
- sensevoice_ax650/streaming_sensevoice.axmodel +2 -2
- sensevoice_ax650/tokens.txt +0 -0
- server.py +19 -9
- test_wer.py +22 -18
- tokenizer.py +0 -135
README.md
CHANGED
|
@@ -25,12 +25,39 @@ FunASR SenseVoice on Axera, official repo: https://github.com/FunAudioLLM/SenseV
|
|
| 25 |
- [x] AX650N
|
| 26 |
- [x] AX630C
|
| 27 |
|
|
|
|
| 28 |
## 环境安装
|
|
|
|
|
|
|
| 29 |
```
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
```
|
| 32 |
如果空间不足可以使用 --prefix 指定别的安装路径
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
## 使用
|
| 36 |
```
|
|
@@ -48,20 +75,21 @@ python3 main.py -i 输入音频文件
|
|
| 48 |
### 示例:
|
| 49 |
example下有测试音频
|
| 50 |
|
| 51 |
-
如
|
| 52 |
```
|
| 53 |
-
|
| 54 |
```
|
| 55 |
输出
|
| 56 |
```
|
| 57 |
-
RTF: 0.
|
| 58 |
-
|
|
|
|
| 59 |
```
|
| 60 |
|
| 61 |
流式识别
|
| 62 |
|
| 63 |
```
|
| 64 |
-
|
| 65 |
```
|
| 66 |
输出
|
| 67 |
```
|
|
@@ -81,13 +109,13 @@ RTF: 0.03678379235444246
|
|
| 81 |
|
| 82 |
使用WER(Word-Error-Rate)作为评价标准
|
| 83 |
|
| 84 |
-
**WER =
|
| 85 |
|
| 86 |
### 复现测试结果
|
| 87 |
|
| 88 |
```
|
| 89 |
./download_datasets.sh
|
| 90 |
-
python test_wer.py -d datasets -
|
| 91 |
```
|
| 92 |
|
| 93 |
## 技术讨论
|
|
|
|
| 25 |
- [x] AX650N
|
| 26 |
- [x] AX630C
|
| 27 |
|
| 28 |
+
|
| 29 |
## 环境安装
|
| 30 |
+
|
| 31 |
+
推荐在板上安装Miniconda管理虚拟环境,安装方法如下:
|
| 32 |
```
|
| 33 |
+
mkdir -p ~/miniconda3
|
| 34 |
+
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-aarch64.sh -O ~/miniconda3/miniconda.sh
|
| 35 |
+
bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
|
| 36 |
+
rm ~/miniconda3/miniconda.sh
|
| 37 |
+
|
| 38 |
+
source ~/miniconda3/bin/activate
|
| 39 |
+
|
| 40 |
+
conda init --all
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
```
|
| 44 |
+
sudo apt-get install libsndfile-dev
|
| 45 |
+
|
| 46 |
+
conda create -n sensevoice python=3.12
|
| 47 |
+
conda activate sensevoice
|
| 48 |
+
pip install -r requirements.txt
|
| 49 |
```
|
| 50 |
如果空间不足可以使用 --prefix 指定别的安装路径
|
| 51 |
|
| 52 |
+
#### 安装pyaxenigne
|
| 53 |
+
|
| 54 |
+
参考 https://github.com/AXERA-TECH/pyaxengine 安装 NPU Python API
|
| 55 |
+
|
| 56 |
+
在0.1.3rc2上测试通过,可通过
|
| 57 |
+
```
|
| 58 |
+
pip install https://github.com/AXERA-TECH/pyaxengine/releases/download/0.1.3.rc2/axengine-0.1.3-py3-none-any.whl
|
| 59 |
+
```
|
| 60 |
+
安装,或把版本号更改为你想使用的版本
|
| 61 |
|
| 62 |
## 使用
|
| 63 |
```
|
|
|
|
| 75 |
### 示例:
|
| 76 |
example下有测试音频
|
| 77 |
|
| 78 |
+
如 中文测试
|
| 79 |
```
|
| 80 |
+
python main.py -i example/zh.mp3
|
| 81 |
```
|
| 82 |
输出
|
| 83 |
```
|
| 84 |
+
RTF: 0.04386647134764582 Latency: 0.2463541030883789s Total length: 5.616s
|
| 85 |
+
ASR result: 开饭时间早上九点至下午五点
|
| 86 |
+
|
| 87 |
```
|
| 88 |
|
| 89 |
流式识别
|
| 90 |
|
| 91 |
```
|
| 92 |
+
python main.py -i example/zh.mp3 --streaming
|
| 93 |
```
|
| 94 |
输出
|
| 95 |
```
|
|
|
|
| 109 |
|
| 110 |
使用WER(Word-Error-Rate)作为评价标准
|
| 111 |
|
| 112 |
+
**WER = 2.0%**
|
| 113 |
|
| 114 |
### 复现测试结果
|
| 115 |
|
| 116 |
```
|
| 117 |
./download_datasets.sh
|
| 118 |
+
python test_wer.py -d aishell -g datasets/ground_truth.txt --language zh
|
| 119 |
```
|
| 120 |
|
| 121 |
## 技术讨论
|
SenseVoiceAx.py
CHANGED
|
@@ -2,44 +2,14 @@ import axengine as axe
|
|
| 2 |
import numpy as np
|
| 3 |
import librosa
|
| 4 |
from frontend import WavFrontend
|
| 5 |
-
import os
|
| 6 |
import time
|
| 7 |
from typing import List, Union, Optional
|
| 8 |
from asr_decoder import CTCDecoder
|
| 9 |
-
from tokenizer import SentencepiecesTokenizer
|
| 10 |
from online_fbank import OnlineFbank
|
| 11 |
import torch
|
| 12 |
|
| 13 |
|
| 14 |
-
def
|
| 15 |
-
# 如果 maxlen 未指定,则取 lengths 中的最大值
|
| 16 |
-
if maxlen is None:
|
| 17 |
-
maxlen = np.max(lengths)
|
| 18 |
-
|
| 19 |
-
# 创建一个从 0 到 maxlen-1 的行向量
|
| 20 |
-
row_vector = np.arange(0, maxlen, 1)
|
| 21 |
-
|
| 22 |
-
# 将 lengths 转换为列向量
|
| 23 |
-
matrix = np.expand_dims(lengths, axis=-1)
|
| 24 |
-
|
| 25 |
-
# 比较生成掩码
|
| 26 |
-
mask = row_vector < matrix
|
| 27 |
-
if mask.shape[-1] < lengths[0]:
|
| 28 |
-
mask = np.concatenate(
|
| 29 |
-
[
|
| 30 |
-
mask,
|
| 31 |
-
np.zeros(
|
| 32 |
-
(mask.shape[0], lengths[0] - mask.shape[-1]), dtype=np.float32
|
| 33 |
-
),
|
| 34 |
-
],
|
| 35 |
-
axis=-1,
|
| 36 |
-
)
|
| 37 |
-
|
| 38 |
-
# 返回指定数据类型的掩码
|
| 39 |
-
return mask.astype(dtype)[None, ...]
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
def unique_consecutive_np(arr):
|
| 43 |
"""
|
| 44 |
找出数组中连续的唯一值,模拟 torch.unique_consecutive(yseq, dim=-1)
|
| 45 |
|
|
@@ -74,13 +44,14 @@ class SenseVoiceAx:
|
|
| 74 |
def __init__(
|
| 75 |
self,
|
| 76 |
model_path: str,
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
| 78 |
beam_size: int = 3,
|
| 79 |
-
language: str = "auto",
|
| 80 |
hot_words: Optional[List[str]] = None,
|
| 81 |
-
use_itn: bool = True,
|
| 82 |
streaming: bool = False,
|
| 83 |
-
providers=[
|
| 84 |
):
|
| 85 |
"""
|
| 86 |
Initialize SenseVoiceAx
|
|
@@ -99,23 +70,8 @@ class SenseVoiceAx:
|
|
| 99 |
Use stream_infer method if streaming is true otherwise infer.
|
| 100 |
|
| 101 |
"""
|
| 102 |
-
model_path_root = os.path.dirname(model_path)
|
| 103 |
-
emb_path = os.path.join(model_path_root, "../embeddings.npy")
|
| 104 |
-
cmvn_file = os.path.join(model_path_root, "../am.mvn")
|
| 105 |
-
bpe_model = os.path.join(
|
| 106 |
-
model_path_root, "../chn_jpn_yue_eng_ko_spectok.bpe.model"
|
| 107 |
-
)
|
| 108 |
-
if streaming:
|
| 109 |
-
self.position_encoding = np.load(
|
| 110 |
-
os.path.join(model_path_root, "../pe_streaming.npy")
|
| 111 |
-
)
|
| 112 |
-
else:
|
| 113 |
-
self.position_encoding = np.load(
|
| 114 |
-
os.path.join(model_path_root, "../pe_nonstream.npy")
|
| 115 |
-
)
|
| 116 |
|
| 117 |
self.streaming = streaming
|
| 118 |
-
self.tokenizer = SentencepiecesTokenizer(bpemodel=bpe_model)
|
| 119 |
|
| 120 |
self.frontend = WavFrontend(
|
| 121 |
cmvn_file=cmvn_file,
|
|
@@ -127,12 +83,15 @@ class SenseVoiceAx:
|
|
| 127 |
lfr_m=7,
|
| 128 |
lfr_n=6,
|
| 129 |
)
|
|
|
|
| 130 |
self.model = axe.InferenceSession(model_path, providers=providers)
|
| 131 |
self.sample_rate = 16000
|
| 132 |
self.blank_id = 0
|
| 133 |
-
self.
|
| 134 |
self.padding = 16
|
| 135 |
self.input_size = 560
|
|
|
|
|
|
|
| 136 |
|
| 137 |
self.lid_dict = {
|
| 138 |
"auto": 0,
|
|
@@ -143,33 +102,13 @@ class SenseVoiceAx:
|
|
| 143 |
"ko": 12,
|
| 144 |
"nospeech": 13,
|
| 145 |
}
|
| 146 |
-
self.lid_int_dict = {
|
| 147 |
-
24884: 3,
|
| 148 |
-
24885: 4,
|
| 149 |
-
24888: 7,
|
| 150 |
-
24892: 11,
|
| 151 |
-
24896: 12,
|
| 152 |
-
24992: 13,
|
| 153 |
-
}
|
| 154 |
-
self.textnorm_dict = {"withitn": 14, "woitn": 15}
|
| 155 |
-
self.textnorm_int_dict = {25016: 14, 25017: 15}
|
| 156 |
-
self.emo_dict = {
|
| 157 |
-
"unk": 25009,
|
| 158 |
-
"happy": 25001,
|
| 159 |
-
"sad": 25002,
|
| 160 |
-
"angry": 25003,
|
| 161 |
-
"neutral": 25004,
|
| 162 |
-
}
|
| 163 |
-
|
| 164 |
-
self.load_embeddings(emb_path, language, use_itn)
|
| 165 |
-
self.language = language
|
| 166 |
|
| 167 |
# decoder
|
| 168 |
if beam_size > 1 and hot_words is not None:
|
| 169 |
self.beam_size = beam_size
|
| 170 |
symbol_table = {}
|
| 171 |
-
for i in range(self.
|
| 172 |
-
symbol_table[self.
|
| 173 |
self.decoder = CTCDecoder(hot_words, symbol_table, bpe_model)
|
| 174 |
else:
|
| 175 |
self.beam_size = 1
|
|
@@ -177,8 +116,8 @@ class SenseVoiceAx:
|
|
| 177 |
|
| 178 |
if streaming:
|
| 179 |
self.cur_idx = -1
|
| 180 |
-
self.chunk_size =
|
| 181 |
-
self.caches_shape = (
|
| 182 |
self.caches = np.zeros(self.caches_shape, dtype=np.float32)
|
| 183 |
self.zeros = np.zeros((1, self.input_size), dtype=np.float32)
|
| 184 |
self.neg_mean, self.inv_stddev = (
|
|
@@ -187,38 +126,25 @@ class SenseVoiceAx:
|
|
| 187 |
)
|
| 188 |
|
| 189 |
self.fbank = OnlineFbank(window_type="hamming")
|
| 190 |
-
self.
|
| 191 |
-
|
| 192 |
-
maxlen=self.max_len,
|
| 193 |
-
dtype=np.float32,
|
| 194 |
)
|
| 195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
@property
|
| 197 |
def language_options(self):
|
| 198 |
return list(self.lid_dict.keys())
|
| 199 |
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
def load_embeddings(self, emb_path, language, use_itn):
|
| 205 |
-
self.embeddings = np.load(emb_path, allow_pickle=True).item()
|
| 206 |
-
self.language_query = self.embeddings[language]
|
| 207 |
-
self.textnorm_query = (
|
| 208 |
-
self.embeddings["withitn"] if use_itn else self.embeddings["woitn"]
|
| 209 |
-
)
|
| 210 |
-
self.event_emo_query = self.embeddings["event_emo"]
|
| 211 |
-
self.input_query = np.concatenate(
|
| 212 |
-
(self.textnorm_query, self.language_query, self.event_emo_query), axis=1
|
| 213 |
-
)
|
| 214 |
-
self.query_num = self.input_query.shape[1]
|
| 215 |
-
|
| 216 |
-
def choose_language(self, language):
|
| 217 |
-
self.language_query = self.embeddings[language]
|
| 218 |
-
self.input_query = np.concatenate(
|
| 219 |
-
(self.textnorm_query, self.language_query, self.event_emo_query), axis=1
|
| 220 |
-
)
|
| 221 |
-
self.language = language
|
| 222 |
|
| 223 |
def load_data(self, filepath: str) -> np.ndarray:
|
| 224 |
waveform, _ = librosa.load(filepath, sr=self.sample_rate)
|
|
@@ -254,7 +180,7 @@ class SenseVoiceAx:
|
|
| 254 |
yseq = np.argmax(x, axis=-1)
|
| 255 |
|
| 256 |
# 去除连续重复元素
|
| 257 |
-
yseq =
|
| 258 |
|
| 259 |
# 创建掩码并过滤 blank_id
|
| 260 |
mask = yseq != self.blank_id
|
|
@@ -263,16 +189,16 @@ class SenseVoiceAx:
|
|
| 263 |
return token_int
|
| 264 |
|
| 265 |
def infer_waveform(self, waveform: np.ndarray, language="auto"):
|
| 266 |
-
if language != self.language:
|
| 267 |
-
self.choose_language(language)
|
| 268 |
-
|
| 269 |
# start = time.time()
|
| 270 |
feat, feat_len = self.preprocess(waveform)
|
| 271 |
# print(f"Preprocess take {time.time() - start}s")
|
| 272 |
|
| 273 |
-
slice_len = self.
|
| 274 |
slice_num = int(np.ceil(feat.shape[1] / slice_len))
|
| 275 |
|
|
|
|
|
|
|
|
|
|
| 276 |
asr_res = []
|
| 277 |
for i in range(slice_num):
|
| 278 |
if i == 0:
|
|
@@ -283,46 +209,39 @@ class SenseVoiceAx:
|
|
| 283 |
i * slice_len - self.padding : (i + 1) * slice_len - self.padding,
|
| 284 |
:,
|
| 285 |
]
|
| 286 |
-
|
| 287 |
-
sub_feat = np.concatenate([self.input_query, sub_feat], axis=1)
|
| 288 |
real_len = sub_feat.shape[1]
|
| 289 |
-
if real_len < self.
|
| 290 |
sub_feat = np.concatenate(
|
| 291 |
[
|
| 292 |
sub_feat,
|
| 293 |
np.zeros(
|
| 294 |
-
(1, self.
|
| 295 |
dtype=np.float32,
|
| 296 |
),
|
| 297 |
],
|
| 298 |
axis=1,
|
| 299 |
)
|
| 300 |
|
| 301 |
-
|
| 302 |
-
np.array([self.max_len], dtype=np.int32),
|
| 303 |
-
maxlen=real_len,
|
| 304 |
-
dtype=np.float32,
|
| 305 |
-
)
|
| 306 |
|
| 307 |
# start = time.time()
|
| 308 |
outputs = self.model.run(
|
| 309 |
None,
|
| 310 |
{
|
| 311 |
"speech": sub_feat,
|
| 312 |
-
"
|
| 313 |
-
"
|
| 314 |
},
|
| 315 |
)
|
| 316 |
ctc_logits, encoder_out_lens = outputs
|
| 317 |
|
| 318 |
token_int = self.postprocess(ctc_logits, encoder_out_lens)
|
| 319 |
|
| 320 |
-
|
| 321 |
-
asr_res.append(self.tokenizer.tokens2text(token_int))
|
| 322 |
-
else:
|
| 323 |
-
asr_res.append(token_int)
|
| 324 |
|
| 325 |
-
|
|
|
|
| 326 |
|
| 327 |
def infer(
|
| 328 |
self, filepath_or_data: Union[np.ndarray, str], language="auto", print_rtf=False
|
|
@@ -343,15 +262,15 @@ class SenseVoiceAx:
|
|
| 343 |
if print_rtf:
|
| 344 |
rtf = latency / total_time
|
| 345 |
print(f"RTF: {rtf} Latency: {latency}s Total length: {total_time}s")
|
| 346 |
-
return
|
| 347 |
|
| 348 |
def decode(self, times, tokens):
|
| 349 |
times_ms = []
|
| 350 |
for step, token in zip(times, tokens):
|
| 351 |
-
if len(self.
|
| 352 |
continue
|
| 353 |
times_ms.append(step * 60)
|
| 354 |
-
return times_ms,
|
| 355 |
|
| 356 |
def reset(self):
|
| 357 |
self.cur_idx = -1
|
|
@@ -368,8 +287,8 @@ class SenseVoiceAx:
|
|
| 368 |
def stream_infer(self, audio, is_last, language="auto"):
|
| 369 |
assert self.streaming, "This method is for streaming model"
|
| 370 |
|
| 371 |
-
|
| 372 |
-
|
| 373 |
|
| 374 |
self.fbank.accept_waveform(audio, is_last)
|
| 375 |
features = self.fbank.get_lfr_frames(
|
|
@@ -393,8 +312,8 @@ class SenseVoiceAx:
|
|
| 393 |
None,
|
| 394 |
{
|
| 395 |
"speech": speech,
|
| 396 |
-
"
|
| 397 |
-
"
|
| 398 |
},
|
| 399 |
)
|
| 400 |
ctc_logits, encoder_out_lens = outputs
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import librosa
|
| 4 |
from frontend import WavFrontend
|
|
|
|
| 5 |
import time
|
| 6 |
from typing import List, Union, Optional
|
| 7 |
from asr_decoder import CTCDecoder
|
|
|
|
| 8 |
from online_fbank import OnlineFbank
|
| 9 |
import torch
|
| 10 |
|
| 11 |
|
| 12 |
+
def unique_consecutive(arr):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
"""
|
| 14 |
找出数组中连续的唯一值,模拟 torch.unique_consecutive(yseq, dim=-1)
|
| 15 |
|
|
|
|
| 44 |
def __init__(
|
| 45 |
self,
|
| 46 |
model_path: str,
|
| 47 |
+
cmvn_file: str,
|
| 48 |
+
token_file: str,
|
| 49 |
+
bpe_model: str = None,
|
| 50 |
+
max_seq_len: int = 256,
|
| 51 |
beam_size: int = 3,
|
|
|
|
| 52 |
hot_words: Optional[List[str]] = None,
|
|
|
|
| 53 |
streaming: bool = False,
|
| 54 |
+
providers=["AxEngineExecutionProvider"],
|
| 55 |
):
|
| 56 |
"""
|
| 57 |
Initialize SenseVoiceAx
|
|
|
|
| 70 |
Use stream_infer method if streaming is true otherwise infer.
|
| 71 |
|
| 72 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
self.streaming = streaming
|
|
|
|
| 75 |
|
| 76 |
self.frontend = WavFrontend(
|
| 77 |
cmvn_file=cmvn_file,
|
|
|
|
| 83 |
lfr_m=7,
|
| 84 |
lfr_n=6,
|
| 85 |
)
|
| 86 |
+
|
| 87 |
self.model = axe.InferenceSession(model_path, providers=providers)
|
| 88 |
self.sample_rate = 16000
|
| 89 |
self.blank_id = 0
|
| 90 |
+
self.max_seq_len = max_seq_len
|
| 91 |
self.padding = 16
|
| 92 |
self.input_size = 560
|
| 93 |
+
self.query_num = 4
|
| 94 |
+
self.tokens = self.load_tokens(token_file)
|
| 95 |
|
| 96 |
self.lid_dict = {
|
| 97 |
"auto": 0,
|
|
|
|
| 102 |
"ko": 12,
|
| 103 |
"nospeech": 13,
|
| 104 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
# decoder
|
| 107 |
if beam_size > 1 and hot_words is not None:
|
| 108 |
self.beam_size = beam_size
|
| 109 |
symbol_table = {}
|
| 110 |
+
for i in range(len(self.tokens)):
|
| 111 |
+
symbol_table[self.tokens[i]] = i
|
| 112 |
self.decoder = CTCDecoder(hot_words, symbol_table, bpe_model)
|
| 113 |
else:
|
| 114 |
self.beam_size = 1
|
|
|
|
| 116 |
|
| 117 |
if streaming:
|
| 118 |
self.cur_idx = -1
|
| 119 |
+
self.chunk_size = max_seq_len - self.padding
|
| 120 |
+
self.caches_shape = (max_seq_len, self.input_size)
|
| 121 |
self.caches = np.zeros(self.caches_shape, dtype=np.float32)
|
| 122 |
self.zeros = np.zeros((1, self.input_size), dtype=np.float32)
|
| 123 |
self.neg_mean, self.inv_stddev = (
|
|
|
|
| 126 |
)
|
| 127 |
|
| 128 |
self.fbank = OnlineFbank(window_type="hamming")
|
| 129 |
+
self.stream_mask = self.sequence_mask(
|
| 130 |
+
max_seq_len + self.query_num, max_seq_len + self.query_num
|
|
|
|
|
|
|
| 131 |
)
|
| 132 |
|
| 133 |
+
def load_tokens(self, token_file):
|
| 134 |
+
tokens = []
|
| 135 |
+
with open(token_file, "r") as f:
|
| 136 |
+
for line in f:
|
| 137 |
+
tokens.append(line[:-1])
|
| 138 |
+
return tokens
|
| 139 |
+
|
| 140 |
@property
|
| 141 |
def language_options(self):
|
| 142 |
return list(self.lid_dict.keys())
|
| 143 |
|
| 144 |
+
def sequence_mask(self, max_seq_len, actual_seq_len):
|
| 145 |
+
mask = np.zeros((1, 1, max_seq_len), dtype=np.int32)
|
| 146 |
+
mask[:, :, :actual_seq_len] = 1
|
| 147 |
+
return mask
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
def load_data(self, filepath: str) -> np.ndarray:
|
| 150 |
waveform, _ = librosa.load(filepath, sr=self.sample_rate)
|
|
|
|
| 180 |
yseq = np.argmax(x, axis=-1)
|
| 181 |
|
| 182 |
# 去除连续重复元素
|
| 183 |
+
yseq = unique_consecutive(yseq)
|
| 184 |
|
| 185 |
# 创建掩码并过滤 blank_id
|
| 186 |
mask = yseq != self.blank_id
|
|
|
|
| 189 |
return token_int
|
| 190 |
|
| 191 |
def infer_waveform(self, waveform: np.ndarray, language="auto"):
|
|
|
|
|
|
|
|
|
|
| 192 |
# start = time.time()
|
| 193 |
feat, feat_len = self.preprocess(waveform)
|
| 194 |
# print(f"Preprocess take {time.time() - start}s")
|
| 195 |
|
| 196 |
+
slice_len = self.max_seq_len - self.query_num
|
| 197 |
slice_num = int(np.ceil(feat.shape[1] / slice_len))
|
| 198 |
|
| 199 |
+
language_token = self.lid_dict[language]
|
| 200 |
+
language_token = np.array([language_token], dtype=np.int32)
|
| 201 |
+
|
| 202 |
asr_res = []
|
| 203 |
for i in range(slice_num):
|
| 204 |
if i == 0:
|
|
|
|
| 209 |
i * slice_len - self.padding : (i + 1) * slice_len - self.padding,
|
| 210 |
:,
|
| 211 |
]
|
| 212 |
+
|
|
|
|
| 213 |
real_len = sub_feat.shape[1]
|
| 214 |
+
if real_len < self.max_seq_len:
|
| 215 |
sub_feat = np.concatenate(
|
| 216 |
[
|
| 217 |
sub_feat,
|
| 218 |
np.zeros(
|
| 219 |
+
(1, self.max_seq_len - real_len, sub_feat.shape[-1]),
|
| 220 |
dtype=np.float32,
|
| 221 |
),
|
| 222 |
],
|
| 223 |
axis=1,
|
| 224 |
)
|
| 225 |
|
| 226 |
+
mask = self.sequence_mask(self.max_seq_len + self.query_num, real_len)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
# start = time.time()
|
| 229 |
outputs = self.model.run(
|
| 230 |
None,
|
| 231 |
{
|
| 232 |
"speech": sub_feat,
|
| 233 |
+
"mask": mask,
|
| 234 |
+
"language": language_token,
|
| 235 |
},
|
| 236 |
)
|
| 237 |
ctc_logits, encoder_out_lens = outputs
|
| 238 |
|
| 239 |
token_int = self.postprocess(ctc_logits, encoder_out_lens)
|
| 240 |
|
| 241 |
+
asr_res.extend(token_int)
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
+
text = "".join([self.tokens[i] for i in asr_res])
|
| 244 |
+
return text
|
| 245 |
|
| 246 |
def infer(
|
| 247 |
self, filepath_or_data: Union[np.ndarray, str], language="auto", print_rtf=False
|
|
|
|
| 262 |
if print_rtf:
|
| 263 |
rtf = latency / total_time
|
| 264 |
print(f"RTF: {rtf} Latency: {latency}s Total length: {total_time}s")
|
| 265 |
+
return asr_res
|
| 266 |
|
| 267 |
def decode(self, times, tokens):
|
| 268 |
times_ms = []
|
| 269 |
for step, token in zip(times, tokens):
|
| 270 |
+
if len(self.tokens[token].strip()) == 0:
|
| 271 |
continue
|
| 272 |
times_ms.append(step * 60)
|
| 273 |
+
return times_ms, "".join([self.tokens[i] for i in tokens])
|
| 274 |
|
| 275 |
def reset(self):
|
| 276 |
self.cur_idx = -1
|
|
|
|
| 287 |
def stream_infer(self, audio, is_last, language="auto"):
|
| 288 |
assert self.streaming, "This method is for streaming model"
|
| 289 |
|
| 290 |
+
language_token = self.lid_dict[language]
|
| 291 |
+
language_token = np.array([language_token], dtype=np.int32)
|
| 292 |
|
| 293 |
self.fbank.accept_waveform(audio, is_last)
|
| 294 |
features = self.fbank.get_lfr_frames(
|
|
|
|
| 312 |
None,
|
| 313 |
{
|
| 314 |
"speech": speech,
|
| 315 |
+
"mask": self.stream_mask,
|
| 316 |
+
"language": language_token,
|
| 317 |
},
|
| 318 |
)
|
| 319 |
ctc_logits, encoder_out_lens = outputs
|
gradio_demo.py
CHANGED
|
@@ -1,25 +1,31 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
from SenseVoiceAx import SenseVoiceAx
|
| 4 |
-
from
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
| 9 |
|
| 10 |
assert os.path.exists(model_path), f"model {model_path} not exist"
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
model_path,
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
| 15 |
beam_size=3,
|
| 16 |
-
language="auto",
|
| 17 |
hot_words=None,
|
| 18 |
-
use_itn=True,
|
| 19 |
streaming=False,
|
| 20 |
)
|
| 21 |
|
| 22 |
-
|
| 23 |
def speech_to_text(audio_path, lang):
|
| 24 |
"""
|
| 25 |
audio_path: 音频文件路径
|
|
@@ -28,9 +34,7 @@ def speech_to_text(audio_path, lang):
|
|
| 28 |
if not audio_path:
|
| 29 |
return "无音频"
|
| 30 |
|
| 31 |
-
|
| 32 |
-
asr_res = pipeline.infer(audio_path, print_rtf=False)
|
| 33 |
-
|
| 34 |
return asr_res
|
| 35 |
|
| 36 |
|
|
@@ -41,7 +45,7 @@ def main():
|
|
| 41 |
|
| 42 |
with gr.Row():
|
| 43 |
audio_input = gr.Audio(
|
| 44 |
-
sources=["
|
| 45 |
)
|
| 46 |
lang_dropdown = gr.Dropdown(
|
| 47 |
choices=["auto", "zh", "en", "yue", "ja", "ko"],
|
|
@@ -55,6 +59,10 @@ def main():
|
|
| 55 |
|
| 56 |
demo.launch(
|
| 57 |
server_name="0.0.0.0",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
)
|
| 59 |
|
| 60 |
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
from SenseVoiceAx import SenseVoiceAx
|
| 4 |
+
from download_utils import download_model
|
| 5 |
|
| 6 |
+
model_root = download_model("SenseVoice")
|
| 7 |
+
model_root = os.path.join(model_root, "sensevoice_ax650")
|
| 8 |
+
max_seq_len = 256
|
| 9 |
+
model_path = os.path.join(model_root, "sensevoice.axmodel")
|
| 10 |
|
| 11 |
assert os.path.exists(model_path), f"model {model_path} not exist"
|
| 12 |
|
| 13 |
+
cmvn_file = os.path.join(model_root, "am.mvn")
|
| 14 |
+
bpe_model = os.path.join(model_root, "chn_jpn_yue_eng_ko_spectok.bpe.model")
|
| 15 |
+
token_file = os.path.join(model_root, "tokens.txt")
|
| 16 |
+
|
| 17 |
+
model = SenseVoiceAx(
|
| 18 |
model_path,
|
| 19 |
+
cmvn_file,
|
| 20 |
+
token_file,
|
| 21 |
+
bpe_model,
|
| 22 |
+
max_seq_len=max_seq_len,
|
| 23 |
beam_size=3,
|
|
|
|
| 24 |
hot_words=None,
|
|
|
|
| 25 |
streaming=False,
|
| 26 |
)
|
| 27 |
|
| 28 |
+
# 你实现的语言转文本函数
|
| 29 |
def speech_to_text(audio_path, lang):
|
| 30 |
"""
|
| 31 |
audio_path: 音频文件路径
|
|
|
|
| 34 |
if not audio_path:
|
| 35 |
return "无音频"
|
| 36 |
|
| 37 |
+
asr_res = model.infer(audio_path, lang, print_rtf=False)
|
|
|
|
|
|
|
| 38 |
return asr_res
|
| 39 |
|
| 40 |
|
|
|
|
| 45 |
|
| 46 |
with gr.Row():
|
| 47 |
audio_input = gr.Audio(
|
| 48 |
+
sources=["microphone"], type="filepath", label="录制或上传音频", format="mp3"
|
| 49 |
)
|
| 50 |
lang_dropdown = gr.Dropdown(
|
| 51 |
choices=["auto", "zh", "en", "yue", "ja", "ko"],
|
|
|
|
| 59 |
|
| 60 |
demo.launch(
|
| 61 |
server_name="0.0.0.0",
|
| 62 |
+
server_port=7860,
|
| 63 |
+
ssl_certfile="./cert.pem",
|
| 64 |
+
ssl_keyfile="./key.pem",
|
| 65 |
+
ssl_verify=False,
|
| 66 |
)
|
| 67 |
|
| 68 |
|
main.py
CHANGED
|
@@ -2,7 +2,7 @@ import os
|
|
| 2 |
import argparse
|
| 3 |
from SenseVoiceAx import SenseVoiceAx
|
| 4 |
import librosa
|
| 5 |
-
|
| 6 |
import time
|
| 7 |
|
| 8 |
|
|
@@ -25,37 +25,38 @@ def get_args():
|
|
| 25 |
|
| 26 |
def main():
|
| 27 |
args = get_args()
|
|
|
|
| 28 |
|
| 29 |
input_audio = args.input
|
| 30 |
language = args.language
|
| 31 |
-
|
|
|
|
| 32 |
if not args.streaming:
|
| 33 |
-
|
| 34 |
-
model_path = os.path.join(
|
| 35 |
else:
|
| 36 |
-
|
| 37 |
-
model_path = os.path.join(
|
| 38 |
|
| 39 |
assert os.path.exists(model_path), f"model {model_path} not exist"
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
print(f"model_path: {model_path}")
|
| 45 |
-
print(f"streaming: {args.streaming}")
|
| 46 |
|
| 47 |
-
|
| 48 |
model_path,
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
| 50 |
beam_size=3,
|
| 51 |
-
language="auto",
|
| 52 |
hot_words=None,
|
| 53 |
-
use_itn=True,
|
| 54 |
streaming=args.streaming,
|
| 55 |
)
|
| 56 |
|
| 57 |
if not args.streaming:
|
| 58 |
-
asr_res =
|
| 59 |
print("ASR result: " + asr_res)
|
| 60 |
else:
|
| 61 |
samples, sr = librosa.load(input_audio, sr=16000)
|
|
@@ -66,7 +67,7 @@ def main():
|
|
| 66 |
step = int(0.1 * sr)
|
| 67 |
for i in range(0, len(samples), step):
|
| 68 |
is_last = i + step >= len(samples)
|
| 69 |
-
for res in
|
| 70 |
print(res)
|
| 71 |
|
| 72 |
end = time.time()
|
|
|
|
| 2 |
import argparse
|
| 3 |
from SenseVoiceAx import SenseVoiceAx
|
| 4 |
import librosa
|
| 5 |
+
from download_utils import download_model
|
| 6 |
import time
|
| 7 |
|
| 8 |
|
|
|
|
| 25 |
|
| 26 |
def main():
|
| 27 |
args = get_args()
|
| 28 |
+
print(vars(args))
|
| 29 |
|
| 30 |
input_audio = args.input
|
| 31 |
language = args.language
|
| 32 |
+
model_root = download_model("SenseVoice")
|
| 33 |
+
model_root = os.path.join(model_root, "sensevoice_ax650")
|
| 34 |
if not args.streaming:
|
| 35 |
+
max_seq_len = 256
|
| 36 |
+
model_path = os.path.join(model_root, "sensevoice.axmodel")
|
| 37 |
else:
|
| 38 |
+
max_seq_len = 26
|
| 39 |
+
model_path = os.path.join(model_root, "streaming_sensevoice.axmodel")
|
| 40 |
|
| 41 |
assert os.path.exists(model_path), f"model {model_path} not exist"
|
| 42 |
|
| 43 |
+
cmvn_file = os.path.join(model_root, "am.mvn")
|
| 44 |
+
bpe_model = os.path.join(model_root, "chn_jpn_yue_eng_ko_spectok.bpe.model")
|
| 45 |
+
token_file = os.path.join(model_root, "tokens.txt")
|
|
|
|
|
|
|
| 46 |
|
| 47 |
+
model = SenseVoiceAx(
|
| 48 |
model_path,
|
| 49 |
+
cmvn_file,
|
| 50 |
+
token_file,
|
| 51 |
+
bpe_model,
|
| 52 |
+
max_seq_len=max_seq_len,
|
| 53 |
beam_size=3,
|
|
|
|
| 54 |
hot_words=None,
|
|
|
|
| 55 |
streaming=args.streaming,
|
| 56 |
)
|
| 57 |
|
| 58 |
if not args.streaming:
|
| 59 |
+
asr_res = model.infer(input_audio, language, print_rtf=True)
|
| 60 |
print("ASR result: " + asr_res)
|
| 61 |
else:
|
| 62 |
samples, sr = librosa.load(input_audio, sr=16000)
|
|
|
|
| 67 |
step = int(0.1 * sr)
|
| 68 |
for i in range(0, len(samples), step):
|
| 69 |
is_last = i + step >= len(samples)
|
| 70 |
+
for res in model.stream_infer(samples[i : i + step], is_last, language):
|
| 71 |
print(res)
|
| 72 |
|
| 73 |
end = time.time()
|
pe_nonstream.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:0f1c9c550bd62fa164a959517f52d46a28591812fafdf002df0df2bd998f44b5
|
| 3 |
-
size 573568
|
|
|
|
|
|
|
|
|
|
|
|
pe_streaming.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:54fec2fe2670168d36678c5857e65c459c634e6b6d6df928b7d415399ce2c291
|
| 3 |
-
size 58368
|
|
|
|
|
|
|
|
|
|
|
|
print_utils.py
DELETED
|
@@ -1,131 +0,0 @@
|
|
| 1 |
-
emo_dict = {
|
| 2 |
-
"<|HAPPY|>": "😊",
|
| 3 |
-
"<|SAD|>": "😔",
|
| 4 |
-
"<|ANGRY|>": "😡",
|
| 5 |
-
"<|NEUTRAL|>": "",
|
| 6 |
-
"<|FEARFUL|>": "😰",
|
| 7 |
-
"<|DISGUSTED|>": "🤢",
|
| 8 |
-
"<|SURPRISED|>": "😮",
|
| 9 |
-
}
|
| 10 |
-
|
| 11 |
-
event_dict = {
|
| 12 |
-
"<|BGM|>": "🎼",
|
| 13 |
-
"<|Speech|>": "",
|
| 14 |
-
"<|Applause|>": "👏",
|
| 15 |
-
"<|Laughter|>": "😀",
|
| 16 |
-
"<|Cry|>": "😭",
|
| 17 |
-
"<|Sneeze|>": "🤧",
|
| 18 |
-
"<|Breath|>": "",
|
| 19 |
-
"<|Cough|>": "🤧",
|
| 20 |
-
}
|
| 21 |
-
|
| 22 |
-
lang_dict = {
|
| 23 |
-
"<|zh|>": "<|lang|>",
|
| 24 |
-
"<|en|>": "<|lang|>",
|
| 25 |
-
"<|yue|>": "<|lang|>",
|
| 26 |
-
"<|ja|>": "<|lang|>",
|
| 27 |
-
"<|ko|>": "<|lang|>",
|
| 28 |
-
"<|nospeech|>": "<|lang|>",
|
| 29 |
-
}
|
| 30 |
-
|
| 31 |
-
emoji_dict = {
|
| 32 |
-
"<|nospeech|><|Event_UNK|>": "❓",
|
| 33 |
-
"<|zh|>": "",
|
| 34 |
-
"<|en|>": "",
|
| 35 |
-
"<|yue|>": "",
|
| 36 |
-
"<|ja|>": "",
|
| 37 |
-
"<|ko|>": "",
|
| 38 |
-
"<|nospeech|>": "",
|
| 39 |
-
"<|HAPPY|>": "😊",
|
| 40 |
-
"<|SAD|>": "😔",
|
| 41 |
-
"<|ANGRY|>": "😡",
|
| 42 |
-
"<|NEUTRAL|>": "",
|
| 43 |
-
"<|BGM|>": "🎼",
|
| 44 |
-
"<|Speech|>": "",
|
| 45 |
-
"<|Applause|>": "👏",
|
| 46 |
-
"<|Laughter|>": "😀",
|
| 47 |
-
"<|FEARFUL|>": "😰",
|
| 48 |
-
"<|DISGUSTED|>": "🤢",
|
| 49 |
-
"<|SURPRISED|>": "😮",
|
| 50 |
-
"<|Cry|>": "😭",
|
| 51 |
-
"<|EMO_UNKNOWN|>": "",
|
| 52 |
-
"<|Sneeze|>": "🤧",
|
| 53 |
-
"<|Breath|>": "",
|
| 54 |
-
"<|Cough|>": "😷",
|
| 55 |
-
"<|Sing|>": "",
|
| 56 |
-
"<|Speech_Noise|>": "",
|
| 57 |
-
"<|withitn|>": "",
|
| 58 |
-
"<|woitn|>": "",
|
| 59 |
-
"<|GBG|>": "",
|
| 60 |
-
"<|Event_UNK|>": "",
|
| 61 |
-
}
|
| 62 |
-
|
| 63 |
-
emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
|
| 64 |
-
event_set = {
|
| 65 |
-
"🎼",
|
| 66 |
-
"👏",
|
| 67 |
-
"😀",
|
| 68 |
-
"😭",
|
| 69 |
-
"🤧",
|
| 70 |
-
"😷",
|
| 71 |
-
}
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
def format_str_v2(s):
|
| 75 |
-
sptk_dict = {}
|
| 76 |
-
for sptk in emoji_dict:
|
| 77 |
-
sptk_dict[sptk] = s.count(sptk)
|
| 78 |
-
s = s.replace(sptk, "")
|
| 79 |
-
emo = "<|NEUTRAL|>"
|
| 80 |
-
for e in emo_dict:
|
| 81 |
-
if sptk_dict[e] > sptk_dict[emo]:
|
| 82 |
-
emo = e
|
| 83 |
-
for e in event_dict:
|
| 84 |
-
if sptk_dict[e] > 0:
|
| 85 |
-
s = event_dict[e] + s
|
| 86 |
-
s = s + emo_dict[emo]
|
| 87 |
-
|
| 88 |
-
for emoji in emo_set.union(event_set):
|
| 89 |
-
s = s.replace(" " + emoji, emoji)
|
| 90 |
-
s = s.replace(emoji + " ", emoji)
|
| 91 |
-
return s.strip()
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
def rich_transcription_postprocess(s):
|
| 95 |
-
def get_emo(s):
|
| 96 |
-
return s[-1] if s[-1] in emo_set else None
|
| 97 |
-
|
| 98 |
-
def get_event(s):
|
| 99 |
-
return s[0] if s[0] in event_set else None
|
| 100 |
-
|
| 101 |
-
s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
|
| 102 |
-
for lang in lang_dict:
|
| 103 |
-
s = s.replace(lang, "<|lang|>")
|
| 104 |
-
s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
|
| 105 |
-
new_s = " " + s_list[0]
|
| 106 |
-
cur_ent_event = get_event(new_s)
|
| 107 |
-
for i in range(1, len(s_list)):
|
| 108 |
-
if len(s_list[i]) == 0:
|
| 109 |
-
continue
|
| 110 |
-
if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
|
| 111 |
-
s_list[i] = s_list[i][1:]
|
| 112 |
-
# else:
|
| 113 |
-
cur_ent_event = get_event(s_list[i])
|
| 114 |
-
if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
|
| 115 |
-
new_s = new_s[:-1]
|
| 116 |
-
new_s += s_list[i].strip().lstrip()
|
| 117 |
-
new_s = new_s.replace("The.", " ")
|
| 118 |
-
return new_s.strip()
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
def rich_print_asr_res(asr_res, will_print=True, remove_punc=False):
|
| 122 |
-
res = "".join([rich_transcription_postprocess(i) for i in asr_res])
|
| 123 |
-
|
| 124 |
-
if remove_punc:
|
| 125 |
-
res = res.replace(",", "")
|
| 126 |
-
res = res.replace("。", "")
|
| 127 |
-
|
| 128 |
-
if will_print:
|
| 129 |
-
print(res)
|
| 130 |
-
|
| 131 |
-
return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -2,10 +2,7 @@ huggingface_hub
|
|
| 2 |
numpy<2
|
| 3 |
kaldi-native-fbank
|
| 4 |
librosa==0.9.1
|
| 5 |
-
sentencepiece
|
| 6 |
fastapi
|
| 7 |
gradio
|
| 8 |
-
emoji
|
| 9 |
-
asr-decoder
|
| 10 |
online-fbank
|
| 11 |
-
|
|
|
|
| 2 |
numpy<2
|
| 3 |
kaldi-native-fbank
|
| 4 |
librosa==0.9.1
|
|
|
|
| 5 |
fastapi
|
| 6 |
gradio
|
|
|
|
|
|
|
| 7 |
online-fbank
|
| 8 |
+
asr_decoder
|
am.mvn → sensevoice_ax630c/am.mvn
RENAMED
|
File without changes
|
chn_jpn_yue_eng_ko_spectok.bpe.model → sensevoice_ax630c/chn_jpn_yue_eng_ko_spectok.bpe.model
RENAMED
|
File without changes
|
sensevoice_ax630c/sensevoice.axmodel
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bdcac5038b7062719a19bed49f39e448e9d741ec389fb1c9b0c62d9efb5a1a8e
|
| 3 |
+
size 259948631
|
sensevoice_ax630c/streaming_sensevoice.axmodel
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:607af1407270dfdff95421286d29286aaab4d93885332d5a6f84810b1042fb2b
|
| 3 |
+
size 249359616
|
sensevoice_ax630c/tokens.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sensevoice_ax650/am.mvn
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<Nnet>
|
| 2 |
+
<Splice> 560 560
|
| 3 |
+
[ 0 ]
|
| 4 |
+
<AddShift> 560 560
|
| 5 |
+
<LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
|
| 6 |
+
<Rescale> 560 560
|
| 7 |
+
<LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
|
| 8 |
+
</Nnet>
|
embeddings.npy → sensevoice_ax650/chn_jpn_yue_eng_ko_spectok.bpe.model
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa87f86064c3730d799ddf7af3c04659151102cba548bce325cf06ba4da4e6a8
|
| 3 |
+
size 377341
|
sensevoice_ax650/sensevoice.axmodel
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:91b0dcf88b85af852c4ca16e3879b317272bc748c9815cd91007cf71a0c59714
|
| 3 |
+
size 263172727
|
sensevoice_ax650/streaming_sensevoice.axmodel
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c883c2988e9bc7a103ac332611523f20bd474c3b10cc35e3cc2e621d35097756
|
| 3 |
+
size 261538678
|
sensevoice_ax650/tokens.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
server.py
CHANGED
|
@@ -3,7 +3,9 @@ from fastapi import FastAPI, HTTPException, Body
|
|
| 3 |
from fastapi.responses import JSONResponse
|
| 4 |
from typing import List, Optional
|
| 5 |
import logging
|
|
|
|
| 6 |
from SenseVoiceAx import SenseVoiceAx
|
|
|
|
| 7 |
import os
|
| 8 |
import librosa
|
| 9 |
|
|
@@ -28,27 +30,35 @@ async def load_model():
|
|
| 28 |
try:
|
| 29 |
# 模型加载
|
| 30 |
language = "auto"
|
| 31 |
-
use_itn = True #
|
| 32 |
-
max_len =
|
| 33 |
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
assert os.path.exists(model_path), f"model {model_path} not exist"
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
|
| 42 |
asr_model = SenseVoiceAx(
|
| 43 |
model_path,
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
| 45 |
beam_size=3,
|
| 46 |
-
language="auto",
|
| 47 |
hot_words=None,
|
| 48 |
-
use_itn=use_itn,
|
| 49 |
streaming=False,
|
| 50 |
)
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
logger.info("ASR model loaded successfully")
|
| 53 |
except Exception as e:
|
| 54 |
logger.error(f"Failed to load ASR model: {str(e)}")
|
|
|
|
| 3 |
from fastapi.responses import JSONResponse
|
| 4 |
from typing import List, Optional
|
| 5 |
import logging
|
| 6 |
+
import json
|
| 7 |
from SenseVoiceAx import SenseVoiceAx
|
| 8 |
+
from download_utils import download_model
|
| 9 |
import os
|
| 10 |
import librosa
|
| 11 |
|
|
|
|
| 30 |
try:
|
| 31 |
# 模型加载
|
| 32 |
language = "auto"
|
| 33 |
+
use_itn = True # 标点符号预测
|
| 34 |
+
max_len = 68
|
| 35 |
|
| 36 |
+
model_root = download_model("SenseVoice")
|
| 37 |
+
model_root = os.path.join(model_root, "sensevoice_ax650")
|
| 38 |
+
max_seq_len = 256
|
| 39 |
+
model_path = os.path.join(model_root, "sensevoice.axmodel")
|
| 40 |
|
| 41 |
assert os.path.exists(model_path), f"model {model_path} not exist"
|
| 42 |
|
| 43 |
+
cmvn_file = os.path.join(model_root, "am.mvn")
|
| 44 |
+
bpe_model = os.path.join(model_root, "chn_jpn_yue_eng_ko_spectok.bpe.model")
|
| 45 |
+
token_file = os.path.join(model_root, "tokens.txt")
|
| 46 |
|
| 47 |
asr_model = SenseVoiceAx(
|
| 48 |
model_path,
|
| 49 |
+
cmvn_file,
|
| 50 |
+
token_file,
|
| 51 |
+
bpe_model,
|
| 52 |
+
max_seq_len=max_seq_len,
|
| 53 |
beam_size=3,
|
|
|
|
| 54 |
hot_words=None,
|
|
|
|
| 55 |
streaming=False,
|
| 56 |
)
|
| 57 |
|
| 58 |
+
print(f"language: {language}")
|
| 59 |
+
print(f"use_itn: {use_itn}")
|
| 60 |
+
print(f"model_path: {model_path}")
|
| 61 |
+
|
| 62 |
logger.info("ASR model loaded successfully")
|
| 63 |
except Exception as e:
|
| 64 |
logger.error(f"Failed to load ASR model: {str(e)}")
|
test_wer.py
CHANGED
|
@@ -1,12 +1,9 @@
|
|
| 1 |
-
import os
|
| 2 |
import argparse
|
| 3 |
from SenseVoiceAx import SenseVoiceAx
|
| 4 |
-
from tokenizer import SentencepiecesTokenizer
|
| 5 |
-
from print_utils import rich_transcription_postprocess, rich_print_asr_res
|
| 6 |
from download_utils import download_model
|
| 7 |
import logging
|
| 8 |
import re
|
| 9 |
-
import emoji
|
| 10 |
|
| 11 |
|
| 12 |
def setup_logging():
|
|
@@ -229,7 +226,6 @@ def main():
|
|
| 229 |
args = get_args()
|
| 230 |
|
| 231 |
language = args.language
|
| 232 |
-
use_itn = False # 标点符号预测
|
| 233 |
max_num = args.max_num
|
| 234 |
|
| 235 |
dataset_type = args.dataset.lower()
|
|
@@ -240,21 +236,32 @@ def main():
|
|
| 240 |
else:
|
| 241 |
raise ValueError(f"Unknown dataset type {dataset_type}")
|
| 242 |
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
|
|
|
| 246 |
|
| 247 |
assert os.path.exists(model_path), f"model {model_path} not exist"
|
| 248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
logger.info(f"dataset: {args.dataset}")
|
| 250 |
logger.info(f"language: {language}")
|
| 251 |
-
logger.info(f"use_itn: {use_itn}")
|
| 252 |
logger.info(f"model_path: {model_path}")
|
| 253 |
|
| 254 |
-
pipeline = SenseVoiceAx(
|
| 255 |
-
model_path, language=language
|
| 256 |
-
)
|
| 257 |
-
|
| 258 |
# Iterate over dataset
|
| 259 |
hyp = []
|
| 260 |
references = []
|
|
@@ -264,11 +271,8 @@ def main():
|
|
| 264 |
for n, (audio_path, reference) in enumerate(dataset):
|
| 265 |
reference = remove_punctuation(reference).lower()
|
| 266 |
|
| 267 |
-
asr_res =
|
| 268 |
-
hypothesis =
|
| 269 |
-
asr_res, will_print=False, remove_punc=True
|
| 270 |
-
).lower()
|
| 271 |
-
hypothesis = emoji.replace_emoji(hypothesis, replace="")
|
| 272 |
|
| 273 |
character_error_num = min_distance(reference, hypothesis)
|
| 274 |
character_num = len(reference)
|
|
|
|
| 1 |
+
import os
|
| 2 |
import argparse
|
| 3 |
from SenseVoiceAx import SenseVoiceAx
|
|
|
|
|
|
|
| 4 |
from download_utils import download_model
|
| 5 |
import logging
|
| 6 |
import re
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def setup_logging():
|
|
|
|
| 226 |
args = get_args()
|
| 227 |
|
| 228 |
language = args.language
|
|
|
|
| 229 |
max_num = args.max_num
|
| 230 |
|
| 231 |
dataset_type = args.dataset.lower()
|
|
|
|
| 236 |
else:
|
| 237 |
raise ValueError(f"Unknown dataset type {dataset_type}")
|
| 238 |
|
| 239 |
+
model_root = download_model("SenseVoice")
|
| 240 |
+
model_root = os.path.join(model_root, "sensevoice_ax650")
|
| 241 |
+
max_seq_len = 256
|
| 242 |
+
model_path = os.path.join(model_root, "sensevoice.axmodel")
|
| 243 |
|
| 244 |
assert os.path.exists(model_path), f"model {model_path} not exist"
|
| 245 |
|
| 246 |
+
cmvn_file = os.path.join(model_root, "am.mvn")
|
| 247 |
+
bpe_model = os.path.join(model_root, "chn_jpn_yue_eng_ko_spectok.bpe.model")
|
| 248 |
+
token_file = os.path.join(model_root, "tokens.txt")
|
| 249 |
+
|
| 250 |
+
model = SenseVoiceAx(
|
| 251 |
+
model_path,
|
| 252 |
+
cmvn_file,
|
| 253 |
+
token_file,
|
| 254 |
+
bpe_model,
|
| 255 |
+
max_seq_len=max_seq_len,
|
| 256 |
+
beam_size=3,
|
| 257 |
+
hot_words=None,
|
| 258 |
+
streaming=False,
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
logger.info(f"dataset: {args.dataset}")
|
| 262 |
logger.info(f"language: {language}")
|
|
|
|
| 263 |
logger.info(f"model_path: {model_path}")
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
# Iterate over dataset
|
| 266 |
hyp = []
|
| 267 |
references = []
|
|
|
|
| 271 |
for n, (audio_path, reference) in enumerate(dataset):
|
| 272 |
reference = remove_punctuation(reference).lower()
|
| 273 |
|
| 274 |
+
asr_res = model.infer(audio_path, language, print_rtf=False)
|
| 275 |
+
hypothesis = remove_punctuation(asr_res).lower()
|
|
|
|
|
|
|
|
|
|
| 276 |
|
| 277 |
character_error_num = min_distance(reference, hypothesis)
|
| 278 |
character_num = len(reference)
|
tokenizer.py
DELETED
|
@@ -1,135 +0,0 @@
|
|
| 1 |
-
import sentencepiece as spm
|
| 2 |
-
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
|
| 5 |
-
|
| 6 |
-
import json
|
| 7 |
-
from abc import abstractmethod
|
| 8 |
-
from abc import ABC
|
| 9 |
-
import numpy as np
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
class BaseTokenizer(ABC):
|
| 13 |
-
def __init__(
|
| 14 |
-
self,
|
| 15 |
-
token_list: Union[Path, str, Iterable[str]] = None,
|
| 16 |
-
unk_symbol: str = "<unk>",
|
| 17 |
-
**kwargs,
|
| 18 |
-
):
|
| 19 |
-
|
| 20 |
-
if token_list is not None:
|
| 21 |
-
if isinstance(token_list, (Path, str)) and token_list.endswith(".txt"):
|
| 22 |
-
token_list = Path(token_list)
|
| 23 |
-
self.token_list_repr = str(token_list)
|
| 24 |
-
self.token_list: List[str] = []
|
| 25 |
-
|
| 26 |
-
with token_list.open("r", encoding="utf-8") as f:
|
| 27 |
-
for idx, line in enumerate(f):
|
| 28 |
-
line = line.rstrip()
|
| 29 |
-
self.token_list.append(line)
|
| 30 |
-
elif isinstance(token_list, (Path, str)) and token_list.endswith(".json"):
|
| 31 |
-
token_list = Path(token_list)
|
| 32 |
-
self.token_list_repr = str(token_list)
|
| 33 |
-
self.token_list: List[str] = []
|
| 34 |
-
|
| 35 |
-
with open(token_list, "r", encoding="utf-8") as f:
|
| 36 |
-
self.token_list = json.load(f)
|
| 37 |
-
|
| 38 |
-
else:
|
| 39 |
-
self.token_list: List[str] = list(token_list)
|
| 40 |
-
self.token_list_repr = ""
|
| 41 |
-
for i, t in enumerate(self.token_list):
|
| 42 |
-
if i == 3:
|
| 43 |
-
break
|
| 44 |
-
self.token_list_repr += f"{t}, "
|
| 45 |
-
self.token_list_repr += f"... (NVocab={(len(self.token_list))})"
|
| 46 |
-
|
| 47 |
-
self.token2id: Dict[str, int] = {}
|
| 48 |
-
for i, t in enumerate(self.token_list):
|
| 49 |
-
if t in self.token2id:
|
| 50 |
-
raise RuntimeError(f'Symbol "{t}" is duplicated')
|
| 51 |
-
self.token2id[t] = i
|
| 52 |
-
|
| 53 |
-
self.unk_symbol = unk_symbol
|
| 54 |
-
if self.unk_symbol not in self.token2id:
|
| 55 |
-
raise RuntimeError(
|
| 56 |
-
f"Unknown symbol '{unk_symbol}' doesn't exist in the token_list"
|
| 57 |
-
)
|
| 58 |
-
self.unk_id = self.token2id[self.unk_symbol]
|
| 59 |
-
|
| 60 |
-
def encode(self, text, **kwargs):
|
| 61 |
-
tokens = self.text2tokens(text)
|
| 62 |
-
text_ints = self.tokens2ids(tokens)
|
| 63 |
-
|
| 64 |
-
return text_ints
|
| 65 |
-
|
| 66 |
-
def decode(self, text_ints):
|
| 67 |
-
token = self.ids2tokens(text_ints)
|
| 68 |
-
text = self.tokens2text(token)
|
| 69 |
-
return text
|
| 70 |
-
|
| 71 |
-
def get_num_vocabulary_size(self) -> int:
|
| 72 |
-
return len(self.token_list)
|
| 73 |
-
|
| 74 |
-
def ids2tokens(self, integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
|
| 75 |
-
if isinstance(integers, np.ndarray) and integers.ndim != 1:
|
| 76 |
-
raise ValueError(f"Must be 1 dim ndarray, but got {integers.ndim}")
|
| 77 |
-
return [self.token_list[i] for i in integers]
|
| 78 |
-
|
| 79 |
-
def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
|
| 80 |
-
return [self.token2id.get(i, self.unk_id) for i in tokens]
|
| 81 |
-
|
| 82 |
-
@abstractmethod
|
| 83 |
-
def text2tokens(self, line: str) -> List[str]:
|
| 84 |
-
raise NotImplementedError
|
| 85 |
-
|
| 86 |
-
@abstractmethod
|
| 87 |
-
def tokens2text(self, tokens: Iterable[str]) -> str:
|
| 88 |
-
raise NotImplementedError
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
class SentencepiecesTokenizer(BaseTokenizer):
|
| 92 |
-
def __init__(self, bpemodel: Union[Path, str], **kwargs):
|
| 93 |
-
super().__init__(**kwargs)
|
| 94 |
-
self.bpemodel = str(bpemodel)
|
| 95 |
-
# NOTE(kamo):
|
| 96 |
-
# Don't build SentencePieceProcessor in __init__()
|
| 97 |
-
# because it's not picklable and it may cause following error,
|
| 98 |
-
# "TypeError: can't pickle SwigPyObject objects",
|
| 99 |
-
# when giving it as argument of "multiprocessing.Process()".
|
| 100 |
-
self.sp = None
|
| 101 |
-
self._build_sentence_piece_processor()
|
| 102 |
-
|
| 103 |
-
def __repr__(self):
|
| 104 |
-
return f'{self.__class__.__name__}(model="{self.bpemodel}")'
|
| 105 |
-
|
| 106 |
-
def _build_sentence_piece_processor(self):
|
| 107 |
-
# Build SentencePieceProcessor lazily.
|
| 108 |
-
if self.sp is None:
|
| 109 |
-
self.sp = spm.SentencePieceProcessor()
|
| 110 |
-
self.sp.load(self.bpemodel)
|
| 111 |
-
|
| 112 |
-
def text2tokens(self, line: str) -> List[str]:
|
| 113 |
-
self._build_sentence_piece_processor()
|
| 114 |
-
return self.sp.EncodeAsPieces(line)
|
| 115 |
-
|
| 116 |
-
def tokens2text(self, tokens: Iterable[str]) -> str:
|
| 117 |
-
self._build_sentence_piece_processor()
|
| 118 |
-
return self.sp.DecodePieces(list(tokens))
|
| 119 |
-
|
| 120 |
-
def encode(self, line: str, **kwargs) -> List[int]:
|
| 121 |
-
self._build_sentence_piece_processor()
|
| 122 |
-
return self.sp.EncodeAsIds(line)
|
| 123 |
-
|
| 124 |
-
def decode(self, line: List[int], **kwargs):
|
| 125 |
-
self._build_sentence_piece_processor()
|
| 126 |
-
return self.sp.DecodeIds(line)
|
| 127 |
-
|
| 128 |
-
def get_vocab_size(self):
|
| 129 |
-
return self.sp.GetPieceSize()
|
| 130 |
-
|
| 131 |
-
def ids2tokens(self, *args, **kwargs):
|
| 132 |
-
return self.decode(*args, **kwargs)
|
| 133 |
-
|
| 134 |
-
def tokens2ids(self, *args, **kwargs):
|
| 135 |
-
return self.encode(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|