inoryQwQ commited on
Commit
e4824c6
·
1 Parent(s): 22b36ed

Update models, simplify inputs

Browse files
README.md CHANGED
@@ -25,12 +25,39 @@ FunASR SenseVoice on Axera, official repo: https://github.com/FunAudioLLM/SenseV
25
  - [x] AX650N
26
  - [x] AX630C
27
 
 
28
  ## 环境安装
 
 
29
  ```
30
- pip3 install -r requirements.txt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  ```
32
  如果空间不足可以使用 --prefix 指定别的安装路径
33
 
 
 
 
 
 
 
 
 
 
34
 
35
  ## 使用
36
  ```
@@ -48,20 +75,21 @@ python3 main.py -i 输入音频文件
48
  ### 示例:
49
  example下有测试音频
50
 
51
- 粤语测试
52
  ```
53
- python3 main.py -i example/yue.mp3
54
  ```
55
  输出
56
  ```
57
- RTF: 0.03026517820946964 Latency: 0.15689468383789062s Total length: 5.184s
58
- ['呢几个字。', '都表达唔到,我想讲嘅意。', '思。']
 
59
  ```
60
 
61
  流式识别
62
 
63
  ```
64
- python3 main.py -i example/zh.mp3 --streaming
65
  ```
66
  输出
67
  ```
@@ -81,13 +109,13 @@ RTF: 0.03678379235444246
81
 
82
  使用WER(Word-Error-Rate)作为评价标准
83
 
84
- **WER = 0.0389**
85
 
86
  ### 复现测试结果
87
 
88
  ```
89
  ./download_datasets.sh
90
- python test_wer.py -d datasets -l zh
91
  ```
92
 
93
  ## 技术讨论
 
25
  - [x] AX650N
26
  - [x] AX630C
27
 
28
+
29
  ## 环境安装
30
+
31
+ 推荐在板上安装Miniconda管理虚拟环境,安装方法如下:
32
  ```
33
+ mkdir -p ~/miniconda3
34
+ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-aarch64.sh -O ~/miniconda3/miniconda.sh
35
+ bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
36
+ rm ~/miniconda3/miniconda.sh
37
+
38
+ source ~/miniconda3/bin/activate
39
+
40
+ conda init --all
41
+ ```
42
+
43
+ ```
44
+ sudo apt-get install libsndfile-dev
45
+
46
+ conda create -n sensevoice python=3.12
47
+ conda activate sensevoice
48
+ pip install -r requirements.txt
49
  ```
50
  如果空间不足可以使用 --prefix 指定别的安装路径
51
 
52
+ #### 安装pyaxenigne
53
+
54
+ 参考 https://github.com/AXERA-TECH/pyaxengine 安装 NPU Python API
55
+
56
+ 在0.1.3rc2上测试通过,可通过
57
+ ```
58
+ pip install https://github.com/AXERA-TECH/pyaxengine/releases/download/0.1.3.rc2/axengine-0.1.3-py3-none-any.whl
59
+ ```
60
+ 安装,或把版本号更改为你想使用的版本
61
 
62
  ## 使用
63
  ```
 
75
  ### 示例:
76
  example下有测试音频
77
 
78
+ 中文测试
79
  ```
80
+ python main.py -i example/zh.mp3
81
  ```
82
  输出
83
  ```
84
+ RTF: 0.04386647134764582 Latency: 0.2463541030883789s Total length: 5.616s
85
+ ASR result: 开饭时间早上九点至下午五点
86
+
87
  ```
88
 
89
  流式识别
90
 
91
  ```
92
+ python main.py -i example/zh.mp3 --streaming
93
  ```
94
  输出
95
  ```
 
109
 
110
  使用WER(Word-Error-Rate)作为评价标准
111
 
112
+ **WER = 2.0%**
113
 
114
  ### 复现测试结果
115
 
116
  ```
117
  ./download_datasets.sh
118
+ python test_wer.py -d aishell -g datasets/ground_truth.txt --language zh
119
  ```
120
 
121
  ## 技术讨论
SenseVoiceAx.py CHANGED
@@ -2,44 +2,14 @@ import axengine as axe
2
  import numpy as np
3
  import librosa
4
  from frontend import WavFrontend
5
- import os
6
  import time
7
  from typing import List, Union, Optional
8
  from asr_decoder import CTCDecoder
9
- from tokenizer import SentencepiecesTokenizer
10
  from online_fbank import OnlineFbank
11
  import torch
12
 
13
 
14
- def sequence_mask(lengths, maxlen=None, dtype=np.float32):
15
- # 如果 maxlen 未指定,则取 lengths 中的最大值
16
- if maxlen is None:
17
- maxlen = np.max(lengths)
18
-
19
- # 创建一个从 0 到 maxlen-1 的行向量
20
- row_vector = np.arange(0, maxlen, 1)
21
-
22
- # 将 lengths 转换为列向量
23
- matrix = np.expand_dims(lengths, axis=-1)
24
-
25
- # 比较生成掩码
26
- mask = row_vector < matrix
27
- if mask.shape[-1] < lengths[0]:
28
- mask = np.concatenate(
29
- [
30
- mask,
31
- np.zeros(
32
- (mask.shape[0], lengths[0] - mask.shape[-1]), dtype=np.float32
33
- ),
34
- ],
35
- axis=-1,
36
- )
37
-
38
- # 返回指定数据类型的掩码
39
- return mask.astype(dtype)[None, ...]
40
-
41
-
42
- def unique_consecutive_np(arr):
43
  """
44
  找出数组中连续的唯一值,模拟 torch.unique_consecutive(yseq, dim=-1)
45
 
@@ -74,13 +44,14 @@ class SenseVoiceAx:
74
  def __init__(
75
  self,
76
  model_path: str,
77
- max_len: int = 256,
 
 
 
78
  beam_size: int = 3,
79
- language: str = "auto",
80
  hot_words: Optional[List[str]] = None,
81
- use_itn: bool = True,
82
  streaming: bool = False,
83
- providers=['AxEngineExecutionProvider']
84
  ):
85
  """
86
  Initialize SenseVoiceAx
@@ -99,23 +70,8 @@ class SenseVoiceAx:
99
  Use stream_infer method if streaming is true otherwise infer.
100
 
101
  """
102
- model_path_root = os.path.dirname(model_path)
103
- emb_path = os.path.join(model_path_root, "../embeddings.npy")
104
- cmvn_file = os.path.join(model_path_root, "../am.mvn")
105
- bpe_model = os.path.join(
106
- model_path_root, "../chn_jpn_yue_eng_ko_spectok.bpe.model"
107
- )
108
- if streaming:
109
- self.position_encoding = np.load(
110
- os.path.join(model_path_root, "../pe_streaming.npy")
111
- )
112
- else:
113
- self.position_encoding = np.load(
114
- os.path.join(model_path_root, "../pe_nonstream.npy")
115
- )
116
 
117
  self.streaming = streaming
118
- self.tokenizer = SentencepiecesTokenizer(bpemodel=bpe_model)
119
 
120
  self.frontend = WavFrontend(
121
  cmvn_file=cmvn_file,
@@ -127,12 +83,15 @@ class SenseVoiceAx:
127
  lfr_m=7,
128
  lfr_n=6,
129
  )
 
130
  self.model = axe.InferenceSession(model_path, providers=providers)
131
  self.sample_rate = 16000
132
  self.blank_id = 0
133
- self.max_len = max_len
134
  self.padding = 16
135
  self.input_size = 560
 
 
136
 
137
  self.lid_dict = {
138
  "auto": 0,
@@ -143,33 +102,13 @@ class SenseVoiceAx:
143
  "ko": 12,
144
  "nospeech": 13,
145
  }
146
- self.lid_int_dict = {
147
- 24884: 3,
148
- 24885: 4,
149
- 24888: 7,
150
- 24892: 11,
151
- 24896: 12,
152
- 24992: 13,
153
- }
154
- self.textnorm_dict = {"withitn": 14, "woitn": 15}
155
- self.textnorm_int_dict = {25016: 14, 25017: 15}
156
- self.emo_dict = {
157
- "unk": 25009,
158
- "happy": 25001,
159
- "sad": 25002,
160
- "angry": 25003,
161
- "neutral": 25004,
162
- }
163
-
164
- self.load_embeddings(emb_path, language, use_itn)
165
- self.language = language
166
 
167
  # decoder
168
  if beam_size > 1 and hot_words is not None:
169
  self.beam_size = beam_size
170
  symbol_table = {}
171
- for i in range(self.tokenizer.get_vocab_size()):
172
- symbol_table[self.tokenizer.decode(i)] = i
173
  self.decoder = CTCDecoder(hot_words, symbol_table, bpe_model)
174
  else:
175
  self.beam_size = 1
@@ -177,8 +116,8 @@ class SenseVoiceAx:
177
 
178
  if streaming:
179
  self.cur_idx = -1
180
- self.chunk_size = max_len - self.padding
181
- self.caches_shape = (max_len, self.input_size)
182
  self.caches = np.zeros(self.caches_shape, dtype=np.float32)
183
  self.zeros = np.zeros((1, self.input_size), dtype=np.float32)
184
  self.neg_mean, self.inv_stddev = (
@@ -187,38 +126,25 @@ class SenseVoiceAx:
187
  )
188
 
189
  self.fbank = OnlineFbank(window_type="hamming")
190
- self.masks = sequence_mask(
191
- np.array([self.max_len], dtype=np.int32),
192
- maxlen=self.max_len,
193
- dtype=np.float32,
194
  )
195
 
 
 
 
 
 
 
 
196
  @property
197
  def language_options(self):
198
  return list(self.lid_dict.keys())
199
 
200
- @property
201
- def textnorm_options(self):
202
- return list(self.textnorm_dict.keys())
203
-
204
- def load_embeddings(self, emb_path, language, use_itn):
205
- self.embeddings = np.load(emb_path, allow_pickle=True).item()
206
- self.language_query = self.embeddings[language]
207
- self.textnorm_query = (
208
- self.embeddings["withitn"] if use_itn else self.embeddings["woitn"]
209
- )
210
- self.event_emo_query = self.embeddings["event_emo"]
211
- self.input_query = np.concatenate(
212
- (self.textnorm_query, self.language_query, self.event_emo_query), axis=1
213
- )
214
- self.query_num = self.input_query.shape[1]
215
-
216
- def choose_language(self, language):
217
- self.language_query = self.embeddings[language]
218
- self.input_query = np.concatenate(
219
- (self.textnorm_query, self.language_query, self.event_emo_query), axis=1
220
- )
221
- self.language = language
222
 
223
  def load_data(self, filepath: str) -> np.ndarray:
224
  waveform, _ = librosa.load(filepath, sr=self.sample_rate)
@@ -254,7 +180,7 @@ class SenseVoiceAx:
254
  yseq = np.argmax(x, axis=-1)
255
 
256
  # 去除连续重复元素
257
- yseq = unique_consecutive_np(yseq)
258
 
259
  # 创建掩码并过滤 blank_id
260
  mask = yseq != self.blank_id
@@ -263,16 +189,16 @@ class SenseVoiceAx:
263
  return token_int
264
 
265
  def infer_waveform(self, waveform: np.ndarray, language="auto"):
266
- if language != self.language:
267
- self.choose_language(language)
268
-
269
  # start = time.time()
270
  feat, feat_len = self.preprocess(waveform)
271
  # print(f"Preprocess take {time.time() - start}s")
272
 
273
- slice_len = self.max_len - self.query_num
274
  slice_num = int(np.ceil(feat.shape[1] / slice_len))
275
 
 
 
 
276
  asr_res = []
277
  for i in range(slice_num):
278
  if i == 0:
@@ -283,46 +209,39 @@ class SenseVoiceAx:
283
  i * slice_len - self.padding : (i + 1) * slice_len - self.padding,
284
  :,
285
  ]
286
- # concat query
287
- sub_feat = np.concatenate([self.input_query, sub_feat], axis=1)
288
  real_len = sub_feat.shape[1]
289
- if real_len < self.max_len:
290
  sub_feat = np.concatenate(
291
  [
292
  sub_feat,
293
  np.zeros(
294
- (1, self.max_len - real_len, sub_feat.shape[-1]),
295
  dtype=np.float32,
296
  ),
297
  ],
298
  axis=1,
299
  )
300
 
301
- masks = sequence_mask(
302
- np.array([self.max_len], dtype=np.int32),
303
- maxlen=real_len,
304
- dtype=np.float32,
305
- )
306
 
307
  # start = time.time()
308
  outputs = self.model.run(
309
  None,
310
  {
311
  "speech": sub_feat,
312
- "masks": masks,
313
- "position_encoding": self.position_encoding,
314
  },
315
  )
316
  ctc_logits, encoder_out_lens = outputs
317
 
318
  token_int = self.postprocess(ctc_logits, encoder_out_lens)
319
 
320
- if self.tokenizer is not None:
321
- asr_res.append(self.tokenizer.tokens2text(token_int))
322
- else:
323
- asr_res.append(token_int)
324
 
325
- return asr_res
 
326
 
327
  def infer(
328
  self, filepath_or_data: Union[np.ndarray, str], language="auto", print_rtf=False
@@ -343,15 +262,15 @@ class SenseVoiceAx:
343
  if print_rtf:
344
  rtf = latency / total_time
345
  print(f"RTF: {rtf} Latency: {latency}s Total length: {total_time}s")
346
- return "".join(asr_res)
347
 
348
  def decode(self, times, tokens):
349
  times_ms = []
350
  for step, token in zip(times, tokens):
351
- if len(self.tokenizer.decode(token).strip()) == 0:
352
  continue
353
  times_ms.append(step * 60)
354
- return times_ms, self.tokenizer.decode(tokens)
355
 
356
  def reset(self):
357
  self.cur_idx = -1
@@ -368,8 +287,8 @@ class SenseVoiceAx:
368
  def stream_infer(self, audio, is_last, language="auto"):
369
  assert self.streaming, "This method is for streaming model"
370
 
371
- if language != self.language:
372
- self.choose_language(language)
373
 
374
  self.fbank.accept_waveform(audio, is_last)
375
  features = self.fbank.get_lfr_frames(
@@ -393,8 +312,8 @@ class SenseVoiceAx:
393
  None,
394
  {
395
  "speech": speech,
396
- "masks": self.masks,
397
- "position_encoding": self.position_encoding,
398
  },
399
  )
400
  ctc_logits, encoder_out_lens = outputs
 
2
  import numpy as np
3
  import librosa
4
  from frontend import WavFrontend
 
5
  import time
6
  from typing import List, Union, Optional
7
  from asr_decoder import CTCDecoder
 
8
  from online_fbank import OnlineFbank
9
  import torch
10
 
11
 
12
+ def unique_consecutive(arr):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  """
14
  找出数组中连续的唯一值,模拟 torch.unique_consecutive(yseq, dim=-1)
15
 
 
44
  def __init__(
45
  self,
46
  model_path: str,
47
+ cmvn_file: str,
48
+ token_file: str,
49
+ bpe_model: str = None,
50
+ max_seq_len: int = 256,
51
  beam_size: int = 3,
 
52
  hot_words: Optional[List[str]] = None,
 
53
  streaming: bool = False,
54
+ providers=["AxEngineExecutionProvider"],
55
  ):
56
  """
57
  Initialize SenseVoiceAx
 
70
  Use stream_infer method if streaming is true otherwise infer.
71
 
72
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  self.streaming = streaming
 
75
 
76
  self.frontend = WavFrontend(
77
  cmvn_file=cmvn_file,
 
83
  lfr_m=7,
84
  lfr_n=6,
85
  )
86
+
87
  self.model = axe.InferenceSession(model_path, providers=providers)
88
  self.sample_rate = 16000
89
  self.blank_id = 0
90
+ self.max_seq_len = max_seq_len
91
  self.padding = 16
92
  self.input_size = 560
93
+ self.query_num = 4
94
+ self.tokens = self.load_tokens(token_file)
95
 
96
  self.lid_dict = {
97
  "auto": 0,
 
102
  "ko": 12,
103
  "nospeech": 13,
104
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  # decoder
107
  if beam_size > 1 and hot_words is not None:
108
  self.beam_size = beam_size
109
  symbol_table = {}
110
+ for i in range(len(self.tokens)):
111
+ symbol_table[self.tokens[i]] = i
112
  self.decoder = CTCDecoder(hot_words, symbol_table, bpe_model)
113
  else:
114
  self.beam_size = 1
 
116
 
117
  if streaming:
118
  self.cur_idx = -1
119
+ self.chunk_size = max_seq_len - self.padding
120
+ self.caches_shape = (max_seq_len, self.input_size)
121
  self.caches = np.zeros(self.caches_shape, dtype=np.float32)
122
  self.zeros = np.zeros((1, self.input_size), dtype=np.float32)
123
  self.neg_mean, self.inv_stddev = (
 
126
  )
127
 
128
  self.fbank = OnlineFbank(window_type="hamming")
129
+ self.stream_mask = self.sequence_mask(
130
+ max_seq_len + self.query_num, max_seq_len + self.query_num
 
 
131
  )
132
 
133
+ def load_tokens(self, token_file):
134
+ tokens = []
135
+ with open(token_file, "r") as f:
136
+ for line in f:
137
+ tokens.append(line[:-1])
138
+ return tokens
139
+
140
  @property
141
  def language_options(self):
142
  return list(self.lid_dict.keys())
143
 
144
+ def sequence_mask(self, max_seq_len, actual_seq_len):
145
+ mask = np.zeros((1, 1, max_seq_len), dtype=np.int32)
146
+ mask[:, :, :actual_seq_len] = 1
147
+ return mask
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  def load_data(self, filepath: str) -> np.ndarray:
150
  waveform, _ = librosa.load(filepath, sr=self.sample_rate)
 
180
  yseq = np.argmax(x, axis=-1)
181
 
182
  # 去除连续重复元素
183
+ yseq = unique_consecutive(yseq)
184
 
185
  # 创建掩码并过滤 blank_id
186
  mask = yseq != self.blank_id
 
189
  return token_int
190
 
191
  def infer_waveform(self, waveform: np.ndarray, language="auto"):
 
 
 
192
  # start = time.time()
193
  feat, feat_len = self.preprocess(waveform)
194
  # print(f"Preprocess take {time.time() - start}s")
195
 
196
+ slice_len = self.max_seq_len - self.query_num
197
  slice_num = int(np.ceil(feat.shape[1] / slice_len))
198
 
199
+ language_token = self.lid_dict[language]
200
+ language_token = np.array([language_token], dtype=np.int32)
201
+
202
  asr_res = []
203
  for i in range(slice_num):
204
  if i == 0:
 
209
  i * slice_len - self.padding : (i + 1) * slice_len - self.padding,
210
  :,
211
  ]
212
+
 
213
  real_len = sub_feat.shape[1]
214
+ if real_len < self.max_seq_len:
215
  sub_feat = np.concatenate(
216
  [
217
  sub_feat,
218
  np.zeros(
219
+ (1, self.max_seq_len - real_len, sub_feat.shape[-1]),
220
  dtype=np.float32,
221
  ),
222
  ],
223
  axis=1,
224
  )
225
 
226
+ mask = self.sequence_mask(self.max_seq_len + self.query_num, real_len)
 
 
 
 
227
 
228
  # start = time.time()
229
  outputs = self.model.run(
230
  None,
231
  {
232
  "speech": sub_feat,
233
+ "mask": mask,
234
+ "language": language_token,
235
  },
236
  )
237
  ctc_logits, encoder_out_lens = outputs
238
 
239
  token_int = self.postprocess(ctc_logits, encoder_out_lens)
240
 
241
+ asr_res.extend(token_int)
 
 
 
242
 
243
+ text = "".join([self.tokens[i] for i in asr_res])
244
+ return text
245
 
246
  def infer(
247
  self, filepath_or_data: Union[np.ndarray, str], language="auto", print_rtf=False
 
262
  if print_rtf:
263
  rtf = latency / total_time
264
  print(f"RTF: {rtf} Latency: {latency}s Total length: {total_time}s")
265
+ return asr_res
266
 
267
  def decode(self, times, tokens):
268
  times_ms = []
269
  for step, token in zip(times, tokens):
270
+ if len(self.tokens[token].strip()) == 0:
271
  continue
272
  times_ms.append(step * 60)
273
+ return times_ms, "".join([self.tokens[i] for i in tokens])
274
 
275
  def reset(self):
276
  self.cur_idx = -1
 
287
  def stream_infer(self, audio, is_last, language="auto"):
288
  assert self.streaming, "This method is for streaming model"
289
 
290
+ language_token = self.lid_dict[language]
291
+ language_token = np.array([language_token], dtype=np.int32)
292
 
293
  self.fbank.accept_waveform(audio, is_last)
294
  features = self.fbank.get_lfr_frames(
 
312
  None,
313
  {
314
  "speech": speech,
315
+ "mask": self.stream_mask,
316
+ "language": language_token,
317
  },
318
  )
319
  ctc_logits, encoder_out_lens = outputs
gradio_demo.py CHANGED
@@ -1,25 +1,31 @@
1
  import gradio as gr
2
  import os
3
  from SenseVoiceAx import SenseVoiceAx
4
- from print_utils import rich_transcription_postprocess
5
 
6
- max_len = 256
7
-
8
- model_path = os.path.join("sensevoice_ax650", "sensevoice.axmodel")
 
9
 
10
  assert os.path.exists(model_path), f"model {model_path} not exist"
11
 
12
- pipeline = SenseVoiceAx(
 
 
 
 
13
  model_path,
14
- max_len=max_len,
 
 
 
15
  beam_size=3,
16
- language="auto",
17
  hot_words=None,
18
- use_itn=True,
19
  streaming=False,
20
  )
21
 
22
-
23
  def speech_to_text(audio_path, lang):
24
  """
25
  audio_path: 音频文件路径
@@ -28,9 +34,7 @@ def speech_to_text(audio_path, lang):
28
  if not audio_path:
29
  return "无音频"
30
 
31
- pipeline.choose_language(language=lang)
32
- asr_res = pipeline.infer(audio_path, print_rtf=False)
33
-
34
  return asr_res
35
 
36
 
@@ -41,7 +45,7 @@ def main():
41
 
42
  with gr.Row():
43
  audio_input = gr.Audio(
44
- sources=["upload"], type="filepath", label="录制或上传音频", format="mp3"
45
  )
46
  lang_dropdown = gr.Dropdown(
47
  choices=["auto", "zh", "en", "yue", "ja", "ko"],
@@ -55,6 +59,10 @@ def main():
55
 
56
  demo.launch(
57
  server_name="0.0.0.0",
 
 
 
 
58
  )
59
 
60
 
 
1
  import gradio as gr
2
  import os
3
  from SenseVoiceAx import SenseVoiceAx
4
+ from download_utils import download_model
5
 
6
+ model_root = download_model("SenseVoice")
7
+ model_root = os.path.join(model_root, "sensevoice_ax650")
8
+ max_seq_len = 256
9
+ model_path = os.path.join(model_root, "sensevoice.axmodel")
10
 
11
  assert os.path.exists(model_path), f"model {model_path} not exist"
12
 
13
+ cmvn_file = os.path.join(model_root, "am.mvn")
14
+ bpe_model = os.path.join(model_root, "chn_jpn_yue_eng_ko_spectok.bpe.model")
15
+ token_file = os.path.join(model_root, "tokens.txt")
16
+
17
+ model = SenseVoiceAx(
18
  model_path,
19
+ cmvn_file,
20
+ token_file,
21
+ bpe_model,
22
+ max_seq_len=max_seq_len,
23
  beam_size=3,
 
24
  hot_words=None,
 
25
  streaming=False,
26
  )
27
 
28
+ # 你实现的语言转文本函数
29
  def speech_to_text(audio_path, lang):
30
  """
31
  audio_path: 音频文件路径
 
34
  if not audio_path:
35
  return "无音频"
36
 
37
+ asr_res = model.infer(audio_path, lang, print_rtf=False)
 
 
38
  return asr_res
39
 
40
 
 
45
 
46
  with gr.Row():
47
  audio_input = gr.Audio(
48
+ sources=["microphone"], type="filepath", label="录制或上传音频", format="mp3"
49
  )
50
  lang_dropdown = gr.Dropdown(
51
  choices=["auto", "zh", "en", "yue", "ja", "ko"],
 
59
 
60
  demo.launch(
61
  server_name="0.0.0.0",
62
+ server_port=7860,
63
+ ssl_certfile="./cert.pem",
64
+ ssl_keyfile="./key.pem",
65
+ ssl_verify=False,
66
  )
67
 
68
 
main.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import argparse
3
  from SenseVoiceAx import SenseVoiceAx
4
  import librosa
5
- import numpy as np
6
  import time
7
 
8
 
@@ -25,37 +25,38 @@ def get_args():
25
 
26
  def main():
27
  args = get_args()
 
28
 
29
  input_audio = args.input
30
  language = args.language
31
- use_itn = True # 标点符号预测
 
32
  if not args.streaming:
33
- max_len = 256
34
- model_path = os.path.join("sensevoice_ax650", "sensevoice.axmodel")
35
  else:
36
- max_len = 26
37
- model_path = os.path.join("sensevoice_ax650", "streaming_sensevoice.axmodel")
38
 
39
  assert os.path.exists(model_path), f"model {model_path} not exist"
40
 
41
- print(f"input_audio: {input_audio}")
42
- print(f"language: {language}")
43
- print(f"use_itn: {use_itn}")
44
- print(f"model_path: {model_path}")
45
- print(f"streaming: {args.streaming}")
46
 
47
- pipeline = SenseVoiceAx(
48
  model_path,
49
- max_len=max_len,
 
 
 
50
  beam_size=3,
51
- language="auto",
52
  hot_words=None,
53
- use_itn=True,
54
  streaming=args.streaming,
55
  )
56
 
57
  if not args.streaming:
58
- asr_res = pipeline.infer(input_audio, print_rtf=True)
59
  print("ASR result: " + asr_res)
60
  else:
61
  samples, sr = librosa.load(input_audio, sr=16000)
@@ -66,7 +67,7 @@ def main():
66
  step = int(0.1 * sr)
67
  for i in range(0, len(samples), step):
68
  is_last = i + step >= len(samples)
69
- for res in pipeline.stream_infer(samples[i : i + step], is_last):
70
  print(res)
71
 
72
  end = time.time()
 
2
  import argparse
3
  from SenseVoiceAx import SenseVoiceAx
4
  import librosa
5
+ from download_utils import download_model
6
  import time
7
 
8
 
 
25
 
26
  def main():
27
  args = get_args()
28
+ print(vars(args))
29
 
30
  input_audio = args.input
31
  language = args.language
32
+ model_root = download_model("SenseVoice")
33
+ model_root = os.path.join(model_root, "sensevoice_ax650")
34
  if not args.streaming:
35
+ max_seq_len = 256
36
+ model_path = os.path.join(model_root, "sensevoice.axmodel")
37
  else:
38
+ max_seq_len = 26
39
+ model_path = os.path.join(model_root, "streaming_sensevoice.axmodel")
40
 
41
  assert os.path.exists(model_path), f"model {model_path} not exist"
42
 
43
+ cmvn_file = os.path.join(model_root, "am.mvn")
44
+ bpe_model = os.path.join(model_root, "chn_jpn_yue_eng_ko_spectok.bpe.model")
45
+ token_file = os.path.join(model_root, "tokens.txt")
 
 
46
 
47
+ model = SenseVoiceAx(
48
  model_path,
49
+ cmvn_file,
50
+ token_file,
51
+ bpe_model,
52
+ max_seq_len=max_seq_len,
53
  beam_size=3,
 
54
  hot_words=None,
 
55
  streaming=args.streaming,
56
  )
57
 
58
  if not args.streaming:
59
+ asr_res = model.infer(input_audio, language, print_rtf=True)
60
  print("ASR result: " + asr_res)
61
  else:
62
  samples, sr = librosa.load(input_audio, sr=16000)
 
67
  step = int(0.1 * sr)
68
  for i in range(0, len(samples), step):
69
  is_last = i + step >= len(samples)
70
+ for res in model.stream_infer(samples[i : i + step], is_last, language):
71
  print(res)
72
 
73
  end = time.time()
pe_nonstream.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f1c9c550bd62fa164a959517f52d46a28591812fafdf002df0df2bd998f44b5
3
- size 573568
 
 
 
 
pe_streaming.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:54fec2fe2670168d36678c5857e65c459c634e6b6d6df928b7d415399ce2c291
3
- size 58368
 
 
 
 
print_utils.py DELETED
@@ -1,131 +0,0 @@
1
- emo_dict = {
2
- "<|HAPPY|>": "😊",
3
- "<|SAD|>": "😔",
4
- "<|ANGRY|>": "😡",
5
- "<|NEUTRAL|>": "",
6
- "<|FEARFUL|>": "😰",
7
- "<|DISGUSTED|>": "🤢",
8
- "<|SURPRISED|>": "😮",
9
- }
10
-
11
- event_dict = {
12
- "<|BGM|>": "🎼",
13
- "<|Speech|>": "",
14
- "<|Applause|>": "👏",
15
- "<|Laughter|>": "😀",
16
- "<|Cry|>": "😭",
17
- "<|Sneeze|>": "🤧",
18
- "<|Breath|>": "",
19
- "<|Cough|>": "🤧",
20
- }
21
-
22
- lang_dict = {
23
- "<|zh|>": "<|lang|>",
24
- "<|en|>": "<|lang|>",
25
- "<|yue|>": "<|lang|>",
26
- "<|ja|>": "<|lang|>",
27
- "<|ko|>": "<|lang|>",
28
- "<|nospeech|>": "<|lang|>",
29
- }
30
-
31
- emoji_dict = {
32
- "<|nospeech|><|Event_UNK|>": "❓",
33
- "<|zh|>": "",
34
- "<|en|>": "",
35
- "<|yue|>": "",
36
- "<|ja|>": "",
37
- "<|ko|>": "",
38
- "<|nospeech|>": "",
39
- "<|HAPPY|>": "😊",
40
- "<|SAD|>": "😔",
41
- "<|ANGRY|>": "😡",
42
- "<|NEUTRAL|>": "",
43
- "<|BGM|>": "🎼",
44
- "<|Speech|>": "",
45
- "<|Applause|>": "👏",
46
- "<|Laughter|>": "😀",
47
- "<|FEARFUL|>": "😰",
48
- "<|DISGUSTED|>": "🤢",
49
- "<|SURPRISED|>": "😮",
50
- "<|Cry|>": "😭",
51
- "<|EMO_UNKNOWN|>": "",
52
- "<|Sneeze|>": "🤧",
53
- "<|Breath|>": "",
54
- "<|Cough|>": "😷",
55
- "<|Sing|>": "",
56
- "<|Speech_Noise|>": "",
57
- "<|withitn|>": "",
58
- "<|woitn|>": "",
59
- "<|GBG|>": "",
60
- "<|Event_UNK|>": "",
61
- }
62
-
63
- emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
64
- event_set = {
65
- "🎼",
66
- "👏",
67
- "😀",
68
- "😭",
69
- "🤧",
70
- "😷",
71
- }
72
-
73
-
74
- def format_str_v2(s):
75
- sptk_dict = {}
76
- for sptk in emoji_dict:
77
- sptk_dict[sptk] = s.count(sptk)
78
- s = s.replace(sptk, "")
79
- emo = "<|NEUTRAL|>"
80
- for e in emo_dict:
81
- if sptk_dict[e] > sptk_dict[emo]:
82
- emo = e
83
- for e in event_dict:
84
- if sptk_dict[e] > 0:
85
- s = event_dict[e] + s
86
- s = s + emo_dict[emo]
87
-
88
- for emoji in emo_set.union(event_set):
89
- s = s.replace(" " + emoji, emoji)
90
- s = s.replace(emoji + " ", emoji)
91
- return s.strip()
92
-
93
-
94
- def rich_transcription_postprocess(s):
95
- def get_emo(s):
96
- return s[-1] if s[-1] in emo_set else None
97
-
98
- def get_event(s):
99
- return s[0] if s[0] in event_set else None
100
-
101
- s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
102
- for lang in lang_dict:
103
- s = s.replace(lang, "<|lang|>")
104
- s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
105
- new_s = " " + s_list[0]
106
- cur_ent_event = get_event(new_s)
107
- for i in range(1, len(s_list)):
108
- if len(s_list[i]) == 0:
109
- continue
110
- if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
111
- s_list[i] = s_list[i][1:]
112
- # else:
113
- cur_ent_event = get_event(s_list[i])
114
- if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
115
- new_s = new_s[:-1]
116
- new_s += s_list[i].strip().lstrip()
117
- new_s = new_s.replace("The.", " ")
118
- return new_s.strip()
119
-
120
-
121
- def rich_print_asr_res(asr_res, will_print=True, remove_punc=False):
122
- res = "".join([rich_transcription_postprocess(i) for i in asr_res])
123
-
124
- if remove_punc:
125
- res = res.replace(",", "")
126
- res = res.replace("。", "")
127
-
128
- if will_print:
129
- print(res)
130
-
131
- return res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -2,10 +2,7 @@ huggingface_hub
2
  numpy<2
3
  kaldi-native-fbank
4
  librosa==0.9.1
5
- sentencepiece
6
  fastapi
7
  gradio
8
- emoji
9
- asr-decoder
10
  online-fbank
11
- torch
 
2
  numpy<2
3
  kaldi-native-fbank
4
  librosa==0.9.1
 
5
  fastapi
6
  gradio
 
 
7
  online-fbank
8
+ asr_decoder
am.mvn → sensevoice_ax630c/am.mvn RENAMED
File without changes
chn_jpn_yue_eng_ko_spectok.bpe.model → sensevoice_ax630c/chn_jpn_yue_eng_ko_spectok.bpe.model RENAMED
File without changes
sensevoice_ax630c/sensevoice.axmodel CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67d290cf7cebf45db5f37b2e93b8bdfff44dc35110bb29d84204a5f9eae9fd4d
3
- size 256550253
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdcac5038b7062719a19bed49f39e448e9d741ec389fb1c9b0c62d9efb5a1a8e
3
+ size 259948631
sensevoice_ax630c/streaming_sensevoice.axmodel CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba1ddd60841297903bfdae059ad88092d0fd1c543e1d80d7f64199d4e27b8263
3
- size 249023211
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:607af1407270dfdff95421286d29286aaab4d93885332d5a6f84810b1042fb2b
3
+ size 249359616
sensevoice_ax630c/tokens.txt ADDED
The diff for this file is too large to render. See raw diff
 
sensevoice_ax650/am.mvn ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <Nnet>
2
+ <Splice> 560 560
3
+ [ 0 ]
4
+ <AddShift> 560 560
5
+ <LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
6
+ <Rescale> 560 560
7
+ <LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
8
+ </Nnet>
embeddings.npy → sensevoice_ax650/chn_jpn_yue_eng_ko_spectok.bpe.model RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a453244ab037744531b97bcb8574c8442301dac11f6406fdab208dddb83b93e
3
- size 25523
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa87f86064c3730d799ddf7af3c04659151102cba548bce325cf06ba4da4e6a8
3
+ size 377341
sensevoice_ax650/sensevoice.axmodel CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fad2f710930c23c91ea62d6951c0c6161194e3cf356fc31611798419c6638dd9
3
- size 262381979
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91b0dcf88b85af852c4ca16e3879b317272bc748c9815cd91007cf71a0c59714
3
+ size 263172727
sensevoice_ax650/streaming_sensevoice.axmodel CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a6902048563d4f3b0442e380187f1b9d840bdcec476fceb158fe45d0cc12067
3
- size 261450261
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c883c2988e9bc7a103ac332611523f20bd474c3b10cc35e3cc2e621d35097756
3
+ size 261538678
sensevoice_ax650/tokens.txt ADDED
The diff for this file is too large to render. See raw diff
 
server.py CHANGED
@@ -3,7 +3,9 @@ from fastapi import FastAPI, HTTPException, Body
3
  from fastapi.responses import JSONResponse
4
  from typing import List, Optional
5
  import logging
 
6
  from SenseVoiceAx import SenseVoiceAx
 
7
  import os
8
  import librosa
9
 
@@ -28,27 +30,35 @@ async def load_model():
28
  try:
29
  # 模型加载
30
  language = "auto"
31
- use_itn = True # 逆文本规范
32
- max_len = 256
33
 
34
- model_path = os.path.join("sensevoice_ax650", "sensevoice.axmodel")
 
 
 
35
 
36
  assert os.path.exists(model_path), f"model {model_path} not exist"
37
 
38
- print(f"language: {language}")
39
- print(f"use_itn: {use_itn}")
40
- print(f"model_path: {model_path}")
41
 
42
  asr_model = SenseVoiceAx(
43
  model_path,
44
- max_len=max_len,
 
 
 
45
  beam_size=3,
46
- language="auto",
47
  hot_words=None,
48
- use_itn=use_itn,
49
  streaming=False,
50
  )
51
 
 
 
 
 
52
  logger.info("ASR model loaded successfully")
53
  except Exception as e:
54
  logger.error(f"Failed to load ASR model: {str(e)}")
 
3
  from fastapi.responses import JSONResponse
4
  from typing import List, Optional
5
  import logging
6
+ import json
7
  from SenseVoiceAx import SenseVoiceAx
8
+ from download_utils import download_model
9
  import os
10
  import librosa
11
 
 
30
  try:
31
  # 模型加载
32
  language = "auto"
33
+ use_itn = True # 标点符号预测
34
+ max_len = 68
35
 
36
+ model_root = download_model("SenseVoice")
37
+ model_root = os.path.join(model_root, "sensevoice_ax650")
38
+ max_seq_len = 256
39
+ model_path = os.path.join(model_root, "sensevoice.axmodel")
40
 
41
  assert os.path.exists(model_path), f"model {model_path} not exist"
42
 
43
+ cmvn_file = os.path.join(model_root, "am.mvn")
44
+ bpe_model = os.path.join(model_root, "chn_jpn_yue_eng_ko_spectok.bpe.model")
45
+ token_file = os.path.join(model_root, "tokens.txt")
46
 
47
  asr_model = SenseVoiceAx(
48
  model_path,
49
+ cmvn_file,
50
+ token_file,
51
+ bpe_model,
52
+ max_seq_len=max_seq_len,
53
  beam_size=3,
 
54
  hot_words=None,
 
55
  streaming=False,
56
  )
57
 
58
+ print(f"language: {language}")
59
+ print(f"use_itn: {use_itn}")
60
+ print(f"model_path: {model_path}")
61
+
62
  logger.info("ASR model loaded successfully")
63
  except Exception as e:
64
  logger.error(f"Failed to load ASR model: {str(e)}")
test_wer.py CHANGED
@@ -1,12 +1,9 @@
1
- import os, sys
2
  import argparse
3
  from SenseVoiceAx import SenseVoiceAx
4
- from tokenizer import SentencepiecesTokenizer
5
- from print_utils import rich_transcription_postprocess, rich_print_asr_res
6
  from download_utils import download_model
7
  import logging
8
  import re
9
- import emoji
10
 
11
 
12
  def setup_logging():
@@ -229,7 +226,6 @@ def main():
229
  args = get_args()
230
 
231
  language = args.language
232
- use_itn = False # 标点符号预测
233
  max_num = args.max_num
234
 
235
  dataset_type = args.dataset.lower()
@@ -240,21 +236,32 @@ def main():
240
  else:
241
  raise ValueError(f"Unknown dataset type {dataset_type}")
242
 
243
- # model_path_root = download_model("SenseVoice")
244
- model_path = os.path.join("sensevoice_ax650", "sensevoice.axmodel")
245
- bpemodel = "chn_jpn_yue_eng_ko_spectok.bpe.model"
 
246
 
247
  assert os.path.exists(model_path), f"model {model_path} not exist"
248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  logger.info(f"dataset: {args.dataset}")
250
  logger.info(f"language: {language}")
251
- logger.info(f"use_itn: {use_itn}")
252
  logger.info(f"model_path: {model_path}")
253
 
254
- pipeline = SenseVoiceAx(
255
- model_path, language=language
256
- )
257
-
258
  # Iterate over dataset
259
  hyp = []
260
  references = []
@@ -264,11 +271,8 @@ def main():
264
  for n, (audio_path, reference) in enumerate(dataset):
265
  reference = remove_punctuation(reference).lower()
266
 
267
- asr_res = pipeline.infer(audio_path, print_rtf=False)
268
- hypothesis = rich_print_asr_res(
269
- asr_res, will_print=False, remove_punc=True
270
- ).lower()
271
- hypothesis = emoji.replace_emoji(hypothesis, replace="")
272
 
273
  character_error_num = min_distance(reference, hypothesis)
274
  character_num = len(reference)
 
1
+ import os
2
  import argparse
3
  from SenseVoiceAx import SenseVoiceAx
 
 
4
  from download_utils import download_model
5
  import logging
6
  import re
 
7
 
8
 
9
  def setup_logging():
 
226
  args = get_args()
227
 
228
  language = args.language
 
229
  max_num = args.max_num
230
 
231
  dataset_type = args.dataset.lower()
 
236
  else:
237
  raise ValueError(f"Unknown dataset type {dataset_type}")
238
 
239
+ model_root = download_model("SenseVoice")
240
+ model_root = os.path.join(model_root, "sensevoice_ax650")
241
+ max_seq_len = 256
242
+ model_path = os.path.join(model_root, "sensevoice.axmodel")
243
 
244
  assert os.path.exists(model_path), f"model {model_path} not exist"
245
 
246
+ cmvn_file = os.path.join(model_root, "am.mvn")
247
+ bpe_model = os.path.join(model_root, "chn_jpn_yue_eng_ko_spectok.bpe.model")
248
+ token_file = os.path.join(model_root, "tokens.txt")
249
+
250
+ model = SenseVoiceAx(
251
+ model_path,
252
+ cmvn_file,
253
+ token_file,
254
+ bpe_model,
255
+ max_seq_len=max_seq_len,
256
+ beam_size=3,
257
+ hot_words=None,
258
+ streaming=False,
259
+ )
260
+
261
  logger.info(f"dataset: {args.dataset}")
262
  logger.info(f"language: {language}")
 
263
  logger.info(f"model_path: {model_path}")
264
 
 
 
 
 
265
  # Iterate over dataset
266
  hyp = []
267
  references = []
 
271
  for n, (audio_path, reference) in enumerate(dataset):
272
  reference = remove_punctuation(reference).lower()
273
 
274
+ asr_res = model.infer(audio_path, language, print_rtf=False)
275
+ hypothesis = remove_punctuation(asr_res).lower()
 
 
 
276
 
277
  character_error_num = min_distance(reference, hypothesis)
278
  character_num = len(reference)
tokenizer.py DELETED
@@ -1,135 +0,0 @@
1
- import sentencepiece as spm
2
-
3
- from pathlib import Path
4
- from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
5
-
6
- import json
7
- from abc import abstractmethod
8
- from abc import ABC
9
- import numpy as np
10
-
11
-
12
- class BaseTokenizer(ABC):
13
- def __init__(
14
- self,
15
- token_list: Union[Path, str, Iterable[str]] = None,
16
- unk_symbol: str = "<unk>",
17
- **kwargs,
18
- ):
19
-
20
- if token_list is not None:
21
- if isinstance(token_list, (Path, str)) and token_list.endswith(".txt"):
22
- token_list = Path(token_list)
23
- self.token_list_repr = str(token_list)
24
- self.token_list: List[str] = []
25
-
26
- with token_list.open("r", encoding="utf-8") as f:
27
- for idx, line in enumerate(f):
28
- line = line.rstrip()
29
- self.token_list.append(line)
30
- elif isinstance(token_list, (Path, str)) and token_list.endswith(".json"):
31
- token_list = Path(token_list)
32
- self.token_list_repr = str(token_list)
33
- self.token_list: List[str] = []
34
-
35
- with open(token_list, "r", encoding="utf-8") as f:
36
- self.token_list = json.load(f)
37
-
38
- else:
39
- self.token_list: List[str] = list(token_list)
40
- self.token_list_repr = ""
41
- for i, t in enumerate(self.token_list):
42
- if i == 3:
43
- break
44
- self.token_list_repr += f"{t}, "
45
- self.token_list_repr += f"... (NVocab={(len(self.token_list))})"
46
-
47
- self.token2id: Dict[str, int] = {}
48
- for i, t in enumerate(self.token_list):
49
- if t in self.token2id:
50
- raise RuntimeError(f'Symbol "{t}" is duplicated')
51
- self.token2id[t] = i
52
-
53
- self.unk_symbol = unk_symbol
54
- if self.unk_symbol not in self.token2id:
55
- raise RuntimeError(
56
- f"Unknown symbol '{unk_symbol}' doesn't exist in the token_list"
57
- )
58
- self.unk_id = self.token2id[self.unk_symbol]
59
-
60
- def encode(self, text, **kwargs):
61
- tokens = self.text2tokens(text)
62
- text_ints = self.tokens2ids(tokens)
63
-
64
- return text_ints
65
-
66
- def decode(self, text_ints):
67
- token = self.ids2tokens(text_ints)
68
- text = self.tokens2text(token)
69
- return text
70
-
71
- def get_num_vocabulary_size(self) -> int:
72
- return len(self.token_list)
73
-
74
- def ids2tokens(self, integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
75
- if isinstance(integers, np.ndarray) and integers.ndim != 1:
76
- raise ValueError(f"Must be 1 dim ndarray, but got {integers.ndim}")
77
- return [self.token_list[i] for i in integers]
78
-
79
- def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
80
- return [self.token2id.get(i, self.unk_id) for i in tokens]
81
-
82
- @abstractmethod
83
- def text2tokens(self, line: str) -> List[str]:
84
- raise NotImplementedError
85
-
86
- @abstractmethod
87
- def tokens2text(self, tokens: Iterable[str]) -> str:
88
- raise NotImplementedError
89
-
90
-
91
- class SentencepiecesTokenizer(BaseTokenizer):
92
- def __init__(self, bpemodel: Union[Path, str], **kwargs):
93
- super().__init__(**kwargs)
94
- self.bpemodel = str(bpemodel)
95
- # NOTE(kamo):
96
- # Don't build SentencePieceProcessor in __init__()
97
- # because it's not picklable and it may cause following error,
98
- # "TypeError: can't pickle SwigPyObject objects",
99
- # when giving it as argument of "multiprocessing.Process()".
100
- self.sp = None
101
- self._build_sentence_piece_processor()
102
-
103
- def __repr__(self):
104
- return f'{self.__class__.__name__}(model="{self.bpemodel}")'
105
-
106
- def _build_sentence_piece_processor(self):
107
- # Build SentencePieceProcessor lazily.
108
- if self.sp is None:
109
- self.sp = spm.SentencePieceProcessor()
110
- self.sp.load(self.bpemodel)
111
-
112
- def text2tokens(self, line: str) -> List[str]:
113
- self._build_sentence_piece_processor()
114
- return self.sp.EncodeAsPieces(line)
115
-
116
- def tokens2text(self, tokens: Iterable[str]) -> str:
117
- self._build_sentence_piece_processor()
118
- return self.sp.DecodePieces(list(tokens))
119
-
120
- def encode(self, line: str, **kwargs) -> List[int]:
121
- self._build_sentence_piece_processor()
122
- return self.sp.EncodeAsIds(line)
123
-
124
- def decode(self, line: List[int], **kwargs):
125
- self._build_sentence_piece_processor()
126
- return self.sp.DecodeIds(line)
127
-
128
- def get_vocab_size(self):
129
- return self.sp.GetPieceSize()
130
-
131
- def ids2tokens(self, *args, **kwargs):
132
- return self.decode(*args, **kwargs)
133
-
134
- def tokens2ids(self, *args, **kwargs):
135
- return self.encode(*args, **kwargs)