Update app.py
Browse files
app.py
CHANGED
|
@@ -14,13 +14,13 @@ logger.info("onnx_asr version: %s", version("onnx_asr"))
|
|
| 14 |
|
| 15 |
vad = onnx_asr.load_vad("silero")
|
| 16 |
|
| 17 |
-
models_multilang = {name: onnx_asr.load_model(name) for name in ["whisper-base"
|
|
|
|
|
|
|
| 18 |
|
| 19 |
models_ru = {
|
| 20 |
name: onnx_asr.load_model(name)
|
| 21 |
for name in [
|
| 22 |
-
"gigaam-v2-ctc",
|
| 23 |
-
"gigaam-v2-rnnt",
|
| 24 |
"gigaam-v3-ctc",
|
| 25 |
"gigaam-v3-rnnt",
|
| 26 |
"gigaam-v3-e2e-ctc",
|
|
@@ -29,11 +29,12 @@ models_ru = {
|
|
| 29 |
"nemo-fastconformer-ru-rnnt",
|
| 30 |
"alphacep/vosk-model-ru",
|
| 31 |
"alphacep/vosk-model-small-ru",
|
|
|
|
| 32 |
]
|
| 33 |
}
|
| 34 |
|
| 35 |
models_en = {
|
| 36 |
-
name: onnx_asr.load_model(name)
|
| 37 |
for name in [
|
| 38 |
"nemo-parakeet-tdt-0.6b-v2",
|
| 39 |
]
|
|
@@ -42,10 +43,14 @@ models_en = {
|
|
| 42 |
models_vad = models_multilang | models_ru | models_en
|
| 43 |
|
| 44 |
|
| 45 |
-
def recognize(audio: tuple[int, np.ndarray], models, language):
|
| 46 |
if audio is None:
|
| 47 |
return None
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
sample_rate, waveform = audio
|
| 50 |
length = waveform.shape[0] / sample_rate
|
| 51 |
logger.debug("recognize: length %.3f, sample_rate %s, waveform.shape %s.", length, sample_rate, waveform.shape)
|
|
@@ -59,6 +64,7 @@ def recognize(audio: tuple[int, np.ndarray], models, language):
|
|
| 59 |
if length > 20 and name == "alphacep/vosk-model-small-ru":
|
| 60 |
gr.Warning(f"Model {name} only supports audio no longer than 20 s.")
|
| 61 |
continue
|
|
|
|
| 62 |
start = timer()
|
| 63 |
result = model.recognize(waveform, sample_rate=sample_rate, language=language)
|
| 64 |
time = timer() - start
|
|
@@ -83,6 +89,10 @@ def recognize_with_vad(audio: tuple[int, np.ndarray], name: str):
|
|
| 83 |
if audio is None:
|
| 84 |
return None
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
sample_rate, waveform = audio
|
| 87 |
length = waveform.shape[0] / sample_rate
|
| 88 |
logger.debug("recognize: length %.3f, sample_rate %s, waveform.shape %s.", length, sample_rate, waveform.shape)
|
|
@@ -103,9 +113,8 @@ def recognize_with_vad(audio: tuple[int, np.ndarray], name: str):
|
|
| 103 |
|
| 104 |
|
| 105 |
with gr.Blocks() as recognize_short:
|
| 106 |
-
audio = gr.Audio(
|
| 107 |
with gr.Row():
|
| 108 |
-
gr.ClearButton(audio)
|
| 109 |
btn_ru = gr.Button("Recognize (ru)", variant="primary")
|
| 110 |
btn_en = gr.Button("Recognize (en)", variant="primary")
|
| 111 |
output = gr.Dataframe(headers=["model", "result"], wrap=True)
|
|
@@ -116,9 +125,8 @@ with gr.Blocks() as recognize_short:
|
|
| 116 |
with gr.Blocks() as recognize_long:
|
| 117 |
gr.Markdown("The default VAD parameters are used. For best results, you should adjust the VAD parameters in your app.")
|
| 118 |
name = gr.Dropdown(models_vad.keys(), label="Model")
|
| 119 |
-
audio = gr.Audio(
|
| 120 |
with gr.Row():
|
| 121 |
-
gr.ClearButton(audio)
|
| 122 |
btn = gr.Button("Recognize", variant="primary")
|
| 123 |
output = gr.TextArea(label="result")
|
| 124 |
btn.click(fn=recognize_with_vad, inputs=[audio, name], outputs=output)
|
|
@@ -130,7 +138,7 @@ with gr.Blocks() as recognize_long:
|
|
| 130 |
label = f"Model {name} support only English language"
|
| 131 |
else:
|
| 132 |
label = None
|
| 133 |
-
return gr.Audio(
|
| 134 |
|
| 135 |
name.change(on_model_change, inputs=name, outputs=audio)
|
| 136 |
|
|
@@ -138,9 +146,10 @@ with gr.Blocks(title="onnx-asr demo") as demo:
|
|
| 138 |
gr.Markdown("""
|
| 139 |
# ASR demo using onnx-asr
|
| 140 |
**[onnx-asr](https://github.com/istupakov/onnx-asr)** is a Python package for Automatic Speech Recognition using ONNX models.
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
(
|
|
|
|
| 144 |
You can also use it with your own model if it has a supported architecture.
|
| 145 |
""")
|
| 146 |
gr.TabbedInterface(
|
|
@@ -153,21 +162,22 @@ with gr.Blocks(title="onnx-asr demo") as demo:
|
|
| 153 |
with gr.Accordion("Models used in this demo:", open=False):
|
| 154 |
gr.Markdown("""
|
| 155 |
## Russian ASR models
|
| 156 |
-
* `gigaam-v2-ctc` - Sber GigaAM v2 CTC ([origin](https://github.com/salute-developers/GigaAM), [onnx](https://huggingface.co/istupakov/gigaam-v2-onnx))
|
| 157 |
-
* `gigaam-v2-rnnt` - Sber GigaAM v2 RNN-T ([origin](https://github.com/salute-developers/GigaAM), [onnx](https://huggingface.co/istupakov/gigaam-v2-onnx))
|
| 158 |
* `gigaam-v3-ctc` - Sber GigaAM v3 CTC ([origin](https://huggingface.co/ai-sage/GigaAM-v3), [onnx](https://huggingface.co/istupakov/gigaam-v3-onnx))
|
| 159 |
* `gigaam-v3-rnnt` - Sber GigaAM v3 RNN-T ([origin](https://huggingface.co/ai-sage/GigaAM-v3), [onnx](https://huggingface.co/istupakov/gigaam-v3-onnx))
|
| 160 |
* `gigaam-v3-e2e-ctc` - Sber GigaAM v3 E2E CTC ([origin](https://huggingface.co/ai-sage/GigaAM-v3), [onnx](https://huggingface.co/istupakov/gigaam-v3-onnx))
|
| 161 |
* `gigaam-v3-e2e-rnnt` - Sber GigaAM v3 E2E RNN-T ([origin](https://huggingface.co/ai-sage/GigaAM-v3), [onnx](https://huggingface.co/istupakov/gigaam-v3-onnx))
|
| 162 |
* `nemo-fastconformer-ru-ctc` - Nvidia FastConformer-Hybrid Large (ru) with CTC decoder ([origin](https://huggingface.co/nvidia/stt_ru_fastconformer_hybrid_large_pc), [onnx](https://huggingface.co/istupakov/stt_ru_fastconformer_hybrid_large_pc_onnx))
|
| 163 |
* `nemo-fastconformer-ru-rnnt` - Nvidia FastConformer-Hybrid Large (ru) with RNN-T decoder ([origin](https://huggingface.co/nvidia/stt_ru_fastconformer_hybrid_large_pc), [onnx](https://huggingface.co/istupakov/stt_ru_fastconformer_hybrid_large_pc_onnx))
|
| 164 |
-
* `nemo-parakeet-tdt-0.6b-v3` - Nvidia Parakeet TDT 0.6B
|
|
|
|
| 165 |
* `whisper-base` - OpenAI Whisper Base exported with onnxruntime ([origin](https://huggingface.co/openai/whisper-base), [onnx](https://huggingface.co/istupakov/whisper-base-onnx))
|
| 166 |
* `alphacep/vosk-model-ru` - Alpha Cephei Vosk 0.54-ru ([origin](https://huggingface.co/alphacep/vosk-model-ru))
|
| 167 |
* `alphacep/vosk-model-small-ru` - Alpha Cephei Vosk 0.52-small-ru ([origin](https://huggingface.co/alphacep/vosk-model-small-ru))
|
|
|
|
| 168 |
## English ASR models
|
| 169 |
-
* `nemo-parakeet-tdt-0.6b-v2` - Nvidia Parakeet TDT 0.6B
|
| 170 |
-
* `nemo-parakeet-tdt-0.6b-v3` - Nvidia Parakeet TDT 0.6B
|
|
|
|
| 171 |
* `whisper-base` - OpenAI Whisper Base exported with onnxruntime ([origin](https://huggingface.co/openai/whisper-base), [onnx](https://huggingface.co/istupakov/whisper-base-onnx))
|
| 172 |
## VAD models
|
| 173 |
* `silero` - Silero VAD ([origin](https://github.com/snakers4/silero-vad), [onnx](https://huggingface.co/onnx-community/silero-vad))
|
|
|
|
| 14 |
|
| 15 |
vad = onnx_asr.load_vad("silero")
|
| 16 |
|
| 17 |
+
models_multilang = {name: onnx_asr.load_model(name) for name in ["whisper-base"]} | {
|
| 18 |
+
name: onnx_asr.load_model(name, quantization="int8") for name in ["nemo-parakeet-tdt-0.6b-v3", "nemo-canary-1b-v2"]
|
| 19 |
+
}
|
| 20 |
|
| 21 |
models_ru = {
|
| 22 |
name: onnx_asr.load_model(name)
|
| 23 |
for name in [
|
|
|
|
|
|
|
| 24 |
"gigaam-v3-ctc",
|
| 25 |
"gigaam-v3-rnnt",
|
| 26 |
"gigaam-v3-e2e-ctc",
|
|
|
|
| 29 |
"nemo-fastconformer-ru-rnnt",
|
| 30 |
"alphacep/vosk-model-ru",
|
| 31 |
"alphacep/vosk-model-small-ru",
|
| 32 |
+
"t-tech/t-one",
|
| 33 |
]
|
| 34 |
}
|
| 35 |
|
| 36 |
models_en = {
|
| 37 |
+
name: onnx_asr.load_model(name, quantization="int8")
|
| 38 |
for name in [
|
| 39 |
"nemo-parakeet-tdt-0.6b-v2",
|
| 40 |
]
|
|
|
|
| 43 |
models_vad = models_multilang | models_ru | models_en
|
| 44 |
|
| 45 |
|
| 46 |
+
def recognize(audio: tuple[int, np.ndarray], models, language: str):
|
| 47 |
if audio is None:
|
| 48 |
return None
|
| 49 |
|
| 50 |
+
valid_res = gr.validators.is_audio_correct_length(audio, min_length=1, max_length=10)
|
| 51 |
+
if not valid_res["is_valid"]:
|
| 52 |
+
raise gr.Error(valid_res["message"])
|
| 53 |
+
|
| 54 |
sample_rate, waveform = audio
|
| 55 |
length = waveform.shape[0] / sample_rate
|
| 56 |
logger.debug("recognize: length %.3f, sample_rate %s, waveform.shape %s.", length, sample_rate, waveform.shape)
|
|
|
|
| 64 |
if length > 20 and name == "alphacep/vosk-model-small-ru":
|
| 65 |
gr.Warning(f"Model {name} only supports audio no longer than 20 s.")
|
| 66 |
continue
|
| 67 |
+
|
| 68 |
start = timer()
|
| 69 |
result = model.recognize(waveform, sample_rate=sample_rate, language=language)
|
| 70 |
time = timer() - start
|
|
|
|
| 89 |
if audio is None:
|
| 90 |
return None
|
| 91 |
|
| 92 |
+
valid_res = gr.validators.is_audio_correct_length(audio, min_length=1, max_length=300)
|
| 93 |
+
if not valid_res["is_valid"]:
|
| 94 |
+
raise gr.Error(valid_res["message"])
|
| 95 |
+
|
| 96 |
sample_rate, waveform = audio
|
| 97 |
length = waveform.shape[0] / sample_rate
|
| 98 |
logger.debug("recognize: length %.3f, sample_rate %s, waveform.shape %s.", length, sample_rate, waveform.shape)
|
|
|
|
| 113 |
|
| 114 |
|
| 115 |
with gr.Blocks() as recognize_short:
|
| 116 |
+
audio = gr.Audio()
|
| 117 |
with gr.Row():
|
|
|
|
| 118 |
btn_ru = gr.Button("Recognize (ru)", variant="primary")
|
| 119 |
btn_en = gr.Button("Recognize (en)", variant="primary")
|
| 120 |
output = gr.Dataframe(headers=["model", "result"], wrap=True)
|
|
|
|
| 125 |
with gr.Blocks() as recognize_long:
|
| 126 |
gr.Markdown("The default VAD parameters are used. For best results, you should adjust the VAD parameters in your app.")
|
| 127 |
name = gr.Dropdown(models_vad.keys(), label="Model")
|
| 128 |
+
audio = gr.Audio()
|
| 129 |
with gr.Row():
|
|
|
|
| 130 |
btn = gr.Button("Recognize", variant="primary")
|
| 131 |
output = gr.TextArea(label="result")
|
| 132 |
btn.click(fn=recognize_with_vad, inputs=[audio, name], outputs=output)
|
|
|
|
| 138 |
label = f"Model {name} support only English language"
|
| 139 |
else:
|
| 140 |
label = None
|
| 141 |
+
return gr.Audio(label=label)
|
| 142 |
|
| 143 |
name.change(on_model_change, inputs=name, outputs=audio)
|
| 144 |
|
|
|
|
| 146 |
gr.Markdown("""
|
| 147 |
# ASR demo using onnx-asr
|
| 148 |
**[onnx-asr](https://github.com/istupakov/onnx-asr)** is a Python package for Automatic Speech Recognition using ONNX models.
|
| 149 |
+
It's written in pure Python with minimal dependencies (no PyTorch, Transformers, or FFmpeg required).
|
| 150 |
+
|
| 151 |
+
Supports **Parakeet v2 (En) / v3 (Multilingual)**, **Canary v2 (Multilingual)** and **GigaAM v2/v3 (Ru)** models
|
| 152 |
+
(and many other modern [models](https://github.com/istupakov/onnx-asr?tab=readme-ov-file#supported-model-names)).
|
| 153 |
You can also use it with your own model if it has a supported architecture.
|
| 154 |
""")
|
| 155 |
gr.TabbedInterface(
|
|
|
|
| 162 |
with gr.Accordion("Models used in this demo:", open=False):
|
| 163 |
gr.Markdown("""
|
| 164 |
## Russian ASR models
|
|
|
|
|
|
|
| 165 |
* `gigaam-v3-ctc` - Sber GigaAM v3 CTC ([origin](https://huggingface.co/ai-sage/GigaAM-v3), [onnx](https://huggingface.co/istupakov/gigaam-v3-onnx))
|
| 166 |
* `gigaam-v3-rnnt` - Sber GigaAM v3 RNN-T ([origin](https://huggingface.co/ai-sage/GigaAM-v3), [onnx](https://huggingface.co/istupakov/gigaam-v3-onnx))
|
| 167 |
* `gigaam-v3-e2e-ctc` - Sber GigaAM v3 E2E CTC ([origin](https://huggingface.co/ai-sage/GigaAM-v3), [onnx](https://huggingface.co/istupakov/gigaam-v3-onnx))
|
| 168 |
* `gigaam-v3-e2e-rnnt` - Sber GigaAM v3 E2E RNN-T ([origin](https://huggingface.co/ai-sage/GigaAM-v3), [onnx](https://huggingface.co/istupakov/gigaam-v3-onnx))
|
| 169 |
* `nemo-fastconformer-ru-ctc` - Nvidia FastConformer-Hybrid Large (ru) with CTC decoder ([origin](https://huggingface.co/nvidia/stt_ru_fastconformer_hybrid_large_pc), [onnx](https://huggingface.co/istupakov/stt_ru_fastconformer_hybrid_large_pc_onnx))
|
| 170 |
* `nemo-fastconformer-ru-rnnt` - Nvidia FastConformer-Hybrid Large (ru) with RNN-T decoder ([origin](https://huggingface.co/nvidia/stt_ru_fastconformer_hybrid_large_pc), [onnx](https://huggingface.co/istupakov/stt_ru_fastconformer_hybrid_large_pc_onnx))
|
| 171 |
+
* `nemo-parakeet-tdt-0.6b-v3` - Nvidia Parakeet TDT 0.6B v3 (multilingual) ([origin](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3), [onnx](https://huggingface.co/istupakov/parakeet-tdt-0.6b-v3-onnx))
|
| 172 |
+
* `nemo-canary-1b-v2` - Nvidia Canary 1B v2 (multilingual) ([origin](https://huggingface.co/nvidia/canary-1b-v2), [onnx](https://huggingface.co/istupakov/canary-1b-v2-onnx))
|
| 173 |
* `whisper-base` - OpenAI Whisper Base exported with onnxruntime ([origin](https://huggingface.co/openai/whisper-base), [onnx](https://huggingface.co/istupakov/whisper-base-onnx))
|
| 174 |
* `alphacep/vosk-model-ru` - Alpha Cephei Vosk 0.54-ru ([origin](https://huggingface.co/alphacep/vosk-model-ru))
|
| 175 |
* `alphacep/vosk-model-small-ru` - Alpha Cephei Vosk 0.52-small-ru ([origin](https://huggingface.co/alphacep/vosk-model-small-ru))
|
| 176 |
+
* `t-tech/t-one` - T-Tech T-one ([origin](https://huggingface.co/t-tech/T-one))
|
| 177 |
## English ASR models
|
| 178 |
+
* `nemo-parakeet-tdt-0.6b-v2` - Nvidia Parakeet TDT 0.6B v2 (en) ([origin](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2), [onnx](https://huggingface.co/istupakov/parakeet-tdt-0.6b-v2-onnx))
|
| 179 |
+
* `nemo-parakeet-tdt-0.6b-v3` - Nvidia Parakeet TDT 0.6B v3 (multilingual) ([origin](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3), [onnx](https://huggingface.co/istupakov/parakeet-tdt-0.6b-v3-onnx))
|
| 180 |
+
* `nemo-canary-1b-v2` - Nvidia Canary 1B v2 (multilingual) ([origin](https://huggingface.co/nvidia/canary-1b-v2), [onnx](https://huggingface.co/istupakov/canary-1b-v2-onnx))
|
| 181 |
* `whisper-base` - OpenAI Whisper Base exported with onnxruntime ([origin](https://huggingface.co/openai/whisper-base), [onnx](https://huggingface.co/istupakov/whisper-base-onnx))
|
| 182 |
## VAD models
|
| 183 |
* `silero` - Silero VAD ([origin](https://github.com/snakers4/silero-vad), [onnx](https://huggingface.co/onnx-community/silero-vad))
|