keveman commited on
Commit
bc36801
·
verified ·
1 Parent(s): 60777ba

Upload 7 files

Browse files
Files changed (7) hide show
  1. audio_processor.js +36 -0
  2. encoder_sleeker_fp32.onnx +3 -0
  3. encoder_worker.js +214 -0
  4. index.html +426 -18
  5. streaming_asr.js +801 -0
  6. ten_vad.js +30 -0
  7. ten_vad.wasm +3 -0
audio_processor.js ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * AudioWorklet Processor for low-latency audio capture
3
+ * Runs in a separate audio thread for minimal latency
4
+ */
5
+
6
+ class AudioProcessor extends AudioWorkletProcessor {
7
+ constructor() {
8
+ super();
9
+ this.bufferSize = 512; // Send audio every ~32ms at 16kHz
10
+ this.buffer = [];
11
+ }
12
+
13
+ process(inputs, outputs, parameters) {
14
+ const input = inputs[0];
15
+ if (input.length > 0) {
16
+ const channelData = input[0];
17
+
18
+ // Accumulate samples
19
+ for (let i = 0; i < channelData.length; i++) {
20
+ this.buffer.push(channelData[i]);
21
+ }
22
+
23
+ // Send when we have enough
24
+ while (this.buffer.length >= this.bufferSize) {
25
+ const chunk = this.buffer.splice(0, this.bufferSize);
26
+ this.port.postMessage({
27
+ audio: new Float32Array(chunk)
28
+ });
29
+ }
30
+ }
31
+
32
+ return true; // Keep processor alive
33
+ }
34
+ }
35
+
36
+ registerProcessor('audio-processor', AudioProcessor);
encoder_sleeker_fp32.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3639b95297009937838ae8fa9193602c7407e60a687b1591ffd2cc4b616d87d0
3
+ size 31139657
encoder_worker.js ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Encoder Worker - Runs preprocessor + encoder in a separate thread
3
+ */
4
+
5
+ importScripts('https://cdn.jsdelivr.net/npm/[email protected]/dist/ort.min.js');
6
+
7
+ // Configure ONNX Runtime to find WASM files from CDN
8
+ ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/[email protected]/dist/';
9
+
10
+ // Model config
11
+ let cfg = null;
12
+ let preprocessor = null;
13
+ let encoder = null;
14
+ let tailLatency = 0;
15
+
16
+ // Preprocessor state
17
+ let prepSession = null;
18
+ let prepDim = 0;
19
+ let prepC1 = 0;
20
+ let prepStateC1 = null;
21
+ let prepStateC2 = null;
22
+
23
+ // Encoder state
24
+ let encSession = null;
25
+ let encDim = 0;
26
+ let encNPast = 0;
27
+ let encNFuture = 0;
28
+ let encEncoderDepth = 0;
29
+ let encContextSize = 0;
30
+ let encInputBuffer = [];
31
+ let encTotalInputFrames = 0;
32
+ let encTotalOutputFrames = 0;
33
+
34
+ function resetPreprocessor() {
35
+ if (prepStateC1) prepStateC1.fill(0);
36
+ if (prepStateC2) prepStateC2.fill(0);
37
+ }
38
+
39
+ function resetEncoder() {
40
+ encInputBuffer = [];
41
+ encTotalInputFrames = 0;
42
+ encTotalOutputFrames = 0;
43
+ }
44
+
45
+ async function processPreprocessor(audioChunk) {
46
+ const feeds = {
47
+ 'audio_chunk': new ort.Tensor('float32', audioChunk, [1, audioChunk.length]),
48
+ 'state_c1': new ort.Tensor('float32', prepStateC1, [1, 4, prepDim]),
49
+ 'state_c2': new ort.Tensor('float32', prepStateC2, [1, 4, prepC1])
50
+ };
51
+
52
+ const results = await prepSession.run(feeds);
53
+
54
+ // Update states
55
+ prepStateC1.set(results.new_state_c1.data);
56
+ prepStateC2.set(results.new_state_c2.data);
57
+
58
+ return {
59
+ data: results.features.data,
60
+ dims: results.features.dims
61
+ };
62
+ }
63
+
64
+ async function processEncoder(melData, melDims, flush = true) {
65
+ const newFrames = melDims[1];
66
+
67
+ // Append new frames to buffer
68
+ for (let f = 0; f < newFrames; f++) {
69
+ const frame = new Float32Array(encDim);
70
+ for (let d = 0; d < encDim; d++) {
71
+ frame[d] = melData[f * encDim + d];
72
+ }
73
+ encInputBuffer.push(frame);
74
+ }
75
+
76
+ encTotalInputFrames += newFrames;
77
+
78
+ // Calculate output range
79
+ const canOutput = flush
80
+ ? encTotalInputFrames
81
+ : Math.max(0, encTotalInputFrames - tailLatency);
82
+
83
+ const outputFrom = flush
84
+ ? Math.max(0, encTotalOutputFrames - tailLatency)
85
+ : encTotalOutputFrames;
86
+
87
+ const newOutputCount = canOutput - outputFrom;
88
+
89
+ if (newOutputCount <= 0) {
90
+ return { data: new Float32Array(0), dims: [1, 0, encDim] };
91
+ }
92
+
93
+ // Prepare input buffer tensor
94
+ const bufferFrames = encInputBuffer.length;
95
+ const bufferData = new Float32Array(bufferFrames * encDim);
96
+ for (let f = 0; f < bufferFrames; f++) {
97
+ bufferData.set(encInputBuffer[f], f * encDim);
98
+ }
99
+
100
+ const feeds = {
101
+ 'input': new ort.Tensor('float32', bufferData, [1, bufferFrames, encDim])
102
+ };
103
+
104
+ const results = await encSession.run(feeds);
105
+ const fullOutput = results.output;
106
+
107
+ // Calculate which frames to return
108
+ const bufStartFrame = encTotalInputFrames - bufferFrames;
109
+ const outputStart = outputFrom - bufStartFrame;
110
+
111
+ // Extract the subset of output
112
+ const resultData = new Float32Array(newOutputCount * encDim);
113
+ for (let f = 0; f < newOutputCount; f++) {
114
+ for (let d = 0; d < encDim; d++) {
115
+ resultData[f * encDim + d] = fullOutput.data[(outputStart + f) * encDim + d];
116
+ }
117
+ }
118
+
119
+ // Trim input buffer to context size
120
+ if (encInputBuffer.length > encContextSize) {
121
+ encInputBuffer = encInputBuffer.slice(-encContextSize);
122
+ }
123
+
124
+ encTotalOutputFrames = canOutput;
125
+ return { data: resultData, dims: [1, newOutputCount, encDim] };
126
+ }
127
+
128
+ self.onmessage = async function(e) {
129
+ const { type, data } = e.data;
130
+
131
+ switch (type) {
132
+ case 'init': {
133
+ try {
134
+ cfg = data.cfg;
135
+ const onnxUrl = data.onnxUrl;
136
+ const modelName = data.modelName;
137
+ const dtype = 'fp32';
138
+
139
+ tailLatency = cfg.n_future * cfg.encoder_depth;
140
+
141
+ // Initialize preprocessor
142
+ self.postMessage({ type: 'status', message: 'Loading preprocessor...' });
143
+ prepSession = await ort.InferenceSession.create(
144
+ `${onnxUrl}/preprocessor_streaming_${modelName}_${dtype}.onnx`
145
+ );
146
+ prepDim = cfg.dim;
147
+ prepC1 = 2 * cfg.dim;
148
+ prepStateC1 = new Float32Array(4 * cfg.dim);
149
+ prepStateC2 = new Float32Array(4 * prepC1);
150
+
151
+ // Initialize encoder
152
+ self.postMessage({ type: 'status', message: 'Loading encoder...' });
153
+ encSession = await ort.InferenceSession.create(
154
+ `${onnxUrl}/encoder_${modelName}_${dtype}.onnx`
155
+ );
156
+ encDim = cfg.dim;
157
+ encNPast = cfg.n_past;
158
+ encNFuture = cfg.n_future;
159
+ encEncoderDepth = cfg.encoder_depth;
160
+ encContextSize = cfg.encoder_depth * (cfg.n_past + cfg.n_future);
161
+
162
+ self.postMessage({ type: 'ready' });
163
+ } catch (err) {
164
+ self.postMessage({ type: 'error', message: err.message });
165
+ }
166
+ break;
167
+ }
168
+
169
+ case 'segment_start': {
170
+ resetPreprocessor();
171
+ resetEncoder();
172
+ self.postMessage({
173
+ type: 'segment_start',
174
+ segmentId: data.segmentId
175
+ });
176
+ break;
177
+ }
178
+
179
+ case 'segment_end': {
180
+ self.postMessage({
181
+ type: 'segment_end',
182
+ segmentId: data.segmentId
183
+ });
184
+ break;
185
+ }
186
+
187
+ case 'audio': {
188
+ try {
189
+ // Process through preprocessor
190
+ const mel = await processPreprocessor(new Float32Array(data.audio));
191
+
192
+ const audioMs = (data.audio.length / 16000 * 1000).toFixed(0);
193
+ console.log(`Audio ${data.audio.length} samples (${audioMs}ms) → Mel ${mel.dims[1]} frames`);
194
+
195
+ // Process through encoder with flush=true
196
+ const enc = await processEncoder(mel.data, mel.dims, true);
197
+
198
+ console.log(`Mel ${mel.dims[1]} frames → Encoder ${enc.dims[1]} frames (accumulated: ${encTotalOutputFrames})`);
199
+
200
+ if (enc.dims[1] > 0) {
201
+ self.postMessage({
202
+ type: 'features',
203
+ segmentId: data.segmentId,
204
+ features: enc.data,
205
+ dims: enc.dims
206
+ }, [enc.data.buffer]); // Transfer ownership
207
+ }
208
+ } catch (err) {
209
+ console.error('Encoder error:', err);
210
+ }
211
+ break;
212
+ }
213
+ }
214
+ };
index.html CHANGED
@@ -1,19 +1,427 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  </html>
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Streaming ASR Demo - Moonshine</title>
7
+ <style>
8
+ * {
9
+ box-sizing: border-box;
10
+ margin: 0;
11
+ padding: 0;
12
+ }
13
+
14
+ body {
15
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
16
+ background: #1a1a2e;
17
+ color: #eee;
18
+ min-height: 100vh;
19
+ padding: 20px;
20
+ }
21
+
22
+ .container {
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+
27
+ h1 {
28
+ text-align: center;
29
+ margin-bottom: 20px;
30
+ color: #00d4ff;
31
+ }
32
+
33
+ .status-bar {
34
+ display: flex;
35
+ justify-content: space-between;
36
+ align-items: center;
37
+ background: #16213e;
38
+ padding: 15px 20px;
39
+ border-radius: 10px;
40
+ margin-bottom: 20px;
41
+ }
42
+
43
+ .status-indicator {
44
+ display: flex;
45
+ align-items: center;
46
+ gap: 10px;
47
+ }
48
+
49
+ .status-dot {
50
+ width: 12px;
51
+ height: 12px;
52
+ border-radius: 50%;
53
+ background: #666;
54
+ }
55
+
56
+ .status-dot.idle { background: #666; }
57
+ .status-dot.listening { background: #00ff88; animation: pulse 1s infinite; }
58
+ .status-dot.recording { background: #ff4444; animation: pulse 0.5s infinite; }
59
+
60
+ @keyframes pulse {
61
+ 0%, 100% { opacity: 1; }
62
+ 50% { opacity: 0.5; }
63
+ }
64
+
65
+ .controls {
66
+ display: flex;
67
+ gap: 10px;
68
+ }
69
+
70
+ button {
71
+ padding: 10px 20px;
72
+ border: none;
73
+ border-radius: 5px;
74
+ cursor: pointer;
75
+ font-size: 14px;
76
+ transition: all 0.2s;
77
+ }
78
+
79
+ button:disabled {
80
+ opacity: 0.5;
81
+ cursor: not-allowed;
82
+ }
83
+
84
+ .btn-primary {
85
+ background: #00d4ff;
86
+ color: #1a1a2e;
87
+ }
88
+
89
+ .btn-primary:hover:not(:disabled) {
90
+ background: #00a8cc;
91
+ }
92
+
93
+ .btn-danger {
94
+ background: #ff4444;
95
+ color: white;
96
+ }
97
+
98
+ .btn-danger:hover:not(:disabled) {
99
+ background: #cc3333;
100
+ }
101
+
102
+ .vad-section {
103
+ background: #16213e;
104
+ padding: 20px;
105
+ border-radius: 10px;
106
+ margin-bottom: 20px;
107
+ }
108
+
109
+ .vad-section h3 {
110
+ margin-bottom: 15px;
111
+ color: #00d4ff;
112
+ }
113
+
114
+ .vad-graph {
115
+ background: #0f0f23;
116
+ border-radius: 5px;
117
+ padding: 10px;
118
+ height: 120px;
119
+ position: relative;
120
+ overflow: hidden;
121
+ }
122
+
123
+ .vad-canvas {
124
+ width: 100%;
125
+ height: 100%;
126
+ }
127
+
128
+ .vad-bar {
129
+ display: flex;
130
+ align-items: center;
131
+ gap: 10px;
132
+ margin-top: 15px;
133
+ }
134
+
135
+ .vad-bar-container {
136
+ flex: 1;
137
+ height: 20px;
138
+ background: #0f0f23;
139
+ border-radius: 10px;
140
+ overflow: hidden;
141
+ }
142
+
143
+ .vad-bar-fill {
144
+ height: 100%;
145
+ background: linear-gradient(90deg, #00ff88, #00d4ff);
146
+ width: 0%;
147
+ transition: width 0.1s;
148
+ }
149
+
150
+ .vad-value {
151
+ min-width: 50px;
152
+ text-align: right;
153
+ font-family: monospace;
154
+ }
155
+
156
+ .pipeline-status {
157
+ display: flex;
158
+ gap: 15px;
159
+ margin-top: 15px;
160
+ font-family: monospace;
161
+ font-size: 12px;
162
+ color: #888;
163
+ }
164
+
165
+ .transcripts-section {
166
+ background: #16213e;
167
+ padding: 20px;
168
+ border-radius: 10px;
169
+ min-height: 300px;
170
+ }
171
+
172
+ .transcripts-section h3 {
173
+ margin-bottom: 15px;
174
+ color: #00d4ff;
175
+ }
176
+
177
+ .transcripts-list {
178
+ max-height: 200px;
179
+ overflow-y: auto;
180
+ }
181
+
182
+ .transcript-item {
183
+ padding: 10px 15px;
184
+ background: #0f0f23;
185
+ border-radius: 5px;
186
+ margin-bottom: 10px;
187
+ display: flex;
188
+ gap: 10px;
189
+ }
190
+
191
+ .transcript-duration {
192
+ color: #00d4ff;
193
+ font-family: monospace;
194
+ min-width: 50px;
195
+ }
196
+
197
+ .transcript-text {
198
+ flex: 1;
199
+ }
200
+
201
+ .live-caption {
202
+ padding: 30px 40px;
203
+ background: rgba(0, 0, 0, 0.75);
204
+ border-radius: 12px;
205
+ margin-top: 25px;
206
+ min-height: 100px;
207
+ display: flex;
208
+ flex-direction: column;
209
+ justify-content: center;
210
+ align-items: center;
211
+ text-align: center;
212
+ backdrop-filter: blur(10px);
213
+ border: 1px solid rgba(255, 255, 255, 0.1);
214
+ box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
215
+ transition: all 0.3s ease;
216
+ }
217
+
218
+ .live-caption.active {
219
+ background: rgba(0, 0, 0, 0.85);
220
+ border-color: rgba(255, 68, 68, 0.3);
221
+ box-shadow: 0 8px 32px rgba(255, 68, 68, 0.15);
222
+ }
223
+
224
+ .live-caption-label {
225
+ font-size: 11px;
226
+ color: rgba(255, 255, 255, 0.5);
227
+ text-transform: uppercase;
228
+ letter-spacing: 2px;
229
+ margin-bottom: 12px;
230
+ }
231
+
232
+ .live-caption.active .live-caption-label {
233
+ color: #ff6b6b;
234
+ }
235
+
236
+ .live-caption-text {
237
+ font-size: 28px;
238
+ font-weight: 400;
239
+ line-height: 1.5;
240
+ min-height: 40px;
241
+ color: #ffffff;
242
+ text-shadow: 0 2px 4px rgba(0, 0, 0, 0.5);
243
+ max-width: 100%;
244
+ word-wrap: break-word;
245
+ }
246
+
247
+ .live-caption-text.placeholder {
248
+ color: rgba(255, 255, 255, 0.35);
249
+ font-style: italic;
250
+ font-size: 20px;
251
+ font-weight: 300;
252
+ }
253
+
254
+ .config-section {
255
+ background: #16213e;
256
+ padding: 20px;
257
+ border-radius: 10px;
258
+ margin-bottom: 20px;
259
+ }
260
+
261
+ .config-section h3 {
262
+ margin-bottom: 15px;
263
+ color: #00d4ff;
264
+ }
265
+
266
+ .config-grid {
267
+ display: grid;
268
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
269
+ gap: 15px;
270
+ }
271
+
272
+ .config-item {
273
+ display: flex;
274
+ flex-direction: column;
275
+ gap: 5px;
276
+ }
277
+
278
+ .config-item label {
279
+ font-size: 12px;
280
+ color: #888;
281
+ }
282
+
283
+ .config-item select, .config-item input {
284
+ padding: 8px;
285
+ border: 1px solid #333;
286
+ border-radius: 5px;
287
+ background: #0f0f23;
288
+ color: #eee;
289
+ }
290
+
291
+ .loading-overlay {
292
+ position: fixed;
293
+ top: 0;
294
+ left: 0;
295
+ right: 0;
296
+ bottom: 0;
297
+ background: rgba(0, 0, 0, 0.8);
298
+ display: flex;
299
+ justify-content: center;
300
+ align-items: center;
301
+ z-index: 1000;
302
+ }
303
+
304
+ .loading-overlay.hidden {
305
+ display: none;
306
+ }
307
+
308
+ .loading-content {
309
+ text-align: center;
310
+ }
311
+
312
+ .loading-spinner {
313
+ width: 50px;
314
+ height: 50px;
315
+ border: 3px solid #333;
316
+ border-top-color: #00d4ff;
317
+ border-radius: 50%;
318
+ animation: spin 1s linear infinite;
319
+ margin: 0 auto 20px;
320
+ }
321
+
322
+ @keyframes spin {
323
+ to { transform: rotate(360deg); }
324
+ }
325
+
326
+ .loading-text {
327
+ color: #00d4ff;
328
+ }
329
+
330
+ .error-message {
331
+ background: #ff4444;
332
+ color: white;
333
+ padding: 15px;
334
+ border-radius: 5px;
335
+ margin-bottom: 20px;
336
+ display: none;
337
+ }
338
+
339
+ .error-message.visible {
340
+ display: block;
341
+ }
342
+ </style>
343
+ </head>
344
+ <body>
345
+ <div class="loading-overlay hidden" id="loadingOverlay">
346
+ <div class="loading-content">
347
+ <div class="loading-spinner"></div>
348
+ <div class="loading-text" id="loadingText">Loading models...</div>
349
+ </div>
350
+ </div>
351
+
352
+ <div class="container">
353
+ <h1>Streaming ASR Demo</h1>
354
+
355
+ <div class="error-message" id="errorMessage"></div>
356
+
357
+ <div class="config-section">
358
+ <h3>Configuration</h3>
359
+ <div class="config-grid">
360
+ <div class="config-item">
361
+ <label>Model</label>
362
+ <select id="modelSelect">
363
+ <option value="sleeker">Moonshine Sleeker</option>
364
+ <option value="spindlier">Moonshine Spindlier</option>
365
+ </select>
366
+ </div>
367
+ <div class="config-item">
368
+ <label>ONNX Files URL</label>
369
+ <input type="text" id="onnxUrl" placeholder="e.g., ./models or https://..." value="./models">
370
+ </div>
371
+ <div class="config-item">
372
+ <label>Onset Threshold</label>
373
+ <input type="number" id="onsetThreshold" value="0.4" min="0" max="1" step="0.1">
374
+ </div>
375
+ <div class="config-item">
376
+ <label>Offset Threshold</label>
377
+ <input type="number" id="offsetThreshold" value="0.3" min="0" max="1" step="0.1">
378
+ </div>
379
+ </div>
380
+ </div>
381
+
382
+ <div class="status-bar">
383
+ <div class="status-indicator">
384
+ <div class="status-dot" id="statusDot"></div>
385
+ <span id="statusText">Ready</span>
386
+ </div>
387
+ <div class="controls">
388
+ <button class="btn-primary" id="startBtn">Start Listening</button>
389
+ <button class="btn-danger" id="stopBtn" disabled>Stop</button>
390
+ </div>
391
+ </div>
392
+
393
+ <div class="vad-section">
394
+ <h3>Voice Activity Detection</h3>
395
+ <div class="vad-graph">
396
+ <canvas id="vadCanvas" class="vad-canvas"></canvas>
397
+ </div>
398
+ <div class="vad-bar">
399
+ <span>VAD:</span>
400
+ <div class="vad-bar-container">
401
+ <div class="vad-bar-fill" id="vadBarFill"></div>
402
+ </div>
403
+ <span class="vad-value" id="vadValue">0%</span>
404
+ </div>
405
+ <div class="pipeline-status">
406
+ <span>audio_q: <span id="audioQueueSize">0</span></span>
407
+ <span>features_q: <span id="featuresQueueSize">0</span></span>
408
+ <span>dropped: <span id="droppedChunks">0</span></span>
409
+ </div>
410
+ </div>
411
+
412
+ <div class="transcripts-section">
413
+ <h3>Transcripts</h3>
414
+ <div class="transcripts-list" id="transcriptsList"></div>
415
+ <div class="live-caption" id="liveCaption">
416
+ <div class="live-caption-label">Live Caption</div>
417
+ <div class="live-caption-text placeholder" id="liveCaptionText">
418
+ Waiting for speech...
419
+ </div>
420
+ </div>
421
+ </div>
422
+ </div>
423
+
424
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/ort.min.js"></script>
425
+ <script type="module" src="streaming_asr.js"></script>
426
+ </body>
427
  </html>
streaming_asr.js ADDED
@@ -0,0 +1,801 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Streaming ASR Demo - JavaScript Implementation with Web Workers
3
+ *
4
+ * Architecture:
5
+ * - Main thread: Audio capture, VAD, UI updates
6
+ * - Encoder Worker: Preprocessor + Encoder ONNX inference
7
+ * - Decoder Worker: Adapter + Decoder ONNX inference
8
+ */
9
+
10
+ // =============================================================================
11
+ // Constants
12
+ // =============================================================================
13
+
14
+ const SAMPLE_RATE = 16000;
15
+ const VAD_CHUNK_SAMPLES = 160; // 10ms - optimal for TenVAD
16
+ const ASR_CHUNK_SAMPLES = 320; // 20ms - Moonshine frame size
17
+ const ENCODER_BATCH_SAMPLES = 5120; // 320ms - batch size for encoder
18
+
19
+ const PRE_BUFFER_CHUNKS = 15; // ~300ms at 20ms chunks
20
+ const POST_BUFFER_CHUNKS = 3; // ~60ms at 20ms chunks
21
+
22
+ const MODEL_CONFIGS = {
23
+ sleeker: {
24
+ dim: 336,
25
+ dec_dim: 288,
26
+ depth: 6,
27
+ encoder_depth: 6,
28
+ n_past: 16,
29
+ n_future: 4,
30
+ nheads: 8,
31
+ head_dim: 36,
32
+ vocab_size: 32768
33
+ },
34
+ spindlier: {
35
+ dim: 620,
36
+ dec_dim: 512,
37
+ depth: 10,
38
+ encoder_depth: 10,
39
+ n_past: 16,
40
+ n_future: 4,
41
+ nheads: 8,
42
+ head_dim: 64,
43
+ vocab_size: 32768
44
+ }
45
+ };
46
+
47
+ // =============================================================================
48
+ // TenVAD - WebAssembly-based Voice Activity Detection
49
+ // =============================================================================
50
+
51
+ class TenVAD {
52
+ constructor(hopSize = 160, threshold = 0.5) {
53
+ this.hopSize = hopSize;
54
+ this.threshold = threshold;
55
+ this.module = null;
56
+ this.vadHandle = null;
57
+ this.audioPtr = null;
58
+ this.probPtr = null;
59
+ this.flagPtr = null;
60
+ this.ready = false;
61
+ }
62
+
63
+ async init(wasmUrl = './ten_vad.js') {
64
+ const wasmBinaryUrl = wasmUrl.replace('.js', '.wasm');
65
+
66
+ // Dynamic import of the ES module
67
+ const vadModule = await import(wasmUrl);
68
+ const createTenVadModule = vadModule.default;
69
+
70
+ this.module = await createTenVadModule({
71
+ locateFile: (path) => {
72
+ if (path.endsWith('.wasm')) {
73
+ return wasmBinaryUrl;
74
+ }
75
+ return path;
76
+ }
77
+ });
78
+
79
+ // Create VAD instance
80
+ const vadHandlePtr = this.module._malloc(4);
81
+ const result = this.module._ten_vad_create(vadHandlePtr, this.hopSize, this.threshold);
82
+
83
+ if (result !== 0) {
84
+ this.module._free(vadHandlePtr);
85
+ throw new Error(`Failed to create TenVAD instance: ${result}`);
86
+ }
87
+
88
+ this.vadHandle = this.module.HEAP32[vadHandlePtr / 4];
89
+ this.module._free(vadHandlePtr);
90
+
91
+ // Allocate buffers
92
+ this.audioPtr = this.module._malloc(this.hopSize * 2);
93
+ this.probPtr = this.module._malloc(4);
94
+ this.flagPtr = this.module._malloc(4);
95
+
96
+ this.ready = true;
97
+ }
98
+
99
+ process(audioChunkFloat32) {
100
+ if (!this.ready) return -1;
101
+
102
+ const int16Data = new Int16Array(this.hopSize);
103
+ for (let i = 0; i < this.hopSize && i < audioChunkFloat32.length; i++) {
104
+ int16Data[i] = Math.max(-32768, Math.min(32767, Math.round(audioChunkFloat32[i] * 32767)));
105
+ }
106
+
107
+ this.module.HEAP16.set(int16Data, this.audioPtr / 2);
108
+ this.module._ten_vad_process(this.vadHandle, this.audioPtr, this.hopSize, this.probPtr, this.flagPtr);
109
+
110
+ return this.module.HEAPF32[this.probPtr / 4];
111
+ }
112
+
113
+ destroy() {
114
+ if (!this.ready || !this.module) return;
115
+
116
+ this.ready = false; // Prevent further use
117
+
118
+ try {
119
+ if (this.audioPtr) {
120
+ this.module._free(this.audioPtr);
121
+ this.audioPtr = null;
122
+ }
123
+ if (this.probPtr) {
124
+ this.module._free(this.probPtr);
125
+ this.probPtr = null;
126
+ }
127
+ if (this.flagPtr) {
128
+ this.module._free(this.flagPtr);
129
+ this.flagPtr = null;
130
+ }
131
+ // Skip _ten_vad_destroy as it causes memory access errors
132
+ // The WASM memory will be cleaned up when the module is garbage collected
133
+ this.vadHandle = null;
134
+ } catch (e) {
135
+ console.warn('TenVAD cleanup error:', e);
136
+ }
137
+
138
+ this.module = null;
139
+ }
140
+ }
141
+
142
+ // Fallback simple energy-based VAD
143
+ class SimpleVAD {
144
+ constructor(sampleRate = 16000, frameSize = 160) {
145
+ this.frameSize = frameSize;
146
+ this.energyHistory = [];
147
+ this.historySize = 50;
148
+ this.noiseFloor = 0.001;
149
+ this.ready = true;
150
+ }
151
+
152
+ async init() {}
153
+
154
+ process(audioChunk) {
155
+ let energy = 0;
156
+ for (let i = 0; i < audioChunk.length; i++) {
157
+ energy += audioChunk[i] * audioChunk[i];
158
+ }
159
+ energy = Math.sqrt(energy / audioChunk.length);
160
+
161
+ this.energyHistory.push(energy);
162
+ if (this.energyHistory.length > this.historySize) {
163
+ this.energyHistory.shift();
164
+ }
165
+
166
+ if (this.energyHistory.length > 10) {
167
+ const sorted = [...this.energyHistory].sort((a, b) => a - b);
168
+ this.noiseFloor = sorted[Math.floor(sorted.length * 0.1)] || 0.001;
169
+ }
170
+
171
+ const snr = energy / (this.noiseFloor + 1e-10);
172
+ return 1 / (1 + Math.exp(-2 * (snr - 3)));
173
+ }
174
+
175
+ destroy() {}
176
+ }
177
+
178
+ // =============================================================================
179
+ // Pipelined Streaming ASR with Web Workers
180
+ // =============================================================================
181
+
182
+ class PipelinedStreamingASR {
183
+ constructor(config) {
184
+ this.modelName = config.modelName || 'sleeker';
185
+ this.onnxUrl = config.onnxUrl || './models';
186
+ this.onsetThreshold = config.onsetThreshold || 0.5;
187
+ this.offsetThreshold = config.offsetThreshold || 0.3;
188
+ this.emaAlpha = config.emaAlpha || 0.3;
189
+
190
+ this.cfg = MODEL_CONFIGS[this.modelName];
191
+
192
+ // Workers
193
+ this.encoderWorker = null;
194
+ this.decoderWorker = null;
195
+ this.encoderReady = false;
196
+ this.decoderReady = false;
197
+
198
+ // VAD (runs on main thread for low latency)
199
+ this.vad = null;
200
+
201
+ // Audio capture
202
+ this.audioContext = null;
203
+ this.sourceNode = null;
204
+ this.workletNode = null;
205
+
206
+ // State
207
+ this.running = false;
208
+ this.state = 'idle';
209
+ this.currentSegmentId = 0;
210
+ this.emaProb = 0;
211
+ this.onsetCounter = 0;
212
+ this.offsetCounter = 0;
213
+
214
+ // Buffers
215
+ this.vadBuffer = [];
216
+ this.asrBuffer = [];
217
+ this.preBuffer = [];
218
+ this.postBufferRemaining = 0;
219
+ this.encoderBatchBuffer = []; // Accumulate 320ms before sending to encoder
220
+
221
+ // Display state
222
+ this.vadHistory = [];
223
+ this.vadUpdateCounter = 0;
224
+ this.vadUpdateInterval = 5; // Update display every 5 VAD chunks (50ms)
225
+
226
+ // Callbacks
227
+ this.onVadUpdate = null;
228
+ this.onTranscript = null;
229
+ this.onLiveCaption = null;
230
+ this.onStatusUpdate = null;
231
+ this.onQueueUpdate = null;
232
+ }
233
+
234
+ async loadModels(progressCallback) {
235
+ // Initialize VAD
236
+ try {
237
+ progressCallback?.('Loading TenVAD...');
238
+ this.vad = new TenVAD(VAD_CHUNK_SAMPLES, 0.5);
239
+ await this.vad.init('./ten_vad.js');
240
+ console.log('Using TenVAD');
241
+ } catch (e) {
242
+ console.warn('TenVAD failed, using SimpleVAD:', e.message);
243
+ this.vad = new SimpleVAD(SAMPLE_RATE, VAD_CHUNK_SAMPLES);
244
+ await this.vad.init();
245
+ }
246
+
247
+ // Initialize Encoder Worker
248
+ progressCallback?.('Loading encoder...');
249
+ await this.initEncoderWorker();
250
+
251
+ // Initialize Decoder Worker
252
+ progressCallback?.('Loading decoder...');
253
+ await this.initDecoderWorker();
254
+
255
+ progressCallback?.('Ready!');
256
+ }
257
+
258
+ initEncoderWorker() {
259
+ return new Promise((resolve, reject) => {
260
+ this.encoderWorker = new Worker('./encoder_worker.js');
261
+
262
+ this.encoderWorker.onmessage = (e) => {
263
+ const { type, data } = e.data;
264
+
265
+ switch (type) {
266
+ case 'ready':
267
+ this.encoderReady = true;
268
+ resolve();
269
+ break;
270
+ case 'error':
271
+ reject(new Error(e.data.message));
272
+ break;
273
+ case 'status':
274
+ // Progress update from worker
275
+ break;
276
+ case 'segment_start':
277
+ this.decoderWorker?.postMessage({ type: 'segment_start', data: { segmentId: e.data.segmentId } });
278
+ break;
279
+ case 'segment_end':
280
+ this.decoderWorker?.postMessage({ type: 'segment_end', data: { segmentId: e.data.segmentId } });
281
+ break;
282
+ case 'features':
283
+ // Forward features to decoder worker
284
+ this.decoderWorker?.postMessage({
285
+ type: 'features',
286
+ data: {
287
+ segmentId: e.data.segmentId,
288
+ features: e.data.features,
289
+ dims: e.data.dims
290
+ }
291
+ }, [e.data.features.buffer]);
292
+ break;
293
+ }
294
+ };
295
+
296
+ this.encoderWorker.postMessage({
297
+ type: 'init',
298
+ data: {
299
+ cfg: this.cfg,
300
+ onnxUrl: this.onnxUrl,
301
+ modelName: this.modelName
302
+ }
303
+ });
304
+ });
305
+ }
306
+
307
+ initDecoderWorker() {
308
+ return new Promise((resolve, reject) => {
309
+ this.decoderWorker = new Worker('./decoder_worker.js');
310
+
311
+ this.decoderWorker.onmessage = (e) => {
312
+ const { type } = e.data;
313
+
314
+ switch (type) {
315
+ case 'ready':
316
+ this.decoderReady = true;
317
+ resolve();
318
+ break;
319
+ case 'error':
320
+ reject(new Error(e.data.message));
321
+ break;
322
+ case 'status':
323
+ break;
324
+ case 'transcript':
325
+ this.onTranscript?.(e.data.text, e.data.segmentId);
326
+ break;
327
+ case 'live_caption':
328
+ this.onLiveCaption?.(e.data.text);
329
+ break;
330
+ }
331
+ };
332
+
333
+ this.decoderWorker.postMessage({
334
+ type: 'init',
335
+ data: {
336
+ cfg: this.cfg,
337
+ onnxUrl: this.onnxUrl,
338
+ modelName: this.modelName
339
+ }
340
+ });
341
+ });
342
+ }
343
+
344
+ async start() {
345
+ if (this.running) return;
346
+
347
+ const stream = await navigator.mediaDevices.getUserMedia({
348
+ audio: {
349
+ sampleRate: SAMPLE_RATE,
350
+ channelCount: 1,
351
+ echoCancellation: false,
352
+ noiseSuppression: false,
353
+ autoGainControl: false
354
+ }
355
+ });
356
+
357
+ this.audioContext = new AudioContext({ sampleRate: SAMPLE_RATE });
358
+
359
+ // Check actual sample rate
360
+ console.log(`Requested sample rate: ${SAMPLE_RATE}, Actual: ${this.audioContext.sampleRate}`);
361
+ this.sourceNode = this.audioContext.createMediaStreamSource(stream);
362
+
363
+ // Use AudioWorklet for better performance
364
+ try {
365
+ await this.audioContext.audioWorklet.addModule('./audio_processor.js');
366
+ this.workletNode = new AudioWorkletNode(this.audioContext, 'audio-processor');
367
+
368
+ this.workletNode.port.onmessage = (e) => {
369
+ if (this.running) {
370
+ this.processAudioChunk(e.data.audio);
371
+ }
372
+ };
373
+
374
+ this.sourceNode.connect(this.workletNode);
375
+ this.workletNode.connect(this.audioContext.destination);
376
+ } catch (e) {
377
+ // Fallback to ScriptProcessor
378
+ console.warn('AudioWorklet not available, using ScriptProcessor');
379
+ const bufferSize = 2048;
380
+ this.scriptNode = this.audioContext.createScriptProcessor(bufferSize, 1, 1);
381
+
382
+ this.scriptNode.onaudioprocess = (e) => {
383
+ if (this.running) {
384
+ const inputData = e.inputBuffer.getChannelData(0);
385
+ this.processAudioChunk(new Float32Array(inputData));
386
+ }
387
+ };
388
+
389
+ this.sourceNode.connect(this.scriptNode);
390
+ this.scriptNode.connect(this.audioContext.destination);
391
+ }
392
+
393
+ this.running = true;
394
+ this.state = 'idle';
395
+ this.onsetCounter = 0;
396
+ this.offsetCounter = 0;
397
+ this.emaProb = 0;
398
+
399
+ this.onStatusUpdate?.('listening', 'Listening...');
400
+ }
401
+
402
+ stop() {
403
+ this.running = false;
404
+
405
+ if (this.workletNode) {
406
+ this.workletNode.disconnect();
407
+ this.workletNode = null;
408
+ }
409
+ if (this.scriptNode) {
410
+ this.scriptNode.disconnect();
411
+ this.scriptNode = null;
412
+ }
413
+ if (this.sourceNode) {
414
+ this.sourceNode.disconnect();
415
+ this.sourceNode = null;
416
+ }
417
+ if (this.audioContext) {
418
+ this.audioContext.close();
419
+ this.audioContext = null;
420
+ }
421
+ if (this.vad) {
422
+ this.vad.destroy();
423
+ }
424
+ if (this.encoderWorker) {
425
+ this.encoderWorker.terminate();
426
+ this.encoderWorker = null;
427
+ }
428
+ if (this.decoderWorker) {
429
+ this.decoderWorker.terminate();
430
+ this.decoderWorker = null;
431
+ }
432
+
433
+ this.onStatusUpdate?.('idle', 'Stopped');
434
+ }
435
+
436
+ processAudioChunk(audioData) {
437
+ // Accumulate for VAD (10ms chunks)
438
+ this.vadBuffer.push(...audioData);
439
+
440
+ // Accumulate for ASR (20ms chunks)
441
+ this.asrBuffer.push(...audioData);
442
+
443
+ // Process VAD chunks
444
+ while (this.vadBuffer.length >= VAD_CHUNK_SAMPLES) {
445
+ const vadChunk = new Float32Array(this.vadBuffer.splice(0, VAD_CHUNK_SAMPLES));
446
+ const prob = this.vad.process(vadChunk);
447
+
448
+ if (prob >= 0) {
449
+ this.emaProb = this.emaAlpha * prob + (1 - this.emaAlpha) * this.emaProb;
450
+
451
+ // Throttle display updates (every 50ms instead of 10ms)
452
+ this.vadUpdateCounter++;
453
+ if (this.vadUpdateCounter >= this.vadUpdateInterval) {
454
+ this.vadUpdateCounter = 0;
455
+ this.vadHistory.push(this.emaProb);
456
+ if (this.vadHistory.length > 100) this.vadHistory.shift();
457
+ this.onVadUpdate?.(this.emaProb, this.vadHistory);
458
+ }
459
+
460
+ this.updateSegmentState();
461
+ }
462
+ }
463
+
464
+ // Extract complete ASR chunks
465
+ while (this.asrBuffer.length >= ASR_CHUNK_SAMPLES) {
466
+ const chunkData = this.asrBuffer.splice(0, ASR_CHUNK_SAMPLES);
467
+ const chunk = new Float32Array(chunkData);
468
+
469
+ if (this.state === 'speech') {
470
+ this.sendAudioToEncoder(chunk);
471
+ } else {
472
+ this.preBuffer.push(chunk);
473
+ if (this.preBuffer.length > PRE_BUFFER_CHUNKS) {
474
+ this.preBuffer.shift();
475
+ }
476
+
477
+ if (this.postBufferRemaining > 0) {
478
+ this.sendAudioToEncoder(chunk);
479
+ this.postBufferRemaining--;
480
+
481
+ if (this.postBufferRemaining === 0) {
482
+ this.finalizeSegmentEnd();
483
+ }
484
+ }
485
+ }
486
+ }
487
+ }
488
+
489
+ sendAudioToEncoder(chunk, flush = false) {
490
+ if (!this.encoderWorker || !this.encoderReady) return;
491
+
492
+ // Accumulate chunks into batch buffer
493
+ this.encoderBatchBuffer.push(...chunk);
494
+
495
+ // Send when we have 320ms worth of audio, or on flush
496
+ if (this.encoderBatchBuffer.length >= ENCODER_BATCH_SAMPLES || flush) {
497
+ if (this.encoderBatchBuffer.length > 0) {
498
+ const batch = new Float32Array(this.encoderBatchBuffer);
499
+ this.encoderBatchBuffer = [];
500
+
501
+ this.encoderWorker.postMessage({
502
+ type: 'audio',
503
+ data: {
504
+ audio: batch,
505
+ segmentId: this.currentSegmentId
506
+ }
507
+ }, [batch.buffer]);
508
+ }
509
+ }
510
+ }
511
+
512
+ updateSegmentState() {
513
+ if (this.state === 'idle') {
514
+ if (this.emaProb >= this.onsetThreshold) {
515
+ this.onsetCounter++;
516
+ if (this.onsetCounter >= 2) {
517
+ this.startSegment();
518
+ }
519
+ } else {
520
+ this.onsetCounter = 0;
521
+ }
522
+ } else if (this.state === 'speech') {
523
+ if (this.emaProb < this.offsetThreshold) {
524
+ this.offsetCounter++;
525
+ if (this.offsetCounter >= 3) {
526
+ this.endSegment();
527
+ }
528
+ } else {
529
+ this.offsetCounter = 0;
530
+ }
531
+ }
532
+ }
533
+
534
+ startSegment() {
535
+ this.currentSegmentId++;
536
+ this.state = 'speech';
537
+ this.onsetCounter = 0;
538
+ this.offsetCounter = 0;
539
+ this.encoderBatchBuffer = []; // Reset batch buffer for new segment
540
+
541
+ // Tell encoder to start new segment
542
+ this.encoderWorker?.postMessage({
543
+ type: 'segment_start',
544
+ data: { segmentId: this.currentSegmentId }
545
+ });
546
+
547
+ // Drain pre-buffer
548
+ while (this.preBuffer.length > 0) {
549
+ const chunk = this.preBuffer.shift();
550
+ this.sendAudioToEncoder(chunk);
551
+ }
552
+
553
+ this.onStatusUpdate?.('recording', 'Recording...');
554
+ }
555
+
556
+ endSegment() {
557
+ this.state = 'idle';
558
+ this.offsetCounter = 0;
559
+ this.postBufferRemaining = POST_BUFFER_CHUNKS;
560
+
561
+ if (this.postBufferRemaining === 0) {
562
+ this.finalizeSegmentEnd();
563
+ }
564
+
565
+ this.onStatusUpdate?.('listening', 'Listening...');
566
+ }
567
+
568
+ finalizeSegmentEnd() {
569
+ // Process remaining complete chunks
570
+ while (this.asrBuffer.length >= ASR_CHUNK_SAMPLES) {
571
+ const chunkData = this.asrBuffer.splice(0, ASR_CHUNK_SAMPLES);
572
+ const chunk = new Float32Array(chunkData);
573
+ this.sendAudioToEncoder(chunk);
574
+ }
575
+
576
+ // Pad and send partial chunk
577
+ if (this.asrBuffer.length > 0) {
578
+ const padded = new Float32Array(ASR_CHUNK_SAMPLES);
579
+ padded.set(this.asrBuffer);
580
+ this.sendAudioToEncoder(padded);
581
+ }
582
+ this.asrBuffer = [];
583
+
584
+ // Flush any remaining audio in the batch buffer
585
+ this.sendAudioToEncoder(new Float32Array(0), true);
586
+
587
+ // Signal segment end
588
+ this.encoderWorker?.postMessage({
589
+ type: 'segment_end',
590
+ data: { segmentId: this.currentSegmentId }
591
+ });
592
+ }
593
+ }
594
+
595
+ // =============================================================================
596
+ // UI Controller
597
+ // =============================================================================
598
+
599
+ class ASRDemoUI {
600
+ constructor() {
601
+ this.asr = null;
602
+ this.vadCanvas = null;
603
+ this.vadCtx = null;
604
+
605
+ this.initElements();
606
+ this.initCanvas();
607
+ this.bindEvents();
608
+ }
609
+
610
+ initElements() {
611
+ this.loadingOverlay = document.getElementById('loadingOverlay');
612
+ this.loadingText = document.getElementById('loadingText');
613
+ this.errorMessage = document.getElementById('errorMessage');
614
+ this.statusDot = document.getElementById('statusDot');
615
+ this.statusText = document.getElementById('statusText');
616
+ this.startBtn = document.getElementById('startBtn');
617
+ this.stopBtn = document.getElementById('stopBtn');
618
+ this.vadBarFill = document.getElementById('vadBarFill');
619
+ this.vadValue = document.getElementById('vadValue');
620
+ this.audioQueueSize = document.getElementById('audioQueueSize');
621
+ this.featuresQueueSize = document.getElementById('featuresQueueSize');
622
+ this.droppedChunksEl = document.getElementById('droppedChunks');
623
+ this.transcriptsList = document.getElementById('transcriptsList');
624
+ this.liveCaption = document.getElementById('liveCaption');
625
+ this.liveCaptionText = document.getElementById('liveCaptionText');
626
+ this.modelSelect = document.getElementById('modelSelect');
627
+ this.onnxUrl = document.getElementById('onnxUrl');
628
+ this.onsetThreshold = document.getElementById('onsetThreshold');
629
+ this.offsetThreshold = document.getElementById('offsetThreshold');
630
+ }
631
+
632
+ initCanvas() {
633
+ this.vadCanvas = document.getElementById('vadCanvas');
634
+ this.vadCtx = this.vadCanvas.getContext('2d');
635
+
636
+ const rect = this.vadCanvas.getBoundingClientRect();
637
+ this.vadCanvas.width = rect.width * window.devicePixelRatio;
638
+ this.vadCanvas.height = rect.height * window.devicePixelRatio;
639
+ this.vadCtx.scale(window.devicePixelRatio, window.devicePixelRatio);
640
+ }
641
+
642
+ bindEvents() {
643
+ this.startBtn.addEventListener('click', () => this.handleStart());
644
+ this.stopBtn.addEventListener('click', () => this.handleStop());
645
+ }
646
+
647
+ async handleStart() {
648
+ try {
649
+ this.showLoading('Initializing...');
650
+
651
+ const config = {
652
+ modelName: this.modelSelect.value,
653
+ onnxUrl: this.onnxUrl.value || './models',
654
+ onsetThreshold: parseFloat(this.onsetThreshold.value),
655
+ offsetThreshold: parseFloat(this.offsetThreshold.value)
656
+ };
657
+
658
+ this.asr = new PipelinedStreamingASR(config);
659
+
660
+ this.asr.onVadUpdate = (prob, history) => this.updateVadDisplay(prob, history);
661
+ this.asr.onTranscript = (text, segmentId) => this.addTranscript(text, segmentId);
662
+ this.asr.onLiveCaption = (text) => this.updateLiveCaption(text);
663
+ this.asr.onStatusUpdate = (status, text) => this.updateStatus(status, text);
664
+
665
+ await this.asr.loadModels((text) => {
666
+ this.loadingText.textContent = text;
667
+ });
668
+
669
+ await this.asr.start();
670
+
671
+ this.hideLoading();
672
+ this.startBtn.disabled = true;
673
+ this.stopBtn.disabled = false;
674
+ this.disableConfig(true);
675
+
676
+ } catch (error) {
677
+ console.error('Start error:', error);
678
+ this.hideLoading();
679
+ this.showError(`Failed to start: ${error.message}`);
680
+ }
681
+ }
682
+
683
+ handleStop() {
684
+ if (this.asr) {
685
+ this.asr.stop();
686
+ this.asr = null;
687
+ }
688
+
689
+ this.startBtn.disabled = false;
690
+ this.stopBtn.disabled = true;
691
+ this.disableConfig(false);
692
+ this.updateStatus('idle', 'Ready');
693
+ }
694
+
695
+ updateVadDisplay(prob, history) {
696
+ this.vadBarFill.style.width = `${prob * 100}%`;
697
+ this.vadValue.textContent = `${Math.round(prob * 100)}%`;
698
+
699
+ const ctx = this.vadCtx;
700
+ const rect = this.vadCanvas.getBoundingClientRect();
701
+ const width = rect.width;
702
+ const height = rect.height;
703
+
704
+ ctx.fillStyle = '#0f0f23';
705
+ ctx.fillRect(0, 0, width, height);
706
+
707
+ if (history.length < 2) return;
708
+
709
+ const onsetY = height * (1 - parseFloat(this.onsetThreshold.value));
710
+ const offsetY = height * (1 - parseFloat(this.offsetThreshold.value));
711
+
712
+ ctx.strokeStyle = '#ff444466';
713
+ ctx.beginPath();
714
+ ctx.moveTo(0, onsetY);
715
+ ctx.lineTo(width, onsetY);
716
+ ctx.stroke();
717
+
718
+ ctx.strokeStyle = '#00ff8866';
719
+ ctx.beginPath();
720
+ ctx.moveTo(0, offsetY);
721
+ ctx.lineTo(width, offsetY);
722
+ ctx.stroke();
723
+
724
+ ctx.strokeStyle = '#00d4ff';
725
+ ctx.lineWidth = 2;
726
+ ctx.beginPath();
727
+
728
+ for (let i = 0; i < history.length; i++) {
729
+ const x = (i / (history.length - 1)) * width;
730
+ const y = height * (1 - history[i]);
731
+ if (i === 0) {
732
+ ctx.moveTo(x, y);
733
+ } else {
734
+ ctx.lineTo(x, y);
735
+ }
736
+ }
737
+ ctx.stroke();
738
+ }
739
+
740
+ addTranscript(text, segmentId) {
741
+ if (!text || !text.trim()) return;
742
+
743
+ const item = document.createElement('div');
744
+ item.className = 'transcript-item';
745
+ item.innerHTML = `
746
+ <span class="transcript-duration">#${segmentId}</span>
747
+ <span class="transcript-text">${this.escapeHtml(text)}</span>
748
+ `;
749
+ this.transcriptsList.appendChild(item);
750
+ this.transcriptsList.scrollTop = this.transcriptsList.scrollHeight;
751
+ }
752
+
753
+ updateLiveCaption(text) {
754
+ if (text) {
755
+ this.liveCaptionText.textContent = text;
756
+ this.liveCaptionText.classList.remove('placeholder');
757
+ this.liveCaption.classList.add('active');
758
+ } else {
759
+ this.liveCaptionText.textContent = 'Waiting for speech...';
760
+ this.liveCaptionText.classList.add('placeholder');
761
+ this.liveCaption.classList.remove('active');
762
+ }
763
+ }
764
+
765
+ updateStatus(status, text) {
766
+ this.statusDot.className = 'status-dot ' + status;
767
+ this.statusText.textContent = text;
768
+ }
769
+
770
+ showLoading(text) {
771
+ this.loadingText.textContent = text;
772
+ this.loadingOverlay.classList.remove('hidden');
773
+ }
774
+
775
+ hideLoading() {
776
+ this.loadingOverlay.classList.add('hidden');
777
+ }
778
+
779
+ showError(message) {
780
+ this.errorMessage.textContent = message;
781
+ this.errorMessage.classList.add('visible');
782
+ }
783
+
784
+ disableConfig(disabled) {
785
+ this.modelSelect.disabled = disabled;
786
+ this.onnxUrl.disabled = disabled;
787
+ this.onsetThreshold.disabled = disabled;
788
+ this.offsetThreshold.disabled = disabled;
789
+ }
790
+
791
+ escapeHtml(text) {
792
+ const div = document.createElement('div');
793
+ div.textContent = text;
794
+ return div.innerHTML;
795
+ }
796
+ }
797
+
798
+ // Initialize on page load
799
+ document.addEventListener('DOMContentLoaded', () => {
800
+ window.asrDemo = new ASRDemoUI();
801
+ });
ten_vad.js ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ var createVADModule = (() => {
3
+ var _scriptDir = import.meta.url;
4
+
5
+ return (
6
+ function(createVADModule) {
7
+ createVADModule = createVADModule || {};
8
+
9
+
10
+ var a;a||(a=typeof createVADModule !== 'undefined' ? createVADModule : {});var k,l;a.ready=new Promise(function(b,c){k=b;l=c});var p=Object.assign({},a),r="object"==typeof window,u="function"==typeof importScripts,v="",w;
11
+ if(r||u)u?v=self.location.href:"undefined"!=typeof document&&document.currentScript&&(v=document.currentScript.src),_scriptDir&&(v=_scriptDir),0!==v.indexOf("blob:")?v=v.substr(0,v.replace(/[?#].*/,"").lastIndexOf("/")+1):v="",u&&(w=b=>{var c=new XMLHttpRequest;c.open("GET",b,!1);c.responseType="arraybuffer";c.send(null);return new Uint8Array(c.response)});var aa=a.print||console.log.bind(console),x=a.printErr||console.warn.bind(console);Object.assign(a,p);p=null;var y;a.wasmBinary&&(y=a.wasmBinary);
12
+ var noExitRuntime=a.noExitRuntime||!0;"object"!=typeof WebAssembly&&z("no native wasm support detected");var A,B=!1,C="undefined"!=typeof TextDecoder?new TextDecoder("utf8"):void 0,D,E,F;function J(){var b=A.buffer;D=b;a.HEAP8=new Int8Array(b);a.HEAP16=new Int16Array(b);a.HEAP32=new Int32Array(b);a.HEAPU8=E=new Uint8Array(b);a.HEAPU16=new Uint16Array(b);a.HEAPU32=F=new Uint32Array(b);a.HEAPF32=new Float32Array(b);a.HEAPF64=new Float64Array(b)}var K=[],L=[],M=[];
13
+ function ba(){var b=a.preRun.shift();K.unshift(b)}var N=0,O=null,P=null;function z(b){if(a.onAbort)a.onAbort(b);b="Aborted("+b+")";x(b);B=!0;b=new WebAssembly.RuntimeError(b+". Build with -sASSERTIONS for more info.");l(b);throw b;}function Q(){return R.startsWith("data:application/octet-stream;base64,")}var R;if(a.locateFile){if(R="ten_vad.wasm",!Q()){var S=R;R=a.locateFile?a.locateFile(S,v):v+S}}else R=(new URL("ten_vad.wasm",import.meta.url)).href;
14
+ function T(){var b=R;try{if(b==R&&y)return new Uint8Array(y);if(w)return w(b);throw"both async and sync fetching of the wasm failed";}catch(c){z(c)}}function ca(){return y||!r&&!u||"function"!=typeof fetch?Promise.resolve().then(function(){return T()}):fetch(R,{credentials:"same-origin"}).then(function(b){if(!b.ok)throw"failed to load wasm binary file at '"+R+"'";return b.arrayBuffer()}).catch(function(){return T()})}function U(b){for(;0<b.length;)b.shift()(a)}
15
+ var da=[null,[],[]],ea={a:function(){z("")},f:function(b,c,m){E.copyWithin(b,c,c+m)},c:function(b){var c=E.length;b>>>=0;if(2147483648<b)return!1;for(var m=1;4>=m;m*=2){var h=c*(1+.2/m);h=Math.min(h,b+100663296);var d=Math;h=Math.max(b,h);d=d.min.call(d,2147483648,h+(65536-h%65536)%65536);a:{try{A.grow(d-D.byteLength+65535>>>16);J();var e=1;break a}catch(W){}e=void 0}if(e)return!0}return!1},e:function(){return 52},b:function(){return 70},d:function(b,c,m,h){for(var d=0,e=0;e<m;e++){var W=F[c>>2],
16
+ X=F[c+4>>2];c+=8;for(var G=0;G<X;G++){var f=E[W+G],H=da[b];if(0===f||10===f){f=H;for(var n=0,q=n+NaN,t=n;f[t]&&!(t>=q);)++t;if(16<t-n&&f.buffer&&C)f=C.decode(f.subarray(n,t));else{for(q="";n<t;){var g=f[n++];if(g&128){var I=f[n++]&63;if(192==(g&224))q+=String.fromCharCode((g&31)<<6|I);else{var Y=f[n++]&63;g=224==(g&240)?(g&15)<<12|I<<6|Y:(g&7)<<18|I<<12|Y<<6|f[n++]&63;65536>g?q+=String.fromCharCode(g):(g-=65536,q+=String.fromCharCode(55296|g>>10,56320|g&1023))}}else q+=String.fromCharCode(g)}f=q}(1===
17
+ b?aa:x)(f);H.length=0}else H.push(f)}d+=X}F[h>>2]=d;return 0}};
18
+ (function(){function b(d){a.asm=d.exports;A=a.asm.g;J();L.unshift(a.asm.h);N--;a.monitorRunDependencies&&a.monitorRunDependencies(N);0==N&&(null!==O&&(clearInterval(O),O=null),P&&(d=P,P=null,d()))}function c(d){b(d.instance)}function m(d){return ca().then(function(e){return WebAssembly.instantiate(e,h)}).then(function(e){return e}).then(d,function(e){x("failed to asynchronously prepare wasm: "+e);z(e)})}var h={a:ea};N++;a.monitorRunDependencies&&a.monitorRunDependencies(N);if(a.instantiateWasm)try{return a.instantiateWasm(h,
19
+ b)}catch(d){x("Module.instantiateWasm callback failed with error: "+d),l(d)}(function(){return y||"function"!=typeof WebAssembly.instantiateStreaming||Q()||"function"!=typeof fetch?m(c):fetch(R,{credentials:"same-origin"}).then(function(d){return WebAssembly.instantiateStreaming(d,h).then(c,function(e){x("wasm streaming compile failed: "+e);x("falling back to ArrayBuffer instantiation");return m(c)})})})().catch(l);return{}})();
20
+ a.___wasm_call_ctors=function(){return(a.___wasm_call_ctors=a.asm.h).apply(null,arguments)};a._malloc=function(){return(a._malloc=a.asm.i).apply(null,arguments)};a._free=function(){return(a._free=a.asm.j).apply(null,arguments)};a._ten_vad_create=function(){return(a._ten_vad_create=a.asm.k).apply(null,arguments)};a._ten_vad_process=function(){return(a._ten_vad_process=a.asm.l).apply(null,arguments)};a._ten_vad_destroy=function(){return(a._ten_vad_destroy=a.asm.m).apply(null,arguments)};
21
+ a._ten_vad_get_version=function(){return(a._ten_vad_get_version=a.asm.n).apply(null,arguments)};var V;P=function fa(){V||Z();V||(P=fa)};
22
+ function Z(){function b(){if(!V&&(V=!0,a.calledRun=!0,!B)){U(L);k(a);if(a.onRuntimeInitialized)a.onRuntimeInitialized();if(a.postRun)for("function"==typeof a.postRun&&(a.postRun=[a.postRun]);a.postRun.length;){var c=a.postRun.shift();M.unshift(c)}U(M)}}if(!(0<N)){if(a.preRun)for("function"==typeof a.preRun&&(a.preRun=[a.preRun]);a.preRun.length;)ba();U(K);0<N||(a.setStatus?(a.setStatus("Running..."),setTimeout(function(){setTimeout(function(){a.setStatus("")},1);b()},1)):b())}}
23
+ if(a.preInit)for("function"==typeof a.preInit&&(a.preInit=[a.preInit]);0<a.preInit.length;)a.preInit.pop()();Z();
24
+
25
+
26
+ return createVADModule.ready
27
+ }
28
+ );
29
+ })();
30
+ export default createVADModule;
ten_vad.wasm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ec0b9640683987e15a4e54e4ce5642b2447c6e5d82b1be889b5099c75434fc3
3
+ size 283349