andito HF Staff commited on
Commit
156b337
·
verified ·
1 Parent(s): a7d26ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +536 -163
app.py CHANGED
@@ -18,7 +18,7 @@ import numpy as np
18
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
19
  from fastapi.responses import StreamingResponse
20
  import uvicorn
21
- from fastrtc import WebRTC, get_cloudflare_turn_credentials_async
22
 
23
  # Try to import the utility, handle error if running in standalone test without library
24
  try:
@@ -29,35 +29,45 @@ except ImportError:
29
 
30
  AUDIO_SAMPLE_RATE = 16000 # respeaker samplerate
31
 
32
- import struct
33
-
34
  async def get_credentials():
35
- # Will use HF_TOKEN env var inside the Space
36
- return await get_cloudflare_turn_credentials_async()
37
 
38
- def gen_wav_header(sample_rate, bits_per_sample, channels):
39
- """
40
- Generates a generic WAV header.
41
- We set the file size to 0xFFFFFFFF (max) to trick the browser into
42
- thinking it's a very long file for streaming purposes.
43
- """
44
- datasize = 0xFFFFFFFF
45
- o = bytes("RIFF", 'ascii') # (4byte) Marks file as RIFF
46
- o += struct.pack('<I', datasize + 36) # (4byte) File size in bytes excluding "RIFF" and size
47
- o += bytes("WAVE", 'ascii') # (4byte) File type
48
- o += bytes("fmt ", 'ascii') # (4byte) Format Chunk Marker
49
- o += struct.pack('<I', 16) # (4byte) Length of above format data
50
- o += struct.pack('<H', 1) # (2byte) Format type (1 - PCM)
51
- o += struct.pack('<H', channels) # (2byte)
52
- o += struct.pack('<I', sample_rate) # (4byte)
53
- o += struct.pack('<I', sample_rate * channels * bits_per_sample // 8) # (4byte)
54
- o += struct.pack('<H', channels * bits_per_sample // 8) # (2byte)
55
- o += struct.pack('<H', bits_per_sample) # (2byte)
56
- o += bytes("data", 'ascii') # (4byte) Data Chunk Marker
57
- o += struct.pack('<I', datasize) # (4byte) Data size in bytes
58
- return o
59
-
60
- # --- 1. Global State Management ---
 
 
 
 
 
 
 
 
 
 
 
61
  class GlobalState:
62
  """
63
  Singleton-style class to manage shared state between FastAPI (WebSockets)
@@ -75,9 +85,26 @@ class GlobalState:
75
  self.latest_frame_bytes = buffer.tobytes()
76
  self.latest_frame_ts = time.time()
77
 
78
- # Audio Stream Data
 
79
  self.audio_queue = queue.Queue()
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  def set_robot_connection(self, ws: WebSocket, loop: asyncio.AbstractEventLoop):
82
  self.robot_ws = ws
83
  self.robot_loop = loop
@@ -92,59 +119,148 @@ class GlobalState:
92
  Pushes raw audio bytes.
93
  If the queue is full (meaning we are lagging), throw away the OLDEST audio.
94
  """
95
- # Limit queue to ~0.5 seconds of audio (approx 5-6 chunks of 4096 bytes)
96
- MAX_QUEUE_SIZE = 6
97
-
98
  while self.audio_queue.qsize() >= MAX_QUEUE_SIZE:
99
  try:
100
- # Drop the oldest chunk to make room for the new one (Keep 'Now', drop 'Past')
101
- print("Dropping oldest audio, queue size is", self.audio_queue.qsize())
102
  self.audio_queue.get_nowait()
103
  except queue.Empty:
104
- pass
105
-
106
- self.audio_queue.put(audio_bytes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  def get_connection_status(self) -> str:
109
  if self.robot_ws:
110
  return "✅ Robot Connected"
111
  return "🔴 Waiting for Robot..."
112
 
113
- state = GlobalState()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
 
115
 
116
- # --- 2. Data Models & Presets ---
117
- @dataclass
118
- class Movement:
119
- name: str
120
- x: float = 0
121
- y: float = 0
122
- z: float = 0
123
- roll: float = 0
124
- pitch: float = 0
125
- yaw: float = 0
126
- body_yaw: float = 0
127
- left_antenna: Optional[float] = None
128
- right_antenna: Optional[float] = None
129
- duration: float = 1.0
130
 
131
- PRESETS = {
132
- "Home": Movement("Home", 0, 0, 0, 0, 0, 0, 0, 0, 0),
133
- "Look Left": Movement("Look Left", 0, 0, 0, 0, 0, 30, 1, 0, 0),
134
- "Look Right": Movement("Look Right", 0, 0, 0, 0, 0, -30, -1, 0, 0),
135
- "Look Up": Movement("Look Up", 0, 0, 0, 0, -20, 0, 0, 0, 0),
136
- "Look Down": Movement("Look Down", 0, 0, 0, 0, 15, 0, 0, 0, 0),
137
- "Curious": Movement("Curious", 10, 0, 10, 15, -10, -15, 0, 45, -45),
138
- "Excited": Movement("Excited", 0, 0, 20, 0, -15, 0, 0, 90, 90),
139
- "Shy": Movement("Shy", -10, 0, -10, 10, 10, 20, 0, -30, 30),
140
- }
141
 
142
- SEQUENCES = {
143
- "Wave": ["Home", "Look Left", "Look Right", "Look Left", "Look Right", "Home"],
144
- "Nod": ["Home", "Look Down", "Look Up", "Look Down", "Home"],
145
- "Excited Dance": ["Home", "Excited", "Look Left", "Look Right", "Home"],
146
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
 
148
 
149
  # --- 3. Controller Logic ---
150
  class MovementManager:
@@ -336,27 +452,120 @@ async def stream_endpoint(ws: WebSocket):
336
 
337
  @app.websocket("/audio_stream")
338
  async def audio_endpoint(ws: WebSocket):
339
- """Endpoint for Robot/Sim to send raw Audio bytes (int16)."""
340
  await ws.accept()
341
  print("[Audio] Stream Connected")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  try:
343
- while True:
344
- # Receive bytes directly
345
- data = await ws.receive()
346
- if data.get('type') == 'websocket.receive' and data.get('bytes'):
347
- state.push_audio(data.get('bytes'))
348
- elif data.get('type') == 'websocket.receive' and data.get('text') == "ping":
349
- print("[Audio] Received ping")
350
- elif data.get('type') == 'websocket.disconnect':
351
- print("[Audio] Disconnected")
352
- break
353
- else:
354
- print(f"[Audio] Received unknown message: {data}")
355
- except Exception as e:
356
- print(f"[Audio] Disconnected: {e}")
357
 
358
  # --- 5. Gradio Interface ---
359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  def webrtc_audio_generator():
361
  """
362
  Generator for FastRTC.
@@ -375,7 +584,7 @@ def webrtc_audio_generator():
375
  try:
376
  # Wait up to 1 second for data. If no data, loop again.
377
  # Do NOT use a short timeout combined with silence generation.
378
- chunk_bytes = state.audio_queue.get(timeout=1.0)
379
  if chunk_bytes:
380
  byte_buffer.extend(chunk_bytes)
381
  except queue.Empty:
@@ -394,6 +603,35 @@ def webrtc_audio_generator():
394
 
395
  yield (AUDIO_SAMPLE_RATE, audio_int16)
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  def webrtc_video_generator():
398
  """
399
  Generator for FastRTC WebRTC (mode='receive', modality='video').
@@ -417,6 +655,64 @@ def webrtc_video_generator():
417
  # Shape (H, W, 3), dtype uint8
418
  yield frame
419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  with gr.Blocks(title="Reachy Controller", theme=gr.themes.Soft()) as demo:
421
 
422
  gr.Markdown("## 🤖 Reachy Mini Controller")
@@ -425,7 +721,12 @@ with gr.Blocks(title="Reachy Controller", theme=gr.themes.Soft()) as demo:
425
  # --- LEFT COLUMN: Controls ---
426
  with gr.Column(scale=1):
427
  status_box = gr.Textbox(label="System Status", value=state.get_connection_status, every=2)
428
-
 
 
 
 
 
429
  with gr.Group():
430
  gr.Markdown("### 🎧 Audio Listen")
431
 
@@ -436,105 +737,177 @@ with gr.Blocks(title="Reachy Controller", theme=gr.themes.Soft()) as demo:
436
  robot_audio = WebRTC(
437
  label="Robot Audio",
438
  modality="audio",
439
- mode="receive",
440
- # Optional niceties:
441
- # icon="phone-solid.svg",
442
- # icon_button_color="black",
443
- # pulse_color="black",
444
  )
445
 
446
- # Wire generator to WebRTC component
447
  robot_audio.stream(
448
- fn=lambda: webrtc_audio_generator(),
449
- inputs=None,
450
  outputs=[robot_audio],
451
- trigger=listen_btn.click,
452
  )
453
 
454
 
455
- with gr.Group():
456
- gr.Markdown("### 🎮 Playback")
457
- auto_play = gr.Checkbox(label="Auto-play", value=True)
458
- speed = gr.Slider(0.5, 2.0, 1.0, label="Speed")
459
 
460
- with gr.Row():
461
- play_btn = gr.Button("▶️ Play", variant="primary")
462
- stop_btn = gr.Button("⏹️ Stop")
463
 
464
- with gr.Row():
465
- clear_btn = gr.Button("🗑️ Clear")
466
- undo_btn = gr.Button("↶ Undo")
467
 
468
- queue_display = gr.Textbox(label="Queue", value=manager.get_queue_text, lines=10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
  # --- RIGHT COLUMN: View ---
471
  with gr.Column(scale=2):
472
- # robot_video = WebRTC(
473
- # label="Robot Video",
474
- # modality="video",
475
- # mode="receive",
476
- # )
477
- # robot_video.stream(
478
- # fn=lambda: webrtc_video_generator(),
479
- # inputs=None,
480
- # outputs=[robot_video],
481
- # trigger=listen_btn.click,
482
- # )
483
- html_code = """
484
- <html>
485
- <body>
486
- <img src="/video_feed" style="width: 100%; max-width: 1080px; border-radius: 8px;">
487
- </body>
488
- </html>
489
- """
490
- sim_view = gr.HTML(value=html_code, label="🎬 Robot Simulation")
491
-
492
- # --- Movement Builders ---
493
- with gr.Tabs():
494
- with gr.Tab("✨ Presets & Sequences"):
495
- gr.Markdown("### Quick Actions")
496
- with gr.Row(variant="panel"):
497
- for name in PRESETS:
498
- btn = gr.Button(name, size="sm")
499
- btn.click(manager.add_preset, inputs=[gr.State(name)], outputs=[queue_display, status_box])
 
 
500
 
501
- gr.Markdown("### Sequences")
502
- with gr.Row():
503
- for seq in SEQUENCES:
504
- btn = gr.Button(f"🎬 {seq}", size="sm")
505
- btn.click(manager.add_sequence, inputs=[gr.State(seq)], outputs=[queue_display, status_box])
506
-
507
- with gr.Tab("🛠️ Custom Move"):
508
- with gr.Row():
509
- c_x = gr.Slider(-50, 50, 0, label="X")
510
- c_y = gr.Slider(-50, 50, 0, label="Y")
511
- c_z = gr.Slider(-20, 50, 0, label="Z")
512
- with gr.Row():
513
- c_r = gr.Slider(-30, 30, 0, label="Roll")
514
- c_p = gr.Slider(-30, 30, 0, label="Pitch")
515
- c_y_aw = gr.Slider(-45, 45, 0, label="Yaw")
516
- with gr.Row():
517
- c_la = gr.Slider(-180, 180, 0, label="Left Ant")
518
- c_ra = gr.Slider(-180, 180, 0, label="Right Ant")
519
 
520
- c_dur = gr.Slider(0.1, 5.0, 1.0, label="Duration")
521
- c_add = gr.Button("➕ Add Custom Move", variant="primary")
522
 
523
- def _add_custom(x,y,z,r,p,yw,la,ra,d):
524
- m = Movement("Custom", x,y,z,r,p,yw,la,ra,d)
525
- return manager.add_movement(m)
526
 
527
- c_add.click(_add_custom,
528
- inputs=[c_x, c_y, c_z, c_r, c_p, c_y_aw, c_la, c_ra, c_dur],
529
- outputs=[queue_display, status_box])
530
 
531
  # --- Event Wiring ---
532
- auto_play.change(lambda x: setattr(manager, 'auto_play', x), inputs=[auto_play])
533
- play_btn.click(manager.play_queue, inputs=[speed], outputs=[queue_display, status_box])
534
- stop_btn.click(manager.stop_playback, outputs=[queue_display, status_box])
535
- clear_btn.click(manager.clear_queue, outputs=[queue_display, status_box])
536
- undo_btn.click(manager.remove_last, outputs=[queue_display, status_box])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
537
 
 
 
 
 
 
 
 
 
 
 
538
 
539
  # --- 6. Mount & Run ---
540
  app = gr.mount_gradio_app(app, demo, path="/")
 
18
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
19
  from fastapi.responses import StreamingResponse
20
  import uvicorn
21
+ from fastrtc import WebRTC, StreamHandler, get_cloudflare_turn_credentials_async, get_cloudflare_turn_credentials
22
 
23
  # Try to import the utility, handle error if running in standalone test without library
24
  try:
 
29
 
30
  AUDIO_SAMPLE_RATE = 16000 # respeaker samplerate
31
 
32
+ import os
 
33
  async def get_credentials():
34
+ # Will use HF_TOKEN env var inside the Space. There is a limit of 10GB per month: https://fastrtc.org/deployment/
35
+ return await get_cloudflare_turn_credentials_async(hf_token=os.getenv("HF_TOKEN"))
36
 
37
+
38
+ # --- 1. Data Models & Presets ---
39
+ @dataclass
40
+ class Movement:
41
+ name: str
42
+ x: float = 0
43
+ y: float = 0
44
+ z: float = 0
45
+ roll: float = 0
46
+ pitch: float = 0
47
+ yaw: float = 0
48
+ body_yaw: float = 0
49
+ left_antenna: Optional[float] = None
50
+ right_antenna: Optional[float] = None
51
+ duration: float = 1.0
52
+
53
+ PRESETS = {
54
+ "Home": Movement("Home", 0, 0, 0, 0, 0, 0, 0, 0, 0),
55
+ "Look Left": Movement("Look Left", 0, 0, 0, 0, 0, 30, 1, 0, 0),
56
+ "Look Right": Movement("Look Right", 0, 0, 0, 0, 0, -30, -1, 0, 0),
57
+ "Look Up": Movement("Look Up", 0, 0, 0, 0, -20, 0, 0, 0, 0),
58
+ "Look Down": Movement("Look Down", 0, 0, 0, 0, 15, 0, 0, 0, 0),
59
+ "Curious": Movement("Curious", 10, 0, 10, 15, -10, -15, 0, 45, -45),
60
+ "Excited": Movement("Excited", 0, 0, 20, 0, -15, 0, 0, 90, 90),
61
+ "Shy": Movement("Shy", -10, 0, -10, 10, 10, 20, 0, -30, 30),
62
+ }
63
+
64
+ SEQUENCES = {
65
+ "Wave": ["Home", "Look Left", "Look Right", "Look Left", "Look Right", "Home"],
66
+ "Nod": ["Home", "Look Down", "Look Up", "Look Down", "Home"],
67
+ "Excited Dance": ["Home", "Excited", "Look Left", "Look Right", "Home"],
68
+ }
69
+
70
+ # --- 2. Global State Management ---
71
  class GlobalState:
72
  """
73
  Singleton-style class to manage shared state between FastAPI (WebSockets)
 
85
  self.latest_frame_bytes = buffer.tobytes()
86
  self.latest_frame_ts = time.time()
87
 
88
+ # Audio from robot -> browser
89
+ # Queue of (sample_rate: int, audio: np.ndarray[int16, shape=(1, N)])
90
  self.audio_queue = queue.Queue()
91
 
92
+ # Audio from operator/server -> robot
93
+ self.audio_to_robot_queue = queue.Queue()
94
+
95
+ # --- Live pose state (for WASDQE control) ---
96
+ self.pose_lock = threading.Lock()
97
+ self.current_pose = Movement(
98
+ name="Current",
99
+ x=0, y=0, z=0,
100
+ roll=0, pitch=0, yaw=0,
101
+ body_yaw=0,
102
+ left_antenna=0,
103
+ right_antenna=0,
104
+ duration=0.2,
105
+ )
106
+
107
+
108
  def set_robot_connection(self, ws: WebSocket, loop: asyncio.AbstractEventLoop):
109
  self.robot_ws = ws
110
  self.robot_loop = loop
 
119
  Pushes raw audio bytes.
120
  If the queue is full (meaning we are lagging), throw away the OLDEST audio.
121
  """
122
+ MAX_QUEUE_SIZE = 2 # keep latency low
123
+
 
124
  while self.audio_queue.qsize() >= MAX_QUEUE_SIZE:
125
  try:
126
+ print("Dropping oldest audio FROM robot, queue size is", self.audio_queue.qsize())
 
127
  self.audio_queue.get_nowait()
128
  except queue.Empty:
129
+ break
130
+
131
+ self.audio_queue.put((AUDIO_SAMPLE_RATE, audio_bytes))
132
+
133
+ def push_audio_to_robot(self, audio_bytes: bytes):
134
+ """
135
+ Audio coming FROM the operator/server, going TO the robot.
136
+ """
137
+ MAX_QUEUE_SIZE = 2
138
+ while self.audio_to_robot_queue.qsize() >= MAX_QUEUE_SIZE:
139
+ try:
140
+ print("Dropping oldest audio TO robot, queue size is", self.audio_to_robot_queue.qsize())
141
+ self.audio_to_robot_queue.get_nowait()
142
+ except queue.Empty:
143
+ break
144
+
145
+ self.audio_to_robot_queue.put(audio_bytes)
146
+
147
+ def get_audio_to_robot_blocking(self) -> bytes:
148
+ """
149
+ Blocking get for the sender task in /audio_stream.
150
+ """
151
+ return self.audio_to_robot_queue.get()
152
 
153
  def get_connection_status(self) -> str:
154
  if self.robot_ws:
155
  return "✅ Robot Connected"
156
  return "🔴 Waiting for Robot..."
157
 
158
+ def update_pose(
159
+ self,
160
+ dx: float = 0,
161
+ dy: float = 0,
162
+ dz: float = 0,
163
+ droll: float = 0,
164
+ dpitch: float = 0,
165
+ dyaw: float = 0,
166
+ dbody_yaw: float = 0,
167
+ ) -> Movement:
168
+ """
169
+ Apply a small delta to the current pose and return a new Movement.
170
+ This is what WASDQE will use.
171
+ """
172
+ with self.pose_lock:
173
+ p = self.current_pose
174
+
175
+ new = Movement(
176
+ name="Current",
177
+ x=p.x + dx,
178
+ y=p.y + dy,
179
+ z=p.z + dz,
180
+ roll=p.roll + droll,
181
+ pitch=p.pitch + dpitch,
182
+ yaw=p.yaw + dyaw,
183
+ body_yaw=p.body_yaw + dbody_yaw,
184
+ left_antenna=p.left_antenna,
185
+ right_antenna=p.right_antenna,
186
+ duration=0.4,
187
+ )
188
+
189
+ # Optional clamping (adjust ranges as you like)
190
+ new.pitch = float(np.clip(new.pitch, -30, 30))
191
+ new.yaw = float(np.clip(new.yaw, -180, 180))
192
+ new.roll = float(np.clip(new.roll, -40, 40))
193
+ new.body_yaw = float(np.clip(new.body_yaw, -3, 3))
194
+ new.z = float(np.clip(new.z, -20, 50))
195
+ new.x = float(np.clip(new.x, -50, 50))
196
+ new.y = float(np.clip(new.y, -50, 50))
197
+
198
+ self.current_pose = new
199
+ return new
200
+
201
+ def reset_pose(self) -> Movement:
202
+ """Back to neutral / home pose."""
203
+ with self.pose_lock:
204
+ self.current_pose = Movement(
205
+ name="Current",
206
+ x=0, y=0, z=0,
207
+ roll=0, pitch=0, yaw=0,
208
+ body_yaw=0,
209
+ left_antenna=0,
210
+ right_antenna=0,
211
+ duration=0.3,
212
+ )
213
+ return self.current_pose
214
+
215
+ def get_pose_text(self) -> str:
216
+ """Human-readable pose info to show in the UI."""
217
+ with self.pose_lock:
218
+ p = self.current_pose
219
+ return (
220
+ f"Head position:\n"
221
+ f" x={p.x:.1f}, y={p.y:.1f}, z={p.z:.1f}\n"
222
+ f" roll={p.roll:.1f}, pitch={p.pitch:.1f}, yaw={p.yaw:.1f}\n"
223
+ f"Body:\n"
224
+ f" body_yaw={p.body_yaw:.1f}"
225
+ )
226
 
227
+ state = GlobalState()
228
 
229
+ def send_pose_to_robot(mov: Movement, msg: str = "Move sent"):
230
+ """
231
+ Convert Movement -> head pose + body_yaw payload and fire it to the robot.
232
+ Used by the WASDQE controls.
233
+ """
234
+ if not (state.robot_ws and state.robot_loop):
235
+ return state.get_pose_text(), "⚠️ Robot not connected"
 
 
 
 
 
 
 
236
 
237
+ pose = create_head_pose(
238
+ x=mov.x, y=mov.y, z=mov.z,
239
+ roll=mov.roll, pitch=mov.pitch, yaw=mov.yaw,
240
+ degrees=True, mm=True,
241
+ )
 
 
 
 
 
242
 
243
+ payload = {
244
+ "type": "movement",
245
+ "movement": {
246
+ "head": pose.tolist(),
247
+ "body_yaw": mov.body_yaw,
248
+ "duration": mov.duration,
249
+ },
250
+ }
251
+
252
+ if mov.left_antenna is not None and mov.right_antenna is not None:
253
+ payload["movement"]["antennas"] = [
254
+ np.deg2rad(mov.right_antenna),
255
+ np.deg2rad(mov.left_antenna),
256
+ ]
257
+
258
+ asyncio.run_coroutine_threadsafe(
259
+ state.robot_ws.send_json(payload),
260
+ state.robot_loop,
261
+ )
262
 
263
+ return state.get_pose_text(), f"✅ {msg}"
264
 
265
  # --- 3. Controller Logic ---
266
  class MovementManager:
 
452
 
453
  @app.websocket("/audio_stream")
454
  async def audio_endpoint(ws: WebSocket):
455
+ """Full duplex audio channel between Robot/Sim and server."""
456
  await ws.accept()
457
  print("[Audio] Stream Connected")
458
+
459
+ async def robot_to_server():
460
+ """Robot mic -> server -> state.audio_queue (-> WebRTC -> browser)."""
461
+ try:
462
+ while True:
463
+ data = await ws.receive()
464
+ t = data.get("type")
465
+
466
+ if t == "websocket.disconnect":
467
+ print("[Audio] Disconnected (recv)")
468
+ break
469
+
470
+ if t == "websocket.receive":
471
+ if data.get("bytes"):
472
+ # Audio FROM robot
473
+ state.push_audio(data["bytes"])
474
+ elif data.get("text") == "ping":
475
+ print("[Audio] Received ping")
476
+ else:
477
+ print(f"[Audio] Received unknown message: {data}")
478
+ except Exception as e:
479
+ print(f"[Audio] robot_to_server error: {e}")
480
+
481
+ async def server_to_robot():
482
+ """Server/operator audio -> robot speaker via WebSocket."""
483
+ loop = asyncio.get_running_loop()
484
+ try:
485
+ while True:
486
+ chunk: bytes = await loop.run_in_executor(
487
+ None, state.get_audio_to_robot_blocking
488
+ )
489
+ if chunk is None:
490
+ continue
491
+ await ws.send_bytes(chunk)
492
+ except Exception as e:
493
+ print(f"[Audio] server_to_robot error: {e}")
494
+
495
  try:
496
+ await asyncio.gather(robot_to_server(), server_to_robot())
497
+ finally:
498
+ print("[Audio] Stream Closed")
499
+
 
 
 
 
 
 
 
 
 
 
500
 
501
  # --- 5. Gradio Interface ---
502
 
503
+ class RobotAudioHandler(StreamHandler):
504
+ """
505
+ FastRTC handler that connects browser WebRTC audio to the robot.
506
+
507
+ - receive(): audio from browser mic -> state.audio_to_robot_queue (then /audio_stream sends it to robot)
508
+ - emit(): audio from state.audio_queue (filled by /audio_stream robot_to_server) -> browser playback
509
+ """
510
+
511
+ def __init__(self) -> None:
512
+ super().__init__(input_sample_rate=AUDIO_SAMPLE_RATE, output_sample_rate=AUDIO_SAMPLE_RATE)
513
+
514
+ def receive(self, frame: tuple[int, np.ndarray]) -> None:
515
+ """Called whenever the browser sends audio."""
516
+ if frame is None:
517
+ return
518
+
519
+ sr, array = frame
520
+ if array is None:
521
+ return
522
+
523
+ arr = np.asarray(array)
524
+
525
+ # Ensure mono
526
+ if arr.ndim > 1:
527
+ arr = arr[0]
528
+
529
+ # Convert to int16 and then to bytes for the robot
530
+ if arr.dtype != np.int16:
531
+ if np.issubdtype(arr.dtype, np.floating):
532
+ arr = np.clip(arr, -1.0, 1.0)
533
+ arr = (arr * 32767.0).astype(np.int16)
534
+ else:
535
+ arr = arr.astype(np.int16)
536
+
537
+ state.push_audio_to_robot(arr.tobytes())
538
+
539
+ def emit(self):
540
+ """
541
+ Called repeatedly by FastRTC to get audio to send to the browser.
542
+
543
+ Should return (sample_rate, np.ndarray[int16]) or None.
544
+ """
545
+ try:
546
+ sample_rate, frame_bytes = state.audio_queue.get(timeout=0.5)
547
+ audio = np.frombuffer(frame_bytes, dtype=np.int16).reshape(1, -1)
548
+ return sample_rate, audio
549
+ except queue.Empty:
550
+ # No audio right now, tell FastRTC to skip sending
551
+ return None
552
+
553
+ def copy(self) -> "RobotAudioHandler":
554
+ """
555
+ FastRTC will call this when it needs a new handler for a new session.
556
+ The handler itself is stateless; it always looks at GlobalState.
557
+ """
558
+ return RobotAudioHandler()
559
+
560
+ def shutdown(self) -> None:
561
+ """Called on session shutdown. Nothing to clean up for now."""
562
+ pass
563
+
564
+ def start_up(self) -> None:
565
+ """Called on session startup. Nothing special to do."""
566
+ pass
567
+
568
+
569
  def webrtc_audio_generator():
570
  """
571
  Generator for FastRTC.
 
584
  try:
585
  # Wait up to 1 second for data. If no data, loop again.
586
  # Do NOT use a short timeout combined with silence generation.
587
+ sample_rate, chunk_bytes = state.audio_queue.get(timeout=1.0)
588
  if chunk_bytes:
589
  byte_buffer.extend(chunk_bytes)
590
  except queue.Empty:
 
603
 
604
  yield (AUDIO_SAMPLE_RATE, audio_int16)
605
 
606
+
607
+ def handle_operator_audio(sr: int, audio: np.ndarray):
608
+ """
609
+ Called continuously by FastRTC when the browser sends mic audio.
610
+
611
+ `audio` is expected to be shape (channels, samples) or (samples,)
612
+ with dtype int16 or float32, depending on FastRTC config.
613
+ """
614
+ if audio is None:
615
+ return
616
+
617
+ arr = np.asarray(audio)
618
+ # Ensure mono and int16
619
+ if arr.ndim > 1:
620
+ arr = arr[0] # take first channel
621
+
622
+ if arr.dtype != np.int16:
623
+ # For float32 in [-1, 1]
624
+ if np.issubdtype(arr.dtype, np.floating):
625
+ arr = np.clip(arr, -1.0, 1.0)
626
+ arr = (arr * 32767.0).astype(np.int16)
627
+ else:
628
+ arr = arr.astype(np.int16)
629
+
630
+ state.push_audio_to_robot(arr.tobytes())
631
+ # No UI output
632
+ return
633
+
634
+
635
  def webrtc_video_generator():
636
  """
637
  Generator for FastRTC WebRTC (mode='receive', modality='video').
 
655
  # Shape (H, W, 3), dtype uint8
656
  yield frame
657
 
658
+ NUDGE_POS = 5.0 # mm or arbitrary units
659
+ NUDGE_HEIGHT = 5.0 # z
660
+ NUDGE_ANGLE = 5.0 # degrees
661
+ NUDGE_BODY = 0.3 # degrees for body_yaw
662
+
663
+ def move_w():
664
+ """
665
+ W: Move "forward" (e.g. towards positive y or z depending on your convention).
666
+ Here: we'll go +z (raise head) as an example.
667
+ """
668
+ mov = state.update_pose(dpitch=-NUDGE_HEIGHT)
669
+ return send_pose_to_robot(mov, "W (forward/up)")
670
+
671
+ def move_s():
672
+ mov = state.update_pose(dpitch=NUDGE_HEIGHT)
673
+ return send_pose_to_robot(mov, "S (back/down)")
674
+
675
+ def move_a():
676
+ """
677
+ A: turn left -> head yaw left + body yaw left.
678
+ """
679
+ mov = state.update_pose(dyaw=NUDGE_ANGLE*2)
680
+ return send_pose_to_robot(mov, "A (turn left)")
681
+
682
+ def move_d():
683
+ """
684
+ D: turn right -> head yaw right + body yaw right.
685
+ """
686
+ mov = state.update_pose(dyaw=-NUDGE_ANGLE*2)
687
+ return send_pose_to_robot(mov, "D (turn right)")
688
+
689
+ def move_q():
690
+ """
691
+ Q: tilt head up (pitch negative if you follow your earlier convention).
692
+ """
693
+ mov = state.update_pose(droll=-NUDGE_ANGLE)
694
+ return send_pose_to_robot(mov, "Q (tilt up)")
695
+
696
+ def move_e():
697
+ """
698
+ E: tilt head down (pitch positive).
699
+ """
700
+ mov = state.update_pose(droll=NUDGE_ANGLE)
701
+ return send_pose_to_robot(mov, "E (tilt down)")
702
+
703
+ def move_body_left():
704
+ mov = state.update_pose(dbody_yaw=NUDGE_BODY)
705
+ return send_pose_to_robot(mov, "Body Left (<)")
706
+
707
+ def move_body_right():
708
+ mov = state.update_pose(dbody_yaw=-NUDGE_BODY)
709
+ return send_pose_to_robot(mov, "Body Right (>)")
710
+
711
+ def center_pose():
712
+ mov = state.reset_pose()
713
+ return send_pose_to_robot(mov, "Reset pose")
714
+
715
+
716
  with gr.Blocks(title="Reachy Controller", theme=gr.themes.Soft()) as demo:
717
 
718
  gr.Markdown("## 🤖 Reachy Mini Controller")
 
721
  # --- LEFT COLUMN: Controls ---
722
  with gr.Column(scale=1):
723
  status_box = gr.Textbox(label="System Status", value=state.get_connection_status, every=2)
724
+ pose_box = gr.Textbox(
725
+ label="Current Pose",
726
+ value=state.get_pose_text,
727
+ every=0.5,
728
+ lines=8,
729
+ )
730
  with gr.Group():
731
  gr.Markdown("### 🎧 Audio Listen")
732
 
 
737
  robot_audio = WebRTC(
738
  label="Robot Audio",
739
  modality="audio",
740
+ mode="send-receive",
741
+ rtc_configuration=get_cloudflare_turn_credentials(),
742
+ server_rtc_configuration=get_cloudflare_turn_credentials(ttl=360_000),
743
+ full_screen=False
 
744
  )
745
 
746
+ # Use the handler directly, like in the FastRTC docs
747
  robot_audio.stream(
748
+ fn=RobotAudioHandler(),
749
+ inputs=[robot_audio],
750
  outputs=[robot_audio],
751
+ time_limit=60,
752
  )
753
 
754
 
755
+ # with gr.Group():
756
+ # gr.Markdown("### 🎮 Playback")
757
+ # auto_play = gr.Checkbox(label="Auto-play", value=True)
758
+ # speed = gr.Slider(0.5, 2.0, 1.0, label="Speed")
759
 
760
+ # with gr.Row():
761
+ # play_btn = gr.Button("▶️ Play", variant="primary")
762
+ # stop_btn = gr.Button("⏹️ Stop")
763
 
764
+ # with gr.Row():
765
+ # clear_btn = gr.Button("🗑️ Clear")
766
+ # undo_btn = gr.Button("↶ Undo")
767
 
768
+ # queue_display = gr.Textbox(label="Queue", value=manager.get_queue_text, lines=10)
769
+
770
+ # --- Live movement control ---
771
+ with gr.Group():
772
+ gr.Markdown("### 🕹️ Keyboard Control (WASD + QE)")
773
+
774
+ # These buttons will be triggered by keyboard events via JS
775
+ btn_forward = gr.Button("Look up (W)", elem_id="btn-forward")
776
+ btn_back = gr.Button("Look down (S)", elem_id="btn-back")
777
+ btn_left = gr.Button("Left (A)", elem_id="btn-left")
778
+ btn_right = gr.Button("Right (D)", elem_id="btn-right")
779
+ btn_tilt_up = gr.Button("Tilt left (Q)", elem_id="btn-tilt-up")
780
+ btn_tilt_down = gr.Button("Tilt right (E)", elem_id="btn-tilt-down")
781
+ btn_body_left = gr.Button("Body Left (J)", elem_id="btn-body-left")
782
+ btn_body_right = gr.Button("Body Right (L)", elem_id="btn-body-right")
783
+ btn_center = gr.Button("Center (H)", elem_id="btn-center")
784
+
785
+ # Each button updates the pose_box text
786
+ btn_forward.click(move_w, outputs=[pose_box])
787
+ btn_back.click(move_s, outputs=[pose_box])
788
+ btn_left.click(move_a, outputs=[pose_box])
789
+ btn_right.click(move_d, outputs=[pose_box])
790
+ btn_tilt_up.click(move_q, outputs=[pose_box])
791
+ btn_tilt_down.click(move_e, outputs=[pose_box])
792
+ btn_body_left.click(move_body_left, outputs=[pose_box])
793
+ btn_body_right.click(move_body_right, outputs=[pose_box])
794
+ btn_center.click(center_pose, outputs=[pose_box])
795
 
796
  # --- RIGHT COLUMN: View ---
797
  with gr.Column(scale=2):
798
+ robot_video = WebRTC(
799
+ label="Robot Video",
800
+ modality="video",
801
+ mode="receive",
802
+ rtc_configuration=get_cloudflare_turn_credentials(),
803
+ server_rtc_configuration=get_cloudflare_turn_credentials(ttl=360_000)
804
+ )
805
+ robot_video.stream(
806
+ fn=lambda: webrtc_video_generator(),
807
+ inputs=[],
808
+ outputs=[robot_video],
809
+ trigger=listen_btn.click,
810
+ )
811
+ # html_code = """
812
+ # <html>
813
+ # <body>
814
+ # <img src="/video_feed" style="width: 100%; max-width: 1080px; border-radius: 8px;">
815
+ # </body>
816
+ # </html>
817
+ # """
818
+ # sim_view = gr.HTML(value=html_code, label="🎬 Robot Simulation")
819
+
820
+ # # --- Movement Builders ---
821
+ # with gr.Tabs():
822
+ # with gr.Tab("✨ Presets & Sequences"):
823
+ # gr.Markdown("### Quick Actions")
824
+ # with gr.Row(variant="panel"):
825
+ # for name in PRESETS:
826
+ # btn = gr.Button(name, size="sm")
827
+ # btn.click(manager.add_preset, inputs=[gr.State(name)], outputs=[queue_display, status_box])
828
 
829
+ # gr.Markdown("### Sequences")
830
+ # with gr.Row():
831
+ # for seq in SEQUENCES:
832
+ # btn = gr.Button(f"🎬 {seq}", size="sm")
833
+ # btn.click(manager.add_sequence, inputs=[gr.State(seq)], outputs=[queue_display, status_box])
834
+
835
+ # with gr.Tab("🛠️ Custom Move"):
836
+ # with gr.Row():
837
+ # c_x = gr.Slider(-50, 50, 0, label="X")
838
+ # c_y = gr.Slider(-50, 50, 0, label="Y")
839
+ # c_z = gr.Slider(-20, 50, 0, label="Z")
840
+ # with gr.Row():
841
+ # c_r = gr.Slider(-30, 30, 0, label="Roll")
842
+ # c_p = gr.Slider(-30, 30, 0, label="Pitch")
843
+ # c_y_aw = gr.Slider(-45, 45, 0, label="Yaw")
844
+ # with gr.Row():
845
+ # c_la = gr.Slider(-180, 180, 0, label="Left Ant")
846
+ # c_ra = gr.Slider(-180, 180, 0, label="Right Ant")
847
 
848
+ # c_dur = gr.Slider(0.1, 5.0, 1.0, label="Duration")
849
+ # c_add = gr.Button("➕ Add Custom Move", variant="primary")
850
 
851
+ # def _add_custom(x,y,z,r,p,yw,la,ra,d):
852
+ # m = Movement("Custom", x,y,z,r,p,yw,la,ra,d)
853
+ # return manager.add_movement(m)
854
 
855
+ # c_add.click(_add_custom,
856
+ # inputs=[c_x, c_y, c_z, c_r, c_p, c_y_aw, c_la, c_ra, c_dur],
857
+ # outputs=[queue_display, status_box])
858
 
859
  # --- Event Wiring ---
860
+ # auto_play.change(lambda x: setattr(manager, 'auto_play', x), inputs=[auto_play])
861
+ # play_btn.click(manager.play_queue, inputs=[speed], outputs=[queue_display, status_box])
862
+ # stop_btn.click(manager.stop_playback, outputs=[queue_display, status_box])
863
+ # clear_btn.click(manager.clear_queue, outputs=[queue_display, status_box])
864
+ # undo_btn.click(manager.remove_last, outputs=[queue_display, status_box])
865
+
866
+ demo.load(
867
+ None,
868
+ None,
869
+ None,
870
+ js="""
871
+ () => {
872
+ const keyMap = {
873
+ 'w': 'btn-forward',
874
+ 's': 'btn-back',
875
+ 'a': 'btn-left',
876
+ 'd': 'btn-right',
877
+ 'q': 'btn-tilt-up',
878
+ 'e': 'btn-tilt-down',
879
+ 'h': 'btn-center',
880
+ 'j': 'btn-body-left',
881
+ 'l': 'btn-body-right',
882
+ };
883
+
884
+ let lastPressed = {};
885
+ const REPEAT_MS = 120; // minimum time between repeated presses
886
+
887
+ document.addEventListener('keydown', (ev) => {
888
+ const key = ev.key.toLowerCase();
889
+ const id = keyMap[key];
890
+ if (!id) return;
891
+
892
+ const now = Date.now();
893
+ if (lastPressed[key] && now - lastPressed[key] < REPEAT_MS) {
894
+ return; // simple debounce
895
+ }
896
+ lastPressed[key] = now;
897
+
898
+ // Prevent page scrolling with space, etc
899
+ ev.preventDefault();
900
 
901
+ const btn = document.getElementById(id);
902
+ if (btn) {
903
+ btn.click();
904
+ }
905
+ });
906
+
907
+ console.log('Keyboard control ready: WASD for x/y, Q/E for pitch, J/L for body yaw, H for center');
908
+ }
909
+ """,
910
+ )
911
 
912
  # --- 6. Mount & Run ---
913
  app = gr.mount_gradio_app(app, demo, path="/")