Spaces:
Running
Comprehensive streaming fix: Eliminate all buffering sources
Browse filesIssue: Code streaming still not working - chunks arriving but not displaying
Root causes identified:
1. React 18 automatic batching delays state updates
2. Potential proxy/nginx buffering at various levels
3. DeepSeek using InferenceClient instead of OpenAI client
Complete fix across full stack:
Backend (backend_api.py):
- Convert DeepSeek to use OpenAI client for consistent streaming
- Add comprehensive anti-buffering headers:
* Cache-Control: no-cache, no-transform
* Content-Encoding: none
* Transfer-Encoding: chunked
* X-Accel-Buffering: no (prevents nginx buffering)
- Add logging for MiniMax and DeepSeek initialization
- All models now use OpenAI client (OpenRouter, MiniMax, DeepSeek)
Frontend (page.tsx):
- Import flushSync from react-dom
- Wrap setGeneratedCode in flushSync() to bypass React 18 batching
- Forces immediate synchronous DOM updates for each chunk
- No more delayed/batched state updates
How streaming works now:
1. Backend: LLM generates chunk → yield immediately (no buffer)
2. HTTP: Headers prevent all proxy/nginx buffering
3. Frontend: SSE receives chunk → flushSync forces immediate React update
4. Monaco: Editor updates character-by-character in real-time
This ensures true streaming where every single chunk appears immediately
as it's generated, with zero buffering at any layer of the stack.
- backend_api.py +12 -4
- frontend/src/app/page.tsx +9 -6
|
@@ -412,9 +412,15 @@ async def generate_code(
|
|
| 412 |
)
|
| 413 |
# Add :novita suffix for the API call
|
| 414 |
actual_model_id = "MiniMaxAI/MiniMax-M2:novita"
|
|
|
|
| 415 |
elif actual_model_id.startswith("deepseek-ai/"):
|
| 416 |
-
# DeepSeek models via HuggingFace
|
| 417 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
elif actual_model_id == "qwen3-max-preview":
|
| 419 |
# Qwen via DashScope (would need separate implementation)
|
| 420 |
# For now, fall back to HF
|
|
@@ -499,9 +505,11 @@ async def generate_code(
|
|
| 499 |
event_stream(),
|
| 500 |
media_type="text/event-stream",
|
| 501 |
headers={
|
| 502 |
-
"Cache-Control": "no-cache",
|
| 503 |
"Connection": "keep-alive",
|
| 504 |
-
"X-Accel-Buffering": "no"
|
|
|
|
|
|
|
| 505 |
}
|
| 506 |
)
|
| 507 |
|
|
|
|
| 412 |
)
|
| 413 |
# Add :novita suffix for the API call
|
| 414 |
actual_model_id = "MiniMaxAI/MiniMax-M2:novita"
|
| 415 |
+
print(f"[Generate] Using HuggingFace router for MiniMax M2")
|
| 416 |
elif actual_model_id.startswith("deepseek-ai/"):
|
| 417 |
+
# DeepSeek models via HuggingFace - use OpenAI client for better streaming
|
| 418 |
+
from openai import OpenAI
|
| 419 |
+
client = OpenAI(
|
| 420 |
+
base_url="https://api-inference.huggingface.co/v1",
|
| 421 |
+
api_key=os.getenv("HF_TOKEN")
|
| 422 |
+
)
|
| 423 |
+
print(f"[Generate] Using HuggingFace Inference API for DeepSeek")
|
| 424 |
elif actual_model_id == "qwen3-max-preview":
|
| 425 |
# Qwen via DashScope (would need separate implementation)
|
| 426 |
# For now, fall back to HF
|
|
|
|
| 505 |
event_stream(),
|
| 506 |
media_type="text/event-stream",
|
| 507 |
headers={
|
| 508 |
+
"Cache-Control": "no-cache, no-transform",
|
| 509 |
"Connection": "keep-alive",
|
| 510 |
+
"X-Accel-Buffering": "no",
|
| 511 |
+
"Content-Encoding": "none",
|
| 512 |
+
"Transfer-Encoding": "chunked"
|
| 513 |
}
|
| 514 |
)
|
| 515 |
|
|
@@ -1,6 +1,7 @@
|
|
| 1 |
'use client';
|
| 2 |
|
| 3 |
import { useState, useEffect } from 'react';
|
|
|
|
| 4 |
import Header from '@/components/Header';
|
| 5 |
import ChatInterface from '@/components/ChatInterface';
|
| 6 |
import CodeEditor from '@/components/CodeEditor';
|
|
@@ -78,14 +79,16 @@ export default function Home() {
|
|
| 78 |
try {
|
| 79 |
apiClient.generateCodeStream(
|
| 80 |
request,
|
| 81 |
-
// onChunk - Update code editor in real-time
|
| 82 |
(chunk: string) => {
|
| 83 |
console.log('[Stream] Received chunk:', chunk.substring(0, 50), '... (length:', chunk.length, ')');
|
| 84 |
-
// Use
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
| 89 |
});
|
| 90 |
},
|
| 91 |
// onComplete
|
|
|
|
| 1 |
'use client';
|
| 2 |
|
| 3 |
import { useState, useEffect } from 'react';
|
| 4 |
+
import { flushSync } from 'react-dom';
|
| 5 |
import Header from '@/components/Header';
|
| 6 |
import ChatInterface from '@/components/ChatInterface';
|
| 7 |
import CodeEditor from '@/components/CodeEditor';
|
|
|
|
| 79 |
try {
|
| 80 |
apiClient.generateCodeStream(
|
| 81 |
request,
|
| 82 |
+
// onChunk - Update code editor in real-time with immediate flush
|
| 83 |
(chunk: string) => {
|
| 84 |
console.log('[Stream] Received chunk:', chunk.substring(0, 50), '... (length:', chunk.length, ')');
|
| 85 |
+
// Use flushSync to force immediate DOM update without React batching
|
| 86 |
+
flushSync(() => {
|
| 87 |
+
setGeneratedCode((prevCode) => {
|
| 88 |
+
const newCode = prevCode + chunk;
|
| 89 |
+
console.log('[Stream] Total code length:', newCode.length);
|
| 90 |
+
return newCode;
|
| 91 |
+
});
|
| 92 |
});
|
| 93 |
},
|
| 94 |
// onComplete
|