Spaces:
Running
Running
optimizations
Browse files- backend_api.py +65 -61
backend_api.py
CHANGED
|
@@ -63,6 +63,30 @@ except Exception as e:
|
|
| 63 |
|
| 64 |
print("[Startup] System prompts initialization complete")
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
# Define models and languages here to avoid importing Gradio UI
|
| 67 |
AVAILABLE_MODELS = [
|
| 68 |
{"name": "Gemini 3.0 Pro", "id": "gemini-3.0-pro", "description": "Google Gemini 3.0 Pro via Poe with advanced reasoning"},
|
|
@@ -75,6 +99,10 @@ AVAILABLE_MODELS = [
|
|
| 75 |
{"name": "Qwen3 Max Preview", "id": "qwen3-max-preview", "description": "Qwen3 Max Preview via DashScope API"},
|
| 76 |
]
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
LANGUAGE_CHOICES = ["html", "gradio", "transformers.js", "streamlit", "comfyui", "react"]
|
| 79 |
|
| 80 |
app = FastAPI(title="AnyCoder API", version="1.0.0")
|
|
@@ -366,45 +394,33 @@ async def generate_code(
|
|
| 366 |
selected_model_id = model_id
|
| 367 |
|
| 368 |
try:
|
| 369 |
-
#
|
| 370 |
-
selected_model =
|
| 371 |
-
for model in AVAILABLE_MODELS:
|
| 372 |
-
if model["id"] == selected_model_id:
|
| 373 |
-
selected_model = model
|
| 374 |
-
break
|
| 375 |
-
|
| 376 |
if not selected_model:
|
|
|
|
| 377 |
selected_model = AVAILABLE_MODELS[0]
|
| 378 |
selected_model_id = selected_model["id"]
|
| 379 |
|
| 380 |
# Track generated code
|
| 381 |
generated_code = ""
|
| 382 |
|
| 383 |
-
#
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
"transformers.js": TRANSFORMERS_JS_SYSTEM_PROMPT,
|
| 389 |
-
"react": REACT_SYSTEM_PROMPT,
|
| 390 |
-
"comfyui": JSON_SYSTEM_PROMPT,
|
| 391 |
-
}
|
| 392 |
-
system_prompt = prompt_map.get(language, GENERIC_SYSTEM_PROMPT.format(language=language))
|
| 393 |
-
|
| 394 |
-
print(f"[Generate] Using {language} prompt for query: {query[:100]}...")
|
| 395 |
|
| 396 |
-
# Get
|
| 397 |
-
|
| 398 |
-
client = get_inference_client(selected_model_id, provider)
|
| 399 |
|
| 400 |
# Get the real model ID with provider suffixes
|
| 401 |
actual_model_id = get_real_model_id(selected_model_id)
|
| 402 |
-
print(f"[Generate] Using model ID: {actual_model_id}")
|
| 403 |
|
| 404 |
-
# Prepare messages
|
|
|
|
| 405 |
messages = [
|
| 406 |
{"role": "system", "content": system_prompt},
|
| 407 |
-
{"role": "user", "content":
|
| 408 |
]
|
| 409 |
|
| 410 |
# Stream the response
|
|
@@ -429,73 +445,61 @@ async def generate_code(
|
|
| 429 |
)
|
| 430 |
|
| 431 |
chunk_count = 0
|
| 432 |
-
|
| 433 |
|
|
|
|
| 434 |
for chunk in stream:
|
| 435 |
-
# Handle different response formats
|
| 436 |
chunk_content = None
|
| 437 |
|
| 438 |
-
if
|
| 439 |
# Mistral format: chunk.data.choices[0].delta.content
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
chunk_content = chunk.data.choices[0].delta.content
|
| 446 |
else:
|
| 447 |
# OpenAI format: chunk.choices[0].delta.content
|
| 448 |
-
|
| 449 |
-
chunk.choices and
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
chunk.choices[0].delta.content):
|
| 454 |
-
chunk_content = chunk.choices[0].delta.content
|
| 455 |
|
| 456 |
if chunk_content:
|
| 457 |
-
|
| 458 |
-
generated_code += content
|
| 459 |
chunk_count += 1
|
| 460 |
|
| 461 |
-
#
|
| 462 |
-
|
| 463 |
-
|
|
|
|
| 464 |
|
| 465 |
-
#
|
| 466 |
event_data = json.dumps({
|
| 467 |
"type": "chunk",
|
| 468 |
-
"content":
|
| 469 |
-
"timestamp": datetime.now().isoformat()
|
| 470 |
})
|
| 471 |
yield f"data: {event_data}\n\n"
|
| 472 |
-
|
| 473 |
-
# Yield control to allow async processing - no artificial delay
|
| 474 |
-
await asyncio.sleep(0)
|
| 475 |
-
|
| 476 |
-
print(f"[Generate] Completed with {chunk_count} chunks, total length: {len(generated_code)}")
|
| 477 |
|
| 478 |
-
# Send completion event
|
| 479 |
completion_data = json.dumps({
|
| 480 |
"type": "complete",
|
| 481 |
-
"code": generated_code
|
| 482 |
-
"timestamp": datetime.now().isoformat()
|
| 483 |
})
|
| 484 |
yield f"data: {completion_data}\n\n"
|
| 485 |
|
| 486 |
except Exception as e:
|
| 487 |
error_data = json.dumps({
|
| 488 |
"type": "error",
|
| 489 |
-
"message": str(e)
|
| 490 |
-
"timestamp": datetime.now().isoformat()
|
| 491 |
})
|
| 492 |
yield f"data: {error_data}\n\n"
|
| 493 |
|
| 494 |
except Exception as e:
|
| 495 |
error_data = json.dumps({
|
| 496 |
"type": "error",
|
| 497 |
-
"message": f"Generation error: {str(e)}"
|
| 498 |
-
"timestamp": datetime.now().isoformat()
|
| 499 |
})
|
| 500 |
yield f"data: {error_data}\n\n"
|
| 501 |
|
|
|
|
| 63 |
|
| 64 |
print("[Startup] System prompts initialization complete")
|
| 65 |
|
| 66 |
+
# Cache system prompts map for fast lookup (created once at startup)
|
| 67 |
+
SYSTEM_PROMPT_CACHE = {
|
| 68 |
+
"html": HTML_SYSTEM_PROMPT,
|
| 69 |
+
"gradio": GRADIO_SYSTEM_PROMPT,
|
| 70 |
+
"streamlit": STREAMLIT_SYSTEM_PROMPT,
|
| 71 |
+
"transformers.js": TRANSFORMERS_JS_SYSTEM_PROMPT,
|
| 72 |
+
"react": REACT_SYSTEM_PROMPT,
|
| 73 |
+
"comfyui": JSON_SYSTEM_PROMPT,
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
# Client connection pool for reuse (thread-safe)
|
| 77 |
+
import threading
|
| 78 |
+
_client_pool = {}
|
| 79 |
+
_client_pool_lock = threading.Lock()
|
| 80 |
+
|
| 81 |
+
def get_cached_client(model_id: str, provider: str = "auto"):
|
| 82 |
+
"""Get or create a cached API client for reuse"""
|
| 83 |
+
cache_key = f"{model_id}:{provider}"
|
| 84 |
+
|
| 85 |
+
with _client_pool_lock:
|
| 86 |
+
if cache_key not in _client_pool:
|
| 87 |
+
_client_pool[cache_key] = get_inference_client(model_id, provider)
|
| 88 |
+
return _client_pool[cache_key]
|
| 89 |
+
|
| 90 |
# Define models and languages here to avoid importing Gradio UI
|
| 91 |
AVAILABLE_MODELS = [
|
| 92 |
{"name": "Gemini 3.0 Pro", "id": "gemini-3.0-pro", "description": "Google Gemini 3.0 Pro via Poe with advanced reasoning"},
|
|
|
|
| 99 |
{"name": "Qwen3 Max Preview", "id": "qwen3-max-preview", "description": "Qwen3 Max Preview via DashScope API"},
|
| 100 |
]
|
| 101 |
|
| 102 |
+
# Cache model lookup for faster access (built after AVAILABLE_MODELS is defined)
|
| 103 |
+
MODEL_CACHE = {model["id"]: model for model in AVAILABLE_MODELS}
|
| 104 |
+
print(f"[Startup] ✅ Performance optimizations loaded: {len(SYSTEM_PROMPT_CACHE)} cached prompts, {len(MODEL_CACHE)} cached models, client pooling enabled")
|
| 105 |
+
|
| 106 |
LANGUAGE_CHOICES = ["html", "gradio", "transformers.js", "streamlit", "comfyui", "react"]
|
| 107 |
|
| 108 |
app = FastAPI(title="AnyCoder API", version="1.0.0")
|
|
|
|
| 394 |
selected_model_id = model_id
|
| 395 |
|
| 396 |
try:
|
| 397 |
+
# Fast model lookup using cache
|
| 398 |
+
selected_model = MODEL_CACHE.get(selected_model_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
if not selected_model:
|
| 400 |
+
# Fallback to first available model (shouldn't happen often)
|
| 401 |
selected_model = AVAILABLE_MODELS[0]
|
| 402 |
selected_model_id = selected_model["id"]
|
| 403 |
|
| 404 |
# Track generated code
|
| 405 |
generated_code = ""
|
| 406 |
|
| 407 |
+
# Fast system prompt lookup using cache
|
| 408 |
+
system_prompt = SYSTEM_PROMPT_CACHE.get(language)
|
| 409 |
+
if not system_prompt:
|
| 410 |
+
# Format generic prompt only if needed
|
| 411 |
+
system_prompt = GENERIC_SYSTEM_PROMPT.format(language=language)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
|
| 413 |
+
# Get cached client (reuses connections)
|
| 414 |
+
client = get_cached_client(selected_model_id, provider)
|
|
|
|
| 415 |
|
| 416 |
# Get the real model ID with provider suffixes
|
| 417 |
actual_model_id = get_real_model_id(selected_model_id)
|
|
|
|
| 418 |
|
| 419 |
+
# Prepare messages (optimized - no string concatenation in hot path)
|
| 420 |
+
user_content = f"Generate a {language} application: {query}"
|
| 421 |
messages = [
|
| 422 |
{"role": "system", "content": system_prompt},
|
| 423 |
+
{"role": "user", "content": user_content}
|
| 424 |
]
|
| 425 |
|
| 426 |
# Stream the response
|
|
|
|
| 445 |
)
|
| 446 |
|
| 447 |
chunk_count = 0
|
| 448 |
+
is_mistral = is_mistral_model(selected_model_id)
|
| 449 |
|
| 450 |
+
# Optimized chunk processing - reduce attribute lookups
|
| 451 |
for chunk in stream:
|
|
|
|
| 452 |
chunk_content = None
|
| 453 |
|
| 454 |
+
if is_mistral:
|
| 455 |
# Mistral format: chunk.data.choices[0].delta.content
|
| 456 |
+
try:
|
| 457 |
+
if chunk.data and chunk.data.choices and chunk.data.choices[0].delta.content:
|
| 458 |
+
chunk_content = chunk.data.choices[0].delta.content
|
| 459 |
+
except (AttributeError, IndexError):
|
| 460 |
+
continue
|
|
|
|
| 461 |
else:
|
| 462 |
# OpenAI format: chunk.choices[0].delta.content
|
| 463 |
+
try:
|
| 464 |
+
if chunk.choices and chunk.choices[0].delta.content:
|
| 465 |
+
chunk_content = chunk.choices[0].delta.content
|
| 466 |
+
except (AttributeError, IndexError):
|
| 467 |
+
continue
|
|
|
|
|
|
|
| 468 |
|
| 469 |
if chunk_content:
|
| 470 |
+
generated_code += chunk_content
|
|
|
|
| 471 |
chunk_count += 1
|
| 472 |
|
| 473 |
+
# Send chunk immediately - optimized JSON serialization
|
| 474 |
+
# Only yield control every 5 chunks to reduce overhead
|
| 475 |
+
if chunk_count % 5 == 0:
|
| 476 |
+
await asyncio.sleep(0)
|
| 477 |
|
| 478 |
+
# Build event data efficiently
|
| 479 |
event_data = json.dumps({
|
| 480 |
"type": "chunk",
|
| 481 |
+
"content": chunk_content
|
|
|
|
| 482 |
})
|
| 483 |
yield f"data: {event_data}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
|
| 485 |
+
# Send completion event (optimized - no timestamp in hot path)
|
| 486 |
completion_data = json.dumps({
|
| 487 |
"type": "complete",
|
| 488 |
+
"code": generated_code
|
|
|
|
| 489 |
})
|
| 490 |
yield f"data: {completion_data}\n\n"
|
| 491 |
|
| 492 |
except Exception as e:
|
| 493 |
error_data = json.dumps({
|
| 494 |
"type": "error",
|
| 495 |
+
"message": str(e)
|
|
|
|
| 496 |
})
|
| 497 |
yield f"data: {error_data}\n\n"
|
| 498 |
|
| 499 |
except Exception as e:
|
| 500 |
error_data = json.dumps({
|
| 501 |
"type": "error",
|
| 502 |
+
"message": f"Generation error: {str(e)}"
|
|
|
|
| 503 |
})
|
| 504 |
yield f"data: {error_data}\n\n"
|
| 505 |
|