akhaliq HF Staff commited on
Commit
990c944
·
1 Parent(s): 3e5404a

optimizations

Browse files
Files changed (1) hide show
  1. backend_api.py +65 -61
backend_api.py CHANGED
@@ -63,6 +63,30 @@ except Exception as e:
63
 
64
  print("[Startup] System prompts initialization complete")
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  # Define models and languages here to avoid importing Gradio UI
67
  AVAILABLE_MODELS = [
68
  {"name": "Gemini 3.0 Pro", "id": "gemini-3.0-pro", "description": "Google Gemini 3.0 Pro via Poe with advanced reasoning"},
@@ -75,6 +99,10 @@ AVAILABLE_MODELS = [
75
  {"name": "Qwen3 Max Preview", "id": "qwen3-max-preview", "description": "Qwen3 Max Preview via DashScope API"},
76
  ]
77
 
 
 
 
 
78
  LANGUAGE_CHOICES = ["html", "gradio", "transformers.js", "streamlit", "comfyui", "react"]
79
 
80
  app = FastAPI(title="AnyCoder API", version="1.0.0")
@@ -366,45 +394,33 @@ async def generate_code(
366
  selected_model_id = model_id
367
 
368
  try:
369
- # Find the selected model
370
- selected_model = None
371
- for model in AVAILABLE_MODELS:
372
- if model["id"] == selected_model_id:
373
- selected_model = model
374
- break
375
-
376
  if not selected_model:
 
377
  selected_model = AVAILABLE_MODELS[0]
378
  selected_model_id = selected_model["id"]
379
 
380
  # Track generated code
381
  generated_code = ""
382
 
383
- # Select appropriate system prompt based on language
384
- prompt_map = {
385
- "html": HTML_SYSTEM_PROMPT,
386
- "gradio": GRADIO_SYSTEM_PROMPT,
387
- "streamlit": STREAMLIT_SYSTEM_PROMPT,
388
- "transformers.js": TRANSFORMERS_JS_SYSTEM_PROMPT,
389
- "react": REACT_SYSTEM_PROMPT,
390
- "comfyui": JSON_SYSTEM_PROMPT,
391
- }
392
- system_prompt = prompt_map.get(language, GENERIC_SYSTEM_PROMPT.format(language=language))
393
-
394
- print(f"[Generate] Using {language} prompt for query: {query[:100]}...")
395
 
396
- # Get the client using backend_models
397
- print(f"[Generate] Getting client for model: {selected_model_id}")
398
- client = get_inference_client(selected_model_id, provider)
399
 
400
  # Get the real model ID with provider suffixes
401
  actual_model_id = get_real_model_id(selected_model_id)
402
- print(f"[Generate] Using model ID: {actual_model_id}")
403
 
404
- # Prepare messages
 
405
  messages = [
406
  {"role": "system", "content": system_prompt},
407
- {"role": "user", "content": f"Generate a {language} application: {query}"}
408
  ]
409
 
410
  # Stream the response
@@ -429,73 +445,61 @@ async def generate_code(
429
  )
430
 
431
  chunk_count = 0
432
- print(f"[Generate] Starting to stream from {actual_model_id}...")
433
 
 
434
  for chunk in stream:
435
- # Handle different response formats
436
  chunk_content = None
437
 
438
- if is_mistral_model(selected_model_id):
439
  # Mistral format: chunk.data.choices[0].delta.content
440
- if (hasattr(chunk, "data") and chunk.data and
441
- hasattr(chunk.data, "choices") and chunk.data.choices and
442
- hasattr(chunk.data.choices[0], "delta") and
443
- hasattr(chunk.data.choices[0].delta, "content") and
444
- chunk.data.choices[0].delta.content is not None):
445
- chunk_content = chunk.data.choices[0].delta.content
446
  else:
447
  # OpenAI format: chunk.choices[0].delta.content
448
- if (hasattr(chunk, 'choices') and
449
- chunk.choices and
450
- len(chunk.choices) > 0 and
451
- hasattr(chunk.choices[0], 'delta') and
452
- hasattr(chunk.choices[0].delta, 'content') and
453
- chunk.choices[0].delta.content):
454
- chunk_content = chunk.choices[0].delta.content
455
 
456
  if chunk_content:
457
- content = chunk_content
458
- generated_code += content
459
  chunk_count += 1
460
 
461
- # Log every 10th chunk to avoid spam
462
- if chunk_count % 10 == 0:
463
- print(f"[Generate] Streamed {chunk_count} chunks, {len(generated_code)} chars total")
 
464
 
465
- # Send chunk as Server-Sent Event - yield immediately for instant streaming
466
  event_data = json.dumps({
467
  "type": "chunk",
468
- "content": content,
469
- "timestamp": datetime.now().isoformat()
470
  })
471
  yield f"data: {event_data}\n\n"
472
-
473
- # Yield control to allow async processing - no artificial delay
474
- await asyncio.sleep(0)
475
-
476
- print(f"[Generate] Completed with {chunk_count} chunks, total length: {len(generated_code)}")
477
 
478
- # Send completion event
479
  completion_data = json.dumps({
480
  "type": "complete",
481
- "code": generated_code,
482
- "timestamp": datetime.now().isoformat()
483
  })
484
  yield f"data: {completion_data}\n\n"
485
 
486
  except Exception as e:
487
  error_data = json.dumps({
488
  "type": "error",
489
- "message": str(e),
490
- "timestamp": datetime.now().isoformat()
491
  })
492
  yield f"data: {error_data}\n\n"
493
 
494
  except Exception as e:
495
  error_data = json.dumps({
496
  "type": "error",
497
- "message": f"Generation error: {str(e)}",
498
- "timestamp": datetime.now().isoformat()
499
  })
500
  yield f"data: {error_data}\n\n"
501
 
 
63
 
64
  print("[Startup] System prompts initialization complete")
65
 
66
+ # Cache system prompts map for fast lookup (created once at startup)
67
+ SYSTEM_PROMPT_CACHE = {
68
+ "html": HTML_SYSTEM_PROMPT,
69
+ "gradio": GRADIO_SYSTEM_PROMPT,
70
+ "streamlit": STREAMLIT_SYSTEM_PROMPT,
71
+ "transformers.js": TRANSFORMERS_JS_SYSTEM_PROMPT,
72
+ "react": REACT_SYSTEM_PROMPT,
73
+ "comfyui": JSON_SYSTEM_PROMPT,
74
+ }
75
+
76
+ # Client connection pool for reuse (thread-safe)
77
+ import threading
78
+ _client_pool = {}
79
+ _client_pool_lock = threading.Lock()
80
+
81
+ def get_cached_client(model_id: str, provider: str = "auto"):
82
+ """Get or create a cached API client for reuse"""
83
+ cache_key = f"{model_id}:{provider}"
84
+
85
+ with _client_pool_lock:
86
+ if cache_key not in _client_pool:
87
+ _client_pool[cache_key] = get_inference_client(model_id, provider)
88
+ return _client_pool[cache_key]
89
+
90
  # Define models and languages here to avoid importing Gradio UI
91
  AVAILABLE_MODELS = [
92
  {"name": "Gemini 3.0 Pro", "id": "gemini-3.0-pro", "description": "Google Gemini 3.0 Pro via Poe with advanced reasoning"},
 
99
  {"name": "Qwen3 Max Preview", "id": "qwen3-max-preview", "description": "Qwen3 Max Preview via DashScope API"},
100
  ]
101
 
102
+ # Cache model lookup for faster access (built after AVAILABLE_MODELS is defined)
103
+ MODEL_CACHE = {model["id"]: model for model in AVAILABLE_MODELS}
104
+ print(f"[Startup] ✅ Performance optimizations loaded: {len(SYSTEM_PROMPT_CACHE)} cached prompts, {len(MODEL_CACHE)} cached models, client pooling enabled")
105
+
106
  LANGUAGE_CHOICES = ["html", "gradio", "transformers.js", "streamlit", "comfyui", "react"]
107
 
108
  app = FastAPI(title="AnyCoder API", version="1.0.0")
 
394
  selected_model_id = model_id
395
 
396
  try:
397
+ # Fast model lookup using cache
398
+ selected_model = MODEL_CACHE.get(selected_model_id)
 
 
 
 
 
399
  if not selected_model:
400
+ # Fallback to first available model (shouldn't happen often)
401
  selected_model = AVAILABLE_MODELS[0]
402
  selected_model_id = selected_model["id"]
403
 
404
  # Track generated code
405
  generated_code = ""
406
 
407
+ # Fast system prompt lookup using cache
408
+ system_prompt = SYSTEM_PROMPT_CACHE.get(language)
409
+ if not system_prompt:
410
+ # Format generic prompt only if needed
411
+ system_prompt = GENERIC_SYSTEM_PROMPT.format(language=language)
 
 
 
 
 
 
 
412
 
413
+ # Get cached client (reuses connections)
414
+ client = get_cached_client(selected_model_id, provider)
 
415
 
416
  # Get the real model ID with provider suffixes
417
  actual_model_id = get_real_model_id(selected_model_id)
 
418
 
419
+ # Prepare messages (optimized - no string concatenation in hot path)
420
+ user_content = f"Generate a {language} application: {query}"
421
  messages = [
422
  {"role": "system", "content": system_prompt},
423
+ {"role": "user", "content": user_content}
424
  ]
425
 
426
  # Stream the response
 
445
  )
446
 
447
  chunk_count = 0
448
+ is_mistral = is_mistral_model(selected_model_id)
449
 
450
+ # Optimized chunk processing - reduce attribute lookups
451
  for chunk in stream:
 
452
  chunk_content = None
453
 
454
+ if is_mistral:
455
  # Mistral format: chunk.data.choices[0].delta.content
456
+ try:
457
+ if chunk.data and chunk.data.choices and chunk.data.choices[0].delta.content:
458
+ chunk_content = chunk.data.choices[0].delta.content
459
+ except (AttributeError, IndexError):
460
+ continue
 
461
  else:
462
  # OpenAI format: chunk.choices[0].delta.content
463
+ try:
464
+ if chunk.choices and chunk.choices[0].delta.content:
465
+ chunk_content = chunk.choices[0].delta.content
466
+ except (AttributeError, IndexError):
467
+ continue
 
 
468
 
469
  if chunk_content:
470
+ generated_code += chunk_content
 
471
  chunk_count += 1
472
 
473
+ # Send chunk immediately - optimized JSON serialization
474
+ # Only yield control every 5 chunks to reduce overhead
475
+ if chunk_count % 5 == 0:
476
+ await asyncio.sleep(0)
477
 
478
+ # Build event data efficiently
479
  event_data = json.dumps({
480
  "type": "chunk",
481
+ "content": chunk_content
 
482
  })
483
  yield f"data: {event_data}\n\n"
 
 
 
 
 
484
 
485
+ # Send completion event (optimized - no timestamp in hot path)
486
  completion_data = json.dumps({
487
  "type": "complete",
488
+ "code": generated_code
 
489
  })
490
  yield f"data: {completion_data}\n\n"
491
 
492
  except Exception as e:
493
  error_data = json.dumps({
494
  "type": "error",
495
+ "message": str(e)
 
496
  })
497
  yield f"data: {error_data}\n\n"
498
 
499
  except Exception as e:
500
  error_data = json.dumps({
501
  "type": "error",
502
+ "message": f"Generation error: {str(e)}"
 
503
  })
504
  yield f"data: {error_data}\n\n"
505