"""Modal vLLM deployment for Spice Bae LLM inference. This deploys an open-source LLM (Llama 3.1 8B) using vLLM on Modal, providing an OpenAI-compatible API endpoint that Spice Bae can use instead of Claude API. Deploy with: modal deploy modal_vllm.py Test locally: modal serve modal_vllm.py Uses Modal's $30/month free credits instead of paid API keys. """ import modal MODELS_DIR = "/llm-models" MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct" def download_model_to_image(model_dir: str, model_name: str): """Download model during image build.""" from huggingface_hub import snapshot_download snapshot_download( model_name, local_dir=model_dir, ignore_patterns=["*.pt", "*.bin"], ) image = ( modal.Image.debian_slim(python_version="3.11") .pip_install( "vllm==0.6.4.post1", "huggingface_hub", "hf_transfer", ) .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) .run_function( download_model_to_image, kwargs={"model_dir": MODELS_DIR, "model_name": MODEL_NAME}, secrets=[modal.Secret.from_name("huggingface-token")], timeout=60 * 20, ) ) app = modal.App("spice-bae-llm") N_GPU = 1 MINUTES = 60 @app.function( image=image, gpu="A10G", scaledown_window=5 * MINUTES, timeout=20 * MINUTES, max_containers=1, ) @modal.web_server(port=8000, startup_timeout=300) def serve(): """Serve vLLM OpenAI-compatible API using built-in server.""" import subprocess cmd = [ "python", "-m", "vllm.entrypoints.openai.api_server", "--model", MODELS_DIR, "--served-model-name", MODEL_NAME, "--host", "0.0.0.0", "--port", "8000", "--gpu-memory-utilization", "0.90", "--max-model-len", "4096", ] subprocess.Popen(cmd) # ============================================================================= # DEPLOYMENT INSTRUCTIONS # ============================================================================= # # 1. Install Modal CLI: # pip install modal # modal setup # # 2. Create HuggingFace token secret (for gated models like Llama): # - Get token from https://huggingface.co/settings/tokens # - Accept Llama license at https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct # modal secret create huggingface-token HF_TOKEN=hf_xxx # # 3. Deploy: # modal deploy modal_vllm.py # # 4. Your API will be at: # https://YOUR_USERNAME--spice-bae-llm-serve.modal.run # # 5. Test it: # curl https://YOUR_USERNAME--spice-bae-llm-serve.modal.run/v1/chat/completions \ # -H "Content-Type: application/json" \ # -d '{"model": "meta-llama/Llama-3.1-8B-Instruct", "messages": [{"role": "user", "content": "Hello!"}]}' # # 6. Set environment variable for Spice Bae: # OPENAI_API_BASE=https://YOUR_USERNAME--spice-bae-llm-serve.modal.run/v1 # OPENAI_API_KEY=not-needed # USE_OPENAI_COMPATIBLE=true # # =============================================================================