Spice_Bae / modal_vllm.py
fabiantoh98's picture
Add Modal deployment with vLLM support
327d382
"""Modal vLLM deployment for Spice Bae LLM inference.
This deploys an open-source LLM (Llama 3.1 8B) using vLLM on Modal,
providing an OpenAI-compatible API endpoint that Spice Bae can use
instead of Claude API.
Deploy with: modal deploy modal_vllm.py
Test locally: modal serve modal_vllm.py
Uses Modal's $30/month free credits instead of paid API keys.
"""
import modal
MODELS_DIR = "/llm-models"
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
def download_model_to_image(model_dir: str, model_name: str):
"""Download model during image build."""
from huggingface_hub import snapshot_download
snapshot_download(
model_name,
local_dir=model_dir,
ignore_patterns=["*.pt", "*.bin"],
)
image = (
modal.Image.debian_slim(python_version="3.11")
.pip_install(
"vllm==0.6.4.post1",
"huggingface_hub",
"hf_transfer",
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.run_function(
download_model_to_image,
kwargs={"model_dir": MODELS_DIR, "model_name": MODEL_NAME},
secrets=[modal.Secret.from_name("huggingface-token")],
timeout=60 * 20,
)
)
app = modal.App("spice-bae-llm")
N_GPU = 1
MINUTES = 60
@app.function(
image=image,
gpu="A10G",
scaledown_window=5 * MINUTES,
timeout=20 * MINUTES,
max_containers=1,
)
@modal.web_server(port=8000, startup_timeout=300)
def serve():
"""Serve vLLM OpenAI-compatible API using built-in server."""
import subprocess
cmd = [
"python", "-m", "vllm.entrypoints.openai.api_server",
"--model", MODELS_DIR,
"--served-model-name", MODEL_NAME,
"--host", "0.0.0.0",
"--port", "8000",
"--gpu-memory-utilization", "0.90",
"--max-model-len", "4096",
]
subprocess.Popen(cmd)
# =============================================================================
# DEPLOYMENT INSTRUCTIONS
# =============================================================================
#
# 1. Install Modal CLI:
# pip install modal
# modal setup
#
# 2. Create HuggingFace token secret (for gated models like Llama):
# - Get token from https://huggingface.co/settings/tokens
# - Accept Llama license at https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
# modal secret create huggingface-token HF_TOKEN=hf_xxx
#
# 3. Deploy:
# modal deploy modal_vllm.py
#
# 4. Your API will be at:
# https://YOUR_USERNAME--spice-bae-llm-serve.modal.run
#
# 5. Test it:
# curl https://YOUR_USERNAME--spice-bae-llm-serve.modal.run/v1/chat/completions \
# -H "Content-Type: application/json" \
# -d '{"model": "meta-llama/Llama-3.1-8B-Instruct", "messages": [{"role": "user", "content": "Hello!"}]}'
#
# 6. Set environment variable for Spice Bae:
# OPENAI_API_BASE=https://YOUR_USERNAME--spice-bae-llm-serve.modal.run/v1
# OPENAI_API_KEY=not-needed
# USE_OPENAI_COMPATIBLE=true
#
# =============================================================================