Spaces:
Running
Running
| """Modal vLLM deployment for Spice Bae LLM inference. | |
| This deploys an open-source LLM (Llama 3.1 8B) using vLLM on Modal, | |
| providing an OpenAI-compatible API endpoint that Spice Bae can use | |
| instead of Claude API. | |
| Deploy with: modal deploy modal_vllm.py | |
| Test locally: modal serve modal_vllm.py | |
| Uses Modal's $30/month free credits instead of paid API keys. | |
| """ | |
| import modal | |
| MODELS_DIR = "/llm-models" | |
| MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct" | |
| def download_model_to_image(model_dir: str, model_name: str): | |
| """Download model during image build.""" | |
| from huggingface_hub import snapshot_download | |
| snapshot_download( | |
| model_name, | |
| local_dir=model_dir, | |
| ignore_patterns=["*.pt", "*.bin"], | |
| ) | |
| image = ( | |
| modal.Image.debian_slim(python_version="3.11") | |
| .pip_install( | |
| "vllm==0.6.4.post1", | |
| "huggingface_hub", | |
| "hf_transfer", | |
| ) | |
| .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) | |
| .run_function( | |
| download_model_to_image, | |
| kwargs={"model_dir": MODELS_DIR, "model_name": MODEL_NAME}, | |
| secrets=[modal.Secret.from_name("huggingface-token")], | |
| timeout=60 * 20, | |
| ) | |
| ) | |
| app = modal.App("spice-bae-llm") | |
| N_GPU = 1 | |
| MINUTES = 60 | |
| def serve(): | |
| """Serve vLLM OpenAI-compatible API using built-in server.""" | |
| import subprocess | |
| cmd = [ | |
| "python", "-m", "vllm.entrypoints.openai.api_server", | |
| "--model", MODELS_DIR, | |
| "--served-model-name", MODEL_NAME, | |
| "--host", "0.0.0.0", | |
| "--port", "8000", | |
| "--gpu-memory-utilization", "0.90", | |
| "--max-model-len", "4096", | |
| ] | |
| subprocess.Popen(cmd) | |
| # ============================================================================= | |
| # DEPLOYMENT INSTRUCTIONS | |
| # ============================================================================= | |
| # | |
| # 1. Install Modal CLI: | |
| # pip install modal | |
| # modal setup | |
| # | |
| # 2. Create HuggingFace token secret (for gated models like Llama): | |
| # - Get token from https://huggingface.co/settings/tokens | |
| # - Accept Llama license at https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct | |
| # modal secret create huggingface-token HF_TOKEN=hf_xxx | |
| # | |
| # 3. Deploy: | |
| # modal deploy modal_vllm.py | |
| # | |
| # 4. Your API will be at: | |
| # https://YOUR_USERNAME--spice-bae-llm-serve.modal.run | |
| # | |
| # 5. Test it: | |
| # curl https://YOUR_USERNAME--spice-bae-llm-serve.modal.run/v1/chat/completions \ | |
| # -H "Content-Type: application/json" \ | |
| # -d '{"model": "meta-llama/Llama-3.1-8B-Instruct", "messages": [{"role": "user", "content": "Hello!"}]}' | |
| # | |
| # 6. Set environment variable for Spice Bae: | |
| # OPENAI_API_BASE=https://YOUR_USERNAME--spice-bae-llm-serve.modal.run/v1 | |
| # OPENAI_API_KEY=not-needed | |
| # USE_OPENAI_COMPATIBLE=true | |
| # | |
| # ============================================================================= | |