Spaces:

MCP-1st-Birthday
/

Spice_Bae

Running

File size: 2,999 Bytes

327d382

"""Modal vLLM deployment for Spice Bae LLM inference.

This deploys an open-source LLM (Llama 3.1 8B) using vLLM on Modal,
providing an OpenAI-compatible API endpoint that Spice Bae can use
instead of Claude API.

Deploy with: modal deploy modal_vllm.py
Test locally: modal serve modal_vllm.py

Uses Modal's $30/month free credits instead of paid API keys.
"""

import modal

MODELS_DIR = "/llm-models"
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"

def download_model_to_image(model_dir: str, model_name: str):
    """Download model during image build."""
    from huggingface_hub import snapshot_download

    snapshot_download(
        model_name,
        local_dir=model_dir,
        ignore_patterns=["*.pt", "*.bin"],
    )


image = (
    modal.Image.debian_slim(python_version="3.11")
    .pip_install(
        "vllm==0.6.4.post1",
        "huggingface_hub",
        "hf_transfer",
    )
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
    .run_function(
        download_model_to_image,
        kwargs={"model_dir": MODELS_DIR, "model_name": MODEL_NAME},
        secrets=[modal.Secret.from_name("huggingface-token")],
        timeout=60 * 20,
    )
)

app = modal.App("spice-bae-llm")

N_GPU = 1
MINUTES = 60


@app.function(
    image=image,
    gpu="A10G",
    scaledown_window=5 * MINUTES,
    timeout=20 * MINUTES,
    max_containers=1,
)
@modal.web_server(port=8000, startup_timeout=300)
def serve():
    """Serve vLLM OpenAI-compatible API using built-in server."""
    import subprocess

    cmd = [
        "python", "-m", "vllm.entrypoints.openai.api_server",
        "--model", MODELS_DIR,
        "--served-model-name", MODEL_NAME,
        "--host", "0.0.0.0",
        "--port", "8000",
        "--gpu-memory-utilization", "0.90",
        "--max-model-len", "4096",
    ]

    subprocess.Popen(cmd)


# =============================================================================
# DEPLOYMENT INSTRUCTIONS
# =============================================================================
#
# 1. Install Modal CLI:
#    pip install modal
#    modal setup
#
# 2. Create HuggingFace token secret (for gated models like Llama):
#    - Get token from https://huggingface.co/settings/tokens
#    - Accept Llama license at https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
#    modal secret create huggingface-token HF_TOKEN=hf_xxx
#
# 3. Deploy:
#    modal deploy modal_vllm.py
#
# 4. Your API will be at:
#    https://YOUR_USERNAME--spice-bae-llm-serve.modal.run
#
# 5. Test it:
#    curl https://YOUR_USERNAME--spice-bae-llm-serve.modal.run/v1/chat/completions \
#      -H "Content-Type: application/json" \
#      -d '{"model": "meta-llama/Llama-3.1-8B-Instruct", "messages": [{"role": "user", "content": "Hello!"}]}'
#
# 6. Set environment variable for Spice Bae:
#    OPENAI_API_BASE=https://YOUR_USERNAME--spice-bae-llm-serve.modal.run/v1
#    OPENAI_API_KEY=not-needed
#    USE_OPENAI_COMPATIBLE=true
#
# =============================================================================