File size: 2,999 Bytes
327d382
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""Modal vLLM deployment for Spice Bae LLM inference.

This deploys an open-source LLM (Llama 3.1 8B) using vLLM on Modal,
providing an OpenAI-compatible API endpoint that Spice Bae can use
instead of Claude API.

Deploy with: modal deploy modal_vllm.py
Test locally: modal serve modal_vllm.py

Uses Modal's $30/month free credits instead of paid API keys.
"""

import modal

MODELS_DIR = "/llm-models"
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"

def download_model_to_image(model_dir: str, model_name: str):
    """Download model during image build."""
    from huggingface_hub import snapshot_download

    snapshot_download(
        model_name,
        local_dir=model_dir,
        ignore_patterns=["*.pt", "*.bin"],
    )


image = (
    modal.Image.debian_slim(python_version="3.11")
    .pip_install(
        "vllm==0.6.4.post1",
        "huggingface_hub",
        "hf_transfer",
    )
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
    .run_function(
        download_model_to_image,
        kwargs={"model_dir": MODELS_DIR, "model_name": MODEL_NAME},
        secrets=[modal.Secret.from_name("huggingface-token")],
        timeout=60 * 20,
    )
)

app = modal.App("spice-bae-llm")

N_GPU = 1
MINUTES = 60


@app.function(
    image=image,
    gpu="A10G",
    scaledown_window=5 * MINUTES,
    timeout=20 * MINUTES,
    max_containers=1,
)
@modal.web_server(port=8000, startup_timeout=300)
def serve():
    """Serve vLLM OpenAI-compatible API using built-in server."""
    import subprocess

    cmd = [
        "python", "-m", "vllm.entrypoints.openai.api_server",
        "--model", MODELS_DIR,
        "--served-model-name", MODEL_NAME,
        "--host", "0.0.0.0",
        "--port", "8000",
        "--gpu-memory-utilization", "0.90",
        "--max-model-len", "4096",
    ]

    subprocess.Popen(cmd)


# =============================================================================
# DEPLOYMENT INSTRUCTIONS
# =============================================================================
#
# 1. Install Modal CLI:
#    pip install modal
#    modal setup
#
# 2. Create HuggingFace token secret (for gated models like Llama):
#    - Get token from https://huggingface.co/settings/tokens
#    - Accept Llama license at https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
#    modal secret create huggingface-token HF_TOKEN=hf_xxx
#
# 3. Deploy:
#    modal deploy modal_vllm.py
#
# 4. Your API will be at:
#    https://YOUR_USERNAME--spice-bae-llm-serve.modal.run
#
# 5. Test it:
#    curl https://YOUR_USERNAME--spice-bae-llm-serve.modal.run/v1/chat/completions \
#      -H "Content-Type: application/json" \
#      -d '{"model": "meta-llama/Llama-3.1-8B-Instruct", "messages": [{"role": "user", "content": "Hello!"}]}'
#
# 6. Set environment variable for Spice Bae:
#    OPENAI_API_BASE=https://YOUR_USERNAME--spice-bae-llm-serve.modal.run/v1
#    OPENAI_API_KEY=not-needed
#    USE_OPENAI_COMPATIBLE=true
#
# =============================================================================