Spaces:

MCP-1st-Birthday
/

Spice_Bae

Running

App Files Files Community

Spice_Bae / modal_vllm.py

fabiantoh98

Add Modal deployment with vLLM support

327d382 11 days ago

raw

history blame contribute delete

3 kB

	"""Modal vLLM deployment for Spice Bae LLM inference.

	This deploys an open-source LLM (Llama 3.1 8B) using vLLM on Modal,
	providing an OpenAI-compatible API endpoint that Spice Bae can use
	instead of Claude API.

	Deploy with: modal deploy modal_vllm.py
	Test locally: modal serve modal_vllm.py

	Uses Modal's $30/month free credits instead of paid API keys.
	"""

	import modal

	MODELS_DIR = "/llm-models"
	MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"

	def download_model_to_image(model_dir: str, model_name: str):
	"""Download model during image build."""
	from huggingface_hub import snapshot_download

	snapshot_download(
	model_name,
	local_dir=model_dir,
	ignore_patterns=[".pt", ".bin"],
	)


	image = (
	modal.Image.debian_slim(python_version="3.11")
	.pip_install(
	"vllm==0.6.4.post1",
	"huggingface_hub",
	"hf_transfer",
	)
	.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
	.run_function(
	download_model_to_image,
	kwargs={"model_dir": MODELS_DIR, "model_name": MODEL_NAME},
	secrets=[modal.Secret.from_name("huggingface-token")],
	timeout=60 * 20,
	)
	)

	app = modal.App("spice-bae-llm")

	N_GPU = 1
	MINUTES = 60


	@app.function(
	image=image,
	gpu="A10G",
	scaledown_window=5 * MINUTES,
	timeout=20 * MINUTES,
	max_containers=1,
	)
	@modal.web_server(port=8000, startup_timeout=300)
	def serve():
	"""Serve vLLM OpenAI-compatible API using built-in server."""
	import subprocess

	cmd = [
	"python", "-m", "vllm.entrypoints.openai.api_server",
	"--model", MODELS_DIR,
	"--served-model-name", MODEL_NAME,
	"--host", "0.0.0.0",
	"--port", "8000",
	"--gpu-memory-utilization", "0.90",
	"--max-model-len", "4096",
	]

	subprocess.Popen(cmd)


	# =============================================================================
	# DEPLOYMENT INSTRUCTIONS
	# =============================================================================
	#
	# 1. Install Modal CLI:
	# pip install modal
	# modal setup
	#
	# 2. Create HuggingFace token secret (for gated models like Llama):
	# - Get token from https://huggingface.co/settings/tokens
	# - Accept Llama license at https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
	# modal secret create huggingface-token HF_TOKEN=hf_xxx
	#
	# 3. Deploy:
	# modal deploy modal_vllm.py
	#
	# 4. Your API will be at:
	# https://YOUR_USERNAME--spice-bae-llm-serve.modal.run
	#
	# 5. Test it:
	# curl https://YOUR_USERNAME--spice-bae-llm-serve.modal.run/v1/chat/completions \
	# -H "Content-Type: application/json" \
	# -d '{"model": "meta-llama/Llama-3.1-8B-Instruct", "messages": [{"role": "user", "content": "Hello!"}]}'
	#
	# 6. Set environment variable for Spice Bae:
	# OPENAI_API_BASE=https://YOUR_USERNAME--spice-bae-llm-serve.modal.run/v1
	# OPENAI_API_KEY=not-needed
	# USE_OPENAI_COMPATIBLE=true
	#
	# =============================================================================