Spaces:
Build error
Build error
| import os | |
| import re | |
| import sys | |
| import traceback | |
| import gradio as gr | |
| from huggingface_hub import ( | |
| login, | |
| HfApi, | |
| hf_hub_download, | |
| whoami, | |
| ) | |
| from llama_cpp import Llama | |
| from transformers import AutoTokenizer | |
| """ | |
| Environment variables you can set in your Space (Settings -> Variables & secrets): | |
| Required (pick one of these approaches): | |
| - GGUF_REPO: The Hugging Face repo that contains your .gguf files | |
| - GGUF_FILE: The specific .gguf filename to load from that repo | |
| Optional (recommended): | |
| - MODEL_ID: Base model repo to pull the tokenizer/chat template from. | |
| Use the matching family for your quant: | |
| - Qwen family: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B or -Qwen-7B | |
| - Llama family: deepseek-ai/DeepSeek-R1-Distill-Llama-8B | |
| If MODEL_ID is not set, we will attempt to guess it from GGUF_REPO. | |
| Other optional env vars: | |
| - HF_TOKEN: If your repo is gated/private, add this as a Space secret (read scope). | |
| - PREFER_FAMILY: "qwen" or "llama" (only used if we need to guess MODEL_ID). Default: qwen | |
| - PREFER_SIZE: "1.5b", "7b", or "8b" (only used if we need to guess MODEL_ID). Default: 1.5b | |
| - N_CTX: context window (default 4096) | |
| - N_THREADS: CPU threads (default: half your CPU cores, at least 1) | |
| - N_BATCH: batch size (default 128) | |
| """ | |
| # -------------------- | |
| # Auth (optional) | |
| # -------------------- | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| if HF_TOKEN: | |
| try: | |
| login(HF_TOKEN) | |
| try: | |
| user = whoami().get("name", "ok") | |
| print(f"[auth] Logged into Hugging Face as: {user}") | |
| except Exception: | |
| print("[auth] Logged in (could not fetch user name).") | |
| except Exception as e: | |
| print(f"[auth] Failed to login with HF_TOKEN: {e}") | |
| # -------------------- | |
| # Config / Defaults | |
| # -------------------- | |
| GGUF_REPO = os.getenv("GGUF_REPO", "").strip() | |
| GGUF_FILE = os.getenv("GGUF_FILE", "").strip() | |
| PREFER_FAMILY = os.getenv("PREFER_FAMILY", "qwen").lower() | |
| PREFER_SIZE = os.getenv("PREFER_SIZE", "1.5b").lower() | |
| # Runtime knobs | |
| def _default_threads(): | |
| try: | |
| cores = os.cpu_count() or 2 | |
| return max(1, cores // 2) # be gentle on free CPU | |
| except Exception: | |
| return 1 | |
| N_CTX = int(os.getenv("N_CTX", "4096")) | |
| N_THREADS = int(os.getenv("N_THREADS", str(_default_threads()))) | |
| N_BATCH = int(os.getenv("N_BATCH", "128")) | |
| # -------------------- | |
| # Helpers | |
| # -------------------- | |
| api = HfApi() | |
| def repo_exists(repo_id: str) -> bool: | |
| try: | |
| api.model_info(repo_id) | |
| return True | |
| except Exception: | |
| return False | |
| def pick_q4_file(repo_id: str) -> str: | |
| """Choose a reasonable 4-bit GGUF from a repo (prefer Q4_K_M, then Q4_0).""" | |
| info = api.model_info(repo_id) | |
| ggufs = [s.rfilename for s in info.siblings if s.rfilename.lower().endswith(".gguf")] | |
| # Prefer Q4_K_M, then any Q4, then Q3 as last resort | |
| priority = [] | |
| for f in ggufs: | |
| fl = f.lower() | |
| score = 0 | |
| if "q4_k_m" in fl: | |
| score = 100 | |
| elif "q4_k_s" in fl or "q4_k_l" in fl or "q4_k" in fl: | |
| score = 95 | |
| elif "q4_0" in fl or "q4" in fl: | |
| score = 90 | |
| elif "q3_k_m" in fl or "q3" in fl: | |
| score = 70 | |
| else: | |
| score = 10 | |
| priority.append((score, f)) | |
| if not priority: | |
| raise FileNotFoundError(f"No .gguf files found in {repo_id}") | |
| priority.sort(reverse=True, key=lambda x: x[0]) | |
| chosen = priority[0][1] | |
| return chosen | |
| def guess_model_id_from_repo(repo_id: str) -> str: | |
| """Guess a matching tokenizer/chat-template model based on the GGUF repo name.""" | |
| rid = repo_id.lower() | |
| # Family | |
| if "qwen" in rid or PREFER_FAMILY == "qwen": | |
| # Size | |
| if "1.5" in rid or "1_5" in rid or PREFER_SIZE == "1.5b": | |
| return "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" | |
| elif "7b" in rid or PREFER_SIZE == "7b": | |
| return "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" | |
| else: | |
| return "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" | |
| # Llama family | |
| if "llama" in rid or PREFER_FAMILY == "llama": | |
| return "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" | |
| # Fallback | |
| return "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" | |
| def ensure_model_source(repo_id: str | None, filename: str | None) -> tuple[str, str]: | |
| """ | |
| Ensure we have a valid GGUF repo + file. | |
| - If both provided, verify they exist. | |
| - If only repo provided, pick a reasonable Q4 file. | |
| - If none provided, raise with a helpful message. | |
| """ | |
| if repo_id and filename: | |
| try: | |
| api.model_info(repo_id) # raises if missing or no access | |
| except Exception as e: | |
| raise FileNotFoundError( | |
| f"Repo not accessible: {repo_id}\n{e}\n" | |
| "Check the repo id spelling, your HF token, and license access." | |
| ) | |
| # Now check the file exists in the repo | |
| info = api.model_info(repo_id) | |
| files = {s.rfilename for s in info.siblings} | |
| if filename not in files: | |
| # Try case-insensitive match | |
| lower_map = {s.rfilename.lower(): s.rfilename for s in info.siblings} | |
| if filename.lower() in lower_map: | |
| filename = lower_map[filename.lower()] | |
| else: | |
| raise FileNotFoundError( | |
| f"File not found in repo: {filename}\n" | |
| f"Available gguf files: {[f for f in files if f.lower().endswith('.gguf')]}" | |
| ) | |
| return repo_id, filename | |
| if repo_id and not filename: | |
| return repo_id, pick_q4_file(repo_id) | |
| raise ValueError( | |
| "No GGUF_REPO/GGUF_FILE provided. Set them in your Space Variables.\n" | |
| "Examples you can try (you must verify these exist and accept access if gated):\n" | |
| " - GGUF_REPO = TheBloke/DeepSeek-R1-Distill-Qwen-7B-GGUF\n" | |
| " GGUF_FILE = deepseek-r1-distill-qwen-7b.Q4_K_M.gguf\n" | |
| " - GGUF_REPO = bartowski/DeepSeek-R1-Distill-Qwen-1.5B-GGUF\n" | |
| " GGUF_FILE = deepseek-r1-distill-qwen-1.5b.Q4_K_M.gguf\n" | |
| " - GGUF_REPO = MaziyarPanahi/DeepSeek-R1-Distill-Llama-8B-GGUF\n" | |
| " GGUF_FILE = deepseek-r1-distill-llama-8b.Q4_K_M.gguf\n" | |
| ) | |
| def build_tokenizer(model_id: str) -> AutoTokenizer: | |
| print(f"[tokenizer] Loading tokenizer/chat template from {model_id}") | |
| tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
| return tok | |
| def apply_template(tokenizer: AutoTokenizer, history, message: str) -> str: | |
| # history: list of [user, assistant] | |
| msgs = [] | |
| for u, a in history: | |
| if u: | |
| msgs.append({"role": "user", "content": u}) | |
| if a: | |
| msgs.append({"role": "assistant", "content": a}) | |
| msgs.append({"role": "user", "content": message}) | |
| return tokenizer.apply_chat_template( | |
| msgs, tokenize=False, add_generation_prompt=True | |
| ) | |
| def strip_reasoning(text: str) -> str: | |
| # Hide DeepSeek-style reasoning tags if present | |
| return re.sub( | |
| r"<\|begin_of_thought\|>.*?<\|end_of_thought\|>", | |
| "", | |
| text, | |
| flags=re.DOTALL, | |
| ) | |
| # -------------------- | |
| # Resolve model + file | |
| # -------------------- | |
| try: | |
| GGUF_REPO, GGUF_FILE = ensure_model_source(GGUF_REPO, GGUF_FILE) | |
| print(f"[gguf] Using repo: {GGUF_REPO}") | |
| print(f"[gguf] Using file: {GGUF_FILE}") | |
| except Exception as e: | |
| # Fail fast with a clear error; Gradio will show logs | |
| print("[startup] Failed to resolve GGUF model source:") | |
| print(e) | |
| traceback.print_exc() | |
| # Provide a minimal dummy UI to show the error instead of crashing Space build | |
| def _error_ui(): | |
| return gr.Markdown( | |
| f"Cannot start: {e}\n\n" | |
| "Go to Settings → Variables and set GGUF_REPO and GGUF_FILE to a valid GGUF." | |
| ) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# DeepSeek R1 Distill (CPU, GGUF)") | |
| _error_ui() | |
| if __name__ == "__main__": | |
| demo.launch() | |
| sys.exit(0) | |
| # Guess MODEL_ID if not provided | |
| MODEL_ID = os.getenv("MODEL_ID", "").strip() | |
| if not MODEL_ID: | |
| MODEL_ID = guess_model_id_from_repo(GGUF_REPO) | |
| # -------------------- | |
| # Download and load | |
| # -------------------- | |
| try: | |
| # Download exact file; raises if not found or no access | |
| print(f"[download] Fetching {GGUF_FILE} from {GGUF_REPO} ...") | |
| model_path = hf_hub_download(repo_id=GGUF_REPO, filename=GGUF_FILE) | |
| print(f"[download] File ready at: {model_path}") | |
| except Exception as e: | |
| print("[download] Failed to download the GGUF file:") | |
| print(e) | |
| traceback.print_exc() | |
| # Same graceful error UI | |
| def _error_ui(): | |
| return gr.Markdown( | |
| f"Download failed: {e}\n\n" | |
| "Check that GGUF_REPO and GGUF_FILE are correct and your HF_TOKEN has access." | |
| ) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# DeepSeek R1 Distill (CPU, GGUF)") | |
| _error_ui() | |
| if __name__ == "__main__": | |
| demo.launch() | |
| sys.exit(0) | |
| # Load tokenizer for chat template | |
| try: | |
| tokenizer = build_tokenizer(MODEL_ID) | |
| except Exception as e: | |
| print("[tokenizer] Failed to load tokenizer/chat template:") | |
| print(e) | |
| traceback.print_exc() | |
| # Still try to continue with a naive prompt if tokenizer fails | |
| tokenizer = None | |
| def naive_template(history, message): | |
| # Simple ChatML-like format | |
| parts = [] | |
| for u, a in history: | |
| if u: | |
| parts.append(f"<|im_start|>user\n{u}\n<|im_end|>") | |
| if a: | |
| parts.append(f"<|im_start|>assistant\n{a}\n<|im_end|>") | |
| parts.append(f"<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n") | |
| return "\n".join(parts) | |
| def make_prompt(history, message): | |
| if tokenizer is not None: | |
| return apply_template(tokenizer, history, message) | |
| return naive_template(history, message) # type: ignore[name-defined] | |
| # Load llama.cpp | |
| try: | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=N_CTX, | |
| n_threads=N_THREADS, | |
| n_batch=N_BATCH, | |
| n_gpu_layers=0, # CPU Space | |
| verbose=False, | |
| ) | |
| print("[llama] Model loaded.") | |
| except Exception as e: | |
| print("[llama] Failed to load llama.cpp with the downloaded GGUF:") | |
| print(e) | |
| traceback.print_exc() | |
| def _error_ui(): | |
| return gr.Markdown(f"Failed to load model: {e}") | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# DeepSeek R1 Distill (CPU, GGUF)") | |
| _error_ui() | |
| if __name__ == "__main__": | |
| demo.launch() | |
| sys.exit(0) | |
| # -------------------- | |
| # Gradio app | |
| # -------------------- | |
| def chat_fn(message, history, max_new_tokens, temperature, top_p, show_reasoning): | |
| try: | |
| prompt = make_prompt(history, message) | |
| # Common stop markers; eos from tokenizer if available | |
| stops = ["<|eot_id|>", "<|im_end|>", "<|end_of_text|>"] | |
| try: | |
| if tokenizer is not None and getattr(tokenizer, "eos_token", None): | |
| eos = tokenizer.eos_token | |
| if eos and eos not in stops: | |
| stops.append(eos) | |
| except Exception: | |
| pass | |
| stream = llm( | |
| prompt, | |
| max_tokens=int(max_new_tokens), | |
| temperature=float(temperature), | |
| top_p=float(top_p), | |
| stop=stops, | |
| stream=True, | |
| ) | |
| raw = "" | |
| for part in stream: | |
| delta = part["choices"][0]["text"] | |
| raw += delta | |
| yield raw if show_reasoning else strip_reasoning(raw) | |
| except Exception as e: | |
| err = f"[error] {type(e).__name__}: {e}" | |
| yield err | |
| header_md = f""" | |
| ### DeepSeek R1 Distill (CPU, GGUF) | |
| Loaded: | |
| - GGUF_REPO: `{GGUF_REPO}` | |
| - GGUF_FILE: `{GGUF_FILE}` | |
| - Chat template from: `{MODEL_ID}` | |
| - n_ctx={N_CTX}, n_threads={N_THREADS}, n_batch={N_BATCH} | |
| Tip: If you see a 404/403 at startup, set GGUF_REPO/GGUF_FILE correctly and ensure HF_TOKEN has access. | |
| """ | |
| demo = gr.ChatInterface( | |
| fn=chat_fn, | |
| additional_inputs=[ | |
| gr.Slider(64, 2048, value=512, step=32, label="Max new tokens"), | |
| gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="Temperature"), | |
| gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p"), | |
| gr.Checkbox(label="Show reasoning", value=False), | |
| ], | |
| title="DeepSeek R1 Distill (CPU, GGUF)", | |
| description=header_md, | |
| examples=[ | |
| "Prove that the sum of two even numbers is even.", | |
| "A train leaves at 3 PM at 60 km/h. Another at 4 PM at 80 km/h. When will the second catch up?", | |
| ], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |