File size: 12,571 Bytes
64fbdd1
 
1612fe8
 
64fbdd1
1612fe8
 
 
 
 
 
 
64fbdd1
 
 
1612fe8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64fbdd1
 
1612fe8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64fbdd1
1612fe8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64fbdd1
 
 
 
 
 
 
 
 
 
 
 
1612fe8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64fbdd1
1612fe8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64fbdd1
 
 
 
 
 
 
 
 
 
1612fe8
64fbdd1
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
import os
import re
import sys
import traceback
import gradio as gr

from huggingface_hub import (
    login,
    HfApi,
    hf_hub_download,
    whoami,
)
from llama_cpp import Llama
from transformers import AutoTokenizer

"""
Environment variables you can set in your Space (Settings -> Variables & secrets):

Required (pick one of these approaches):
- GGUF_REPO: The Hugging Face repo that contains your .gguf files
- GGUF_FILE: The specific .gguf filename to load from that repo

Optional (recommended):
- MODEL_ID: Base model repo to pull the tokenizer/chat template from.
  Use the matching family for your quant:
  - Qwen family:  deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B or -Qwen-7B
  - Llama family: deepseek-ai/DeepSeek-R1-Distill-Llama-8B

If MODEL_ID is not set, we will attempt to guess it from GGUF_REPO.

Other optional env vars:
- HF_TOKEN: If your repo is gated/private, add this as a Space secret (read scope).
- PREFER_FAMILY: "qwen" or "llama" (only used if we need to guess MODEL_ID). Default: qwen
- PREFER_SIZE: "1.5b", "7b", or "8b" (only used if we need to guess MODEL_ID). Default: 1.5b
- N_CTX: context window (default 4096)
- N_THREADS: CPU threads (default: half your CPU cores, at least 1)
- N_BATCH: batch size (default 128)
"""

# --------------------
# Auth (optional)
# --------------------
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN:
    try:
        login(HF_TOKEN)
        try:
            user = whoami().get("name", "ok")
            print(f"[auth] Logged into Hugging Face as: {user}")
        except Exception:
            print("[auth] Logged in (could not fetch user name).")
    except Exception as e:
        print(f"[auth] Failed to login with HF_TOKEN: {e}")

# --------------------
# Config / Defaults
# --------------------
GGUF_REPO = os.getenv("GGUF_REPO", "").strip()
GGUF_FILE = os.getenv("GGUF_FILE", "").strip()

PREFER_FAMILY = os.getenv("PREFER_FAMILY", "qwen").lower()
PREFER_SIZE = os.getenv("PREFER_SIZE", "1.5b").lower()

# Runtime knobs
def _default_threads():
    try:
        cores = os.cpu_count() or 2
        return max(1, cores // 2)  # be gentle on free CPU
    except Exception:
        return 1

N_CTX = int(os.getenv("N_CTX", "4096"))
N_THREADS = int(os.getenv("N_THREADS", str(_default_threads())))
N_BATCH = int(os.getenv("N_BATCH", "128"))

# --------------------
# Helpers
# --------------------
api = HfApi()


def repo_exists(repo_id: str) -> bool:
    try:
        api.model_info(repo_id)
        return True
    except Exception:
        return False


def pick_q4_file(repo_id: str) -> str:
    """Choose a reasonable 4-bit GGUF from a repo (prefer Q4_K_M, then Q4_0)."""
    info = api.model_info(repo_id)
    ggufs = [s.rfilename for s in info.siblings if s.rfilename.lower().endswith(".gguf")]

    # Prefer Q4_K_M, then any Q4, then Q3 as last resort
    priority = []
    for f in ggufs:
        fl = f.lower()
        score = 0
        if "q4_k_m" in fl:
            score = 100
        elif "q4_k_s" in fl or "q4_k_l" in fl or "q4_k" in fl:
            score = 95
        elif "q4_0" in fl or "q4" in fl:
            score = 90
        elif "q3_k_m" in fl or "q3" in fl:
            score = 70
        else:
            score = 10
        priority.append((score, f))

    if not priority:
        raise FileNotFoundError(f"No .gguf files found in {repo_id}")

    priority.sort(reverse=True, key=lambda x: x[0])
    chosen = priority[0][1]
    return chosen


def guess_model_id_from_repo(repo_id: str) -> str:
    """Guess a matching tokenizer/chat-template model based on the GGUF repo name."""
    rid = repo_id.lower()
    # Family
    if "qwen" in rid or PREFER_FAMILY == "qwen":
        # Size
        if "1.5" in rid or "1_5" in rid or PREFER_SIZE == "1.5b":
            return "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
        elif "7b" in rid or PREFER_SIZE == "7b":
            return "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
        else:
            return "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    # Llama family
    if "llama" in rid or PREFER_FAMILY == "llama":
        return "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
    # Fallback
    return "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"


def ensure_model_source(repo_id: str | None, filename: str | None) -> tuple[str, str]:
    """
    Ensure we have a valid GGUF repo + file.
    - If both provided, verify they exist.
    - If only repo provided, pick a reasonable Q4 file.
    - If none provided, raise with a helpful message.
    """
    if repo_id and filename:
        try:
            api.model_info(repo_id)  # raises if missing or no access
        except Exception as e:
            raise FileNotFoundError(
                f"Repo not accessible: {repo_id}\n{e}\n"
                "Check the repo id spelling, your HF token, and license access."
            )
        # Now check the file exists in the repo
        info = api.model_info(repo_id)
        files = {s.rfilename for s in info.siblings}
        if filename not in files:
            # Try case-insensitive match
            lower_map = {s.rfilename.lower(): s.rfilename for s in info.siblings}
            if filename.lower() in lower_map:
                filename = lower_map[filename.lower()]
            else:
                raise FileNotFoundError(
                    f"File not found in repo: {filename}\n"
                    f"Available gguf files: {[f for f in files if f.lower().endswith('.gguf')]}"
                )
        return repo_id, filename

    if repo_id and not filename:
        return repo_id, pick_q4_file(repo_id)

    raise ValueError(
        "No GGUF_REPO/GGUF_FILE provided. Set them in your Space Variables.\n"
        "Examples you can try (you must verify these exist and accept access if gated):\n"
        "  - GGUF_REPO = TheBloke/DeepSeek-R1-Distill-Qwen-7B-GGUF\n"
        "    GGUF_FILE = deepseek-r1-distill-qwen-7b.Q4_K_M.gguf\n"
        "  - GGUF_REPO = bartowski/DeepSeek-R1-Distill-Qwen-1.5B-GGUF\n"
        "    GGUF_FILE = deepseek-r1-distill-qwen-1.5b.Q4_K_M.gguf\n"
        "  - GGUF_REPO = MaziyarPanahi/DeepSeek-R1-Distill-Llama-8B-GGUF\n"
        "    GGUF_FILE = deepseek-r1-distill-llama-8b.Q4_K_M.gguf\n"
    )


def build_tokenizer(model_id: str) -> AutoTokenizer:
    print(f"[tokenizer] Loading tokenizer/chat template from {model_id}")
    tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    return tok


def apply_template(tokenizer: AutoTokenizer, history, message: str) -> str:
    # history: list of [user, assistant]
    msgs = []
    for u, a in history:
        if u:
            msgs.append({"role": "user", "content": u})
        if a:
            msgs.append({"role": "assistant", "content": a})
    msgs.append({"role": "user", "content": message})
    return tokenizer.apply_chat_template(
        msgs, tokenize=False, add_generation_prompt=True
    )


def strip_reasoning(text: str) -> str:
    # Hide DeepSeek-style reasoning tags if present
    return re.sub(
        r"<\|begin_of_thought\|>.*?<\|end_of_thought\|>",
        "",
        text,
        flags=re.DOTALL,
    )


# --------------------
# Resolve model + file
# --------------------
try:
    GGUF_REPO, GGUF_FILE = ensure_model_source(GGUF_REPO, GGUF_FILE)
    print(f"[gguf] Using repo: {GGUF_REPO}")
    print(f"[gguf] Using file: {GGUF_FILE}")
except Exception as e:
    # Fail fast with a clear error; Gradio will show logs
    print("[startup] Failed to resolve GGUF model source:")
    print(e)
    traceback.print_exc()
    # Provide a minimal dummy UI to show the error instead of crashing Space build
    def _error_ui():
        return gr.Markdown(
            f"Cannot start: {e}\n\n"
            "Go to Settings → Variables and set GGUF_REPO and GGUF_FILE to a valid GGUF."
        )
    with gr.Blocks() as demo:
        gr.Markdown("# DeepSeek R1 Distill (CPU, GGUF)")
        _error_ui()
    if __name__ == "__main__":
        demo.launch()
    sys.exit(0)

# Guess MODEL_ID if not provided
MODEL_ID = os.getenv("MODEL_ID", "").strip()
if not MODEL_ID:
    MODEL_ID = guess_model_id_from_repo(GGUF_REPO)

# --------------------
# Download and load
# --------------------
try:
    # Download exact file; raises if not found or no access
    print(f"[download] Fetching {GGUF_FILE} from {GGUF_REPO} ...")
    model_path = hf_hub_download(repo_id=GGUF_REPO, filename=GGUF_FILE)
    print(f"[download] File ready at: {model_path}")
except Exception as e:
    print("[download] Failed to download the GGUF file:")
    print(e)
    traceback.print_exc()
    # Same graceful error UI
    def _error_ui():
        return gr.Markdown(
            f"Download failed: {e}\n\n"
            "Check that GGUF_REPO and GGUF_FILE are correct and your HF_TOKEN has access."
        )
    with gr.Blocks() as demo:
        gr.Markdown("# DeepSeek R1 Distill (CPU, GGUF)")
        _error_ui()
    if __name__ == "__main__":
        demo.launch()
    sys.exit(0)

# Load tokenizer for chat template
try:
    tokenizer = build_tokenizer(MODEL_ID)
except Exception as e:
    print("[tokenizer] Failed to load tokenizer/chat template:")
    print(e)
    traceback.print_exc()
    # Still try to continue with a naive prompt if tokenizer fails
    tokenizer = None
    def naive_template(history, message):
        # Simple ChatML-like format
        parts = []
        for u, a in history:
            if u:
                parts.append(f"<|im_start|>user\n{u}\n<|im_end|>")
            if a:
                parts.append(f"<|im_start|>assistant\n{a}\n<|im_end|>")
        parts.append(f"<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n")
        return "\n".join(parts)

def make_prompt(history, message):
    if tokenizer is not None:
        return apply_template(tokenizer, history, message)
    return naive_template(history, message)  # type: ignore[name-defined]

# Load llama.cpp
try:
    llm = Llama(
        model_path=model_path,
        n_ctx=N_CTX,
        n_threads=N_THREADS,
        n_batch=N_BATCH,
        n_gpu_layers=0,  # CPU Space
        verbose=False,
    )
    print("[llama] Model loaded.")
except Exception as e:
    print("[llama] Failed to load llama.cpp with the downloaded GGUF:")
    print(e)
    traceback.print_exc()
    def _error_ui():
        return gr.Markdown(f"Failed to load model: {e}")
    with gr.Blocks() as demo:
        gr.Markdown("# DeepSeek R1 Distill (CPU, GGUF)")
        _error_ui()
    if __name__ == "__main__":
        demo.launch()
    sys.exit(0)

# --------------------
# Gradio app
# --------------------
def chat_fn(message, history, max_new_tokens, temperature, top_p, show_reasoning):
    try:
        prompt = make_prompt(history, message)
        # Common stop markers; eos from tokenizer if available
        stops = ["<|eot_id|>", "<|im_end|>", "<|end_of_text|>"]
        try:
            if tokenizer is not None and getattr(tokenizer, "eos_token", None):
                eos = tokenizer.eos_token
                if eos and eos not in stops:
                    stops.append(eos)
        except Exception:
            pass

        stream = llm(
            prompt,
            max_tokens=int(max_new_tokens),
            temperature=float(temperature),
            top_p=float(top_p),
            stop=stops,
            stream=True,
        )
        raw = ""
        for part in stream:
            delta = part["choices"][0]["text"]
            raw += delta
            yield raw if show_reasoning else strip_reasoning(raw)
    except Exception as e:
        err = f"[error] {type(e).__name__}: {e}"
        yield err

header_md = f"""
### DeepSeek R1 Distill (CPU, GGUF)
Loaded:
- GGUF_REPO: `{GGUF_REPO}`
- GGUF_FILE: `{GGUF_FILE}`
- Chat template from: `{MODEL_ID}`
- n_ctx={N_CTX}, n_threads={N_THREADS}, n_batch={N_BATCH}

Tip: If you see a 404/403 at startup, set GGUF_REPO/GGUF_FILE correctly and ensure HF_TOKEN has access.
"""

demo = gr.ChatInterface(
    fn=chat_fn,
    additional_inputs=[
        gr.Slider(64, 2048, value=512, step=32, label="Max new tokens"),
        gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="Temperature"),
        gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p"),
        gr.Checkbox(label="Show reasoning", value=False),
    ],
    title="DeepSeek R1 Distill (CPU, GGUF)",
    description=header_md,
    examples=[
        "Prove that the sum of two even numbers is even.",
        "A train leaves at 3 PM at 60 km/h. Another at 4 PM at 80 km/h. When will the second catch up?",
    ],
)

if __name__ == "__main__":
    demo.launch()