Spaces:

Cardiosense-AG
/

ai_econsult_demo

Paused

App Files Files Community

Cardiosense-AG commited on Nov 4

Commit

4bb50d6

verified ·

1 Parent(s): ced1708

Update src/model_loader.py

Browse files

Files changed (1) hide show

src/model_loader.py +93 -134

src/model_loader.py CHANGED Viewed

@@ -1,186 +1,145 @@
 # src/model_loader.py
 from __future__ import annotations
-import json
 import os
 import time
 from functools import lru_cache
-from typing import List, Dict, Tuple
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
-try:
-    from transformers import BitsAndBytesConfig
-    _HAS_BNB = True
-except Exception:
-    _HAS_BNB = False
-from .paths import hf_cache_dir
-_LOG_PREFIX = "[model_loader]"
-def _env_flag(name: str) -> bool:
-    return os.getenv(name, "").strip().lower() in {"1", "true", "yes", "on"}
-def _select_model() -> Tuple[str, bool, bool]:
-    """
-    Returns (model_id, use_cuda, use_4bit)
-    - Primary when CUDA available
-    - Fallback to CPU model otherwise or when FORCE_CPU_LLM=1
-    """
-    primary = os.getenv("MODEL_ID", "google/medgemma-27b-text-it")
-    fallback = os.getenv("MODEL_FALLBACK_ID", "google/medgemma-4b-it")
-    force_cpu = _env_flag("FORCE_CPU_LLM")
-    quant4 = (os.getenv("QUANT_MODE", "4bit").lower() == "4bit")
-    if not force_cpu and torch.cuda.is_available():
-        print(f"{_LOG_PREFIX} CUDA available. Selecting primary model: {primary} (4-bit={quant4 and _HAS_BNB})")
-        return primary, True, (quant4 and _HAS_BNB)
     else:
-        print(f"{_LOG_PREFIX} Using CPU fallback model: {fallback}")
-        return fallback, False, False
 @lru_cache(maxsize=1)
 def _load_tokenizer(model_id: str):
-    cache = str(hf_cache_dir())
-    tok = AutoTokenizer.from_pretrained(
-        model_id,
-        cache_dir=cache,
-        use_fast=True,
-        trust_remote_code=True,
-    )
-    if tok.pad_token_id is None and tok.eos_token_id is not None:
-        tok.pad_token = tok.eos_token
-    print(f"{_LOG_PREFIX} Tokenizer loaded: {model_id}  | cache={cache}")
     return tok
-@lru_cache(maxsize=1)
-def _load_model(model_id: str, use_cuda: bool, use_4bit: bool):
-    cache = str(hf_cache_dir())
-    t0 = time.perf_counter()
-    if use_cuda and use_4bit and _HAS_BNB:
-        bnb_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=torch.bfloat16,
-            bnb_4bit_use_double_quant=True,
-        )
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            cache_dir=cache,
-            device_map="auto",
-            torch_dtype=torch.bfloat16,
-            quantization_config=bnb_config,
-            trust_remote_code=True,
-        )
-        quant_txt = "4-bit (bnb, nf4)"
     else:
-        # CPU or non-quantized path
-        device_map = "auto" if use_cuda else {"": "cpu"}
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            cache_dir=cache,
-            device_map=device_map,
-            torch_dtype=torch.bfloat16 if use_cuda else torch.float32,
-            low_cpu_mem_usage=True,
-            trust_remote_code=True,
-        )
-        quant_txt = "none"
-    dt = time.perf_counter() - t0
-    print(f"{_LOG_PREFIX} Model loaded: {model_id}  | quant={quant_txt}  | time={dt:.2f}s")
     return model
-def _format_messages(tokenizer, messages: List[Dict[str, str]]):
-    """
-    Bypass chat template and build a simple instruction prompt.
-    This avoids models that output placeholder JSON schemas.
-    """
-    sys_text = "\n".join([m["content"] for m in messages if m["role"] == "system"])
-    usr_text = "\n".join([m["content"] for m in messages if m["role"] == "user"])
-    prompt = (
-        f"{sys_text.strip()}\n\n"
-        f"---\n"
-        f"{usr_text.strip()}\n\n"
-        f"Respond only with valid JSON for the SOAP draft as described above."
-    )
-    tokens = tokenizer(prompt, return_tensors="pt")
-    return tokens["input_ids"]
-def _stub_json_response() -> str:
     """
-    Deterministic JSON for end-to-end UI tests.
     """
-    obj = {
-        "subjective": "Patient reports intermittent exertional chest tightness for 2 months, no rest pain.",
-        "objective": "BP 132/78, HR 72, BMI 29. No murmurs. LDL 155 mg/dL, A1C 7.8%. eGFR 52.",
-        "assessment": [
-            "Stable angina symptoms with ASCVD risk factors (DM2, hyperlipidemia).",
-            "No red flags on history/exam today."
-        ],
-        "plan": [
-            "Start/continue high‑intensity statin; consider ezetimibe if LDL >70 on maximally tolerated statin.",
-            "Low‑dose aspirin for secondary prevention if established ASCVD; otherwise not routine for primary prevention.",
-            "Cardiology referral if symptoms persist or worsen; consider stress testing."
-        ]
-    }
-    return json.dumps(obj, ensure_ascii=False)
 def generate_chat(
     messages: List[Dict[str, str]],
     *,
-    max_new_tokens: int = 512,
     temperature: float = 0.2,
     top_p: float = 0.95,
 ) -> str:
-    """
-    Main text generation entry point.
-    - Honors E2E_STUB=1 for deterministic JSON (no model load).
-    - Otherwise loads tokenizer/model (GPU-first, CPU fallback) and generates.
-    """
-    if _env_flag("E2E_STUB"):
-        print(f"{_LOG_PREFIX} E2E_STUB=1 — returning deterministic JSON without model load.")
-        return _stub_json_response()
-    model_id, use_cuda, use_4bit = _select_model()
     tok = _load_tokenizer(model_id)
-    model = _load_model(model_id, use_cuda, use_4bit)
-    inputs = _format_messages(tok, messages)
-    input_ids = inputs.to(model.device)
-    gen_cfg = dict(
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
         top_p=top_p,
         eos_token_id=tok.eos_token_id,
-        pad_token_id=tok.pad_token_id,
     )
-    t0 = time.perf_counter()
     with torch.no_grad():
-        output_ids = model.generate(
-            input_ids=input_ids,
-            **gen_cfg,
-        )
-    dt = time.perf_counter() - t0
-    # Return only the newly generated tokens
-    generated = output_ids[0, input_ids.shape[-1]:]
-    text = tok.decode(generated, skip_special_tokens=True)
-    print(f"{_LOG_PREFIX} Generated {generated.shape[-1]} tokens in {dt:.2f}s (temp={temperature}, top_p={top_p})")
-    return text

 # src/model_loader.py
+# -----------------------------------------------------------------------------
+# Why this change
+# -----------------------------------------------------------------------------
+# - Fix fallback model id → 'google/medgemma-4b-text-it' (previous typo caused
+#   CPU-only runs to fail).
+# - Keep primary on GPU in 4-bit (bnb, nf4) when available; otherwise fallback.
+# - Provide a single generate_chat(messages, **gen_kwargs) entry point with
+#   consistent logging and without relying on chat templates (manual prompt).
+# - Lightweight logs show model choice, cache path, and generation time.
+# -----------------------------------------------------------------------------
 from __future__ import annotations
 import os
 import time
 from functools import lru_cache
+from typing import Dict, List
 import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+)
+HF_CACHE = os.environ.get("HF_HOME") or os.environ.get("TRANSFORMERS_CACHE") or "/data/econsult/hf_cache"
+MODEL_PRIMARY_ID = os.environ.get("MODEL_PRIMARY_ID", "google/medgemma-27b-text-it")
+MODEL_FALLBACK_ID = os.environ.get("MODEL_FALLBACK_ID", "google/medgemma-4b-text-it")  # <-- fixed
+def _pick_device_and_quant() -> Dict[str, object]:
+    cuda = torch.cuda.is_available()
+    if cuda:
+        # Prefer 4-bit NF4 on GPU for the primary model
+        quant = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+        )
+        return {"device_map": "auto", "quantization_config": quant, "torch_dtype": torch.bfloat16}
     else:
+        # CPU path: no bnb quantization. Load smaller fallback model in fp32.
+        return {"device_map": "auto", "torch_dtype": torch.float32}
 @lru_cache(maxsize=1)
 def _load_tokenizer(model_id: str):
+    tok = AutoTokenizer.from_pretrained(model_id, cache_dir=HF_CACHE, trust_remote_code=True)
     return tok
+@lru_cache(maxsize=2)
+def _load_model(model_id: str, use_quant: bool):
+    opts = _pick_device_and_quant()
+    if not torch.cuda.is_available():
+        # CPU: avoid quantization args that require CUDA
+        opts.pop("quantization_config", None)
+    t0 = time.time()
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        cache_dir=HF_CACHE,
+        trust_remote_code=True,
+        **opts,
+    )
+    dt = time.time() - t0
+    if torch.cuda.is_available() and "quantization_config" in opts:
+        print(f"[model_loader] Model loaded: {model_id}  | quant=4-bit (bnb, nf4)  | time={dt:.2f}s")
     else:
+        dtype = "fp32" if opts.get("torch_dtype") == torch.float32 else str(opts.get("torch_dtype"))
+        print(f"[model_loader] Model loaded: {model_id}  | dtype={dtype}  | time={dt:.2f}s")
     return model
+def _select_ids() -> str:
+    # Prefer primary if CUDA; otherwise fallback
+    if torch.cuda.is_available():
+        print(f"[model_loader] CUDA available. Selecting primary model: {MODEL_PRIMARY_ID} (4-bit=True)")
+        return MODEL_PRIMARY_ID
+    else:
+        print(f"[model_loader] CUDA not available. Selecting fallback model: {MODEL_FALLBACK_ID} (CPU)")
+        return MODEL_FALLBACK_ID
+def _build_prompt(messages: List[Dict[str, str]]) -> str:
     """
+    Manual prompt (avoid chat templates). We keep it simple and instructive.
     """
+    sys = ""
+    turns = []
+    for m in messages:
+        role = m.get("role", "user")
+        content = m.get("content", "")
+        if role == "system":
+            sys = content.strip()
+        elif role == "user":
+            turns.append(f"User: {content.strip()}")
+        elif role == "assistant":
+            turns.append(f"Assistant: {content.strip()}")
+    prompt = (sys + "\n\n" if sys else "") + "\n".join(turns) + "\nAssistant:"
+    return prompt
 def generate_chat(
     messages: List[Dict[str, str]],
     *,
+    max_new_tokens: int = 700,
     temperature: float = 0.2,
     top_p: float = 0.95,
 ) -> str:
+    model_id = _select_ids()
     tok = _load_tokenizer(model_id)
+    model = _load_model(model_id, use_quant=torch.cuda.is_available())
+    prompt = _build_prompt(messages)
+    inputs = tok(prompt, return_tensors="pt").to(model.device)
+    gen_kwargs = dict(
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
         top_p=top_p,
+        pad_token_id=tok.eos_token_id,
         eos_token_id=tok.eos_token_id,
     )
+    t0 = time.time()
     with torch.no_grad():
+        out = model.generate(**inputs, **gen_kwargs)
+    dt = time.time() - t0
+    text = tok.decode(out[0], skip_special_tokens=True)
+    # Strip the prompt
+    generated = text[len(prompt) :].strip()
+    print(f"[model_loader] Generated {max_new_tokens} tokens in {dt:.2f}s (temp={temperature}, top_p={top_p})")
+    print(f"[model_loader] Tokenizer loaded: {model_id}  | cache={HF_CACHE}")
+    return generated