Spaces:

Mungert
/

GradLLM

Running

App Files Files Community

johnbridges commited on Sep 17

Commit

d76b941

1 Parent(s): d279e64

.

Browse files

Files changed (1) hide show

hf_backend.py +35 -22

hf_backend.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # hf_backend.py
 import time, logging
-from typing import Any, Dict, AsyncIterable
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
@@ -33,6 +34,7 @@ except Exception as e:
 # ---------------- helpers ----------------
 def _pick_cpu_dtype() -> torch.dtype:
     if hasattr(torch, "cpu") and hasattr(torch.cpu, "is_bf16_supported"):
         try:
             if torch.cpu.is_bf16_supported():
@@ -45,19 +47,22 @@ def _pick_cpu_dtype() -> torch.dtype:
 # ---------------- global cache ----------------
-_MODEL_CACHE: dict[tuple[str, torch.dtype], AutoModelForCausalLM] = {}
-def _get_model(device: str, dtype: torch.dtype):
-    key = (device, dtype)
-    if key in _MODEL_CACHE:
-        return _MODEL_CACHE[key]
     cfg = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
     if hasattr(cfg, "quantization_config"):
         logger.warning("Removing quantization_config from model config")
-        delattr(cfg, "quantization_config")
     try:
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
@@ -69,20 +74,20 @@ def _get_model(device: str, dtype: torch.dtype):
     except Exception as e:
         if device == "cpu" and dtype == torch.bfloat16:
             logger.warning(f"BF16 load failed on CPU: {e}. Retrying with FP32.")
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_ID,
                 config=cfg,
-                torch_dtype=torch.float32,
                 trust_remote_code=True,
                 device_map={"": "cpu"},
             )
-            dtype = torch.float32
         else:
             raise
     model.eval()
-    _MODEL_CACHE[(device, dtype)] = model
-    return model
 # ---------------- Chat Backend ----------------
@@ -105,7 +110,7 @@ class HFChatBackend(ChatBackend):
             logger.debug("Injected X-IP-Token into ZeroGPU headers")
         # Build prompt using chat template if available
-        if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template:
             try:
                 prompt = tokenizer.apply_chat_template(
                     messages,
@@ -119,15 +124,20 @@ class HFChatBackend(ChatBackend):
         else:
             prompt = messages[-1]["content"] if messages else "(empty)"
-        def _run_once(prompt: str, device: str, dtype: torch.dtype) -> str:
-            model = _get_model(device, dtype)
-            inputs = tokenizer(prompt, return_tensors="pt").to(device)
             with torch.inference_mode():
                 if device != "cpu":
-                    autocast_ctx = torch.autocast(device_type=device, dtype=dtype)
                 else:
-                    autocast_ctx = torch.cpu.amp.autocast(dtype=dtype)
                 with autocast_ctx:
                     outputs = model.generate(
@@ -135,21 +145,24 @@ class HFChatBackend(ChatBackend):
                         max_new_tokens=max_tokens,
                         temperature=temperature,
                         do_sample=True,
                     )
             return tokenizer.decode(outputs[0], skip_special_tokens=True)
         if spaces:
-            # --- GPU path with ZeroGPU ---
             @spaces.GPU(duration=120)
             def run_once(prompt: str) -> str:
-                return _run_once(prompt, device="cuda", dtype=torch.float16)
             text = run_once(prompt)
         else:
-            # --- CPU-only fallback with auto dtype detection ---
-            dtype = _pick_cpu_dtype()
-            text = _run_once(prompt, device="cpu", dtype=dtype)
         yield {
             "id": rid,

 # hf_backend.py
 import time, logging
+from contextlib import nullcontext
+from typing import Any, Dict, AsyncIterable, Tuple
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
 # ---------------- helpers ----------------
 def _pick_cpu_dtype() -> torch.dtype:
+    # Prefer BF16 if CPU supports it
     if hasattr(torch, "cpu") and hasattr(torch.cpu, "is_bf16_supported"):
         try:
             if torch.cpu.is_bf16_supported():
 # ---------------- global cache ----------------
+_MODEL_CACHE: Dict[tuple[str, torch.dtype], AutoModelForCausalLM] = {}
+def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, torch.dtype]:
+    # Return model and the effective dtype actually loaded with
+    # (handles CPU BF16 -> FP32 fallback)
+    effective_key = (device, dtype)
+    if effective_key in _MODEL_CACHE:
+        return _MODEL_CACHE[effective_key], dtype
     cfg = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
     if hasattr(cfg, "quantization_config"):
         logger.warning("Removing quantization_config from model config")
+        delattr(cfg, "quantization_config")  # delete instead of setting None
+    eff_dtype = dtype
     try:
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
     except Exception as e:
         if device == "cpu" and dtype == torch.bfloat16:
             logger.warning(f"BF16 load failed on CPU: {e}. Retrying with FP32.")
+            eff_dtype = torch.float32
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_ID,
                 config=cfg,
+                torch_dtype=eff_dtype,
                 trust_remote_code=True,
                 device_map={"": "cpu"},
             )
         else:
             raise
     model.eval()
+    _MODEL_CACHE[(device, eff_dtype)] = model
+    return model, eff_dtype
 # ---------------- Chat Backend ----------------
             logger.debug("Injected X-IP-Token into ZeroGPU headers")
         # Build prompt using chat template if available
+        if hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None):
             try:
                 prompt = tokenizer.apply_chat_template(
                     messages,
         else:
             prompt = messages[-1]["content"] if messages else "(empty)"
+        def _run_once(prompt: str, device: str, req_dtype: torch.dtype) -> str:
+            model, eff_dtype = _get_model(device, req_dtype)
+            inputs = tokenizer(prompt, return_tensors="pt")
+            inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
             with torch.inference_mode():
                 if device != "cpu":
+                    autocast_ctx = torch.autocast(device_type=device, dtype=eff_dtype)
                 else:
+                    if eff_dtype == torch.bfloat16:
+                        autocast_ctx = torch.cpu.amp.autocast(dtype=torch.bfloat16)
+                    else:
+                        autocast_ctx = nullcontext()
                 with autocast_ctx:
                     outputs = model.generate(
                         max_new_tokens=max_tokens,
                         temperature=temperature,
                         do_sample=True,
+                        use_cache=True,
                     )
             return tokenizer.decode(outputs[0], skip_special_tokens=True)
         if spaces:
+            # Always dispatch via ZeroGPU decorator if available.
             @spaces.GPU(duration=120)
             def run_once(prompt: str) -> str:
+                if torch.cuda.is_available():
+                    return _run_once(prompt, device="cuda", req_dtype=torch.float16)
+                # Fallback to CPU inside the GPU context if CUDA is unavailable
+                return _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
             text = run_once(prompt)
         else:
+            # CPU-only runtime
+            text = _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
         yield {
             "id": rid,