Spaces:

Mungert
/

GradLLM

Running

johnbridges commited on Sep 17

Commit

849364d

1 Parent(s): 1d79762

.

Files changed (1) hide show

hf_backend.py CHANGED Viewed

@@ -45,6 +45,32 @@ def _pick_cpu_dtype() -> torch.dtype:
     return torch.float32
 # ---------------- Chat Backend ----------------
 class HFChatBackend(ChatBackend):
     async def stream(self, request: Dict[str, Any]) -> AsyncIterable[Dict[str, Any]]:
@@ -66,21 +92,7 @@ class HFChatBackend(ChatBackend):
             logger.debug("Injected X-IP-Token into ZeroGPU headers")
         def _run_once(prompt: str, device: str, dtype: torch.dtype) -> str:
-            # Load config and strip any quantization settings (fix FP8 issue)
-            cfg = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
-            if hasattr(cfg, "quantization_config"):
-                logger.warning("Removing quantization_config from model config")
-                cfg.quantization_config = None
-            model = AutoModelForCausalLM.from_pretrained(
-                MODEL_ID,
-                config=cfg,
-                torch_dtype=dtype,
-                trust_remote_code=True,
-                device_map="auto" if device != "cpu" else {"": "cpu"},
-            )
-            model.eval()
             inputs = tokenizer(prompt, return_tensors="pt").to(device)
             with torch.inference_mode():

     return torch.float32
+# ---------------- global cache ----------------
+_MODEL_CACHE: dict[tuple[str, torch.dtype], AutoModelForCausalLM] = {}
+def _get_model(device: str, dtype: torch.dtype):
+    key = (device, dtype)
+    if key in _MODEL_CACHE:
+        return _MODEL_CACHE[key]
+    cfg = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
+    if hasattr(cfg, "quantization_config"):
+        logger.warning("Removing quantization_config from model config")
+        delattr(cfg, "quantization_config")  # delete instead of setting None
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        config=cfg,
+        torch_dtype=dtype,
+        trust_remote_code=True,
+        device_map="auto" if device != "cpu" else {"": "cpu"},
+    )
+    model.eval()
+    _MODEL_CACHE[key] = model
+    return model
 # ---------------- Chat Backend ----------------
 class HFChatBackend(ChatBackend):
     async def stream(self, request: Dict[str, Any]) -> AsyncIterable[Dict[str, Any]]:
             logger.debug("Injected X-IP-Token into ZeroGPU headers")
         def _run_once(prompt: str, device: str, dtype: torch.dtype) -> str:
+            model = _get_model(device, dtype)
             inputs = tokenizer(prompt, return_tensors="pt").to(device)
             with torch.inference_mode():