Spaces:

Mungert
/

GradLLM

Running

App Files Files Community

johnbridges commited on Aug 17

Commit

7471f75

1 Parent(s): 7f43efb

.

Browse files

Files changed (1) hide show

hf_backend.py +57 -65

hf_backend.py CHANGED Viewed

@@ -1,6 +1,6 @@
-# hf_backend.py
-import time, logging, os, contextlib
-from typing import Any, Dict, AsyncIterable, List
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -9,23 +9,21 @@ from config import settings
 try:
     import spaces
 except ImportError:
-    spaces = None
 logger = logging.getLogger(__name__)
-# --- Load model/tokenizer on CPU at import time (ZeroGPU safe) ---
 MODEL_ID = settings.LlmHFModelID or "Qwen/Qwen2.5-1.5B-Instruct"
 logger.info(f"Loading {MODEL_ID} on CPU at startup (ZeroGPU safe)...")
-tokenizer = None
-model = None
-load_error = None
 try:
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=False)
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
-        torch_dtype=torch.float32,  # CPU-safe default
         trust_remote_code=True,
     )
     model.eval()
@@ -34,11 +32,7 @@ except Exception as e:
     logger.exception(load_error)
-# --- Device helpers ---
 def pick_device() -> str:
-    forced = os.getenv("FORCE_DEVICE", "").lower().strip()
-    if forced in {"cpu", "cuda", "mps"}:
-        return forced
     if torch.cuda.is_available():
         return "cuda"
     if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
@@ -54,7 +48,6 @@ def pick_dtype(device: str) -> torch.dtype:
     return torch.float32
-# --- Backend class ---
 class HFChatBackend(ChatBackend):
     async def stream(self, request: Dict[str, Any]) -> AsyncIterable[Dict[str, Any]]:
         if load_error:
@@ -68,55 +61,54 @@ class HFChatBackend(ChatBackend):
         rid = f"chatcmpl-hf-{int(time.time())}"
         now = int(time.time())
-        if spaces:
-            @spaces.GPU(duration=120)  # allow longer run
-            def run_once(prompt: str) -> str:
-                device = pick_device()
-                dtype = pick_dtype(device)
-                # Move model to GPU if needed
-                model.to(device=device, dtype=dtype).eval()
-                inputs = tokenizer(prompt, return_tensors="pt").to(device)
-                with torch.inference_mode(), torch.autocast(device_type=device, dtype=dtype):
-                    outputs = model.generate(
-                        **inputs,
-                        max_new_tokens=max_tokens,
-                        temperature=temperature,
-                        do_sample=True,
-                    )
-                return tokenizer.decode(outputs[0], skip_special_tokens=True)
         else:
-            def run_once(prompt: str) -> str:
-                inputs = tokenizer(prompt, return_tensors="pt")
-                with torch.inference_mode():
-                    outputs = model.generate(
-                        **inputs,
-                        max_new_tokens=max_tokens,
-                        temperature=temperature,
-                        do_sample=True,
-                    )
-                return tokenizer.decode(outputs[0], skip_special_tokens=True)
-        try:
-            text = run_once(prompt)
-            yield {
-                "id": rid,
-                "object": "chat.completion.chunk",
-                "created": now,
-                "model": MODEL_ID,
-                "choices": [
-                    {"index": 0, "delta": {"content": text}, "finish_reason": "stop"}
-                ],
-            }
-        except Exception:
-            logger.exception("HF inference failed")
-            raise
-class StubImagesBackend(ImagesBackend):
-    async def generate_b64(self, request: Dict[str, Any]) -> str:
-        logger.warning("Image generation not supported in HF backend.")
-        return (
-            "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGP4BwQACfsD/etCJH0AAAAASUVORK5CYII="
-        )

+# hf_backend.py (patched)
+import time, logging, os
+from typing import Any, Dict, AsyncIterable
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 try:
     import spaces
+    from spaces.zero.client import SpaceZeroClient
 except ImportError:
+    spaces, SpaceZeroClient = None, None
 logger = logging.getLogger(__name__)
 MODEL_ID = settings.LlmHFModelID or "Qwen/Qwen2.5-1.5B-Instruct"
 logger.info(f"Loading {MODEL_ID} on CPU at startup (ZeroGPU safe)...")
+tokenizer, model, load_error = None, None, None
 try:
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=False)
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
+        torch_dtype=torch.float32,
         trust_remote_code=True,
     )
     model.eval()
     logger.exception(load_error)
 def pick_device() -> str:
     if torch.cuda.is_available():
         return "cuda"
     if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
     return torch.float32
 class HFChatBackend(ChatBackend):
     async def stream(self, request: Dict[str, Any]) -> AsyncIterable[Dict[str, Any]]:
         if load_error:
         rid = f"chatcmpl-hf-{int(time.time())}"
         now = int(time.time())
+        # --- ✅ Extract X-IP-Token from RabbitMQ message
+        x_ip_token = request.get("x_ip_token")
+        headers = {}
+        if x_ip_token:
+            headers["X-IP-Token"] = x_ip_token
+            logger.info("Using X-IP-Token from request for ZeroGPU attribution")
+        def _gpu_inference_fn(prompt: str) -> str:
+            device = pick_device()
+            dtype = pick_dtype(device)
+            model.to(device=device, dtype=dtype).eval()
+            inputs = tokenizer(prompt, return_tensors="pt").to(device)
+            with torch.inference_mode(), torch.autocast(device_type=device, dtype=dtype):
+                outputs = model.generate(
+                    **inputs,
+                    max_new_tokens=max_tokens,
+                    temperature=temperature,
+                    do_sample=True,
+                )
+            return tokenizer.decode(outputs[0], skip_special_tokens=True)
+        if spaces and SpaceZeroClient:
+            # Use a custom SpaceZeroClient with headers
+            client = SpaceZeroClient(headers=headers or None)
+            try:
+                text = await client.run(_gpu_inference_fn, args=[prompt], duration=120)
+            except Exception:
+                logger.exception("HF inference (ZeroGPU) failed")
+                raise
         else:
+            # CPU fallback
+            inputs = tokenizer(prompt, return_tensors="pt")
+            with torch.inference_mode():
+                outputs = model.generate(
+                    **inputs,
+                    max_new_tokens=max_tokens,
+                    temperature=temperature,
+                    do_sample=True,
+                )
+            text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        yield {
+            "id": rid,
+            "object": "chat.completion.chunk",
+            "created": now,
+            "model": MODEL_ID,
+            "choices": [
+                {"index": 0, "delta": {"content": text}, "finish_reason": "stop"}
+            ],
+        }