Spaces:

Mungert
/

GradLLM

Running

App Files Files Community

johnbridges commited on Sep 18

Commit

bf6d44e

1 Parent(s): 0e8e333

.

Browse files

Files changed (1) hide show

hf_backend.py +84 -38

hf_backend.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # hf_backend.py
-import time, logging
 from contextlib import nullcontext
 from typing import Any, Dict, AsyncIterable, Tuple
@@ -10,15 +10,30 @@ from config import settings
 logger = logging.getLogger(__name__)
 try:
     import spaces
     from spaces.zero import client as zero_client
 except ImportError:
     spaces, zero_client = None, None
-# --- Model setup ---
 MODEL_ID = settings.LlmHFModelID or "Qwen/Qwen2.5-1.5B-Instruct"
-logger.info(f"Preloading tokenizer for {MODEL_ID} on CPU...")
 tokenizer, load_error = None, None
 try:
@@ -27,36 +42,39 @@ try:
         trust_remote_code=True,
         use_fast=False,
     )
 except Exception as e:
     load_error = f"Failed to load tokenizer: {e}"
     logger.exception(load_error)
-# ---------------- helpers ----------------
 def _pick_cpu_dtype() -> torch.dtype:
-    if hasattr(torch, "cpu") and hasattr(torch.cpu, "is_bf16_supported"):
-        try:
-            if torch.cpu.is_bf16_supported():
-                logger.info("CPU BF16 supported, will attempt torch.bfloat16")
-                return torch.bfloat16
-        except Exception:
-            pass
-    logger.info("Falling back to torch.float32 on CPU")
     return torch.float32
-# ---------------- global cache ----------------
 _MODEL_CACHE: Dict[tuple[str, torch.dtype], AutoModelForCausalLM] = {}
 def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, torch.dtype]:
     key = (device, dtype)
     if key in _MODEL_CACHE:
         return _MODEL_CACHE[key], dtype
     cfg = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
     if hasattr(cfg, "quantization_config"):
-        logger.warning("Removing quantization_config from model config")
         delattr(cfg, "quantization_config")
     eff_dtype = dtype
@@ -71,7 +89,7 @@ def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, t
         )
     except Exception as e:
         if device == "cpu" and dtype == torch.bfloat16:
-            logger.warning(f"BF16 load failed on CPU: {e}. Retrying with FP32.")
             eff_dtype = torch.float32
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_ID,
@@ -82,92 +100,120 @@ def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, t
                 low_cpu_mem_usage=False,
             )
         else:
             raise
     if device == "cpu":
         model = model.to(device=device, dtype=eff_dtype)
     else:
         model = model.to(device=device)
     model.eval()
     _MODEL_CACHE[(device, eff_dtype)] = model
     return model, eff_dtype
-# ---------------- Chat Backend ----------------
 class HFChatBackend(ChatBackend):
     async def stream(self, request: Dict[str, Any]) -> AsyncIterable[Dict[str, Any]]:
         if load_error:
             raise RuntimeError(load_error)
         messages = request.get("messages", [])
         temperature = float(request.get("temperature", settings.LlmTemp or 0.7))
         max_tokens = int(request.get("max_tokens", settings.LlmOpenAICtxSize or 512))
         rid = f"chatcmpl-hf-{int(time.time())}"
         now = int(time.time())
         x_ip_token = request.get("x_ip_token")
         if x_ip_token and zero_client:
             zero_client.HEADERS["X-IP-Token"] = x_ip_token
-            logger.debug("Injected X-IP-Token into ZeroGPU headers")
         if hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None):
             try:
                 prompt = tokenizer.apply_chat_template(
                     messages,
                     tokenize=False,
                     add_generation_prompt=True,
                 )
-                logger.debug("Applied chat template for prompt")
             except Exception as e:
-                logger.warning(f"Failed to apply chat template: {e}, using fallback")
                 prompt = messages[-1]["content"] if messages else "(empty)"
         else:
             prompt = messages[-1]["content"] if messages else "(empty)"
         def _run_once(prompt: str, device: str, req_dtype: torch.dtype) -> str:
             model, eff_dtype = _get_model(device, req_dtype)
             inputs = tokenizer(prompt, return_tensors="pt")
             inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
             with torch.inference_mode():
                 if device != "cpu":
                     autocast_ctx = torch.autocast(device_type=device, dtype=eff_dtype)
                 else:
-                    if eff_dtype == torch.bfloat16:
-                        autocast_ctx = torch.cpu.amp.autocast(dtype=torch.bfloat16)
-                    else:
-                        autocast_ctx = nullcontext()
                 with autocast_ctx:
-                    outputs = model.generate(
-                        **inputs,
-                        max_new_tokens=max_tokens,
-                        temperature=temperature,
-                        do_sample=True,
-                        use_cache=True,
-                    )
-            # Slice: keep only newly generated tokens
-            input_len = inputs["input_ids"].shape[-1]
             generated_ids = outputs[0][input_len:]
             text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
             return text
         if spaces:
             @spaces.GPU(duration=120)
             def run_once(prompt: str) -> str:
                 if torch.cuda.is_available():
                     return _run_once(prompt, device="cuda", req_dtype=torch.float16)
                 return _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
             text = run_once(prompt)
         else:
             text = _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
-        yield {
             "id": rid,
             "object": "chat.completion.chunk",
             "created": now,
@@ -176,12 +222,12 @@ class HFChatBackend(ChatBackend):
                 {"index": 0, "delta": {"role": "assistant", "content": text}, "finish_reason": "stop"}
             ],
         }
-# ---------------- Stub Images Backend ----------------
 class StubImagesBackend(ImagesBackend):
     async def generate_b64(self, request: Dict[str, Any]) -> str:
         logger.warning("Image generation not supported in HF backend.")
-        return (
-            "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGP4BwQACfsD/etCJH0AAAAASUVORK5CYII="
-        )

 # hf_backend.py
+import time, logging, json
 from contextlib import nullcontext
 from typing import Any, Dict, AsyncIterable, Tuple
 logger = logging.getLogger(__name__)
+# ---------- logging helpers ----------
+def _snippet(txt: str, n: int = 800) -> str:
+    if not isinstance(txt, str):
+        return f"<non-str:{type(txt)}>"
+    return txt if len(txt) <= n else txt[:n] + f"... <+{len(txt)-n} chars>"
+def _json_snippet(obj: Any, n: int = 800) -> str:
+    try:
+        s = json.dumps(obj, ensure_ascii=False, indent=2)
+    except Exception:
+        s = str(obj)
+    return _snippet(s, n)
+# ---------- HF Spaces imports ----------
 try:
     import spaces
     from spaces.zero import client as zero_client
 except ImportError:
     spaces, zero_client = None, None
+# ---------- Model setup ----------
 MODEL_ID = settings.LlmHFModelID or "Qwen/Qwen2.5-1.5B-Instruct"
+logger.info(f"[init] MODEL_ID={MODEL_ID}")
 tokenizer, load_error = None, None
 try:
         trust_remote_code=True,
         use_fast=False,
     )
+    has_template = hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None)
+    logger.info(f"[init] tokenizer loaded. chat_template={'yes' if has_template else 'no'}")
 except Exception as e:
     load_error = f"Failed to load tokenizer: {e}"
     logger.exception(load_error)
+# ---------- helpers ----------
 def _pick_cpu_dtype() -> torch.dtype:
+    try:
+        if hasattr(torch, "cpu") and hasattr(torch.cpu, "is_bf16_supported") and torch.cpu.is_bf16_supported():
+            logger.info("[dtype] CPU BF16 supported -> torch.bfloat16")
+            return torch.bfloat16
+    except Exception as e:
+        logger.warning(f"[dtype] BF16 probe failed: {e}")
+    logger.info("[dtype] fallback -> torch.float32")
     return torch.float32
+# ---------- global cache ----------
 _MODEL_CACHE: Dict[tuple[str, torch.dtype], AutoModelForCausalLM] = {}
 def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, torch.dtype]:
     key = (device, dtype)
     if key in _MODEL_CACHE:
+        logger.info(f"[cache] hit model for device={device} dtype={dtype}")
         return _MODEL_CACHE[key], dtype
+    logger.info(f"[load] begin from_pretrained device={device} dtype={dtype}")
     cfg = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
     if hasattr(cfg, "quantization_config"):
+        logger.warning("[load] removing quantization_config from config to avoid FP8 path")
         delattr(cfg, "quantization_config")
     eff_dtype = dtype
         )
     except Exception as e:
         if device == "cpu" and dtype == torch.bfloat16:
+            logger.warning(f"[load] BF16 load failed on CPU ({e}). retry FP32.")
             eff_dtype = torch.float32
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_ID,
                 low_cpu_mem_usage=False,
             )
         else:
+            logger.exception("[load] from_pretrained failed")
             raise
     if device == "cpu":
+        logger.info(f"[load] casting all weights to CPU dtype={eff_dtype}")
         model = model.to(device=device, dtype=eff_dtype)
     else:
+        logger.info(f"[load] moving model to device={device} (no recast)")
         model = model.to(device=device)
     model.eval()
+    try:
+        first_dtype = next(model.parameters()).dtype
+        logger.info(f"[load] ready. effective_dtype={eff_dtype} first_param_dtype={first_dtype}")
+    except Exception:
+        logger.info(f"[load] ready. effective_dtype={eff_dtype} (param dtype probe failed)")
     _MODEL_CACHE[(device, eff_dtype)] = model
     return model, eff_dtype
+# ---------- Chat Backend ----------
 class HFChatBackend(ChatBackend):
     async def stream(self, request: Dict[str, Any]) -> AsyncIterable[Dict[str, Any]]:
         if load_error:
             raise RuntimeError(load_error)
         messages = request.get("messages", [])
+        tools = request.get("tools")
         temperature = float(request.get("temperature", settings.LlmTemp or 0.7))
         max_tokens = int(request.get("max_tokens", settings.LlmOpenAICtxSize or 512))
         rid = f"chatcmpl-hf-{int(time.time())}"
         now = int(time.time())
+        logger.info(f"[req] rid={rid} temp={temperature} max_tokens={max_tokens} "
+                    f"msgs={len(messages)} tools={'yes' if tools else 'no'} "
+                    f"spaces={'yes' if spaces else 'no'} cuda={'yes' if torch.cuda.is_available() else 'no'}")
+        # X-IP-Token for ZeroGPU
         x_ip_token = request.get("x_ip_token")
         if x_ip_token and zero_client:
             zero_client.HEADERS["X-IP-Token"] = x_ip_token
+            logger.info("[req] injected X-IP-Token into ZeroGPU headers")
+        # Build prompt
         if hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None):
             try:
                 prompt = tokenizer.apply_chat_template(
                     messages,
+                    tools=tools,
                     tokenize=False,
                     add_generation_prompt=True,
                 )
+                logger.info(f"[prompt] built via chat_template. len={len(prompt)}\n{_snippet(prompt, 1200)}")
             except Exception as e:
+                logger.warning(f"[prompt] chat_template failed -> fallback. err={e}")
                 prompt = messages[-1]["content"] if messages else "(empty)"
+                logger.info(f"[prompt] fallback content len={len(prompt)}\n{_snippet(prompt, 800)}")
         else:
             prompt = messages[-1]["content"] if messages else "(empty)"
+            logger.info(f"[prompt] no template. using last user text len={len(prompt)}\n{_snippet(prompt, 800)}")
         def _run_once(prompt: str, device: str, req_dtype: torch.dtype) -> str:
             model, eff_dtype = _get_model(device, req_dtype)
             inputs = tokenizer(prompt, return_tensors="pt")
+            input_ids = inputs["input_ids"]
+            logger.info(f"[gen] device={device} dtype={eff_dtype} input_tokens={input_ids.shape[-1]}")
             inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
             with torch.inference_mode():
                 if device != "cpu":
                     autocast_ctx = torch.autocast(device_type=device, dtype=eff_dtype)
                 else:
+                    autocast_ctx = torch.cpu.amp.autocast(dtype=torch.bfloat16) if eff_dtype == torch.bfloat16 else nullcontext()
+                gen_kwargs = dict(
+                    max_new_tokens=max_tokens,
+                    temperature=temperature,
+                    do_sample=True,
+                    use_cache=True,
+                )
+                logger.info(f"[gen] kwargs={gen_kwargs}")
                 with autocast_ctx:
+                    outputs = model.generate(**inputs, **gen_kwargs)
+            # Only decode newly generated tokens
+            input_len = input_ids.shape[-1]
             generated_ids = outputs[0][input_len:]
+            logger.info(f"[gen] new_tokens={generated_ids.shape[-1]}")
             text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+            logger.info(f"[gen] text len={len(text)}\n{_snippet(text, 1200)}")
             return text
+        # Dispatch with or without ZeroGPU
         if spaces:
             @spaces.GPU(duration=120)
             def run_once(prompt: str) -> str:
                 if torch.cuda.is_available():
+                    logger.info("[path] ZeroGPU + CUDA")
                     return _run_once(prompt, device="cuda", req_dtype=torch.float16)
+                logger.info("[path] ZeroGPU but no CUDA -> CPU fallback")
                 return _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
             text = run_once(prompt)
         else:
+            logger.info("[path] CPU-only runtime")
             text = _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
+        # Emit single OpenAI-style chunk
+        chunk = {
             "id": rid,
             "object": "chat.completion.chunk",
             "created": now,
                 {"index": 0, "delta": {"role": "assistant", "content": text}, "finish_reason": "stop"}
             ],
         }
+        logger.info(f"[out] chunk summary -> id={rid} content_len={len(text)}")
+        yield chunk
+# ---------- Stub Images Backend ----------
 class StubImagesBackend(ImagesBackend):
     async def generate_b64(self, request: Dict[str, Any]) -> str:
         logger.warning("Image generation not supported in HF backend.")
+        return "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGP4BwQACfsD/etCJH0AAAAASUVORK5CYII="