Spaces:

Mungert
/

GradLLM

Running

App Files Files Community

johnbridges commited on Sep 18

Commit

be6d3d6

1 Parent(s): bf6d44e

.

Browse files

Files changed (1) hide show

hf_backend.py +63 -40

hf_backend.py CHANGED Viewed

@@ -10,46 +10,29 @@ from config import settings
 logger = logging.getLogger(__name__)
-# ---------- logging helpers ----------
 def _snippet(txt: str, n: int = 800) -> str:
     if not isinstance(txt, str):
         return f"<non-str:{type(txt)}>"
     return txt if len(txt) <= n else txt[:n] + f"... <+{len(txt)-n} chars>"
-def _json_snippet(obj: Any, n: int = 800) -> str:
-    try:
-        s = json.dumps(obj, ensure_ascii=False, indent=2)
-    except Exception:
-        s = str(obj)
-    return _snippet(s, n)
-# ---------- HF Spaces imports ----------
 try:
     import spaces
     from spaces.zero import client as zero_client
 except ImportError:
     spaces, zero_client = None, None
-# ---------- Model setup ----------
 MODEL_ID = settings.LlmHFModelID or "Qwen/Qwen2.5-1.5B-Instruct"
 logger.info(f"[init] MODEL_ID={MODEL_ID}")
 tokenizer, load_error = None, None
 try:
-    tokenizer = AutoTokenizer.from_pretrained(
-        MODEL_ID,
-        trust_remote_code=True,
-        use_fast=False,
-    )
     has_template = hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None)
     logger.info(f"[init] tokenizer loaded. chat_template={'yes' if has_template else 'no'}")
 except Exception as e:
     load_error = f"Failed to load tokenizer: {e}"
     logger.exception(load_error)
-# ---------- helpers ----------
 def _pick_cpu_dtype() -> torch.dtype:
     try:
         if hasattr(torch, "cpu") and hasattr(torch.cpu, "is_bf16_supported") and torch.cpu.is_bf16_supported():
@@ -60,11 +43,8 @@ def _pick_cpu_dtype() -> torch.dtype:
     logger.info("[dtype] fallback -> torch.float32")
     return torch.float32
-# ---------- global cache ----------
 _MODEL_CACHE: Dict[tuple[str, torch.dtype], AutoModelForCausalLM] = {}
 def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, torch.dtype]:
     key = (device, dtype)
     if key in _MODEL_CACHE:
@@ -120,8 +100,40 @@ def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, t
     _MODEL_CACHE[(device, eff_dtype)] = model
     return model, eff_dtype
-# ---------- Chat Backend ----------
 class HFChatBackend(ChatBackend):
     async def stream(self, request: Dict[str, Any]) -> AsyncIterable[Dict[str, Any]]:
         if load_error:
@@ -130,16 +142,15 @@ class HFChatBackend(ChatBackend):
         messages = request.get("messages", [])
         tools = request.get("tools")
         temperature = float(request.get("temperature", settings.LlmTemp or 0.7))
-        max_tokens = int(request.get("max_tokens", settings.LlmOpenAICtxSize or 512))
         rid = f"chatcmpl-hf-{int(time.time())}"
         now = int(time.time())
-        logger.info(f"[req] rid={rid} temp={temperature} max_tokens={max_tokens} "
                     f"msgs={len(messages)} tools={'yes' if tools else 'no'} "
                     f"spaces={'yes' if spaces else 'no'} cuda={'yes' if torch.cuda.is_available() else 'no'}")
-        # X-IP-Token for ZeroGPU
         x_ip_token = request.get("x_ip_token")
         if x_ip_token and zero_client:
             zero_client.HEADERS["X-IP-Token"] = x_ip_token
@@ -150,11 +161,11 @@ class HFChatBackend(ChatBackend):
             try:
                 prompt = tokenizer.apply_chat_template(
                     messages,
-                    tools=tools,
                     tokenize=False,
                     add_generation_prompt=True,
                 )
-                logger.info(f"[prompt] built via chat_template. len={len(prompt)}\n{_snippet(prompt, 1200)}")
             except Exception as e:
                 logger.warning(f"[prompt] chat_template failed -> fallback. err={e}")
                 prompt = messages[-1]["content"] if messages else "(empty)"
@@ -166,11 +177,25 @@ class HFChatBackend(ChatBackend):
         def _run_once(prompt: str, device: str, req_dtype: torch.dtype) -> str:
             model, eff_dtype = _get_model(device, req_dtype)
-            inputs = tokenizer(prompt, return_tensors="pt")
-            input_ids = inputs["input_ids"]
-            logger.info(f"[gen] device={device} dtype={eff_dtype} input_tokens={input_ids.shape[-1]}")
-            inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
             with torch.inference_mode():
                 if device != "cpu":
@@ -179,25 +204,26 @@ class HFChatBackend(ChatBackend):
                     autocast_ctx = torch.cpu.amp.autocast(dtype=torch.bfloat16) if eff_dtype == torch.bfloat16 else nullcontext()
                 gen_kwargs = dict(
-                    max_new_tokens=max_tokens,
-                    temperature=temperature,
-                    do_sample=True,
                     use_cache=True,
                 )
                 logger.info(f"[gen] kwargs={gen_kwargs}")
                 with autocast_ctx:
                     outputs = model.generate(**inputs, **gen_kwargs)
-            # Only decode newly generated tokens
-            input_len = input_ids.shape[-1]
             generated_ids = outputs[0][input_len:]
             logger.info(f"[gen] new_tokens={generated_ids.shape[-1]}")
             text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
             logger.info(f"[gen] text len={len(text)}\n{_snippet(text, 1200)}")
             return text
-        # Dispatch with or without ZeroGPU
         if spaces:
             @spaces.GPU(duration=120)
             def run_once(prompt: str) -> str:
@@ -206,13 +232,11 @@ class HFChatBackend(ChatBackend):
                     return _run_once(prompt, device="cuda", req_dtype=torch.float16)
                 logger.info("[path] ZeroGPU but no CUDA -> CPU fallback")
                 return _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
             text = run_once(prompt)
         else:
             logger.info("[path] CPU-only runtime")
             text = _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
-        # Emit single OpenAI-style chunk
         chunk = {
             "id": rid,
             "object": "chat.completion.chunk",
@@ -226,7 +250,6 @@ class HFChatBackend(ChatBackend):
         yield chunk
-# ---------- Stub Images Backend ----------
 class StubImagesBackend(ImagesBackend):
     async def generate_b64(self, request: Dict[str, Any]) -> str:
         logger.warning("Image generation not supported in HF backend.")

 logger = logging.getLogger(__name__)
 def _snippet(txt: str, n: int = 800) -> str:
     if not isinstance(txt, str):
         return f"<non-str:{type(txt)}>"
     return txt if len(txt) <= n else txt[:n] + f"... <+{len(txt)-n} chars>"
 try:
     import spaces
     from spaces.zero import client as zero_client
 except ImportError:
     spaces, zero_client = None, None
 MODEL_ID = settings.LlmHFModelID or "Qwen/Qwen2.5-1.5B-Instruct"
 logger.info(f"[init] MODEL_ID={MODEL_ID}")
 tokenizer, load_error = None, None
 try:
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=False)
     has_template = hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None)
     logger.info(f"[init] tokenizer loaded. chat_template={'yes' if has_template else 'no'}")
 except Exception as e:
     load_error = f"Failed to load tokenizer: {e}"
     logger.exception(load_error)
 def _pick_cpu_dtype() -> torch.dtype:
     try:
         if hasattr(torch, "cpu") and hasattr(torch.cpu, "is_bf16_supported") and torch.cpu.is_bf16_supported():
     logger.info("[dtype] fallback -> torch.float32")
     return torch.float32
 _MODEL_CACHE: Dict[tuple[str, torch.dtype], AutoModelForCausalLM] = {}
 def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, torch.dtype]:
     key = (device, dtype)
     if key in _MODEL_CACHE:
     _MODEL_CACHE[(device, eff_dtype)] = model
     return model, eff_dtype
+def _max_context(model, tokenizer) -> int:
+    # Prefer model config; fallback to tokenizer hint
+    mc = getattr(getattr(model, "config", None), "max_position_embeddings", None)
+    if isinstance(mc, int) and mc > 0:
+        return mc
+    tk = getattr(tokenizer, "model_max_length", None)
+    if isinstance(tk, int) and tk > 0 and tk < 10**12:
+        return tk
+    return 32768  # safe default for Qwen3
+def _build_inputs_with_truncation(prompt: str, device: str, max_new_tokens: int, model, tokenizer):
+    toks = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
+    input_ids = toks["input_ids"]
+    attn = toks.get("attention_mask", None)
+    ctx = _max_context(model, tokenizer)
+    limit = max(8, ctx - max_new_tokens)
+    in_len = input_ids.shape[-1]
+    if in_len > limit:
+        # left-truncate to fit context
+        cut = in_len - limit
+        input_ids = input_ids[:, -limit:]
+        if attn is not None:
+            attn = attn[:, -limit:]
+        logger.warning(f"[truncate] prompt_tokens={in_len} > limit={limit}. truncated_left_by={cut} to fit ctx={ctx}, new_input={input_ids.shape[-1]}, max_new={max_new_tokens}")
+    inputs = {"input_ids": input_ids}
+    if attn is not None:
+        inputs["attention_mask"] = attn
+    # move to device
+    inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
+    return inputs, in_len, ctx, limit
 class HFChatBackend(ChatBackend):
     async def stream(self, request: Dict[str, Any]) -> AsyncIterable[Dict[str, Any]]:
         if load_error:
         messages = request.get("messages", [])
         tools = request.get("tools")
         temperature = float(request.get("temperature", settings.LlmTemp or 0.7))
+        req_max_tokens = int(request.get("max_tokens", settings.LlmOpenAICtxSize or 512))
         rid = f"chatcmpl-hf-{int(time.time())}"
         now = int(time.time())
+        logger.info(f"[req] rid={rid} temp={temperature} req_max_tokens={req_max_tokens} "
                     f"msgs={len(messages)} tools={'yes' if tools else 'no'} "
                     f"spaces={'yes' if spaces else 'no'} cuda={'yes' if torch.cuda.is_available() else 'no'}")
         x_ip_token = request.get("x_ip_token")
         if x_ip_token and zero_client:
             zero_client.HEADERS["X-IP-Token"] = x_ip_token
             try:
                 prompt = tokenizer.apply_chat_template(
                     messages,
+                    #tools=tools,
                     tokenize=False,
                     add_generation_prompt=True,
                 )
+                logger.info(f"[prompt] built via chat_template. len={len(prompt)}\n{_snippet(prompt, 800)}")
             except Exception as e:
                 logger.warning(f"[prompt] chat_template failed -> fallback. err={e}")
                 prompt = messages[-1]["content"] if messages else "(empty)"
         def _run_once(prompt: str, device: str, req_dtype: torch.dtype) -> str:
             model, eff_dtype = _get_model(device, req_dtype)
+            # Clamp max_new_tokens for CPU to prevent stalls
+            if device == "cpu":
+                max_new_tokens = min(req_max_tokens, 512)
+            else:
+                max_new_tokens = req_max_tokens
+            # Build inputs with context-aware truncation
+            inputs, orig_in_len, ctx, limit = _build_inputs_with_truncation(prompt, device, max_new_tokens, model, tokenizer)
+            logger.info(f"[gen] device={device} dtype={eff_dtype} input_tokens={inputs['input_ids'].shape[-1]} "
+                        f"(orig={orig_in_len}) max_ctx={ctx} limit_for_input={limit} max_new_tokens={max_new_tokens}")
+            # Sampling settings
+            do_sample = temperature > 1e-6
+            temp = max(1e-5, temperature) if do_sample else 0.0
+            # ids
+            eos_id = tokenizer.eos_token_id
+            pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else eos_id
             with torch.inference_mode():
                 if device != "cpu":
                     autocast_ctx = torch.cpu.amp.autocast(dtype=torch.bfloat16) if eff_dtype == torch.bfloat16 else nullcontext()
                 gen_kwargs = dict(
+                    max_new_tokens=max_new_tokens,
+                    temperature=temp,
+                    do_sample=do_sample,
                     use_cache=True,
+                    eos_token_id=eos_id,
+                    pad_token_id=pad_id,
                 )
                 logger.info(f"[gen] kwargs={gen_kwargs}")
                 with autocast_ctx:
                     outputs = model.generate(**inputs, **gen_kwargs)
+            # Slice generated continuation only
+            input_len = inputs["input_ids"].shape[-1]
             generated_ids = outputs[0][input_len:]
             logger.info(f"[gen] new_tokens={generated_ids.shape[-1]}")
             text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
             logger.info(f"[gen] text len={len(text)}\n{_snippet(text, 1200)}")
             return text
         if spaces:
             @spaces.GPU(duration=120)
             def run_once(prompt: str) -> str:
                     return _run_once(prompt, device="cuda", req_dtype=torch.float16)
                 logger.info("[path] ZeroGPU but no CUDA -> CPU fallback")
                 return _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
             text = run_once(prompt)
         else:
             logger.info("[path] CPU-only runtime")
             text = _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
         chunk = {
             "id": rid,
             "object": "chat.completion.chunk",
         yield chunk
 class StubImagesBackend(ImagesBackend):
     async def generate_b64(self, request: Dict[str, Any]) -> str:
         logger.warning("Image generation not supported in HF backend.")