Spaces:

Mungert
/

GradLLM

Running

App Files Files Community

johnbridges commited on Sep 18

Commit

b416f51

1 Parent(s): 552430d

.

Browse files

Files changed (1) hide show

hf_backend.py +6 -13

hf_backend.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # hf_backend.py
-import time, logging, json
 from contextlib import nullcontext
 from typing import Any, Dict, AsyncIterable, Tuple
@@ -101,7 +101,6 @@ def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, t
     return model, eff_dtype
 def _max_context(model, tokenizer) -> int:
-    # Prefer model config; fallback to tokenizer hint
     mc = getattr(getattr(model, "config", None), "max_position_embeddings", None)
     if isinstance(mc, int) and mc > 0:
         return mc
@@ -119,7 +118,6 @@ def _build_inputs_with_truncation(prompt: str, device: str, max_new_tokens: int,
     limit = max(8, ctx - max_new_tokens)
     in_len = input_ids.shape[-1]
     if in_len > limit:
-        # left-truncate to fit context
         cut = in_len - limit
         input_ids = input_ids[:, -limit:]
         if attn is not None:
@@ -130,7 +128,6 @@ def _build_inputs_with_truncation(prompt: str, device: str, max_new_tokens: int,
     if attn is not None:
         inputs["attention_mask"] = attn
-    # move to device
     inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
     return inputs, in_len, ctx, limit
@@ -156,7 +153,7 @@ class HFChatBackend(ChatBackend):
             zero_client.HEADERS["X-IP-Token"] = x_ip_token
             logger.info("[req] injected X-IP-Token into ZeroGPU headers")
-        # Build prompt
         if hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None):
             try:
                 prompt = tokenizer.apply_chat_template(
@@ -176,20 +173,16 @@ class HFChatBackend(ChatBackend):
         def _run_once(prompt: str, device: str, req_dtype: torch.dtype) -> str:
             model, eff_dtype = _get_model(device, req_dtype)
             max_new_tokens = req_max_tokens
-            # Build inputs with context-aware truncation
             inputs, orig_in_len, ctx, limit = _build_inputs_with_truncation(prompt, device, max_new_tokens, model, tokenizer)
             logger.info(f"[gen] device={device} dtype={eff_dtype} input_tokens={inputs['input_ids'].shape[-1]} "
                         f"(orig={orig_in_len}) max_ctx={ctx} limit_for_input={limit} max_new_tokens={max_new_tokens}")
-            # Sampling settings
             do_sample = temperature > 1e-6
             temp = max(1e-5, temperature) if do_sample else 0.0
-            # ids
             eos_id = tokenizer.eos_token_id
             pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else eos_id
@@ -212,7 +205,6 @@ class HFChatBackend(ChatBackend):
                 with autocast_ctx:
                     outputs = model.generate(**inputs, **gen_kwargs)
-            # Slice generated continuation only
             input_len = inputs["input_ids"].shape[-1]
             generated_ids = outputs[0][input_len:]
             logger.info(f"[gen] new_tokens={generated_ids.shape[-1]}")
@@ -220,18 +212,19 @@ class HFChatBackend(ChatBackend):
             logger.info(f"[gen] text len={len(text)}\n{_snippet(text, 1200)}")
             return text
         if spaces:
             @spaces.GPU(duration=120)
-            def run_once(prompt: str) -> str:
                 if torch.cuda.is_available():
                     logger.info("[path] ZeroGPU + CUDA")
                     return _run_once(prompt, device="cuda", req_dtype=torch.float16)
                 logger.info("[path] ZeroGPU but no CUDA -> CPU fallback")
                 return _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
-            text = run_once(prompt)
         else:
             logger.info("[path] CPU-only runtime")
-            text = _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
         chunk = {
             "id": rid,

 # hf_backend.py
+import time, logging, json, asyncio
 from contextlib import nullcontext
 from typing import Any, Dict, AsyncIterable, Tuple
     return model, eff_dtype
 def _max_context(model, tokenizer) -> int:
     mc = getattr(getattr(model, "config", None), "max_position_embeddings", None)
     if isinstance(mc, int) and mc > 0:
         return mc
     limit = max(8, ctx - max_new_tokens)
     in_len = input_ids.shape[-1]
     if in_len > limit:
         cut = in_len - limit
         input_ids = input_ids[:, -limit:]
         if attn is not None:
     if attn is not None:
         inputs["attention_mask"] = attn
     inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
     return inputs, in_len, ctx, limit
             zero_client.HEADERS["X-IP-Token"] = x_ip_token
             logger.info("[req] injected X-IP-Token into ZeroGPU headers")
+        # Build prompt (pass tools to template)
         if hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None):
             try:
                 prompt = tokenizer.apply_chat_template(
         def _run_once(prompt: str, device: str, req_dtype: torch.dtype) -> str:
             model, eff_dtype = _get_model(device, req_dtype)
             max_new_tokens = req_max_tokens
             inputs, orig_in_len, ctx, limit = _build_inputs_with_truncation(prompt, device, max_new_tokens, model, tokenizer)
             logger.info(f"[gen] device={device} dtype={eff_dtype} input_tokens={inputs['input_ids'].shape[-1]} "
                         f"(orig={orig_in_len}) max_ctx={ctx} limit_for_input={limit} max_new_tokens={max_new_tokens}")
             do_sample = temperature > 1e-6
             temp = max(1e-5, temperature) if do_sample else 0.0
             eos_id = tokenizer.eos_token_id
             pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else eos_id
                 with autocast_ctx:
                     outputs = model.generate(**inputs, **gen_kwargs)
             input_len = inputs["input_ids"].shape[-1]
             generated_ids = outputs[0][input_len:]
             logger.info(f"[gen] new_tokens={generated_ids.shape[-1]}")
             logger.info(f"[gen] text len={len(text)}\n{_snippet(text, 1200)}")
             return text
+        # Offload heavy work to a worker thread so asyncio heartbeats continue
         if spaces:
             @spaces.GPU(duration=120)
+            def run_once_sync(prompt: str) -> str:
                 if torch.cuda.is_available():
                     logger.info("[path] ZeroGPU + CUDA")
                     return _run_once(prompt, device="cuda", req_dtype=torch.float16)
                 logger.info("[path] ZeroGPU but no CUDA -> CPU fallback")
                 return _run_once(prompt, device="cpu", req_dtype=_pick_cpu_dtype())
+            text = await asyncio.to_thread(run_once_sync, prompt)
         else:
             logger.info("[path] CPU-only runtime")
+            text = await asyncio.to_thread(_run_once, prompt, "cpu", _pick_cpu_dtype())
         chunk = {
             "id": rid,