Spaces:

Inpris
/

Humains-Junior

Sleeping

App Files Files Community

NS-Y commited on Nov 2, 2025

Commit

107e86b

verified ·

1 Parent(s): 1da1de0

Upload 3 files

Browse files

Files changed (2) hide show

README.md +2 -2
app.py +59 -93

README.md CHANGED Viewed

@@ -9,7 +9,7 @@ app_file: app.py
 pinned: false
 ---
-A Gradio Space that applies the Appendix-style prompt: the model must prioritize the given *Context* and answer in plain text with two sections — **Analysis** and **Response**.
 **Environment variables (optional)**
 - `EXOSKELETON_MODEL_ID` (default: `Inpris/humains-junior`)
@@ -22,6 +22,6 @@ A Gradio Space that applies the Appendix-style prompt: the model must prioritize
 - `HF_TOKEN` — required if the model is gated.
 **Files**
-- `app.py` — Gradio app (slow tokenizer forced to avoid tokenizer.json schema mismatches)
 - `requirements.txt` — dependencies (pins transformers 4.43.3, accelerate 0.32.1)
 - `examples/` — (optional) assets/presets

 pinned: false
 ---
+A Gradio Space that applies the Appendix-style prompt (Phi-3.5 instruct-style chat). The model must prioritize the given *Context* and answer in plain text with two sections — **Analysis** and **Response**.
 **Environment variables (optional)**
 - `EXOSKELETON_MODEL_ID` (default: `Inpris/humains-junior`)
 - `HF_TOKEN` — required if the model is gated.
 **Files**
+- `app.py` — Gradio app (forces slow tokenizer using LLaMA tokenizer if needed; Phi-3.5 fallback)
 - `requirements.txt` — dependencies (pins transformers 4.43.3, accelerate 0.32.1)
 - `examples/` — (optional) assets/presets

app.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import os
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import gradio as gr
-# -----------------------------
-# Config
-# -----------------------------
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 DEFAULT_MODEL = os.environ.get("EXOSKELETON_MODEL_ID", "Inpris/humains-junior")
@@ -13,11 +13,8 @@ DEVICE_MAP = os.environ.get("DEVICE_MAP", "auto")
 MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "512"))
 TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.3"))
 TOP_P = float(os.environ.get("TOP_P", "0.95"))
-USE_AUTH_TOKEN = os.environ.get("HF_TOKEN")  # optional for gated repos
-# -----------------------------
-# Appendix-style rules + Phi-3.5 instruct chat prompt
-# -----------------------------
 APPENDIX_RULES = """You are a helpful assistant that always follows the provided context, even when it conflicts with your internal knowledge.
 Response Format:
@@ -48,64 +45,6 @@ Analysis: The query asks for the capital of France. The context states it is Lon
 Response: The capital of France is London.
 """
-def build_messages(question: str, context: str):
-    """Phi-3.5-instruct style: system + user; we keep a 1-shot in the system block as in Appendix."""
-    system = APPENDIX_RULES
-    user = f"""Client: {question.strip()} Answer based on the context.
-Context:
-{context.strip()}"""
-    return [
-        {"role": "system", "content": system},
-        {"role": "user", "content": user},
-    ]
-# -----------------------------
-# Model loading (use the repo's own tokenizer)
-# -----------------------------
-_tokenizer = None
-_model = None
-def load_model(model_id: str = DEFAULT_MODEL):
-    global _tokenizer, _model
-    if _tokenizer is not None and _model is not None:
-        return _tokenizer, _model
-    auth = USE_AUTH_TOKEN if (USE_AUTH_TOKEN and USE_AUTH_TOKEN.strip()) else None
-    # IMPORTANT:
-    # - trust_remote_code=True so custom tokenizer/model classes from the repo are used.
-    # - use_fast=False to avoid tokenizer.json schema mismatches; many custom repos only ship a slow tokenizer.
-    _tokenizer = AutoTokenizer.from_pretrained(
-        model_id,
-        use_auth_token=auth,
-        trust_remote_code=True,
-        use_fast=False,
-    )
-    _model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-        device_map=DEVICE_MAP,
-        use_auth_token=auth,
-        trust_remote_code=True,
-    )
-    if _tokenizer.pad_token_id is None and _tokenizer.eos_token_id is not None:
-        _tokenizer.pad_token_id = _tokenizer.eos_token_id
-    # Prefer a static cache; and we will pass use_cache=False at generation to avoid DynamicCache issues
-    try:
-        _model.generation_config.cache_implementation = "static"
-    except Exception:
-        pass
-    return _tokenizer, _model
-# -----------------------------
-# Prompting via chat template
-# -----------------------------
-# If the repo doesn't ship a chat template, we inject a Phi-3.5-instruct style template.
 PHI3_TEMPLATE = """{% for message in messages -%}
 {% if message['role'] == 'system' -%}
 <|system|>
@@ -124,6 +63,14 @@ PHI3_TEMPLATE = """{% for message in messages -%}
 <|assistant|>
 """
 def ensure_chat_template(tok):
     try:
         tmpl = tok.chat_template
@@ -135,20 +82,54 @@ def ensure_chat_template(tok):
 def encode_messages(tokenizer, messages: list):
     ensure_chat_template(tokenizer)
     return tokenizer.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        tokenize=True,
-        return_tensors="pt"
     )
-# -----------------------------
-# Generation
-# -----------------------------
 def generate_text(question: str, context: str, temperature: float, top_p: float, max_new_tokens: int, model_id: str):
     tokenizer, model = load_model(model_id)
     messages = build_messages(question, context)
     inputs = encode_messages(tokenizer, messages).to(model.device)
     with torch.no_grad():
         output_ids = model.generate(
             inputs,
@@ -157,11 +138,10 @@ def generate_text(question: str, context: str, temperature: float, top_p: float,
             top_p=top_p,
             max_new_tokens=max_new_tokens,
             pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
-            use_cache=False,  # critical for compatibility with some remote-code cache implementations
         )
     text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-    # Extract the last "Analysis:" + "Response:" sections
     analysis, response = "", ""
     a_idx = text.rfind("Analysis:")
     r_idx = text.rfind("Response:")
@@ -175,20 +155,11 @@ def generate_text(question: str, context: str, temperature: float, top_p: float,
         response = text.strip()
     return analysis, response, text
-# -----------------------------
-# UI
-# -----------------------------
 PRESET_Q = "What are the health effects of coffee? Answer based on the context."
-PRESET_CTX = (
-    "Coffee contains caffeine, which can increase alertness. Excess intake may cause "
-    "jitteriness and sleep disruption. Moderate consumption is considered safe for most adults."
-)
 with gr.Blocks(title="Exoskeleton Reasoning — Appendix Prompt Demo") as demo:
-    gr.Markdown(
-        "# Exoskeleton Reasoning — Appendix-Style Prompt\n"
-        "The model must **prioritize the provided context**, and reply in plain text with two sections: **Analysis** and **Response**."
-    )
     with gr.Row():
         with gr.Column(scale=3):
             q = gr.Textbox(label="Client question", value=PRESET_Q, lines=4)
@@ -200,9 +171,7 @@ with gr.Blocks(title="Exoskeleton Reasoning — Appendix Prompt Demo") as demo:
                 max_new = gr.Slider(64, 1024, value=MAX_NEW_TOKENS, step=16, label="Max new tokens")
                 model_id = gr.Textbox(label="Model ID", value=DEFAULT_MODEL)
             run = gr.Button("Run", variant="primary")
-            gr.Markdown(
-                'Secrets/vars: set **HF_TOKEN** if the model is gated · Override `EXOSKELETON_MODEL_ID` to change default.'
-            )
         with gr.Column(scale=4):
             with gr.Accordion("Analysis", open=True):
                 analysis_box = gr.Textbox(lines=6, label="Analysis (model)")
@@ -210,16 +179,13 @@ with gr.Blocks(title="Exoskeleton Reasoning — Appendix Prompt Demo") as demo:
                 response_box = gr.Textbox(lines=6, label="Response (model)")
             with gr.Accordion("Raw output", open=False):
                 raw_box = gr.Textbox(lines=8, label="Raw text")
     def infer_fn(question, context, temperature, top_p, max_new_tokens, model_id):
         if not question.strip() or not context.strip():
             gr.Warning("Please provide both a Client question and Context.")
             return "", "", ""
         a, r, raw = generate_text(question, context, temperature, top_p, max_new_tokens, model_id)
         return a, r, raw
-    run.click(fn=infer_fn, inputs=[q, ctx, temp, topp, max_new, model_id],
-              outputs=[analysis_box, response_box, raw_box])
 if __name__ == "__main__":
     demo.launch()

 import os
+import json
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.models.llama import LlamaTokenizer  # force slow llama if needed
 import gradio as gr
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 DEFAULT_MODEL = os.environ.get("EXOSKELETON_MODEL_ID", "Inpris/humains-junior")
 MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "512"))
 TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.3"))
 TOP_P = float(os.environ.get("TOP_P", "0.95"))
+USE_AUTH_TOKEN = os.environ.get("HF_TOKEN")
 APPENDIX_RULES = """You are a helpful assistant that always follows the provided context, even when it conflicts with your internal knowledge.
 Response Format:
 Response: The capital of France is London.
 """
 PHI3_TEMPLATE = """{% for message in messages -%}
 {% if message['role'] == 'system' -%}
 <|system|>
 <|assistant|>
 """
+def build_messages(question: str, context: str):
+    system = APPENDIX_RULES
+    user = f"""Client: {question.strip()} Answer based on the context.
+Context:
+{context.strip()}"""
+    return [{"role":"system","content":system},{"role":"user","content":user}]
 def ensure_chat_template(tok):
     try:
         tmpl = tok.chat_template
 def encode_messages(tokenizer, messages: list):
     ensure_chat_template(tokenizer)
     return tokenizer.apply_chat_template(
+        messages, add_generation_prompt=True, tokenize=True, return_tensors="pt"
+    )
+_tokenizer = None
+_model = None
+def load_tokenizer_robust(model_id: str, auth):
+    try:
+        return AutoTokenizer.from_pretrained(model_id, use_auth_token=auth, trust_remote_code=False, use_fast=False)
+    except Exception as e1:
+        last_err = e1
+    try:
+        return LlamaTokenizer.from_pretrained(model_id, use_auth_token=auth)
+    except Exception as e2:
+        last_err = e2
+    try:
+        return AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct", use_auth_token=auth, trust_remote_code=False, use_fast=False)
+    except Exception as e3:
+        raise last_err
+def load_model(model_id: str = DEFAULT_MODEL):
+    global _tokenizer, _model
+    if _tokenizer is not None and _model is not None:
+        return _tokenizer, _model
+    auth = USE_AUTH_TOKEN if (USE_AUTH_TOKEN and USE_AUTH_TOKEN.strip()) else None
+    _tokenizer = load_tokenizer_robust(model_id, auth)
+    if _tokenizer.pad_token_id is None and _tokenizer.eos_token_id is not None:
+        _tokenizer.pad_token_id = _tokenizer.eos_token_id
+    _model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        device_map=DEVICE_MAP,
+        use_auth_token=auth,
+        trust_remote_code=True,
     )
+    try:
+        _model.generation_config.cache_implementation = "static"
+    except Exception:
+        pass
+    return _tokenizer, _model
 def generate_text(question: str, context: str, temperature: float, top_p: float, max_new_tokens: int, model_id: str):
     tokenizer, model = load_model(model_id)
     messages = build_messages(question, context)
     inputs = encode_messages(tokenizer, messages).to(model.device)
     with torch.no_grad():
         output_ids = model.generate(
             inputs,
             top_p=top_p,
             max_new_tokens=max_new_tokens,
             pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
+            use_cache=False,
         )
     text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
     analysis, response = "", ""
     a_idx = text.rfind("Analysis:")
     r_idx = text.rfind("Response:")
         response = text.strip()
     return analysis, response, text
 PRESET_Q = "What are the health effects of coffee? Answer based on the context."
+PRESET_CTX = "Coffee contains caffeine, which can increase alertness. Excess intake may cause jitteriness and sleep disruption. Moderate consumption is considered safe for most adults."
 with gr.Blocks(title="Exoskeleton Reasoning — Appendix Prompt Demo") as demo:
+    gr.Markdown("# Exoskeleton Reasoning — Appendix-Style Prompt\nThe model must **prioritize the provided context**, and reply in plain text with two sections: **Analysis** and **Response**.")
     with gr.Row():
         with gr.Column(scale=3):
             q = gr.Textbox(label="Client question", value=PRESET_Q, lines=4)
                 max_new = gr.Slider(64, 1024, value=MAX_NEW_TOKENS, step=16, label="Max new tokens")
                 model_id = gr.Textbox(label="Model ID", value=DEFAULT_MODEL)
             run = gr.Button("Run", variant="primary")
+            gr.Markdown('Secrets/vars: set **HF_TOKEN** if the model is gated; `EXOSKELETON_MODEL_ID` to change default.')
         with gr.Column(scale=4):
             with gr.Accordion("Analysis", open=True):
                 analysis_box = gr.Textbox(lines=6, label="Analysis (model)")
                 response_box = gr.Textbox(lines=6, label="Response (model)")
             with gr.Accordion("Raw output", open=False):
                 raw_box = gr.Textbox(lines=8, label="Raw text")
     def infer_fn(question, context, temperature, top_p, max_new_tokens, model_id):
         if not question.strip() or not context.strip():
             gr.Warning("Please provide both a Client question and Context.")
             return "", "", ""
         a, r, raw = generate_text(question, context, temperature, top_p, max_new_tokens, model_id)
         return a, r, raw
+    run.click(fn=infer_fn, inputs=[q, ctx, temp, topp, max_new, model_id], outputs=[analysis_box, response_box, raw_box])
 if __name__ == "__main__":
     demo.launch()