NS-Y commited on
Commit
107e86b
Β·
verified Β·
1 Parent(s): 1da1de0

Upload 3 files

Browse files
Files changed (2) hide show
  1. README.md +2 -2
  2. app.py +59 -93
README.md CHANGED
@@ -9,7 +9,7 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- A Gradio Space that applies the Appendix-style prompt: the model must prioritize the given *Context* and answer in plain text with two sections β€” **Analysis** and **Response**.
13
 
14
  **Environment variables (optional)**
15
  - `EXOSKELETON_MODEL_ID` (default: `Inpris/humains-junior`)
@@ -22,6 +22,6 @@ A Gradio Space that applies the Appendix-style prompt: the model must prioritize
22
  - `HF_TOKEN` β€” required if the model is gated.
23
 
24
  **Files**
25
- - `app.py` β€” Gradio app (slow tokenizer forced to avoid tokenizer.json schema mismatches)
26
  - `requirements.txt` β€” dependencies (pins transformers 4.43.3, accelerate 0.32.1)
27
  - `examples/` β€” (optional) assets/presets
 
9
  pinned: false
10
  ---
11
 
12
+ A Gradio Space that applies the Appendix-style prompt (Phi-3.5 instruct-style chat). The model must prioritize the given *Context* and answer in plain text with two sections β€” **Analysis** and **Response**.
13
 
14
  **Environment variables (optional)**
15
  - `EXOSKELETON_MODEL_ID` (default: `Inpris/humains-junior`)
 
22
  - `HF_TOKEN` β€” required if the model is gated.
23
 
24
  **Files**
25
+ - `app.py` β€” Gradio app (forces slow tokenizer using LLaMA tokenizer if needed; Phi-3.5 fallback)
26
  - `requirements.txt` β€” dependencies (pins transformers 4.43.3, accelerate 0.32.1)
27
  - `examples/` β€” (optional) assets/presets
app.py CHANGED
@@ -1,11 +1,11 @@
 
1
  import os
 
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
4
  import gradio as gr
5
 
6
- # -----------------------------
7
- # Config
8
- # -----------------------------
9
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
10
 
11
  DEFAULT_MODEL = os.environ.get("EXOSKELETON_MODEL_ID", "Inpris/humains-junior")
@@ -13,11 +13,8 @@ DEVICE_MAP = os.environ.get("DEVICE_MAP", "auto")
13
  MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "512"))
14
  TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.3"))
15
  TOP_P = float(os.environ.get("TOP_P", "0.95"))
16
- USE_AUTH_TOKEN = os.environ.get("HF_TOKEN") # optional for gated repos
17
 
18
- # -----------------------------
19
- # Appendix-style rules + Phi-3.5 instruct chat prompt
20
- # -----------------------------
21
  APPENDIX_RULES = """You are a helpful assistant that always follows the provided context, even when it conflicts with your internal knowledge.
22
 
23
  Response Format:
@@ -48,64 +45,6 @@ Analysis: The query asks for the capital of France. The context states it is Lon
48
  Response: The capital of France is London.
49
  """
50
 
51
- def build_messages(question: str, context: str):
52
- """Phi-3.5-instruct style: system + user; we keep a 1-shot in the system block as in Appendix."""
53
- system = APPENDIX_RULES
54
- user = f"""Client: {question.strip()} Answer based on the context.
55
-
56
- Context:
57
- {context.strip()}"""
58
- return [
59
- {"role": "system", "content": system},
60
- {"role": "user", "content": user},
61
- ]
62
-
63
- # -----------------------------
64
- # Model loading (use the repo's own tokenizer)
65
- # -----------------------------
66
- _tokenizer = None
67
- _model = None
68
-
69
- def load_model(model_id: str = DEFAULT_MODEL):
70
- global _tokenizer, _model
71
- if _tokenizer is not None and _model is not None:
72
- return _tokenizer, _model
73
-
74
- auth = USE_AUTH_TOKEN if (USE_AUTH_TOKEN and USE_AUTH_TOKEN.strip()) else None
75
-
76
- # IMPORTANT:
77
- # - trust_remote_code=True so custom tokenizer/model classes from the repo are used.
78
- # - use_fast=False to avoid tokenizer.json schema mismatches; many custom repos only ship a slow tokenizer.
79
- _tokenizer = AutoTokenizer.from_pretrained(
80
- model_id,
81
- use_auth_token=auth,
82
- trust_remote_code=True,
83
- use_fast=False,
84
- )
85
-
86
- _model = AutoModelForCausalLM.from_pretrained(
87
- model_id,
88
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
89
- device_map=DEVICE_MAP,
90
- use_auth_token=auth,
91
- trust_remote_code=True,
92
- )
93
-
94
- if _tokenizer.pad_token_id is None and _tokenizer.eos_token_id is not None:
95
- _tokenizer.pad_token_id = _tokenizer.eos_token_id
96
-
97
- # Prefer a static cache; and we will pass use_cache=False at generation to avoid DynamicCache issues
98
- try:
99
- _model.generation_config.cache_implementation = "static"
100
- except Exception:
101
- pass
102
-
103
- return _tokenizer, _model
104
-
105
- # -----------------------------
106
- # Prompting via chat template
107
- # -----------------------------
108
- # If the repo doesn't ship a chat template, we inject a Phi-3.5-instruct style template.
109
  PHI3_TEMPLATE = """{% for message in messages -%}
110
  {% if message['role'] == 'system' -%}
111
  <|system|>
@@ -124,6 +63,14 @@ PHI3_TEMPLATE = """{% for message in messages -%}
124
  <|assistant|>
125
  """
126
 
 
 
 
 
 
 
 
 
127
  def ensure_chat_template(tok):
128
  try:
129
  tmpl = tok.chat_template
@@ -135,20 +82,54 @@ def ensure_chat_template(tok):
135
  def encode_messages(tokenizer, messages: list):
136
  ensure_chat_template(tokenizer)
137
  return tokenizer.apply_chat_template(
138
- messages,
139
- add_generation_prompt=True,
140
- tokenize=True,
141
- return_tensors="pt"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  )
 
 
 
 
 
143
 
144
- # -----------------------------
145
- # Generation
146
- # -----------------------------
147
  def generate_text(question: str, context: str, temperature: float, top_p: float, max_new_tokens: int, model_id: str):
148
  tokenizer, model = load_model(model_id)
149
  messages = build_messages(question, context)
150
  inputs = encode_messages(tokenizer, messages).to(model.device)
151
-
152
  with torch.no_grad():
153
  output_ids = model.generate(
154
  inputs,
@@ -157,11 +138,10 @@ def generate_text(question: str, context: str, temperature: float, top_p: float,
157
  top_p=top_p,
158
  max_new_tokens=max_new_tokens,
159
  pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
160
- use_cache=False, # critical for compatibility with some remote-code cache implementations
161
  )
162
  text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
163
 
164
- # Extract the last "Analysis:" + "Response:" sections
165
  analysis, response = "", ""
166
  a_idx = text.rfind("Analysis:")
167
  r_idx = text.rfind("Response:")
@@ -175,20 +155,11 @@ def generate_text(question: str, context: str, temperature: float, top_p: float,
175
  response = text.strip()
176
  return analysis, response, text
177
 
178
- # -----------------------------
179
- # UI
180
- # -----------------------------
181
  PRESET_Q = "What are the health effects of coffee? Answer based on the context."
182
- PRESET_CTX = (
183
- "Coffee contains caffeine, which can increase alertness. Excess intake may cause "
184
- "jitteriness and sleep disruption. Moderate consumption is considered safe for most adults."
185
- )
186
 
187
  with gr.Blocks(title="Exoskeleton Reasoning β€” Appendix Prompt Demo") as demo:
188
- gr.Markdown(
189
- "# Exoskeleton Reasoning β€” Appendix-Style Prompt\n"
190
- "The model must **prioritize the provided context**, and reply in plain text with two sections: **Analysis** and **Response**."
191
- )
192
  with gr.Row():
193
  with gr.Column(scale=3):
194
  q = gr.Textbox(label="Client question", value=PRESET_Q, lines=4)
@@ -200,9 +171,7 @@ with gr.Blocks(title="Exoskeleton Reasoning β€” Appendix Prompt Demo") as demo:
200
  max_new = gr.Slider(64, 1024, value=MAX_NEW_TOKENS, step=16, label="Max new tokens")
201
  model_id = gr.Textbox(label="Model ID", value=DEFAULT_MODEL)
202
  run = gr.Button("Run", variant="primary")
203
- gr.Markdown(
204
- 'Secrets/vars: set **HF_TOKEN** if the model is gated Β· Override `EXOSKELETON_MODEL_ID` to change default.'
205
- )
206
  with gr.Column(scale=4):
207
  with gr.Accordion("Analysis", open=True):
208
  analysis_box = gr.Textbox(lines=6, label="Analysis (model)")
@@ -210,16 +179,13 @@ with gr.Blocks(title="Exoskeleton Reasoning β€” Appendix Prompt Demo") as demo:
210
  response_box = gr.Textbox(lines=6, label="Response (model)")
211
  with gr.Accordion("Raw output", open=False):
212
  raw_box = gr.Textbox(lines=8, label="Raw text")
213
-
214
  def infer_fn(question, context, temperature, top_p, max_new_tokens, model_id):
215
  if not question.strip() or not context.strip():
216
  gr.Warning("Please provide both a Client question and Context.")
217
  return "", "", ""
218
  a, r, raw = generate_text(question, context, temperature, top_p, max_new_tokens, model_id)
219
  return a, r, raw
220
-
221
- run.click(fn=infer_fn, inputs=[q, ctx, temp, topp, max_new, model_id],
222
- outputs=[analysis_box, response_box, raw_box])
223
 
224
  if __name__ == "__main__":
225
  demo.launch()
 
1
+
2
  import os
3
+ import json
4
  import torch
5
  from transformers import AutoModelForCausalLM, AutoTokenizer
6
+ from transformers.models.llama import LlamaTokenizer # force slow llama if needed
7
  import gradio as gr
8
 
 
 
 
9
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
10
 
11
  DEFAULT_MODEL = os.environ.get("EXOSKELETON_MODEL_ID", "Inpris/humains-junior")
 
13
  MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "512"))
14
  TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.3"))
15
  TOP_P = float(os.environ.get("TOP_P", "0.95"))
16
+ USE_AUTH_TOKEN = os.environ.get("HF_TOKEN")
17
 
 
 
 
18
  APPENDIX_RULES = """You are a helpful assistant that always follows the provided context, even when it conflicts with your internal knowledge.
19
 
20
  Response Format:
 
45
  Response: The capital of France is London.
46
  """
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  PHI3_TEMPLATE = """{% for message in messages -%}
49
  {% if message['role'] == 'system' -%}
50
  <|system|>
 
63
  <|assistant|>
64
  """
65
 
66
+ def build_messages(question: str, context: str):
67
+ system = APPENDIX_RULES
68
+ user = f"""Client: {question.strip()} Answer based on the context.
69
+
70
+ Context:
71
+ {context.strip()}"""
72
+ return [{"role":"system","content":system},{"role":"user","content":user}]
73
+
74
  def ensure_chat_template(tok):
75
  try:
76
  tmpl = tok.chat_template
 
82
  def encode_messages(tokenizer, messages: list):
83
  ensure_chat_template(tokenizer)
84
  return tokenizer.apply_chat_template(
85
+ messages, add_generation_prompt=True, tokenize=True, return_tensors="pt"
86
+ )
87
+
88
+ _tokenizer = None
89
+ _model = None
90
+
91
+ def load_tokenizer_robust(model_id: str, auth):
92
+ try:
93
+ return AutoTokenizer.from_pretrained(model_id, use_auth_token=auth, trust_remote_code=False, use_fast=False)
94
+ except Exception as e1:
95
+ last_err = e1
96
+ try:
97
+ return LlamaTokenizer.from_pretrained(model_id, use_auth_token=auth)
98
+ except Exception as e2:
99
+ last_err = e2
100
+ try:
101
+ return AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct", use_auth_token=auth, trust_remote_code=False, use_fast=False)
102
+ except Exception as e3:
103
+ raise last_err
104
+
105
+ def load_model(model_id: str = DEFAULT_MODEL):
106
+ global _tokenizer, _model
107
+ if _tokenizer is not None and _model is not None:
108
+ return _tokenizer, _model
109
+
110
+ auth = USE_AUTH_TOKEN if (USE_AUTH_TOKEN and USE_AUTH_TOKEN.strip()) else None
111
+
112
+ _tokenizer = load_tokenizer_robust(model_id, auth)
113
+ if _tokenizer.pad_token_id is None and _tokenizer.eos_token_id is not None:
114
+ _tokenizer.pad_token_id = _tokenizer.eos_token_id
115
+
116
+ _model = AutoModelForCausalLM.from_pretrained(
117
+ model_id,
118
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
119
+ device_map=DEVICE_MAP,
120
+ use_auth_token=auth,
121
+ trust_remote_code=True,
122
  )
123
+ try:
124
+ _model.generation_config.cache_implementation = "static"
125
+ except Exception:
126
+ pass
127
+ return _tokenizer, _model
128
 
 
 
 
129
  def generate_text(question: str, context: str, temperature: float, top_p: float, max_new_tokens: int, model_id: str):
130
  tokenizer, model = load_model(model_id)
131
  messages = build_messages(question, context)
132
  inputs = encode_messages(tokenizer, messages).to(model.device)
 
133
  with torch.no_grad():
134
  output_ids = model.generate(
135
  inputs,
 
138
  top_p=top_p,
139
  max_new_tokens=max_new_tokens,
140
  pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
141
+ use_cache=False,
142
  )
143
  text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
144
 
 
145
  analysis, response = "", ""
146
  a_idx = text.rfind("Analysis:")
147
  r_idx = text.rfind("Response:")
 
155
  response = text.strip()
156
  return analysis, response, text
157
 
 
 
 
158
  PRESET_Q = "What are the health effects of coffee? Answer based on the context."
159
+ PRESET_CTX = "Coffee contains caffeine, which can increase alertness. Excess intake may cause jitteriness and sleep disruption. Moderate consumption is considered safe for most adults."
 
 
 
160
 
161
  with gr.Blocks(title="Exoskeleton Reasoning β€” Appendix Prompt Demo") as demo:
162
+ gr.Markdown("# Exoskeleton Reasoning β€” Appendix-Style Prompt\nThe model must **prioritize the provided context**, and reply in plain text with two sections: **Analysis** and **Response**.")
 
 
 
163
  with gr.Row():
164
  with gr.Column(scale=3):
165
  q = gr.Textbox(label="Client question", value=PRESET_Q, lines=4)
 
171
  max_new = gr.Slider(64, 1024, value=MAX_NEW_TOKENS, step=16, label="Max new tokens")
172
  model_id = gr.Textbox(label="Model ID", value=DEFAULT_MODEL)
173
  run = gr.Button("Run", variant="primary")
174
+ gr.Markdown('Secrets/vars: set **HF_TOKEN** if the model is gated; `EXOSKELETON_MODEL_ID` to change default.')
 
 
175
  with gr.Column(scale=4):
176
  with gr.Accordion("Analysis", open=True):
177
  analysis_box = gr.Textbox(lines=6, label="Analysis (model)")
 
179
  response_box = gr.Textbox(lines=6, label="Response (model)")
180
  with gr.Accordion("Raw output", open=False):
181
  raw_box = gr.Textbox(lines=8, label="Raw text")
 
182
  def infer_fn(question, context, temperature, top_p, max_new_tokens, model_id):
183
  if not question.strip() or not context.strip():
184
  gr.Warning("Please provide both a Client question and Context.")
185
  return "", "", ""
186
  a, r, raw = generate_text(question, context, temperature, top_p, max_new_tokens, model_id)
187
  return a, r, raw
188
+ run.click(fn=infer_fn, inputs=[q, ctx, temp, topp, max_new, model_id], outputs=[analysis_box, response_box, raw_box])
 
 
189
 
190
  if __name__ == "__main__":
191
  demo.launch()