Markus Clauss DIRU Vetsuisse Claude commited on
Commit
ed1e41a
Β·
1 Parent(s): e055772

Switch to CPU-only version for stable persistent model

Browse files

- Remove all @spaces.GPU decorators
- Remove ensure_model_loaded helper (not needed)
- Simplify all functions - model stays persistent in memory
- Model loads once on startup and remains available
- All features work on CPU (just slower than GPU)
- Fixes all "Model not loaded" errors permanently

Benefits:
- Free forever (CPU Basic tier)
- Model persists across all function calls
- No ZeroGPU isolation issues
- Simpler, more stable code

πŸ€– Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <[email protected]>

Files changed (1) hide show
  1. app.py +17 -72
app.py CHANGED
@@ -14,7 +14,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
14
  import warnings
15
  import os
16
  import time # For timing measurements
17
- import spaces
18
 
19
  # Advanced ML components (2024 State-of-the-Art)
20
  try:
@@ -54,37 +54,6 @@ model_loaded = False
54
  HF_TOKEN = os.environ.get('HF_TOKEN', None)
55
  print(f"πŸ” HF_TOKEN available: {bool(HF_TOKEN)}")
56
 
57
- def ensure_model_loaded():
58
- """Helper function to ensure model is loaded for ZeroGPU"""
59
- global model, tokenizer
60
-
61
- if model is None or tokenizer is None:
62
- hf_token = HF_TOKEN
63
- if not hf_token:
64
- return False, "❌ No HuggingFace token found. Please set HF_TOKEN environment variable."
65
-
66
- model_name = "swiss-ai/Apertus-8B-Instruct-2509"
67
- try:
68
- tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
69
- if tokenizer.pad_token is None:
70
- tokenizer.pad_token = tokenizer.eos_token
71
-
72
- model = AutoModelForCausalLM.from_pretrained(
73
- model_name,
74
- token=hf_token,
75
- torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
76
- device_map="auto" if torch.cuda.is_available() else "cpu",
77
- low_cpu_mem_usage=True,
78
- output_attentions=True,
79
- output_hidden_states=True,
80
- trust_remote_code=True
81
- )
82
- return True, "βœ… Model loaded"
83
- except Exception as e:
84
- return False, f"❌ Failed to load model: {str(e)}"
85
- return True, "βœ… Model ready"
86
-
87
- @spaces.GPU(duration=120, enable_queue=True)
88
  def load_model():
89
  """Load Apertus model with HuggingFace token from environment"""
90
  global model, tokenizer, model_loaded
@@ -174,15 +143,12 @@ def load_model():
174
  print(f"πŸ“‹ Full traceback:\n{traceback.format_exc()}")
175
  return f"❌ Failed to load model: {str(e)}\nπŸ’‘ Check your token and model access permissions."
176
 
177
- @spaces.GPU(duration=60, enable_queue=True)
178
  def chat_with_apertus(message, max_tokens=300):
179
  """Simple chat function"""
180
  global model, tokenizer
181
 
182
- # Ensure model is loaded
183
- success, msg = ensure_model_loaded()
184
- if not success:
185
- return msg
186
 
187
  try:
188
  formatted_prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
@@ -221,15 +187,12 @@ You are Apertus, a helpful Swiss AI assistant. You are transparent, multilingual
221
  except Exception as e:
222
  return f"❌ Error: {str(e)}"
223
 
224
- @spaces.GPU(duration=30)
225
  def analyze_attention(text, layer=15):
226
  """Analyze attention patterns"""
227
  global model, tokenizer
228
 
229
- # Ensure model is loaded for ZeroGPU
230
- success, msg = ensure_model_loaded()
231
- if not success:
232
- return None, msg
233
 
234
  try:
235
  inputs = tokenizer(text, return_tensors="pt")
@@ -278,15 +241,12 @@ def analyze_attention(text, layer=15):
278
  except Exception as e:
279
  return None, f"❌ Error analyzing attention: {str(e)}"
280
 
281
- @spaces.GPU(duration=30)
282
  def analyze_token_predictions(text):
283
  """Analyze next token predictions"""
284
  global model, tokenizer
285
 
286
- # Ensure model is loaded for ZeroGPU
287
- success, msg = ensure_model_loaded()
288
- if not success:
289
- return None, msg
290
 
291
  try:
292
  inputs = tokenizer(text, return_tensors="pt")
@@ -334,15 +294,12 @@ def analyze_token_predictions(text):
334
  except Exception as e:
335
  return None, f"❌ Error analyzing predictions: {str(e)}"
336
 
337
- @spaces.GPU(duration=30)
338
  def analyze_layer_evolution(text):
339
  """Analyze how representations evolve through layers"""
340
  global model, tokenizer
341
 
342
- # Ensure model is loaded for ZeroGPU
343
- success, msg = ensure_model_loaded()
344
- if not success:
345
- return None, msg
346
 
347
  try:
348
  inputs = tokenizer(text, return_tensors="pt")
@@ -405,15 +362,12 @@ def analyze_layer_evolution(text):
405
  except Exception as e:
406
  return None, f"❌ Error analyzing layer evolution: {str(e)}"
407
 
408
- @spaces.GPU(duration=30)
409
  def analyze_weights(layer_num, layer_type):
410
  """Analyze weight distribution with research-based metrics"""
411
  global model
412
 
413
- # Ensure model is loaded for ZeroGPU
414
- success, msg = ensure_model_loaded()
415
- if not success:
416
- return None, msg
417
 
418
  try:
419
  selected_layer = f"model.layers.{layer_num}.{layer_type}"
@@ -856,15 +810,12 @@ def goldfish_loss_function(logits, targets, k=0.1, temperature=1.0):
856
  else:
857
  return masked_loss.sum()
858
 
859
- @spaces.GPU(duration=30)
860
  def analyze_memorization_patterns(text, k_values=[0.0, 0.1, 0.2, 0.3]):
861
  """Analyze how Goldfish Loss affects memorization"""
862
  global model, tokenizer
863
 
864
- # Ensure model is loaded for ZeroGPU
865
- success, msg = ensure_model_loaded()
866
- if not success:
867
- return None, msg
868
 
869
  try:
870
  inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
@@ -1163,15 +1114,12 @@ def simulate_optimizer_comparison(baseline_loss, num_steps):
1163
  # 🧠 DECISION PROCESS & GERMAN LANGUAGE ANALYSIS
1164
  # =============================================================================
1165
 
1166
- @spaces.GPU(duration=30)
1167
  def analyze_decision_process(text, max_steps=10):
1168
  """Step-by-step decision process like CLI script"""
1169
  global model, tokenizer
1170
 
1171
- # Ensure model is loaded for ZeroGPU
1172
- success, msg = ensure_model_loaded()
1173
- if not success:
1174
- return None, msg
1175
 
1176
  try:
1177
  inputs = tokenizer(text, return_tensors="pt", max_length=256, truncation=True)
@@ -1299,15 +1247,12 @@ def analyze_decision_process(text, max_steps=10):
1299
  except Exception as e:
1300
  return None, f"❌ Error analyzing decision process: {str(e)}"
1301
 
1302
- @spaces.GPU(duration=30)
1303
  def analyze_german_compounds(text_input=""):
1304
  """Analyze German compound words with multi-tokenizer comparison"""
1305
  global model, tokenizer
1306
 
1307
- # Ensure model is loaded for ZeroGPU
1308
- success, msg = ensure_model_loaded()
1309
- if not success:
1310
- return None, msg
1311
 
1312
  # Swiss/German compound examples if no input
1313
  if not text_input.strip():
 
14
  import warnings
15
  import os
16
  import time # For timing measurements
17
+ # import spaces # Disabled - CPU-only version for persistent model
18
 
19
  # Advanced ML components (2024 State-of-the-Art)
20
  try:
 
54
  HF_TOKEN = os.environ.get('HF_TOKEN', None)
55
  print(f"πŸ” HF_TOKEN available: {bool(HF_TOKEN)}")
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def load_model():
58
  """Load Apertus model with HuggingFace token from environment"""
59
  global model, tokenizer, model_loaded
 
143
  print(f"πŸ“‹ Full traceback:\n{traceback.format_exc()}")
144
  return f"❌ Failed to load model: {str(e)}\nπŸ’‘ Check your token and model access permissions."
145
 
 
146
  def chat_with_apertus(message, max_tokens=300):
147
  """Simple chat function"""
148
  global model, tokenizer
149
 
150
+ if model is None or tokenizer is None:
151
+ return "❌ Model not loaded. Please wait for initialization or refresh the page."
 
 
152
 
153
  try:
154
  formatted_prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
 
187
  except Exception as e:
188
  return f"❌ Error: {str(e)}"
189
 
 
190
  def analyze_attention(text, layer=15):
191
  """Analyze attention patterns"""
192
  global model, tokenizer
193
 
194
+ if model is None or tokenizer is None:
195
+ return None, "❌ Model not loaded. Please wait for initialization or refresh the page."
 
 
196
 
197
  try:
198
  inputs = tokenizer(text, return_tensors="pt")
 
241
  except Exception as e:
242
  return None, f"❌ Error analyzing attention: {str(e)}"
243
 
 
244
  def analyze_token_predictions(text):
245
  """Analyze next token predictions"""
246
  global model, tokenizer
247
 
248
+ if model is None or tokenizer is None:
249
+ return None, "❌ Model not loaded. Please wait for initialization or refresh the page."
 
 
250
 
251
  try:
252
  inputs = tokenizer(text, return_tensors="pt")
 
294
  except Exception as e:
295
  return None, f"❌ Error analyzing predictions: {str(e)}"
296
 
 
297
  def analyze_layer_evolution(text):
298
  """Analyze how representations evolve through layers"""
299
  global model, tokenizer
300
 
301
+ if model is None or tokenizer is None:
302
+ return None, "❌ Model not loaded. Please wait for initialization or refresh the page."
 
 
303
 
304
  try:
305
  inputs = tokenizer(text, return_tensors="pt")
 
362
  except Exception as e:
363
  return None, f"❌ Error analyzing layer evolution: {str(e)}"
364
 
 
365
  def analyze_weights(layer_num, layer_type):
366
  """Analyze weight distribution with research-based metrics"""
367
  global model
368
 
369
+ if model is None:
370
+ return None, "❌ Model not loaded. Please wait for initialization or refresh the page."
 
 
371
 
372
  try:
373
  selected_layer = f"model.layers.{layer_num}.{layer_type}"
 
810
  else:
811
  return masked_loss.sum()
812
 
 
813
  def analyze_memorization_patterns(text, k_values=[0.0, 0.1, 0.2, 0.3]):
814
  """Analyze how Goldfish Loss affects memorization"""
815
  global model, tokenizer
816
 
817
+ if model is None or tokenizer is None:
818
+ return None, "❌ Model not loaded. Please wait for initialization or refresh the page."
 
 
819
 
820
  try:
821
  inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
 
1114
  # 🧠 DECISION PROCESS & GERMAN LANGUAGE ANALYSIS
1115
  # =============================================================================
1116
 
 
1117
  def analyze_decision_process(text, max_steps=10):
1118
  """Step-by-step decision process like CLI script"""
1119
  global model, tokenizer
1120
 
1121
+ if model is None or tokenizer is None:
1122
+ return None, "❌ Model not loaded. Please wait for initialization or refresh the page."
 
 
1123
 
1124
  try:
1125
  inputs = tokenizer(text, return_tensors="pt", max_length=256, truncation=True)
 
1247
  except Exception as e:
1248
  return None, f"❌ Error analyzing decision process: {str(e)}"
1249
 
 
1250
  def analyze_german_compounds(text_input=""):
1251
  """Analyze German compound words with multi-tokenizer comparison"""
1252
  global model, tokenizer
1253
 
1254
+ if model is None or tokenizer is None:
1255
+ return None, "❌ Model not loaded. Please wait for initialization or refresh the page."
 
 
1256
 
1257
  # Swiss/German compound examples if no input
1258
  if not text_input.strip():