jeanbaptdzd commited on
Commit
f372eea
Β·
1 Parent(s): 6541672

Add EOS token fix verification test

Browse files
Files changed (2) hide show
  1. test_eos_fix.py +148 -0
  2. test_french_finance.py +128 -0
test_eos_fix.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test that the EOS token fix is working properly
4
+ Verify: no regressions, better completion, proper finish_reason
5
+ """
6
+ import httpx
7
+ import json
8
+ import time
9
+
10
+ BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"
11
+
12
+ def check_space_status():
13
+ """Check if Space is running"""
14
+ try:
15
+ response = httpx.get(f"{BASE_URL}/", timeout=10.0)
16
+ data = response.json()
17
+ return data.get("status") == "ok" and data.get("backend") == "Transformers"
18
+ except:
19
+ return False
20
+
21
+ print("="*80)
22
+ print("TESTING EOS TOKEN FIX")
23
+ print("="*80)
24
+
25
+ if not check_space_status():
26
+ print("❌ Space not ready. Please wait for rebuild.")
27
+ exit(1)
28
+
29
+ print("βœ… Space is ready\n")
30
+
31
+ # Test 1: Check finish_reason is accurate
32
+ print("[TEST 1] Verify finish_reason accuracy")
33
+ print("-" * 80)
34
+
35
+ response = httpx.post(
36
+ f"{BASE_URL}/v1/chat/completions",
37
+ json={
38
+ "model": "DragonLLM/qwen3-8b-fin-v1.0",
39
+ "messages": [{"role": "user", "content": "What is 2+2? Answer in 5 words."}],
40
+ "max_tokens": 50,
41
+ "temperature": 0.3
42
+ },
43
+ timeout=60.0
44
+ )
45
+
46
+ data = response.json()
47
+ finish = data["choices"][0]["finish_reason"]
48
+ content = data["choices"][0]["message"]["content"]
49
+ tokens = data.get("usage", {}).get("completion_tokens", 0)
50
+
51
+ print(f"Max tokens: 50")
52
+ print(f"Generated: {tokens} tokens")
53
+ print(f"Finish reason: {finish}")
54
+ print(f"Response: {content[:150]}...")
55
+
56
+ if finish == "stop" and tokens < 50:
57
+ print("βœ… PASS: Stopped naturally with EOS token (not length limit)")
58
+ elif finish == "length" and tokens >= 50:
59
+ print("βœ… PASS: Correctly detected length limit")
60
+ else:
61
+ print(f"⚠️ Unexpected: finish={finish}, tokens={tokens}")
62
+
63
+ # Test 2: Check complete French answer
64
+ print("\n[TEST 2] Complete French answer")
65
+ print("-" * 80)
66
+
67
+ response = httpx.post(
68
+ f"{BASE_URL}/v1/chat/completions",
69
+ json={
70
+ "model": "DragonLLM/qwen3-8b-fin-v1.0",
71
+ "messages": [{"role": "user", "content": "Qu'est-ce qu'une obligation? Soyez concis."}],
72
+ "max_tokens": 300,
73
+ "temperature": 0.3
74
+ },
75
+ timeout=60.0
76
+ )
77
+
78
+ data = response.json()
79
+ content = data["choices"][0]["message"]["content"]
80
+ finish = data["choices"][0]["finish_reason"]
81
+ tokens = data.get("usage", {}).get("completion_tokens", 0)
82
+
83
+ # Extract answer
84
+ if "</think>" in content:
85
+ answer = content.split("</think>")[1].strip()
86
+ else:
87
+ answer = content
88
+
89
+ print(f"Generated: {tokens} tokens")
90
+ print(f"Finish reason: {finish}")
91
+ print(f"\nFull answer:\n{answer}\n")
92
+
93
+ # Check completeness
94
+ ends_properly = answer.rstrip().endswith((".", "!", "?", ")", "]"))
95
+ has_french = any(c in answer for c in ["Γ©", "Γ¨", "Γ ", "Γ§"])
96
+
97
+ print(f"Ends properly: {ends_properly}")
98
+ print(f"Is French: {has_french}")
99
+ print(f"Finish: {finish}")
100
+
101
+ if ends_properly and finish == "stop" and has_french:
102
+ print("βœ… PASS: Complete French answer with proper EOS")
103
+ else:
104
+ print(f"⚠️ Check: ends={ends_properly}, finish={finish}, french={has_french}")
105
+
106
+ # Test 3: Long answer completeness
107
+ print("\n[TEST 3] Long answer completeness")
108
+ print("-" * 80)
109
+
110
+ response = httpx.post(
111
+ f"{BASE_URL}/v1/chat/completions",
112
+ json={
113
+ "model": "DragonLLM/qwen3-8b-fin-v1.0",
114
+ "messages": [{"role": "user", "content": "Expliquez en dΓ©tail le nantissement de compte-titres."}],
115
+ "temperature": 0.3
116
+ # Use default max_tokens (1500)
117
+ },
118
+ timeout=90.0
119
+ )
120
+
121
+ data = response.json()
122
+ content = data["choices"][0]["message"]["content"]
123
+ finish = data["choices"][0]["finish_reason"]
124
+ tokens = data.get("usage", {}).get("completion_tokens", 0)
125
+
126
+ if "</think>" in content:
127
+ answer = content.split("</think>")[1].strip()
128
+ else:
129
+ answer = content
130
+
131
+ print(f"Generated: {tokens} tokens (default max: 1500)")
132
+ print(f"Finish reason: {finish}")
133
+ print(f"Answer length: {len(answer)} chars")
134
+ print(f"Last 150 chars: ...{answer[-150:]}")
135
+
136
+ if finish == "stop":
137
+ print("βœ… PASS: Model stopped naturally at EOS (complete answer)")
138
+ elif finish == "length":
139
+ print(f"⚠️ Hit token limit - may need higher max_tokens for complex questions")
140
+ else:
141
+ print(f"❌ Unexpected finish_reason: {finish}")
142
+
143
+ print("\n" + "="*80)
144
+ print("SUMMARY")
145
+ print("="*80)
146
+ print("If all tests show 'stop' finish_reason and proper sentence endings,")
147
+ print("the EOS token fix is working correctly!")
148
+
test_french_finance.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test French finance queries against the OpenAI-compatible API.
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import asyncio
9
+ import httpx
10
+ from typing import Dict, Any
11
+
12
+ # Default API URL (can be overridden with API_URL env var)
13
+ API_URL = os.getenv("API_URL", "http://localhost:7860/v1")
14
+ API_KEY = os.getenv("SERVICE_API_KEY")
15
+
16
+ # French finance test questions
17
+ FRENCH_QUESTS = [
18
+ {
19
+ "name": "Obligations",
20
+ "question": "Qu'est-ce qu'une obligation?",
21
+ "max_tokens": 400,
22
+ },
23
+ {
24
+ "name": "SICAV",
25
+ "question": "Qu'est-ce qu'une SICAV?",
26
+ "max_tokens": 400,
27
+ },
28
+ {
29
+ "name": "CAC 40",
30
+ "question": "Expliquez le CAC 40",
31
+ "max_tokens": 500,
32
+ },
33
+ {
34
+ "name": "VaR",
35
+ "question": "Qu'est-ce que la Value at Risk (VaR) et comment la calcule-t-on?",
36
+ "max_tokens": 600,
37
+ },
38
+ ]
39
+
40
+
41
+ async def test_french_query(client: httpx.AsyncClient, test: Dict[str, Any]) -> Dict[str, Any]:
42
+ """Test a single French finance query."""
43
+ headers = {"Content-Type": "application/json"}
44
+ if API_KEY:
45
+ headers["x-api-key"] = API_KEY
46
+
47
+ payload = {
48
+ "model": "DragonLLM/qwen3-8b-fin-v1.0",
49
+ "messages": [{"role": "user", "content": test["question"]}],
50
+ "temperature": 0.7,
51
+ "max_tokens": test["max_tokens"],
52
+ }
53
+
54
+ try:
55
+ response = await client.post(
56
+ f"{API_URL}/chat/completions",
57
+ json=payload,
58
+ headers=headers,
59
+ timeout=120.0,
60
+ )
61
+ response.raise_for_status()
62
+ data = response.json()
63
+
64
+ return {
65
+ "name": test["name"],
66
+ "success": True,
67
+ "question": test["question"],
68
+ "answer": data["choices"][0]["message"]["content"],
69
+ "finish_reason": data["choices"][0]["finish_reason"],
70
+ "tokens": data["usage"]["completion_tokens"],
71
+ "total_tokens": data["usage"]["total_tokens"],
72
+ }
73
+ except Exception as e:
74
+ return {
75
+ "name": test["name"],
76
+ "success": False,
77
+ "question": test["question"],
78
+ "error": str(e),
79
+ }
80
+
81
+
82
+ async def main():
83
+ """Run all French finance tests."""
84
+ print("=" * 70)
85
+ print("French Finance Test Suite")
86
+ print("=" * 70)
87
+ print(f"API URL: {API_URL}")
88
+ print()
89
+
90
+ async with httpx.AsyncClient() as client:
91
+ results = []
92
+ for i, test in enumerate(FRENCH_QUESTS, 1):
93
+ print(f"[{i}/{len(FRENCH_QUESTS)}] Testing: {test['name']}")
94
+ print(f" Question: {test['question']}")
95
+ result = await test_french_query(client, test)
96
+ results.append(result)
97
+
98
+ if result["success"]:
99
+ answer_preview = result["answer"][:150] + "..." if len(result["answer"]) > 150 else result["answer"]
100
+ print(f" βœ“ Success")
101
+ print(f" Finish reason: {result['finish_reason']}")
102
+ print(f" Tokens: {result['tokens']}")
103
+ print(f" Answer preview: {answer_preview}")
104
+ else:
105
+ print(f" βœ— Failed: {result['error']}")
106
+ print()
107
+
108
+ # Summary
109
+ print("=" * 70)
110
+ print("Summary")
111
+ print("=" * 70)
112
+ passed = sum(1 for r in results if r["success"])
113
+ print(f"Passed: {passed}/{len(results)}")
114
+
115
+ if passed == len(results):
116
+ print("βœ“ All tests passed!")
117
+ return 0
118
+ else:
119
+ print("βœ— Some tests failed")
120
+ for r in results:
121
+ if not r["success"]:
122
+ print(f" - {r['name']}: {r['error']}")
123
+ return 1
124
+
125
+
126
+ if __name__ == "__main__":
127
+ sys.exit(asyncio.run(main()))
128
+