Spaces:

jeanbaptdzd
/

open-finance-llm-8b

Paused

App Files Files Community

jeanbaptdzd commited on Nov 2

Commit

f372eea

1 Parent(s): 6541672

Add EOS token fix verification test

Browse files

Files changed (2) hide show

test_eos_fix.py +148 -0
test_french_finance.py +128 -0

test_eos_fix.py ADDED Viewed

	@@ -0,0 +1,148 @@

+#!/usr/bin/env python3
+"""
+Test that the EOS token fix is working properly
+Verify: no regressions, better completion, proper finish_reason
+"""
+import httpx
+import json
+import time
+BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"
+def check_space_status():
+    """Check if Space is running"""
+    try:
+        response = httpx.get(f"{BASE_URL}/", timeout=10.0)
+        data = response.json()
+        return data.get("status") == "ok" and data.get("backend") == "Transformers"
+    except:
+        return False
+print("="*80)
+print("TESTING EOS TOKEN FIX")
+print("="*80)
+if not check_space_status():
+    print("❌ Space not ready. Please wait for rebuild.")
+    exit(1)
+print("✅ Space is ready\n")
+# Test 1: Check finish_reason is accurate
+print("[TEST 1] Verify finish_reason accuracy")
+print("-" * 80)
+response = httpx.post(
+    f"{BASE_URL}/v1/chat/completions",
+    json={
+        "model": "DragonLLM/qwen3-8b-fin-v1.0",
+        "messages": [{"role": "user", "content": "What is 2+2? Answer in 5 words."}],
+        "max_tokens": 50,
+        "temperature": 0.3
+    },
+    timeout=60.0
+)
+data = response.json()
+finish = data["choices"][0]["finish_reason"]
+content = data["choices"][0]["message"]["content"]
+tokens = data.get("usage", {}).get("completion_tokens", 0)
+print(f"Max tokens: 50")
+print(f"Generated: {tokens} tokens")
+print(f"Finish reason: {finish}")
+print(f"Response: {content[:150]}...")
+if finish == "stop" and tokens < 50:
+    print("✅ PASS: Stopped naturally with EOS token (not length limit)")
+elif finish == "length" and tokens >= 50:
+    print("✅ PASS: Correctly detected length limit")
+else:
+    print(f"⚠️  Unexpected: finish={finish}, tokens={tokens}")
+# Test 2: Check complete French answer
+print("\n[TEST 2] Complete French answer")
+print("-" * 80)
+response = httpx.post(
+    f"{BASE_URL}/v1/chat/completions",
+    json={
+        "model": "DragonLLM/qwen3-8b-fin-v1.0",
+        "messages": [{"role": "user", "content": "Qu'est-ce qu'une obligation? Soyez concis."}],
+        "max_tokens": 300,
+        "temperature": 0.3
+    },
+    timeout=60.0
+)
+data = response.json()
+content = data["choices"][0]["message"]["content"]
+finish = data["choices"][0]["finish_reason"]
+tokens = data.get("usage", {}).get("completion_tokens", 0)
+# Extract answer
+if "</think>" in content:
+    answer = content.split("</think>")[1].strip()
+else:
+    answer = content
+print(f"Generated: {tokens} tokens")
+print(f"Finish reason: {finish}")
+print(f"\nFull answer:\n{answer}\n")
+# Check completeness
+ends_properly = answer.rstrip().endswith((".", "!", "?", ")", "]"))
+has_french = any(c in answer for c in ["é", "è", "à", "ç"])
+print(f"Ends properly: {ends_properly}")
+print(f"Is French: {has_french}")
+print(f"Finish: {finish}")
+if ends_properly and finish == "stop" and has_french:
+    print("✅ PASS: Complete French answer with proper EOS")
+else:
+    print(f"⚠️  Check: ends={ends_properly}, finish={finish}, french={has_french}")
+# Test 3: Long answer completeness
+print("\n[TEST 3] Long answer completeness")
+print("-" * 80)
+response = httpx.post(
+    f"{BASE_URL}/v1/chat/completions",
+    json={
+        "model": "DragonLLM/qwen3-8b-fin-v1.0",
+        "messages": [{"role": "user", "content": "Expliquez en détail le nantissement de compte-titres."}],
+        "temperature": 0.3
+        # Use default max_tokens (1500)
+    },
+    timeout=90.0
+)
+data = response.json()
+content = data["choices"][0]["message"]["content"]
+finish = data["choices"][0]["finish_reason"]
+tokens = data.get("usage", {}).get("completion_tokens", 0)
+if "</think>" in content:
+    answer = content.split("</think>")[1].strip()
+else:
+    answer = content
+print(f"Generated: {tokens} tokens (default max: 1500)")
+print(f"Finish reason: {finish}")
+print(f"Answer length: {len(answer)} chars")
+print(f"Last 150 chars: ...{answer[-150:]}")
+if finish == "stop":
+    print("✅ PASS: Model stopped naturally at EOS (complete answer)")
+elif finish == "length":
+    print(f"⚠️  Hit token limit - may need higher max_tokens for complex questions")
+else:
+    print(f"❌ Unexpected finish_reason: {finish}")
+print("\n" + "="*80)
+print("SUMMARY")
+print("="*80)
+print("If all tests show 'stop' finish_reason and proper sentence endings,")
+print("the EOS token fix is working correctly!")

test_french_finance.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#!/usr/bin/env python3
+"""
+Test French finance queries against the OpenAI-compatible API.
+"""
+import os
+import sys
+import asyncio
+import httpx
+from typing import Dict, Any
+# Default API URL (can be overridden with API_URL env var)
+API_URL = os.getenv("API_URL", "http://localhost:7860/v1")
+API_KEY = os.getenv("SERVICE_API_KEY")
+# French finance test questions
+FRENCH_QUESTS = [
+    {
+        "name": "Obligations",
+        "question": "Qu'est-ce qu'une obligation?",
+        "max_tokens": 400,
+    },
+    {
+        "name": "SICAV",
+        "question": "Qu'est-ce qu'une SICAV?",
+        "max_tokens": 400,
+    },
+    {
+        "name": "CAC 40",
+        "question": "Expliquez le CAC 40",
+        "max_tokens": 500,
+    },
+    {
+        "name": "VaR",
+        "question": "Qu'est-ce que la Value at Risk (VaR) et comment la calcule-t-on?",
+        "max_tokens": 600,
+    },
+]
+async def test_french_query(client: httpx.AsyncClient, test: Dict[str, Any]) -> Dict[str, Any]:
+    """Test a single French finance query."""
+    headers = {"Content-Type": "application/json"}
+    if API_KEY:
+        headers["x-api-key"] = API_KEY
+    payload = {
+        "model": "DragonLLM/qwen3-8b-fin-v1.0",
+        "messages": [{"role": "user", "content": test["question"]}],
+        "temperature": 0.7,
+        "max_tokens": test["max_tokens"],
+    }
+    try:
+        response = await client.post(
+            f"{API_URL}/chat/completions",
+            json=payload,
+            headers=headers,
+            timeout=120.0,
+        )
+        response.raise_for_status()
+        data = response.json()
+        return {
+            "name": test["name"],
+            "success": True,
+            "question": test["question"],
+            "answer": data["choices"][0]["message"]["content"],
+            "finish_reason": data["choices"][0]["finish_reason"],
+            "tokens": data["usage"]["completion_tokens"],
+            "total_tokens": data["usage"]["total_tokens"],
+        }
+    except Exception as e:
+        return {
+            "name": test["name"],
+            "success": False,
+            "question": test["question"],
+            "error": str(e),
+        }
+async def main():
+    """Run all French finance tests."""
+    print("=" * 70)
+    print("French Finance Test Suite")
+    print("=" * 70)
+    print(f"API URL: {API_URL}")
+    print()
+    async with httpx.AsyncClient() as client:
+        results = []
+        for i, test in enumerate(FRENCH_QUESTS, 1):
+            print(f"[{i}/{len(FRENCH_QUESTS)}] Testing: {test['name']}")
+            print(f"  Question: {test['question']}")
+            result = await test_french_query(client, test)
+            results.append(result)
+            if result["success"]:
+                answer_preview = result["answer"][:150] + "..." if len(result["answer"]) > 150 else result["answer"]
+                print(f"  ✓ Success")
+                print(f"  Finish reason: {result['finish_reason']}")
+                print(f"  Tokens: {result['tokens']}")
+                print(f"  Answer preview: {answer_preview}")
+            else:
+                print(f"  ✗ Failed: {result['error']}")
+            print()
+    # Summary
+    print("=" * 70)
+    print("Summary")
+    print("=" * 70)
+    passed = sum(1 for r in results if r["success"])
+    print(f"Passed: {passed}/{len(results)}")
+    if passed == len(results):
+        print("✓ All tests passed!")
+        return 0
+    else:
+        print("✗ Some tests failed")
+        for r in results:
+            if not r["success"]:
+                print(f"  - {r['name']}: {r['error']}")
+        return 1
+if __name__ == "__main__":
+    sys.exit(asyncio.run(main()))