#!/usr/bin/env python3 """ Test HF Space with expanded context window (128h -> 2160h). Validates VRAM usage and forecast variation patterns. """ import os import sys from pathlib import Path import polars as pl import numpy as np from gradio_client import Client # Get HF token from environment HF_TOKEN = os.getenv("HF_TOKEN") if not HF_TOKEN: print("[ERROR] HF_TOKEN environment variable not set") sys.exit(1) def test_hf_space_smoke(): """Run smoke test on HF Space and validate results""" print("=" * 80) print("HF SPACE SMOKE TEST: Context Window Expansion (128h -> 2160h)") print("=" * 80) # Initialize client print("\nConnecting to HF Space...") client = Client("evgueni-p/fbmc-chronos2", hf_token=HF_TOKEN) print("[OK] Connected to evgueni-p/fbmc-chronos2") # Test parameters run_date = "2024-09-30" test_border = "AT_DE" forecast_type = "smoke_test" # 7 days, 1 border print(f"\nTest configuration:") print(f" Border: {test_border}") print(f" Run date: {run_date}") print(f" Forecast type: {forecast_type}") print(f" Expected context: 2160 hours (90 days)") print(f" Expected batch_size: 48") # Run forecast print(f"\nRunning forecast via API...") try: result = client.predict( run_date_str=run_date, forecast_type=forecast_type, api_name="/forecast_api" ) print(f"[OK] Forecast completed") print(f" Result file: {result}") except Exception as e: print(f"[FAIL] API call failed: {e}") import traceback traceback.print_exc() return False # Download and validate forecast print(f"\nValidating forecast results...") if not os.path.exists(result): print(f"[FAIL] Forecast file not found: {result}") return False # Load forecast df = pl.read_parquet(result) print(f"[OK] Loaded forecast file") print(f" Shape: {df.shape}") print(f" Columns: {df.columns}") # Expected: 168 hours (7 days), 4 columns (timestamp + median + q10 + q90) expected_hours = 168 if len(df) != expected_hours: print(f"[FAIL] Forecast length mismatch:") print(f" Expected: {expected_hours} hours") print(f" Got: {len(df)} hours") return False print(f"[OK] Forecast length: {len(df)} hours (correct)") # Extract median forecast for AT_DE median_col = f"{test_border}_median" if median_col not in df.columns: print(f"[FAIL] Column {median_col} not found in forecast") return False median_forecast = df[median_col].to_numpy() # Check variation statistics mean_val = np.mean(median_forecast) std_val = np.std(median_forecast) min_val = np.min(median_forecast) max_val = np.max(median_forecast) range_val = max_val - min_val print(f"\n[CHECK] Forecast statistics:") print(f" Mean: {mean_val:.2f} MW") print(f" Std Dev: {std_val:.2f} MW") print(f" Min: {min_val:.2f} MW") print(f" Max: {max_val:.2f} MW") print(f" Range: {range_val:.2f} MW") # Validation 1: Check for variation if std_val < 1.0: print(f"\n[WARNING] Low variation detected (std={std_val:.4f} MW)") unique_values = len(np.unique(median_forecast)) print(f" Unique values in forecast: {unique_values}/{len(median_forecast)}") if unique_values < 5: print(f"\n[FAIL] Forecast appears constant (only {unique_values} unique values)") print(f" First 24 values: {median_forecast[:24]}") return False else: print(f"\n[OK] Forecast shows variation (std={std_val:.2f} MW)") # Validation 2: Check unique values count unique_values = len(np.unique(median_forecast)) print(f"\n[CHECK] Unique values: {unique_values}/{len(median_forecast)}") if unique_values < 50: print(f"[WARNING] Low diversity (expected >50 unique values)") else: print(f"[OK] Good diversity in forecast") # Validation 3: Check data type (should be integers now) if median_col in df.columns: dtype = df.schema[median_col] print(f"\n[CHECK] Data type: {dtype}") if "Int" in str(dtype): print(f"[OK] MW values converted to integers") else: print(f"[INFO] MW values still float (expected Int32)") # Display first 48 hours print(f"\n[CHECK] First 48 hours of median forecast:") for i in range(min(48, len(median_forecast))): if i % 12 == 0: print(f" Hour {i:3d}-{i+11:3d}: ", end="") print(f"{median_forecast[i]:7.0f} ", end="") if (i + 1) % 12 == 0: print() print() # Summary print("\n" + "=" * 80) print("SMOKE TEST VALIDATION SUMMARY") print("=" * 80) checks_passed = [] checks_failed = [] # Check 1: Length if len(df) == expected_hours: checks_passed.append("Forecast length (168 hours)") else: checks_failed.append(f"Forecast length ({len(df)} != {expected_hours})") # Check 2: Variation if std_val >= 1.0: checks_passed.append(f"Variation (std={std_val:.2f} MW)") else: checks_failed.append(f"Low variation (std={std_val:.4f} MW)") # Check 3: Diversity if unique_values >= 50: checks_passed.append(f"Diversity ({unique_values} unique values)") else: checks_failed.append(f"Low diversity ({unique_values} unique values)") print(f"\n[PASSED] {len(checks_passed)} checks:") for check in checks_passed: print(f" + {check}") if checks_failed: print(f"\n[FAILED] {len(checks_failed)} checks:") for check in checks_failed: print(f" - {check}") # Overall result if len(checks_failed) == 0: print("\n" + "=" * 80) print("[SUCCESS] ALL CHECKS PASSED - Ready for full 38-border evaluation") print("=" * 80) print("\nNext steps:") print("1. Check HF Space logs for VRAM usage (should be ~76% = 36.6 GB / 48 GB)") print("2. Run full 38-border evaluation") print("3. Compare to Session 12 baseline (15.92 MW D+1 MAE)") return True else: print("\n" + "=" * 80) print("[PARTIAL SUCCESS] Some checks failed - investigate before full evaluation") print("=" * 80) return False if __name__ == "__main__": success = test_hf_space_smoke() sys.exit(0 if success else 1)