fbmc-chronos2 / scripts /test_hf_space_context_expansion.py
Evgueni Poloukarov
revert: remove hour-aware adaptive quantile selection (61% MAE degradation)
ff9fbcf
#!/usr/bin/env python3
"""
Test HF Space with expanded context window (128h -> 2160h).
Validates VRAM usage and forecast variation patterns.
"""
import os
import sys
from pathlib import Path
import polars as pl
import numpy as np
from gradio_client import Client
# Get HF token from environment
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
print("[ERROR] HF_TOKEN environment variable not set")
sys.exit(1)
def test_hf_space_smoke():
"""Run smoke test on HF Space and validate results"""
print("=" * 80)
print("HF SPACE SMOKE TEST: Context Window Expansion (128h -> 2160h)")
print("=" * 80)
# Initialize client
print("\nConnecting to HF Space...")
client = Client("evgueni-p/fbmc-chronos2", hf_token=HF_TOKEN)
print("[OK] Connected to evgueni-p/fbmc-chronos2")
# Test parameters
run_date = "2024-09-30"
test_border = "AT_DE"
forecast_type = "smoke_test" # 7 days, 1 border
print(f"\nTest configuration:")
print(f" Border: {test_border}")
print(f" Run date: {run_date}")
print(f" Forecast type: {forecast_type}")
print(f" Expected context: 2160 hours (90 days)")
print(f" Expected batch_size: 48")
# Run forecast
print(f"\nRunning forecast via API...")
try:
result = client.predict(
run_date_str=run_date,
forecast_type=forecast_type,
api_name="/forecast_api"
)
print(f"[OK] Forecast completed")
print(f" Result file: {result}")
except Exception as e:
print(f"[FAIL] API call failed: {e}")
import traceback
traceback.print_exc()
return False
# Download and validate forecast
print(f"\nValidating forecast results...")
if not os.path.exists(result):
print(f"[FAIL] Forecast file not found: {result}")
return False
# Load forecast
df = pl.read_parquet(result)
print(f"[OK] Loaded forecast file")
print(f" Shape: {df.shape}")
print(f" Columns: {df.columns}")
# Expected: 168 hours (7 days), 4 columns (timestamp + median + q10 + q90)
expected_hours = 168
if len(df) != expected_hours:
print(f"[FAIL] Forecast length mismatch:")
print(f" Expected: {expected_hours} hours")
print(f" Got: {len(df)} hours")
return False
print(f"[OK] Forecast length: {len(df)} hours (correct)")
# Extract median forecast for AT_DE
median_col = f"{test_border}_median"
if median_col not in df.columns:
print(f"[FAIL] Column {median_col} not found in forecast")
return False
median_forecast = df[median_col].to_numpy()
# Check variation statistics
mean_val = np.mean(median_forecast)
std_val = np.std(median_forecast)
min_val = np.min(median_forecast)
max_val = np.max(median_forecast)
range_val = max_val - min_val
print(f"\n[CHECK] Forecast statistics:")
print(f" Mean: {mean_val:.2f} MW")
print(f" Std Dev: {std_val:.2f} MW")
print(f" Min: {min_val:.2f} MW")
print(f" Max: {max_val:.2f} MW")
print(f" Range: {range_val:.2f} MW")
# Validation 1: Check for variation
if std_val < 1.0:
print(f"\n[WARNING] Low variation detected (std={std_val:.4f} MW)")
unique_values = len(np.unique(median_forecast))
print(f" Unique values in forecast: {unique_values}/{len(median_forecast)}")
if unique_values < 5:
print(f"\n[FAIL] Forecast appears constant (only {unique_values} unique values)")
print(f" First 24 values: {median_forecast[:24]}")
return False
else:
print(f"\n[OK] Forecast shows variation (std={std_val:.2f} MW)")
# Validation 2: Check unique values count
unique_values = len(np.unique(median_forecast))
print(f"\n[CHECK] Unique values: {unique_values}/{len(median_forecast)}")
if unique_values < 50:
print(f"[WARNING] Low diversity (expected >50 unique values)")
else:
print(f"[OK] Good diversity in forecast")
# Validation 3: Check data type (should be integers now)
if median_col in df.columns:
dtype = df.schema[median_col]
print(f"\n[CHECK] Data type: {dtype}")
if "Int" in str(dtype):
print(f"[OK] MW values converted to integers")
else:
print(f"[INFO] MW values still float (expected Int32)")
# Display first 48 hours
print(f"\n[CHECK] First 48 hours of median forecast:")
for i in range(min(48, len(median_forecast))):
if i % 12 == 0:
print(f" Hour {i:3d}-{i+11:3d}: ", end="")
print(f"{median_forecast[i]:7.0f} ", end="")
if (i + 1) % 12 == 0:
print()
print()
# Summary
print("\n" + "=" * 80)
print("SMOKE TEST VALIDATION SUMMARY")
print("=" * 80)
checks_passed = []
checks_failed = []
# Check 1: Length
if len(df) == expected_hours:
checks_passed.append("Forecast length (168 hours)")
else:
checks_failed.append(f"Forecast length ({len(df)} != {expected_hours})")
# Check 2: Variation
if std_val >= 1.0:
checks_passed.append(f"Variation (std={std_val:.2f} MW)")
else:
checks_failed.append(f"Low variation (std={std_val:.4f} MW)")
# Check 3: Diversity
if unique_values >= 50:
checks_passed.append(f"Diversity ({unique_values} unique values)")
else:
checks_failed.append(f"Low diversity ({unique_values} unique values)")
print(f"\n[PASSED] {len(checks_passed)} checks:")
for check in checks_passed:
print(f" + {check}")
if checks_failed:
print(f"\n[FAILED] {len(checks_failed)} checks:")
for check in checks_failed:
print(f" - {check}")
# Overall result
if len(checks_failed) == 0:
print("\n" + "=" * 80)
print("[SUCCESS] ALL CHECKS PASSED - Ready for full 38-border evaluation")
print("=" * 80)
print("\nNext steps:")
print("1. Check HF Space logs for VRAM usage (should be ~76% = 36.6 GB / 48 GB)")
print("2. Run full 38-border evaluation")
print("3. Compare to Session 12 baseline (15.92 MW D+1 MAE)")
return True
else:
print("\n" + "=" * 80)
print("[PARTIAL SUCCESS] Some checks failed - investigate before full evaluation")
print("=" * 80)
return False
if __name__ == "__main__":
success = test_hf_space_smoke()
sys.exit(0 if success else 1)