fbmc-chronos2 / scripts /evaluate_october_2024.py
Evgueni Poloukarov
fix: CRITICAL timestamp bug causing 313 vs 336 hour mismatch + validation
67808ce
#!/usr/bin/env python3
"""
October 2024 Evaluation - Multivariate Chronos-2
Run date: 2024-09-30 (forecast Oct 1-14, 2024)
Compares 38-border × 14-day forecast against actuals
Calculates D+1 through D+14 MAE for each border
"""
import sys
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
import os
import time
import numpy as np
import polars as pl
from datetime import datetime, timedelta
from pathlib import Path
from dotenv import load_dotenv
from gradio_client import Client
load_dotenv()
def main():
print("="*70)
print("OCTOBER 2024 MULTIVARIATE CHRONOS-2 EVALUATION")
print("="*70)
total_start = time.time()
# Step 1: Connect to HF Space
print("\n[1/6] Connecting to HuggingFace Space...")
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
raise ValueError("HF_TOKEN not found in environment")
client = Client("evgueni-p/fbmc-chronos2", hf_token=hf_token)
print("[OK] Connected to HF Space")
# Step 2: Run full 14-day forecast for Oct 1-14, 2024
print("\n[2/6] Running full 38-border forecast via HF Space API...")
print(" Run date: 2024-09-30")
print(" Forecast period: Oct 1-14, 2024 (336 hours)")
print(" This may take 5-10 minutes...")
forecast_start_time = time.time()
result_file = client.predict(
"2024-09-30", # run_date
"full_14day", # forecast_type
)
forecast_time = time.time() - forecast_start_time
print(f"[OK] Forecast complete in {forecast_time/60:.2f} minutes")
print(f" Result file: {result_file}")
# Step 3: Load forecast results
print("\n[3/6] Loading forecast results...")
forecast_df = pl.read_parquet(result_file)
print(f"[OK] Loaded forecast with shape: {forecast_df.shape}")
print(f" Columns: {len(forecast_df.columns)} (timestamp + {len(forecast_df.columns)-1} forecast columns)")
# Identify border columns (median forecasts)
median_cols = [col for col in forecast_df.columns if col.endswith('_median')]
borders = [col.replace('_median', '') for col in median_cols]
print(f"[OK] Found {len(borders)} borders")
# Step 4: Load actuals from local dataset
print("\n[4/6] Loading actual values from local dataset...")
local_data_path = Path('data/processed/features_unified_24month.parquet')
if not local_data_path.exists():
print(f"[ERROR] Local dataset not found at: {local_data_path}")
sys.exit(1)
df = pl.read_parquet(local_data_path)
print(f"[OK] Loaded dataset: {len(df)} rows")
print(f" Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
# Extract October 1-14, 2024 actuals
oct_start = datetime(2024, 10, 1, 0, 0, 0)
oct_end = datetime(2024, 10, 14, 23, 0, 0)
actual_df = df.filter(
(pl.col('timestamp') >= oct_start) &
(pl.col('timestamp') <= oct_end)
)
if len(actual_df) == 0:
print("[ERROR] No actual data found for October 2024!")
print(" Dataset may not contain October 2024 data.")
print(" Available date range in dataset:")
print(f" {df['timestamp'].min()} to {df['timestamp'].max()}")
sys.exit(1)
print(f"[OK] Extracted {len(actual_df)} hours of actual values")
# Step 5: Calculate metrics for each border
print(f"\n[5/6] Calculating MAE metrics for {len(borders)} borders...")
print(" Progress:")
results = []
for i, border in enumerate(borders, 1):
# Get forecast for this border (median)
forecast_col = f"{border}_median"
if forecast_col not in forecast_df.columns:
print(f" [{i:2d}/{len(borders)}] {border:15s} - SKIPPED (no forecast)")
continue
# Get actual values for this border
target_col = f'target_border_{border}'
if target_col not in actual_df.columns:
print(f" [{i:2d}/{len(borders)}] {border:15s} - SKIPPED (no actuals)")
continue
# Merge forecast with actuals on timestamp
merged = forecast_df.select(['timestamp', forecast_col]).join(
actual_df.select(['timestamp', target_col]),
on='timestamp',
how='left'
)
# Calculate overall MAE (all 336 hours)
valid_data = merged.filter(
pl.col(forecast_col).is_not_null() &
pl.col(target_col).is_not_null()
)
if len(valid_data) == 0:
print(f" [{i:2d}/{len(borders)}] {border:15s} - SKIPPED (no valid data)")
continue
# Validation: Check for expected 336 hours (14 days × 24 hours)
if len(valid_data) != 336:
print(f" [{i:2d}/{len(borders)}] {border:15s} - WARNING: Only {len(valid_data)}/336 hours (missing {336-len(valid_data)} hours)")
# Calculate overall metrics
mae = (valid_data[forecast_col] - valid_data[target_col]).abs().mean()
rmse = ((valid_data[forecast_col] - valid_data[target_col])**2).mean()**0.5
# Validation: Flag suspicious MAE values
if mae < 0.001:
print(f" [{i:2d}/{len(borders)}] {border:15s} - WARNING: Suspiciously low MAE = {mae:.2e} MW (possible data leakage)")
# Calculate per-day MAE (D+1 through D+14)
per_day_mae = []
for day in range(1, 15):
day_start = oct_start + timedelta(days=day-1)
day_end = day_start + timedelta(days=1) - timedelta(hours=1)
day_data = valid_data.filter(
(pl.col('timestamp') >= day_start) &
(pl.col('timestamp') <= day_end)
)
if len(day_data) > 0:
day_mae = (day_data[forecast_col] - day_data[target_col]).abs().mean()
per_day_mae.append(day_mae)
else:
per_day_mae.append(np.nan)
# Build results dict with all 14 days
result_dict = {
'border': border,
'mae_overall': mae,
'rmse_overall': rmse,
'n_hours': len(valid_data),
}
# Add MAE for each day (D+1 through D+14)
for day_idx in range(14):
day_num = day_idx + 1
result_dict[f'mae_d{day_num}'] = per_day_mae[day_idx] if len(per_day_mae) > day_idx else np.nan
# Validation: Check for constant forecasts (same MAE across all days)
valid_mae_values = [m for m in per_day_mae if not np.isnan(m)]
if len(valid_mae_values) >= 5: # Need at least 5 days to check
mae_std = np.std(valid_mae_values)
mae_mean = np.mean(valid_mae_values)
if mae_std < 0.01 and mae_mean > 0: # Essentially zero variation but non-zero MAE
print(f" [{i:2d}/{len(borders)}] {border:15s} - WARNING: Constant forecast detected (MAE std={mae_std:.2e})")
results.append(result_dict)
# Status indicator
d1_mae = per_day_mae[0] if len(per_day_mae) > 0 else np.inf
status = "[OK]" if d1_mae <= 150 else "[!]"
print(f" [{i:2d}/{len(borders)}] {border:15s} - D+1 MAE: {d1_mae:6.1f} MW {status}")
# Step 6: Summary statistics
print("\n[6/6] Generating summary report...")
if not results:
print("[ERROR] No results to summarize")
sys.exit(1)
results_df = pl.DataFrame(results)
# Calculate summary statistics
mean_mae_d1 = results_df['mae_d1'].mean()
median_mae_d1 = results_df['mae_d1'].median()
min_mae_d1 = results_df['mae_d1'].min()
max_mae_d1 = results_df['mae_d1'].max()
# Save results to CSV
output_file = Path('results/october_2024_multivariate.csv')
output_file.parent.mkdir(exist_ok=True)
results_df.write_csv(output_file)
print(f"[OK] Results saved to: {output_file}")
# Generate summary report
print("\n" + "="*70)
print("EVALUATION RESULTS SUMMARY - OCTOBER 2024")
print("="*70)
print(f"\nBorders evaluated: {len(results)}/{len(borders)}")
print(f"Total forecast time: {forecast_time/60:.2f} minutes")
print(f"Total evaluation time: {(time.time() - total_start)/60:.2f} minutes")
print(f"\n*** D+1 MAE (PRIMARY METRIC) ***")
print(f"Mean: {mean_mae_d1:.2f} MW (Target: [<=]134 MW)")
print(f"Median: {median_mae_d1:.2f} MW")
print(f"Min: {min_mae_d1:.2f} MW")
print(f"Max: {max_mae_d1:.2f} MW")
# Target achievement
below_target = (results_df['mae_d1'] <= 150).sum()
print(f"\n*** TARGET ACHIEVEMENT ***")
print(f"Borders with D+1 MAE [<=]150 MW: {below_target}/{len(results)} ({below_target/len(results)*100:.1f}%)")
# Best and worst performers
print(f"\n*** TOP 5 BEST PERFORMERS (Lowest D+1 MAE) ***")
best = results_df.sort('mae_d1').head(5)
for row in best.iter_rows(named=True):
print(f" {row['border']:15s}: D+1 MAE={row['mae_d1']:6.1f} MW, Overall MAE={row['mae_overall']:6.1f} MW")
print(f"\n*** TOP 5 WORST PERFORMERS (Highest D+1 MAE) ***")
worst = results_df.sort('mae_d1', descending=True).head(5)
for row in worst.iter_rows(named=True):
print(f" {row['border']:15s}: D+1 MAE={row['mae_d1']:6.1f} MW, Overall MAE={row['mae_overall']:6.1f} MW")
# MAE degradation over forecast horizon
print(f"\n*** MAE DEGRADATION OVER FORECAST HORIZON (ALL 14 DAYS) ***")
for day in range(1, 15):
col_name = f'mae_d{day}'
mean_mae_day = results_df[col_name].mean()
delta = mean_mae_day - mean_mae_d1 if day > 1 else 0
delta_pct = (delta / mean_mae_d1 * 100) if day > 1 and mean_mae_d1 > 0 else 0
if day == 1:
print(f"D+{day:2d}: {mean_mae_day:6.2f} MW (baseline)")
else:
print(f"D+{day:2d}: {mean_mae_day:6.2f} MW (+{delta:5.2f} MW, +{delta_pct:5.1f}%)")
# Final verdict
print("\n" + "="*70)
if mean_mae_d1 <= 134:
print("[OK] TARGET ACHIEVED! Mean D+1 MAE [<=]134 MW")
print(" Zero-shot multivariate forecasting successful!")
elif mean_mae_d1 <= 150:
print("[~] CLOSE TO TARGET. Mean D+1 MAE [<=]150 MW")
print(" Zero-shot baseline established. Fine-tuning recommended.")
else:
print(f"[!] TARGET NOT MET. Mean D+1 MAE: {mean_mae_d1:.2f} MW (Target: [<=]134 MW)")
print(" Fine-tuning strongly recommended for Phase 2")
print("="*70)
# Save summary report
report_file = Path('results/october_2024_evaluation_report.txt')
with open(report_file, 'w', encoding='utf-8', errors='replace') as f:
f.write("="*70 + "\n")
f.write("OCTOBER 2024 MULTIVARIATE CHRONOS-2 EVALUATION REPORT\n")
f.write("="*70 + "\n\n")
f.write(f"Run date: 2024-09-30\n")
f.write(f"Forecast period: Oct 1-14, 2024 (336 hours)\n")
f.write(f"Model: amazon/chronos-2 (multivariate, 615 features)\n")
f.write(f"Borders evaluated: {len(results)}/{len(borders)}\n")
f.write(f"Forecast time: {forecast_time/60:.2f} minutes\n\n")
f.write(f"D+1 MAE RESULTS:\n")
f.write(f" Mean: {mean_mae_d1:.2f} MW\n")
f.write(f" Median: {median_mae_d1:.2f} MW\n")
f.write(f" Min: {min_mae_d1:.2f} MW\n")
f.write(f" Max: {max_mae_d1:.2f} MW\n\n")
f.write(f"Target achievement: {below_target}/{len(results)} borders with MAE [<=]150 MW\n\n")
if mean_mae_d1 <= 134:
f.write("[OK] TARGET ACHIEVED!\n")
else:
f.write(f"[!] Target not met - Fine-tuning recommended\n")
print(f"\n[OK] Summary report saved to: {report_file}")
print(f"\nTotal evaluation time: {(time.time() - total_start)/60:.1f} minutes")
if __name__ == '__main__':
main()