fbmc-chronos2 / scripts /compare_hourly_mae.py
Evgueni Poloukarov
revert: remove hour-aware adaptive quantile selection (61% MAE degradation)
ff9fbcf
#!/usr/bin/env python3
"""
Compare hourly MAE: baseline vs hour-aware adaptive selection.
Loads both forecasts and compares MAE per hour-of-day to measure improvement.
"""
import polars as pl
import numpy as np
from pathlib import Path
from datetime import datetime
# Paths
PROJECT_ROOT = Path(__file__).parent.parent
BASELINE_FORECAST = PROJECT_ROOT / 'results' / 'september_2025_forecast_full_14day.parquet'
HOUR_AWARE_FORECAST = PROJECT_ROOT / 'results' / 'september_2025_forecast_hour_aware_ACTUAL.parquet'
BASELINE_SUMMARY = PROJECT_ROOT / 'results' / 'september_2025_hourly_summary.csv'
OUTPUT_PATH = PROJECT_ROOT / 'results' / 'hourly_mae_comparison.csv'
def load_actuals():
"""Load actuals from HuggingFace dataset."""
print('[INFO] Loading actuals from HuggingFace dataset...')
from datasets import load_dataset
import os
dataset = load_dataset('evgueni-p/fbmc-features-24month', split='train', token=os.environ.get('HF_TOKEN'))
df_actuals_full = pl.from_arrow(dataset.data.table)
# Filter to September 2-15, 2025
forecast_start = datetime(2025, 9, 2)
forecast_end = datetime(2025, 9, 16)
df_actuals = df_actuals_full.filter(
(pl.col('timestamp') >= forecast_start) &
(pl.col('timestamp') < forecast_end)
)
print(f'[INFO] Actuals filtered: {df_actuals.shape[0]} hours')
return df_actuals
def compute_hourly_mae(df_forecast, df_actuals, label):
"""Compute MAE per hour-of-day for all borders."""
print(f'[INFO] Computing hourly MAE for {label}...')
# Extract border names
# For hour-aware, use _adaptive column; for baseline use _median
if '_adaptive' in df_forecast.columns[0] or any(c.endswith('_adaptive') for c in df_forecast.columns):
forecast_cols = [col for col in df_forecast.columns if col.endswith('_adaptive')]
border_names = [col.replace('_adaptive', '') for col in forecast_cols]
col_suffix = '_adaptive'
else:
forecast_cols = [col for col in df_forecast.columns if col.endswith('_median')]
border_names = [col.replace('_median', '') for col in forecast_cols]
col_suffix = '_median'
print(f'[INFO] Using forecast column suffix: {col_suffix}')
hourly_results = []
for border in border_names:
forecast_col = f'{border}{col_suffix}'
actual_col = f'target_border_{border}'
if actual_col not in df_actuals.columns:
continue
# Create unified dataframe
df_border = df_forecast.select(['timestamp', forecast_col]).join(
df_actuals.select(['timestamp', actual_col]),
on='timestamp',
how='inner'
)
# Add hour-of-day
df_border = df_border.with_columns([
pl.col('timestamp').dt.hour().alias('hour')
])
# Compute MAE per hour
for hour in range(24):
hour_df = df_border.filter(pl.col('hour') == hour)
if len(hour_df) == 0:
continue
mae = (hour_df[forecast_col] - hour_df[actual_col]).abs().mean()
hourly_results.append({
'border': border,
'hour': hour,
'mae': mae,
'n_hours': len(hour_df),
'version': label
})
return pl.DataFrame(hourly_results)
def compare_results(df_baseline_hourly, df_hour_aware_hourly):
"""Compare baseline vs hour-aware hourly MAE."""
print('\n' + '='*80)
print('HOURLY MAE COMPARISON: Baseline vs Hour-Aware Adaptive Selection')
print('='*80)
# Aggregate across borders for each version
baseline_stats = df_baseline_hourly.group_by('hour').agg([
pl.col('mae').mean().alias('baseline_mae'),
pl.col('mae').median().alias('baseline_median_mae'),
pl.col('border').count().alias('n_borders')
]).sort('hour')
hour_aware_stats = df_hour_aware_hourly.group_by('hour').agg([
pl.col('mae').mean().alias('hour_aware_mae'),
pl.col('mae').median().alias('hour_aware_median_mae')
]).sort('hour')
# Join for comparison
comparison = baseline_stats.join(hour_aware_stats, on='hour', how='inner')
# Calculate improvement
comparison = comparison.with_columns([
(pl.col('baseline_mae') - pl.col('hour_aware_mae')).alias('mae_reduction'),
((pl.col('baseline_mae') - pl.col('hour_aware_mae')) / pl.col('baseline_mae') * 100).alias('improvement_pct')
])
print('\n[INFO] Hour-by-Hour Comparison:')
print(comparison)
# Overall statistics
overall_baseline = df_baseline_hourly['mae'].mean()
overall_hour_aware = df_hour_aware_hourly['mae'].mean()
overall_improvement = (overall_baseline - overall_hour_aware) / overall_baseline * 100
print(f'\n[INFO] Overall MAE:')
print(f' Baseline: {overall_baseline:.2f} MW')
print(f' Hour-Aware: {overall_hour_aware:.2f} MW')
print(f' Improvement: {overall_improvement:.2f}%')
# Problem hours analysis (15-21)
problem_hours = [15, 16, 17, 18, 19, 20, 21]
problem_baseline = comparison.filter(pl.col('hour').is_in(problem_hours))['baseline_mae'].mean()
problem_hour_aware = comparison.filter(pl.col('hour').is_in(problem_hours))['hour_aware_mae'].mean()
problem_improvement = (problem_baseline - problem_hour_aware) / problem_baseline * 100
print(f'\n[INFO] Problem Hours (15-21) MAE:')
print(f' Baseline: {problem_baseline:.2f} MW')
print(f' Hour-Aware: {problem_hour_aware:.2f} MW')
print(f' Improvement: {problem_improvement:.2f}%')
# Best/worst hours
print('\n[INFO] Top 5 Most Improved Hours:')
best_improvements = comparison.sort('improvement_pct', descending=True).head(5)
print(best_improvements.select(['hour', 'baseline_mae', 'hour_aware_mae', 'improvement_pct']))
print('\n[INFO] Top 5 Least Improved (or Degraded) Hours:')
worst_improvements = comparison.sort('improvement_pct').head(5)
print(worst_improvements.select(['hour', 'baseline_mae', 'hour_aware_mae', 'improvement_pct']))
# Success criteria check
print('\n' + '='*80)
if overall_improvement >= 5.0:
print(f'[SUCCESS] Hour-aware selection achieved {overall_improvement:.1f}% improvement (target: 5-10%)')
print('[RECOMMENDATION] Proceed to Phase 4: AutoGluon fine-tuning with sample weighting')
elif overall_improvement >= 3.0:
print(f'[PARTIAL SUCCESS] {overall_improvement:.1f}% improvement - marginal gain')
print('[RECOMMENDATION] Consider proceeding to fine-tuning, may provide larger gains')
else:
print(f'[INSUFFICIENT] Only {overall_improvement:.1f}% improvement (target: 5-10%)')
print('[RECOMMENDATION] Skip to Phase 4: AutoGluon fine-tuning with sample weighting')
print('='*80)
return comparison
def main():
"""Main comparison workflow."""
print('[START] Hourly MAE Comparison Analysis')
print(f'[INFO] Baseline forecast: {BASELINE_FORECAST}')
print(f'[INFO] Hour-aware forecast: {HOUR_AWARE_FORECAST}')
# Load data
df_actuals = load_actuals()
print(f'\n[INFO] Loading baseline forecast...')
df_baseline = pl.read_parquet(BASELINE_FORECAST)
print(f'[INFO] Baseline shape: {df_baseline.shape}')
print(f'\n[INFO] Loading hour-aware forecast...')
df_hour_aware = pl.read_parquet(HOUR_AWARE_FORECAST)
print(f'[INFO] Hour-aware shape: {df_hour_aware.shape}')
# Compute hourly MAE for both
df_baseline_hourly = compute_hourly_mae(df_baseline, df_actuals, 'baseline')
df_hour_aware_hourly = compute_hourly_mae(df_hour_aware, df_actuals, 'hour_aware')
# Compare results
comparison = compare_results(df_baseline_hourly, df_hour_aware_hourly)
# Save detailed comparison
comparison.write_csv(OUTPUT_PATH)
print(f'\n[INFO] Detailed comparison saved to: {OUTPUT_PATH}')
print('\n[SUCCESS] Hourly MAE comparison complete!')
if __name__ == '__main__':
main()