#!/usr/bin/env python3 """ Compare hourly MAE: baseline vs hour-aware adaptive selection. Loads both forecasts and compares MAE per hour-of-day to measure improvement. """ import polars as pl import numpy as np from pathlib import Path from datetime import datetime # Paths PROJECT_ROOT = Path(__file__).parent.parent BASELINE_FORECAST = PROJECT_ROOT / 'results' / 'september_2025_forecast_full_14day.parquet' HOUR_AWARE_FORECAST = PROJECT_ROOT / 'results' / 'september_2025_forecast_hour_aware_ACTUAL.parquet' BASELINE_SUMMARY = PROJECT_ROOT / 'results' / 'september_2025_hourly_summary.csv' OUTPUT_PATH = PROJECT_ROOT / 'results' / 'hourly_mae_comparison.csv' def load_actuals(): """Load actuals from HuggingFace dataset.""" print('[INFO] Loading actuals from HuggingFace dataset...') from datasets import load_dataset import os dataset = load_dataset('evgueni-p/fbmc-features-24month', split='train', token=os.environ.get('HF_TOKEN')) df_actuals_full = pl.from_arrow(dataset.data.table) # Filter to September 2-15, 2025 forecast_start = datetime(2025, 9, 2) forecast_end = datetime(2025, 9, 16) df_actuals = df_actuals_full.filter( (pl.col('timestamp') >= forecast_start) & (pl.col('timestamp') < forecast_end) ) print(f'[INFO] Actuals filtered: {df_actuals.shape[0]} hours') return df_actuals def compute_hourly_mae(df_forecast, df_actuals, label): """Compute MAE per hour-of-day for all borders.""" print(f'[INFO] Computing hourly MAE for {label}...') # Extract border names # For hour-aware, use _adaptive column; for baseline use _median if '_adaptive' in df_forecast.columns[0] or any(c.endswith('_adaptive') for c in df_forecast.columns): forecast_cols = [col for col in df_forecast.columns if col.endswith('_adaptive')] border_names = [col.replace('_adaptive', '') for col in forecast_cols] col_suffix = '_adaptive' else: forecast_cols = [col for col in df_forecast.columns if col.endswith('_median')] border_names = [col.replace('_median', '') for col in forecast_cols] col_suffix = '_median' print(f'[INFO] Using forecast column suffix: {col_suffix}') hourly_results = [] for border in border_names: forecast_col = f'{border}{col_suffix}' actual_col = f'target_border_{border}' if actual_col not in df_actuals.columns: continue # Create unified dataframe df_border = df_forecast.select(['timestamp', forecast_col]).join( df_actuals.select(['timestamp', actual_col]), on='timestamp', how='inner' ) # Add hour-of-day df_border = df_border.with_columns([ pl.col('timestamp').dt.hour().alias('hour') ]) # Compute MAE per hour for hour in range(24): hour_df = df_border.filter(pl.col('hour') == hour) if len(hour_df) == 0: continue mae = (hour_df[forecast_col] - hour_df[actual_col]).abs().mean() hourly_results.append({ 'border': border, 'hour': hour, 'mae': mae, 'n_hours': len(hour_df), 'version': label }) return pl.DataFrame(hourly_results) def compare_results(df_baseline_hourly, df_hour_aware_hourly): """Compare baseline vs hour-aware hourly MAE.""" print('\n' + '='*80) print('HOURLY MAE COMPARISON: Baseline vs Hour-Aware Adaptive Selection') print('='*80) # Aggregate across borders for each version baseline_stats = df_baseline_hourly.group_by('hour').agg([ pl.col('mae').mean().alias('baseline_mae'), pl.col('mae').median().alias('baseline_median_mae'), pl.col('border').count().alias('n_borders') ]).sort('hour') hour_aware_stats = df_hour_aware_hourly.group_by('hour').agg([ pl.col('mae').mean().alias('hour_aware_mae'), pl.col('mae').median().alias('hour_aware_median_mae') ]).sort('hour') # Join for comparison comparison = baseline_stats.join(hour_aware_stats, on='hour', how='inner') # Calculate improvement comparison = comparison.with_columns([ (pl.col('baseline_mae') - pl.col('hour_aware_mae')).alias('mae_reduction'), ((pl.col('baseline_mae') - pl.col('hour_aware_mae')) / pl.col('baseline_mae') * 100).alias('improvement_pct') ]) print('\n[INFO] Hour-by-Hour Comparison:') print(comparison) # Overall statistics overall_baseline = df_baseline_hourly['mae'].mean() overall_hour_aware = df_hour_aware_hourly['mae'].mean() overall_improvement = (overall_baseline - overall_hour_aware) / overall_baseline * 100 print(f'\n[INFO] Overall MAE:') print(f' Baseline: {overall_baseline:.2f} MW') print(f' Hour-Aware: {overall_hour_aware:.2f} MW') print(f' Improvement: {overall_improvement:.2f}%') # Problem hours analysis (15-21) problem_hours = [15, 16, 17, 18, 19, 20, 21] problem_baseline = comparison.filter(pl.col('hour').is_in(problem_hours))['baseline_mae'].mean() problem_hour_aware = comparison.filter(pl.col('hour').is_in(problem_hours))['hour_aware_mae'].mean() problem_improvement = (problem_baseline - problem_hour_aware) / problem_baseline * 100 print(f'\n[INFO] Problem Hours (15-21) MAE:') print(f' Baseline: {problem_baseline:.2f} MW') print(f' Hour-Aware: {problem_hour_aware:.2f} MW') print(f' Improvement: {problem_improvement:.2f}%') # Best/worst hours print('\n[INFO] Top 5 Most Improved Hours:') best_improvements = comparison.sort('improvement_pct', descending=True).head(5) print(best_improvements.select(['hour', 'baseline_mae', 'hour_aware_mae', 'improvement_pct'])) print('\n[INFO] Top 5 Least Improved (or Degraded) Hours:') worst_improvements = comparison.sort('improvement_pct').head(5) print(worst_improvements.select(['hour', 'baseline_mae', 'hour_aware_mae', 'improvement_pct'])) # Success criteria check print('\n' + '='*80) if overall_improvement >= 5.0: print(f'[SUCCESS] Hour-aware selection achieved {overall_improvement:.1f}% improvement (target: 5-10%)') print('[RECOMMENDATION] Proceed to Phase 4: AutoGluon fine-tuning with sample weighting') elif overall_improvement >= 3.0: print(f'[PARTIAL SUCCESS] {overall_improvement:.1f}% improvement - marginal gain') print('[RECOMMENDATION] Consider proceeding to fine-tuning, may provide larger gains') else: print(f'[INSUFFICIENT] Only {overall_improvement:.1f}% improvement (target: 5-10%)') print('[RECOMMENDATION] Skip to Phase 4: AutoGluon fine-tuning with sample weighting') print('='*80) return comparison def main(): """Main comparison workflow.""" print('[START] Hourly MAE Comparison Analysis') print(f'[INFO] Baseline forecast: {BASELINE_FORECAST}') print(f'[INFO] Hour-aware forecast: {HOUR_AWARE_FORECAST}') # Load data df_actuals = load_actuals() print(f'\n[INFO] Loading baseline forecast...') df_baseline = pl.read_parquet(BASELINE_FORECAST) print(f'[INFO] Baseline shape: {df_baseline.shape}') print(f'\n[INFO] Loading hour-aware forecast...') df_hour_aware = pl.read_parquet(HOUR_AWARE_FORECAST) print(f'[INFO] Hour-aware shape: {df_hour_aware.shape}') # Compute hourly MAE for both df_baseline_hourly = compute_hourly_mae(df_baseline, df_actuals, 'baseline') df_hour_aware_hourly = compute_hourly_mae(df_hour_aware, df_actuals, 'hour_aware') # Compare results comparison = compare_results(df_baseline_hourly, df_hour_aware_hourly) # Save detailed comparison comparison.write_csv(OUTPUT_PATH) print(f'\n[INFO] Detailed comparison saved to: {OUTPUT_PATH}') print('\n[SUCCESS] Hourly MAE comparison complete!') if __name__ == '__main__': main()