#!/usr/bin/env python3 """ Analyze hourly MAE patterns to establish baseline before optimization. This script loads September 2025 forecast results and computes MAE per hour-of-day to identify which hours have highest errors (likely ramping hours: 7-9, 17-21). """ import polars as pl import numpy as np from pathlib import Path from datetime import datetime # Paths PROJECT_ROOT = Path(__file__).parent.parent FORECAST_PATH = PROJECT_ROOT / 'results' / 'september_2025_forecast_full_14day.parquet' OUTPUT_PATH = PROJECT_ROOT / 'results' / 'september_2025_hourly_mae_baseline.csv' def load_data(): """Load forecast and actual data.""" print('[INFO] Loading forecast results...') df_forecast = pl.read_parquet(FORECAST_PATH) print(f'[INFO] Forecast shape: {df_forecast.shape}') print(f'[INFO] Forecast period: {df_forecast["timestamp"].min()} to {df_forecast["timestamp"].max()}') # Load actuals from HuggingFace dataset print('[INFO] Loading actuals from HuggingFace dataset...') from datasets import load_dataset import os dataset = load_dataset('evgueni-p/fbmc-features-24month', split='train', token=os.environ.get('HF_TOKEN')) df_actuals_full = pl.from_arrow(dataset.data.table) # Filter actuals to forecast period (Sept 2-15, 2025) forecast_start = datetime(2025, 9, 2) forecast_end = datetime(2025, 9, 16) df_actuals = df_actuals_full.filter( (pl.col('timestamp') >= forecast_start) & (pl.col('timestamp') < forecast_end) ) print(f'[INFO] Actuals filtered: {df_actuals.shape[0]} hours') return df_forecast, df_actuals def compute_hourly_mae(df_forecast, df_actuals): """Compute MAE per hour-of-day for all borders.""" print('[INFO] Computing hourly MAE...') # Extract border names from forecast columns forecast_cols = [col for col in df_forecast.columns if col.endswith('_median')] border_names = [col.replace('_median', '') for col in forecast_cols] print(f'[INFO] Processing {len(border_names)} borders...') hourly_results = [] for border in border_names: forecast_col = f'{border}_median' actual_col = f'target_border_{border}' # Skip if actual column missing if actual_col not in df_actuals.columns: print(f'[WARNING] Skipping {border} - no actual data') continue # Create unified dataframe df_border = df_forecast.select(['timestamp', forecast_col]).join( df_actuals.select(['timestamp', actual_col]), on='timestamp', how='inner' ) # Add hour-of-day df_border = df_border.with_columns([ pl.col('timestamp').dt.hour().alias('hour') ]) # Compute MAE per hour for hour in range(24): hour_df = df_border.filter(pl.col('hour') == hour) if len(hour_df) == 0: continue mae = (hour_df[forecast_col] - hour_df[actual_col]).abs().mean() hourly_results.append({ 'border': border, 'hour': hour, 'mae': mae, 'n_hours': len(hour_df) }) return pl.DataFrame(hourly_results) def analyze_patterns(df_hourly): """Analyze hourly MAE patterns.""" print('\n' + '='*60) print('HOURLY MAE ANALYSIS') print('='*60) # Overall statistics per hour (aggregated across all borders) hourly_stats = df_hourly.group_by('hour').agg([ pl.col('mae').mean().alias('mean_mae'), pl.col('mae').median().alias('median_mae'), pl.col('mae').std().alias('std_mae'), pl.col('mae').min().alias('min_mae'), pl.col('mae').max().alias('max_mae'), pl.col('border').count().alias('n_borders') ]).sort('hour') print('\n[INFO] MAE by Hour-of-Day (Averaged Across All Borders):') print(hourly_stats) # Identify problem hours (highest MAE) print('\n[INFO] Top 5 Worst Hours (Highest MAE):') worst_hours = hourly_stats.sort('mean_mae', descending=True).head(5) print(worst_hours) # Identify best hours (lowest MAE) print('\n[INFO] Top 5 Best Hours (Lowest MAE):') best_hours = hourly_stats.sort('mean_mae').head(5) print(best_hours) # Ramping hour analysis ramping_hours = [5, 6, 7, 8, 9, 17, 18, 19, 20, 21] non_ramping_hours = [h for h in range(24) if h not in ramping_hours] ramping_mae = hourly_stats.filter(pl.col('hour').is_in(ramping_hours))['mean_mae'].mean() non_ramping_mae = hourly_stats.filter(pl.col('hour').is_in(non_ramping_hours))['mean_mae'].mean() print(f'\n[INFO] Ramping hours (5-9, 17-21) MAE: {ramping_mae:.2f} MW') print(f'[INFO] Non-ramping hours MAE: {non_ramping_mae:.2f} MW') print(f'[INFO] Ramping penalty: {(ramping_mae - non_ramping_mae) / non_ramping_mae * 100:.1f}% higher') # Peak hour analysis peak_hours = [7, 8, 9, 17, 18, 19, 20] peak_mae = hourly_stats.filter(pl.col('hour').is_in(peak_hours))['mean_mae'].mean() print(f'\n[INFO] Peak hours (7-9, 17-20) MAE: {peak_mae:.2f} MW') # Night hour analysis night_hours = [22, 23, 0, 1, 2, 3, 4] night_mae = hourly_stats.filter(pl.col('hour').is_in(night_hours))['mean_mae'].mean() print(f'[INFO] Night hours (22-4) MAE: {night_mae:.2f} MW') return hourly_stats def identify_problematic_borders(df_hourly): """Identify borders with largest hourly MAE variations.""" print('\n[INFO] Borders with Highest Hourly MAE Variation:') border_variation = df_hourly.group_by('border').agg([ pl.col('mae').mean().alias('mean_mae'), pl.col('mae').std().alias('std_mae'), pl.col('mae').max().alias('max_mae'), (pl.col('mae').max() - pl.col('mae').min()).alias('range_mae') ]).sort('std_mae', descending=True) print(border_variation.head(10)) return border_variation def main(): """Main analysis workflow.""" print('[START] Hourly MAE Baseline Analysis') print(f'[INFO] Forecast file: {FORECAST_PATH}') # Load data df_forecast, df_actuals = load_data() # Compute hourly MAE df_hourly = compute_hourly_mae(df_forecast, df_actuals) print(f'\n[INFO] Computed hourly MAE for {df_hourly["border"].n_unique()} borders') # Analyze patterns hourly_stats = analyze_patterns(df_hourly) # Identify problematic borders border_variation = identify_problematic_borders(df_hourly) # Save detailed results df_hourly.write_csv(OUTPUT_PATH) print(f'\n[INFO] Detailed hourly MAE saved to: {OUTPUT_PATH}') # Save summary stats summary_path = PROJECT_ROOT / 'results' / 'september_2025_hourly_summary.csv' hourly_stats.write_csv(summary_path) print(f'[INFO] Hourly summary saved to: {summary_path}') print('\n[SUCCESS] Hourly MAE baseline analysis complete!') if __name__ == '__main__': main()