Spaces:
Sleeping
Sleeping
Evgueni Poloukarov
fix: CRITICAL timestamp bug causing 313 vs 336 hour mismatch + validation
67808ce
| #!/usr/bin/env python3 | |
| """ | |
| October 2024 Evaluation - Multivariate Chronos-2 | |
| Run date: 2024-09-30 (forecast Oct 1-14, 2024) | |
| Compares 38-border × 14-day forecast against actuals | |
| Calculates D+1 through D+14 MAE for each border | |
| """ | |
| import sys | |
| sys.stdout.reconfigure(encoding='utf-8', errors='replace') | |
| import os | |
| import time | |
| import numpy as np | |
| import polars as pl | |
| from datetime import datetime, timedelta | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| from gradio_client import Client | |
| load_dotenv() | |
| def main(): | |
| print("="*70) | |
| print("OCTOBER 2024 MULTIVARIATE CHRONOS-2 EVALUATION") | |
| print("="*70) | |
| total_start = time.time() | |
| # Step 1: Connect to HF Space | |
| print("\n[1/6] Connecting to HuggingFace Space...") | |
| hf_token = os.getenv("HF_TOKEN") | |
| if not hf_token: | |
| raise ValueError("HF_TOKEN not found in environment") | |
| client = Client("evgueni-p/fbmc-chronos2", hf_token=hf_token) | |
| print("[OK] Connected to HF Space") | |
| # Step 2: Run full 14-day forecast for Oct 1-14, 2024 | |
| print("\n[2/6] Running full 38-border forecast via HF Space API...") | |
| print(" Run date: 2024-09-30") | |
| print(" Forecast period: Oct 1-14, 2024 (336 hours)") | |
| print(" This may take 5-10 minutes...") | |
| forecast_start_time = time.time() | |
| result_file = client.predict( | |
| "2024-09-30", # run_date | |
| "full_14day", # forecast_type | |
| ) | |
| forecast_time = time.time() - forecast_start_time | |
| print(f"[OK] Forecast complete in {forecast_time/60:.2f} minutes") | |
| print(f" Result file: {result_file}") | |
| # Step 3: Load forecast results | |
| print("\n[3/6] Loading forecast results...") | |
| forecast_df = pl.read_parquet(result_file) | |
| print(f"[OK] Loaded forecast with shape: {forecast_df.shape}") | |
| print(f" Columns: {len(forecast_df.columns)} (timestamp + {len(forecast_df.columns)-1} forecast columns)") | |
| # Identify border columns (median forecasts) | |
| median_cols = [col for col in forecast_df.columns if col.endswith('_median')] | |
| borders = [col.replace('_median', '') for col in median_cols] | |
| print(f"[OK] Found {len(borders)} borders") | |
| # Step 4: Load actuals from local dataset | |
| print("\n[4/6] Loading actual values from local dataset...") | |
| local_data_path = Path('data/processed/features_unified_24month.parquet') | |
| if not local_data_path.exists(): | |
| print(f"[ERROR] Local dataset not found at: {local_data_path}") | |
| sys.exit(1) | |
| df = pl.read_parquet(local_data_path) | |
| print(f"[OK] Loaded dataset: {len(df)} rows") | |
| print(f" Date range: {df['timestamp'].min()} to {df['timestamp'].max()}") | |
| # Extract October 1-14, 2024 actuals | |
| oct_start = datetime(2024, 10, 1, 0, 0, 0) | |
| oct_end = datetime(2024, 10, 14, 23, 0, 0) | |
| actual_df = df.filter( | |
| (pl.col('timestamp') >= oct_start) & | |
| (pl.col('timestamp') <= oct_end) | |
| ) | |
| if len(actual_df) == 0: | |
| print("[ERROR] No actual data found for October 2024!") | |
| print(" Dataset may not contain October 2024 data.") | |
| print(" Available date range in dataset:") | |
| print(f" {df['timestamp'].min()} to {df['timestamp'].max()}") | |
| sys.exit(1) | |
| print(f"[OK] Extracted {len(actual_df)} hours of actual values") | |
| # Step 5: Calculate metrics for each border | |
| print(f"\n[5/6] Calculating MAE metrics for {len(borders)} borders...") | |
| print(" Progress:") | |
| results = [] | |
| for i, border in enumerate(borders, 1): | |
| # Get forecast for this border (median) | |
| forecast_col = f"{border}_median" | |
| if forecast_col not in forecast_df.columns: | |
| print(f" [{i:2d}/{len(borders)}] {border:15s} - SKIPPED (no forecast)") | |
| continue | |
| # Get actual values for this border | |
| target_col = f'target_border_{border}' | |
| if target_col not in actual_df.columns: | |
| print(f" [{i:2d}/{len(borders)}] {border:15s} - SKIPPED (no actuals)") | |
| continue | |
| # Merge forecast with actuals on timestamp | |
| merged = forecast_df.select(['timestamp', forecast_col]).join( | |
| actual_df.select(['timestamp', target_col]), | |
| on='timestamp', | |
| how='left' | |
| ) | |
| # Calculate overall MAE (all 336 hours) | |
| valid_data = merged.filter( | |
| pl.col(forecast_col).is_not_null() & | |
| pl.col(target_col).is_not_null() | |
| ) | |
| if len(valid_data) == 0: | |
| print(f" [{i:2d}/{len(borders)}] {border:15s} - SKIPPED (no valid data)") | |
| continue | |
| # Validation: Check for expected 336 hours (14 days × 24 hours) | |
| if len(valid_data) != 336: | |
| print(f" [{i:2d}/{len(borders)}] {border:15s} - WARNING: Only {len(valid_data)}/336 hours (missing {336-len(valid_data)} hours)") | |
| # Calculate overall metrics | |
| mae = (valid_data[forecast_col] - valid_data[target_col]).abs().mean() | |
| rmse = ((valid_data[forecast_col] - valid_data[target_col])**2).mean()**0.5 | |
| # Validation: Flag suspicious MAE values | |
| if mae < 0.001: | |
| print(f" [{i:2d}/{len(borders)}] {border:15s} - WARNING: Suspiciously low MAE = {mae:.2e} MW (possible data leakage)") | |
| # Calculate per-day MAE (D+1 through D+14) | |
| per_day_mae = [] | |
| for day in range(1, 15): | |
| day_start = oct_start + timedelta(days=day-1) | |
| day_end = day_start + timedelta(days=1) - timedelta(hours=1) | |
| day_data = valid_data.filter( | |
| (pl.col('timestamp') >= day_start) & | |
| (pl.col('timestamp') <= day_end) | |
| ) | |
| if len(day_data) > 0: | |
| day_mae = (day_data[forecast_col] - day_data[target_col]).abs().mean() | |
| per_day_mae.append(day_mae) | |
| else: | |
| per_day_mae.append(np.nan) | |
| # Build results dict with all 14 days | |
| result_dict = { | |
| 'border': border, | |
| 'mae_overall': mae, | |
| 'rmse_overall': rmse, | |
| 'n_hours': len(valid_data), | |
| } | |
| # Add MAE for each day (D+1 through D+14) | |
| for day_idx in range(14): | |
| day_num = day_idx + 1 | |
| result_dict[f'mae_d{day_num}'] = per_day_mae[day_idx] if len(per_day_mae) > day_idx else np.nan | |
| # Validation: Check for constant forecasts (same MAE across all days) | |
| valid_mae_values = [m for m in per_day_mae if not np.isnan(m)] | |
| if len(valid_mae_values) >= 5: # Need at least 5 days to check | |
| mae_std = np.std(valid_mae_values) | |
| mae_mean = np.mean(valid_mae_values) | |
| if mae_std < 0.01 and mae_mean > 0: # Essentially zero variation but non-zero MAE | |
| print(f" [{i:2d}/{len(borders)}] {border:15s} - WARNING: Constant forecast detected (MAE std={mae_std:.2e})") | |
| results.append(result_dict) | |
| # Status indicator | |
| d1_mae = per_day_mae[0] if len(per_day_mae) > 0 else np.inf | |
| status = "[OK]" if d1_mae <= 150 else "[!]" | |
| print(f" [{i:2d}/{len(borders)}] {border:15s} - D+1 MAE: {d1_mae:6.1f} MW {status}") | |
| # Step 6: Summary statistics | |
| print("\n[6/6] Generating summary report...") | |
| if not results: | |
| print("[ERROR] No results to summarize") | |
| sys.exit(1) | |
| results_df = pl.DataFrame(results) | |
| # Calculate summary statistics | |
| mean_mae_d1 = results_df['mae_d1'].mean() | |
| median_mae_d1 = results_df['mae_d1'].median() | |
| min_mae_d1 = results_df['mae_d1'].min() | |
| max_mae_d1 = results_df['mae_d1'].max() | |
| # Save results to CSV | |
| output_file = Path('results/october_2024_multivariate.csv') | |
| output_file.parent.mkdir(exist_ok=True) | |
| results_df.write_csv(output_file) | |
| print(f"[OK] Results saved to: {output_file}") | |
| # Generate summary report | |
| print("\n" + "="*70) | |
| print("EVALUATION RESULTS SUMMARY - OCTOBER 2024") | |
| print("="*70) | |
| print(f"\nBorders evaluated: {len(results)}/{len(borders)}") | |
| print(f"Total forecast time: {forecast_time/60:.2f} minutes") | |
| print(f"Total evaluation time: {(time.time() - total_start)/60:.2f} minutes") | |
| print(f"\n*** D+1 MAE (PRIMARY METRIC) ***") | |
| print(f"Mean: {mean_mae_d1:.2f} MW (Target: [<=]134 MW)") | |
| print(f"Median: {median_mae_d1:.2f} MW") | |
| print(f"Min: {min_mae_d1:.2f} MW") | |
| print(f"Max: {max_mae_d1:.2f} MW") | |
| # Target achievement | |
| below_target = (results_df['mae_d1'] <= 150).sum() | |
| print(f"\n*** TARGET ACHIEVEMENT ***") | |
| print(f"Borders with D+1 MAE [<=]150 MW: {below_target}/{len(results)} ({below_target/len(results)*100:.1f}%)") | |
| # Best and worst performers | |
| print(f"\n*** TOP 5 BEST PERFORMERS (Lowest D+1 MAE) ***") | |
| best = results_df.sort('mae_d1').head(5) | |
| for row in best.iter_rows(named=True): | |
| print(f" {row['border']:15s}: D+1 MAE={row['mae_d1']:6.1f} MW, Overall MAE={row['mae_overall']:6.1f} MW") | |
| print(f"\n*** TOP 5 WORST PERFORMERS (Highest D+1 MAE) ***") | |
| worst = results_df.sort('mae_d1', descending=True).head(5) | |
| for row in worst.iter_rows(named=True): | |
| print(f" {row['border']:15s}: D+1 MAE={row['mae_d1']:6.1f} MW, Overall MAE={row['mae_overall']:6.1f} MW") | |
| # MAE degradation over forecast horizon | |
| print(f"\n*** MAE DEGRADATION OVER FORECAST HORIZON (ALL 14 DAYS) ***") | |
| for day in range(1, 15): | |
| col_name = f'mae_d{day}' | |
| mean_mae_day = results_df[col_name].mean() | |
| delta = mean_mae_day - mean_mae_d1 if day > 1 else 0 | |
| delta_pct = (delta / mean_mae_d1 * 100) if day > 1 and mean_mae_d1 > 0 else 0 | |
| if day == 1: | |
| print(f"D+{day:2d}: {mean_mae_day:6.2f} MW (baseline)") | |
| else: | |
| print(f"D+{day:2d}: {mean_mae_day:6.2f} MW (+{delta:5.2f} MW, +{delta_pct:5.1f}%)") | |
| # Final verdict | |
| print("\n" + "="*70) | |
| if mean_mae_d1 <= 134: | |
| print("[OK] TARGET ACHIEVED! Mean D+1 MAE [<=]134 MW") | |
| print(" Zero-shot multivariate forecasting successful!") | |
| elif mean_mae_d1 <= 150: | |
| print("[~] CLOSE TO TARGET. Mean D+1 MAE [<=]150 MW") | |
| print(" Zero-shot baseline established. Fine-tuning recommended.") | |
| else: | |
| print(f"[!] TARGET NOT MET. Mean D+1 MAE: {mean_mae_d1:.2f} MW (Target: [<=]134 MW)") | |
| print(" Fine-tuning strongly recommended for Phase 2") | |
| print("="*70) | |
| # Save summary report | |
| report_file = Path('results/october_2024_evaluation_report.txt') | |
| with open(report_file, 'w', encoding='utf-8', errors='replace') as f: | |
| f.write("="*70 + "\n") | |
| f.write("OCTOBER 2024 MULTIVARIATE CHRONOS-2 EVALUATION REPORT\n") | |
| f.write("="*70 + "\n\n") | |
| f.write(f"Run date: 2024-09-30\n") | |
| f.write(f"Forecast period: Oct 1-14, 2024 (336 hours)\n") | |
| f.write(f"Model: amazon/chronos-2 (multivariate, 615 features)\n") | |
| f.write(f"Borders evaluated: {len(results)}/{len(borders)}\n") | |
| f.write(f"Forecast time: {forecast_time/60:.2f} minutes\n\n") | |
| f.write(f"D+1 MAE RESULTS:\n") | |
| f.write(f" Mean: {mean_mae_d1:.2f} MW\n") | |
| f.write(f" Median: {median_mae_d1:.2f} MW\n") | |
| f.write(f" Min: {min_mae_d1:.2f} MW\n") | |
| f.write(f" Max: {max_mae_d1:.2f} MW\n\n") | |
| f.write(f"Target achievement: {below_target}/{len(results)} borders with MAE [<=]150 MW\n\n") | |
| if mean_mae_d1 <= 134: | |
| f.write("[OK] TARGET ACHIEVED!\n") | |
| else: | |
| f.write(f"[!] Target not met - Fine-tuning recommended\n") | |
| print(f"\n[OK] Summary report saved to: {report_file}") | |
| print(f"\nTotal evaluation time: {(time.time() - total_start)/60:.1f} minutes") | |
| if __name__ == '__main__': | |
| main() | |