File size: 11,686 Bytes
a321b61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67808ce
 
 
 
a321b61
 
 
 
67808ce
 
 
 
a321b61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2daca7
 
a321b61
 
 
 
b2daca7
 
 
 
 
 
 
67808ce
 
 
 
 
 
 
 
b2daca7
a321b61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2daca7
 
 
 
 
 
 
 
 
 
 
 
a321b61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
#!/usr/bin/env python3
"""
October 2024 Evaluation - Multivariate Chronos-2
Run date: 2024-09-30 (forecast Oct 1-14, 2024)
Compares 38-border × 14-day forecast against actuals
Calculates D+1 through D+14 MAE for each border
"""
import sys
sys.stdout.reconfigure(encoding='utf-8', errors='replace')

import os
import time
import numpy as np
import polars as pl
from datetime import datetime, timedelta
from pathlib import Path
from dotenv import load_dotenv
from gradio_client import Client

load_dotenv()

def main():
    print("="*70)
    print("OCTOBER 2024 MULTIVARIATE CHRONOS-2 EVALUATION")
    print("="*70)

    total_start = time.time()

    # Step 1: Connect to HF Space
    print("\n[1/6] Connecting to HuggingFace Space...")
    hf_token = os.getenv("HF_TOKEN")
    if not hf_token:
        raise ValueError("HF_TOKEN not found in environment")

    client = Client("evgueni-p/fbmc-chronos2", hf_token=hf_token)
    print("[OK] Connected to HF Space")

    # Step 2: Run full 14-day forecast for Oct 1-14, 2024
    print("\n[2/6] Running full 38-border forecast via HF Space API...")
    print("      Run date: 2024-09-30")
    print("      Forecast period: Oct 1-14, 2024 (336 hours)")
    print("      This may take 5-10 minutes...")

    forecast_start_time = time.time()
    result_file = client.predict(
        "2024-09-30",  # run_date
        "full_14day",  # forecast_type
    )
    forecast_time = time.time() - forecast_start_time

    print(f"[OK] Forecast complete in {forecast_time/60:.2f} minutes")
    print(f"     Result file: {result_file}")

    # Step 3: Load forecast results
    print("\n[3/6] Loading forecast results...")
    forecast_df = pl.read_parquet(result_file)
    print(f"[OK] Loaded forecast with shape: {forecast_df.shape}")
    print(f"     Columns: {len(forecast_df.columns)} (timestamp + {len(forecast_df.columns)-1} forecast columns)")

    # Identify border columns (median forecasts)
    median_cols = [col for col in forecast_df.columns if col.endswith('_median')]
    borders = [col.replace('_median', '') for col in median_cols]
    print(f"[OK] Found {len(borders)} borders")

    # Step 4: Load actuals from local dataset
    print("\n[4/6] Loading actual values from local dataset...")
    local_data_path = Path('data/processed/features_unified_24month.parquet')

    if not local_data_path.exists():
        print(f"[ERROR] Local dataset not found at: {local_data_path}")
        sys.exit(1)

    df = pl.read_parquet(local_data_path)

    print(f"[OK] Loaded dataset: {len(df)} rows")
    print(f"     Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")

    # Extract October 1-14, 2024 actuals
    oct_start = datetime(2024, 10, 1, 0, 0, 0)
    oct_end = datetime(2024, 10, 14, 23, 0, 0)

    actual_df = df.filter(
        (pl.col('timestamp') >= oct_start) &
        (pl.col('timestamp') <= oct_end)
    )

    if len(actual_df) == 0:
        print("[ERROR] No actual data found for October 2024!")
        print("        Dataset may not contain October 2024 data.")
        print("        Available date range in dataset:")
        print(f"        {df['timestamp'].min()} to {df['timestamp'].max()}")
        sys.exit(1)

    print(f"[OK] Extracted {len(actual_df)} hours of actual values")

    # Step 5: Calculate metrics for each border
    print(f"\n[5/6] Calculating MAE metrics for {len(borders)} borders...")
    print("      Progress:")

    results = []

    for i, border in enumerate(borders, 1):
        # Get forecast for this border (median)
        forecast_col = f"{border}_median"

        if forecast_col not in forecast_df.columns:
            print(f"      [{i:2d}/{len(borders)}] {border:15s} - SKIPPED (no forecast)")
            continue

        # Get actual values for this border
        target_col = f'target_border_{border}'

        if target_col not in actual_df.columns:
            print(f"      [{i:2d}/{len(borders)}] {border:15s} - SKIPPED (no actuals)")
            continue

        # Merge forecast with actuals on timestamp
        merged = forecast_df.select(['timestamp', forecast_col]).join(
            actual_df.select(['timestamp', target_col]),
            on='timestamp',
            how='left'
        )

        # Calculate overall MAE (all 336 hours)
        valid_data = merged.filter(
            pl.col(forecast_col).is_not_null() &
            pl.col(target_col).is_not_null()
        )

        if len(valid_data) == 0:
            print(f"      [{i:2d}/{len(borders)}] {border:15s} - SKIPPED (no valid data)")
            continue

        # Validation: Check for expected 336 hours (14 days × 24 hours)
        if len(valid_data) != 336:
            print(f"      [{i:2d}/{len(borders)}] {border:15s} - WARNING: Only {len(valid_data)}/336 hours (missing {336-len(valid_data)} hours)")

        # Calculate overall metrics
        mae = (valid_data[forecast_col] - valid_data[target_col]).abs().mean()
        rmse = ((valid_data[forecast_col] - valid_data[target_col])**2).mean()**0.5

        # Validation: Flag suspicious MAE values
        if mae < 0.001:
            print(f"      [{i:2d}/{len(borders)}] {border:15s} - WARNING: Suspiciously low MAE = {mae:.2e} MW (possible data leakage)")

        # Calculate per-day MAE (D+1 through D+14)
        per_day_mae = []
        for day in range(1, 15):
            day_start = oct_start + timedelta(days=day-1)
            day_end = day_start + timedelta(days=1) - timedelta(hours=1)

            day_data = valid_data.filter(
                (pl.col('timestamp') >= day_start) &
                (pl.col('timestamp') <= day_end)
            )

            if len(day_data) > 0:
                day_mae = (day_data[forecast_col] - day_data[target_col]).abs().mean()
                per_day_mae.append(day_mae)
            else:
                per_day_mae.append(np.nan)

        # Build results dict with all 14 days
        result_dict = {
            'border': border,
            'mae_overall': mae,
            'rmse_overall': rmse,
            'n_hours': len(valid_data),
        }

        # Add MAE for each day (D+1 through D+14)
        for day_idx in range(14):
            day_num = day_idx + 1
            result_dict[f'mae_d{day_num}'] = per_day_mae[day_idx] if len(per_day_mae) > day_idx else np.nan

        # Validation: Check for constant forecasts (same MAE across all days)
        valid_mae_values = [m for m in per_day_mae if not np.isnan(m)]
        if len(valid_mae_values) >= 5:  # Need at least 5 days to check
            mae_std = np.std(valid_mae_values)
            mae_mean = np.mean(valid_mae_values)
            if mae_std < 0.01 and mae_mean > 0:  # Essentially zero variation but non-zero MAE
                print(f"      [{i:2d}/{len(borders)}] {border:15s} - WARNING: Constant forecast detected (MAE std={mae_std:.2e})")

        results.append(result_dict)

        # Status indicator
        d1_mae = per_day_mae[0] if len(per_day_mae) > 0 else np.inf
        status = "[OK]" if d1_mae <= 150 else "[!]"

        print(f"      [{i:2d}/{len(borders)}] {border:15s} - D+1 MAE: {d1_mae:6.1f} MW  {status}")

    # Step 6: Summary statistics
    print("\n[6/6] Generating summary report...")

    if not results:
        print("[ERROR] No results to summarize")
        sys.exit(1)

    results_df = pl.DataFrame(results)

    # Calculate summary statistics
    mean_mae_d1 = results_df['mae_d1'].mean()
    median_mae_d1 = results_df['mae_d1'].median()
    min_mae_d1 = results_df['mae_d1'].min()
    max_mae_d1 = results_df['mae_d1'].max()

    # Save results to CSV
    output_file = Path('results/october_2024_multivariate.csv')
    output_file.parent.mkdir(exist_ok=True)
    results_df.write_csv(output_file)
    print(f"[OK] Results saved to: {output_file}")

    # Generate summary report
    print("\n" + "="*70)
    print("EVALUATION RESULTS SUMMARY - OCTOBER 2024")
    print("="*70)

    print(f"\nBorders evaluated: {len(results)}/{len(borders)}")
    print(f"Total forecast time: {forecast_time/60:.2f} minutes")
    print(f"Total evaluation time: {(time.time() - total_start)/60:.2f} minutes")

    print(f"\n*** D+1 MAE (PRIMARY METRIC) ***")
    print(f"Mean:   {mean_mae_d1:.2f} MW  (Target: [<=]134 MW)")
    print(f"Median: {median_mae_d1:.2f} MW")
    print(f"Min:    {min_mae_d1:.2f} MW")
    print(f"Max:    {max_mae_d1:.2f} MW")

    # Target achievement
    below_target = (results_df['mae_d1'] <= 150).sum()
    print(f"\n*** TARGET ACHIEVEMENT ***")
    print(f"Borders with D+1 MAE [<=]150 MW: {below_target}/{len(results)} ({below_target/len(results)*100:.1f}%)")

    # Best and worst performers
    print(f"\n*** TOP 5 BEST PERFORMERS (Lowest D+1 MAE) ***")
    best = results_df.sort('mae_d1').head(5)
    for row in best.iter_rows(named=True):
        print(f"  {row['border']:15s}: D+1 MAE={row['mae_d1']:6.1f} MW, Overall MAE={row['mae_overall']:6.1f} MW")

    print(f"\n*** TOP 5 WORST PERFORMERS (Highest D+1 MAE) ***")
    worst = results_df.sort('mae_d1', descending=True).head(5)
    for row in worst.iter_rows(named=True):
        print(f"  {row['border']:15s}: D+1 MAE={row['mae_d1']:6.1f} MW, Overall MAE={row['mae_overall']:6.1f} MW")

    # MAE degradation over forecast horizon
    print(f"\n*** MAE DEGRADATION OVER FORECAST HORIZON (ALL 14 DAYS) ***")

    for day in range(1, 15):
        col_name = f'mae_d{day}'
        mean_mae_day = results_df[col_name].mean()
        delta = mean_mae_day - mean_mae_d1 if day > 1 else 0
        delta_pct = (delta / mean_mae_d1 * 100) if day > 1 and mean_mae_d1 > 0 else 0

        if day == 1:
            print(f"D+{day:2d}: {mean_mae_day:6.2f} MW (baseline)")
        else:
            print(f"D+{day:2d}: {mean_mae_day:6.2f} MW (+{delta:5.2f} MW, +{delta_pct:5.1f}%)")

    # Final verdict
    print("\n" + "="*70)
    if mean_mae_d1 <= 134:
        print("[OK] TARGET ACHIEVED! Mean D+1 MAE [<=]134 MW")
        print("     Zero-shot multivariate forecasting successful!")
    elif mean_mae_d1 <= 150:
        print("[~] CLOSE TO TARGET. Mean D+1 MAE [<=]150 MW")
        print("    Zero-shot baseline established. Fine-tuning recommended.")
    else:
        print(f"[!] TARGET NOT MET. Mean D+1 MAE: {mean_mae_d1:.2f} MW (Target: [<=]134 MW)")
        print("    Fine-tuning strongly recommended for Phase 2")
    print("="*70)

    # Save summary report
    report_file = Path('results/october_2024_evaluation_report.txt')
    with open(report_file, 'w', encoding='utf-8', errors='replace') as f:
        f.write("="*70 + "\n")
        f.write("OCTOBER 2024 MULTIVARIATE CHRONOS-2 EVALUATION REPORT\n")
        f.write("="*70 + "\n\n")
        f.write(f"Run date: 2024-09-30\n")
        f.write(f"Forecast period: Oct 1-14, 2024 (336 hours)\n")
        f.write(f"Model: amazon/chronos-2 (multivariate, 615 features)\n")
        f.write(f"Borders evaluated: {len(results)}/{len(borders)}\n")
        f.write(f"Forecast time: {forecast_time/60:.2f} minutes\n\n")
        f.write(f"D+1 MAE RESULTS:\n")
        f.write(f"  Mean:   {mean_mae_d1:.2f} MW\n")
        f.write(f"  Median: {median_mae_d1:.2f} MW\n")
        f.write(f"  Min:    {min_mae_d1:.2f} MW\n")
        f.write(f"  Max:    {max_mae_d1:.2f} MW\n\n")
        f.write(f"Target achievement: {below_target}/{len(results)} borders with MAE [<=]150 MW\n\n")
        if mean_mae_d1 <= 134:
            f.write("[OK] TARGET ACHIEVED!\n")
        else:
            f.write(f"[!] Target not met - Fine-tuning recommended\n")

    print(f"\n[OK] Summary report saved to: {report_file}")
    print(f"\nTotal evaluation time: {(time.time() - total_start)/60:.1f} minutes")


if __name__ == '__main__':
    main()