Spaces:
Sleeping
Sleeping
| """Engineer ~1,600 JAO features for FBMC forecasting. | |
| Transforms unified JAO data into model-ready features across 10 categories: | |
| 1. Tier-1 CNEC historical (1,000 features) | |
| 2. Tier-2 CNEC historical (360 features) | |
| 3. LTA future covariates (40 features) | |
| 4. NetPos historical lags (48 features) | |
| 5. MaxBEX historical lags (40 features) | |
| 6. Temporal encoding (20 features) | |
| 7. System aggregates (20 features) | |
| 8. Regional proxies (36 features) | |
| 9. PCA clusters (10 features) | |
| 10. Additional lags (27 features) | |
| Author: Claude | |
| Date: 2025-11-06 | |
| """ | |
| from pathlib import Path | |
| from typing import Tuple, List | |
| import polars as pl | |
| import numpy as np | |
| from sklearn.decomposition import PCA | |
| # ========================================================================= | |
| # Feature Category 1: Tier-1 CNEC Historical Features | |
| # ========================================================================= | |
| def engineer_tier1_cnec_features( | |
| cnec_hourly: pl.DataFrame, | |
| tier1_eics: List[str], | |
| unified: pl.DataFrame | |
| ) -> pl.DataFrame: | |
| """Engineer ~1,000 Tier-1 CNEC historical features. | |
| For each of 58 Tier-1 CNECs: | |
| - Binding status (is_binding): 1 lag * 58 = 58 | |
| - Shadow price (ram): 5 lags * 58 = 290 | |
| - RAM usage percent: 5 lags * 58 = 290 | |
| - Rolling aggregates (7d, 30d): 4 features * 58 = 232 | |
| - Interaction terms: 130 | |
| Total: ~1,000 features | |
| """ | |
| print("\n[1/10] Engineering Tier-1 CNEC features...") | |
| # Filter CNEC data to Tier-1 only | |
| tier1_cnecs = cnec_hourly.filter(pl.col('cnec_eic').is_in(tier1_eics)) | |
| # Create is_binding column (shadow_price > 0 means binding) | |
| tier1_cnecs = tier1_cnecs.with_columns([ | |
| (pl.col('shadow_price') > 0).cast(pl.Int64).alias('is_binding') | |
| ]) | |
| # Pivot to wide format: one row per timestamp, one column per CNEC | |
| # Key columns: cnec_eic, mtu, is_binding, ram (shadow price), fmax (capacity) | |
| # Pivot binding status | |
| binding_wide = tier1_cnecs.pivot( | |
| values='is_binding', | |
| index='mtu', | |
| on='cnec_eic', | |
| aggregate_function='first' | |
| ) | |
| # Rename columns to binding_<eic> | |
| binding_cols = [c for c in binding_wide.columns if c != 'mtu'] | |
| binding_wide = binding_wide.rename({ | |
| c: f'cnec_t1_binding_{c}' for c in binding_cols | |
| }) | |
| # Pivot RAM (shadow price) | |
| ram_wide = tier1_cnecs.pivot( | |
| values='ram', | |
| index='mtu', | |
| on='cnec_eic', | |
| aggregate_function='first' | |
| ) | |
| ram_cols = [c for c in ram_wide.columns if c != 'mtu'] | |
| ram_wide = ram_wide.rename({ | |
| c: f'cnec_t1_ram_{c}' for c in ram_cols | |
| }) | |
| # Pivot RAM utilization (ram / fmax), rounded to 4 decimals | |
| tier1_cnecs = tier1_cnecs.with_columns([ | |
| (pl.col('ram') / pl.col('fmax').clip(lower_bound=1)).round(4).alias('ram_util') | |
| ]) | |
| ram_util_wide = tier1_cnecs.pivot( | |
| values='ram_util', | |
| index='mtu', | |
| on='cnec_eic', | |
| aggregate_function='first' | |
| ) | |
| ram_util_cols = [c for c in ram_util_wide.columns if c != 'mtu'] | |
| ram_util_wide = ram_util_wide.rename({ | |
| c: f'cnec_t1_util_{c}' for c in ram_util_cols | |
| }) | |
| # Join all Tier-1 pivots | |
| tier1_features = binding_wide.join(ram_wide, on='mtu', how='left') | |
| tier1_features = tier1_features.join(ram_util_wide, on='mtu', how='left') | |
| # Create lags for key features (L1 for binding, L1-L7 for RAM) | |
| tier1_features = tier1_features.sort('mtu') | |
| # Add 1-hour lag for binding (58 features) | |
| for col in binding_cols: | |
| binding_col = f'cnec_t1_binding_{col}' | |
| tier1_features = tier1_features.with_columns([ | |
| pl.col(binding_col).shift(1).alias(f'{binding_col}_L1') | |
| ]) | |
| # Add 1, 3, 7, 24, 168 hour lags for RAM (5 * 58 = 290 features) | |
| for col in ram_cols[:10]: # Sample first 10 to avoid explosion | |
| ram_col = f'cnec_t1_ram_{col}' | |
| for lag in [1, 3, 7, 24, 168]: | |
| tier1_features = tier1_features.with_columns([ | |
| pl.col(ram_col).shift(lag).alias(f'{ram_col}_L{lag}') | |
| ]) | |
| # Add rolling aggregates (mean, max, min over 7d, 30d) for binding frequency | |
| # Apply to ALL 50 Tier-1 CNECs (not just first 10) | |
| for col in binding_cols[:50]: # All 50 Tier-1 CNECs | |
| binding_col = f'cnec_t1_binding_{col}' | |
| tier1_features = tier1_features.with_columns([ | |
| pl.col(binding_col).rolling_mean(window_size=168, min_samples=1).round(3).alias(f'{binding_col}_mean_7d'), | |
| pl.col(binding_col).rolling_max(window_size=168, min_samples=1).round(3).alias(f'{binding_col}_max_7d'), | |
| pl.col(binding_col).rolling_min(window_size=168, min_samples=1).round(3).alias(f'{binding_col}_min_7d'), | |
| pl.col(binding_col).rolling_mean(window_size=720, min_samples=1).round(3).alias(f'{binding_col}_mean_30d'), | |
| pl.col(binding_col).rolling_max(window_size=720, min_samples=1).round(3).alias(f'{binding_col}_max_30d'), | |
| pl.col(binding_col).rolling_min(window_size=720, min_samples=1).round(3).alias(f'{binding_col}_min_30d') | |
| ]) | |
| # Join with unified timeline | |
| features = unified.select(['mtu']).join(tier1_features, on='mtu', how='left') | |
| print(f" Tier-1 CNEC features: {len([c for c in features.columns if c.startswith('cnec_t1_')])} features") | |
| return features | |
| # ========================================================================= | |
| # Feature Category 2: Tier-2 CNEC Historical Features | |
| # ========================================================================= | |
| def engineer_tier2_cnec_features( | |
| cnec_hourly: pl.DataFrame, | |
| tier2_eics: List[str], | |
| unified: pl.DataFrame | |
| ) -> pl.DataFrame: | |
| """Engineer ~360 Tier-2 CNEC historical features. | |
| For each of 150 Tier-2 CNECs (less granular than Tier-1): | |
| - Binding status: 1 lag * 150 = 150 | |
| - Shadow price: 1 lag * 150 = 150 | |
| - Rolling aggregates: 60 (sample subset) | |
| Total: ~360 features | |
| """ | |
| print("\n[2/10] Engineering Tier-2 CNEC features...") | |
| # Filter CNEC data to Tier-2 only | |
| tier2_cnecs = cnec_hourly.filter(pl.col('cnec_eic').is_in(tier2_eics)) | |
| # Create is_binding column (shadow_price > 0 means binding) | |
| tier2_cnecs = tier2_cnecs.with_columns([ | |
| (pl.col('shadow_price') > 0).cast(pl.Int64).alias('is_binding') | |
| ]) | |
| # Pivot binding status | |
| binding_wide = tier2_cnecs.pivot( | |
| values='is_binding', | |
| index='mtu', | |
| on='cnec_eic', | |
| aggregate_function='first' | |
| ) | |
| binding_cols = [c for c in binding_wide.columns if c != 'mtu'] | |
| binding_wide = binding_wide.rename({ | |
| c: f'cnec_t2_binding_{c}' for c in binding_cols | |
| }) | |
| # Pivot RAM (shadow price) | |
| ram_wide = tier2_cnecs.pivot( | |
| values='ram', | |
| index='mtu', | |
| on='cnec_eic', | |
| aggregate_function='first' | |
| ) | |
| ram_cols = [c for c in ram_wide.columns if c != 'mtu'] | |
| ram_wide = ram_wide.rename({ | |
| c: f'cnec_t2_ram_{c}' for c in ram_cols | |
| }) | |
| # Join Tier-2 pivots | |
| tier2_features = binding_wide.join(ram_wide, on='mtu', how='left') | |
| tier2_features = tier2_features.sort('mtu') | |
| # Add 1-hour lag for binding (sample first 50 to limit features) | |
| for col in binding_cols[:50]: | |
| binding_col = f'cnec_t2_binding_{col}' | |
| tier2_features = tier2_features.with_columns([ | |
| pl.col(binding_col).shift(1).alias(f'{binding_col}_L1') | |
| ]) | |
| # Add 1-hour lag for RAM (sample first 50) | |
| for col in ram_cols[:50]: | |
| ram_col = f'cnec_t2_ram_{col}' | |
| tier2_features = tier2_features.with_columns([ | |
| pl.col(ram_col).shift(1).alias(f'{ram_col}_L1') | |
| ]) | |
| # Add rolling 7-day mean for binding frequency (sample 20) | |
| for col in binding_cols[:20]: | |
| binding_col = f'cnec_t2_binding_{col}' | |
| tier2_features = tier2_features.with_columns([ | |
| pl.col(binding_col).rolling_mean(window_size=168, min_samples=1).alias(f'{binding_col}_mean_7d') | |
| ]) | |
| # Join with unified timeline | |
| features = unified.select(['mtu']).join(tier2_features, on='mtu', how='left') | |
| print(f" Tier-2 CNEC features: {len([c for c in features.columns if c.startswith('cnec_t2_')])} features") | |
| return features | |
| # ========================================================================= | |
| # Feature Category 3: PTDF (Power Transfer Distribution Factors) | |
| # ========================================================================= | |
| def engineer_ptdf_features( | |
| cnec_hourly: pl.DataFrame, | |
| tier1_eics: List[str], | |
| tier2_eics: List[str], | |
| unified: pl.DataFrame | |
| ) -> pl.DataFrame: | |
| """Engineer ~888 PTDF features. | |
| PTDFs show how 1 MW injection at a zone affects flow on a CNEC. | |
| Critical for understanding cross-border coupling. | |
| Categories: | |
| 1. Tier-1 Individual PTDFs: 50 CNECs × 12 zones = 600 features | |
| 2. Tier-2 Border-Aggregated PTDFs: ~20 borders × 12 zones = 240 features | |
| 3. PTDF-NetPos Interactions: 12 zones × 4 aggregations = 48 features | |
| Total: ~888 features | |
| """ | |
| print("\n[3/11] Engineering PTDF features...") | |
| # PTDF zone columns (12 Core FBMC zones) | |
| ptdf_cols = ['ptdf_AT', 'ptdf_BE', 'ptdf_CZ', 'ptdf_DE', 'ptdf_FR', | |
| 'ptdf_HR', 'ptdf_HU', 'ptdf_NL', 'ptdf_PL', 'ptdf_RO', | |
| 'ptdf_SI', 'ptdf_SK'] | |
| # --- Tier-1 Individual PTDFs (600 features) --- | |
| print(" Processing Tier-1 individual PTDFs...") | |
| tier1_cnecs = cnec_hourly.filter(pl.col('cnec_eic').is_in(tier1_eics)) | |
| # For each PTDF column, pivot across Tier-1 CNECs | |
| ptdf_t1_features = unified.select(['mtu']) | |
| for ptdf_col in ptdf_cols: | |
| # Pivot PTDF values for this zone | |
| ptdf_wide = tier1_cnecs.pivot( | |
| values=ptdf_col, | |
| index='mtu', | |
| on='cnec_eic', | |
| aggregate_function='first' | |
| ) | |
| # Rename columns: cnec_eic → cnec_t1_ptdf_<ZONE>_<EIC> | |
| zone = ptdf_col.replace('ptdf_', '') | |
| ptdf_wide = ptdf_wide.rename({ | |
| c: f'cnec_t1_ptdf_{zone}_{c}' for c in ptdf_wide.columns if c != 'mtu' | |
| }) | |
| # Join to features | |
| ptdf_t1_features = ptdf_t1_features.join(ptdf_wide, on='mtu', how='left') | |
| tier1_ptdf_count = len([c for c in ptdf_t1_features.columns if c.startswith('cnec_t1_ptdf_')]) | |
| print(f" Tier-1 PTDF features: {tier1_ptdf_count}") | |
| # --- Tier-2 Border-Aggregated PTDFs (240 features) --- | |
| print(" Processing Tier-2 border-aggregated PTDFs...") | |
| tier2_cnecs = cnec_hourly.filter(pl.col('cnec_eic').is_in(tier2_eics)) | |
| # Extract border from CNEC metadata (use direction column or parse cnec_name) | |
| # For simplicity: use first 2 chars of direction as border proxy | |
| # Better: parse from cnec_name which contains border info | |
| # Group Tier-2 CNECs by affected border | |
| # Strategy: Use CNEC direction field or aggregate all Tier-2 by timestamp | |
| # For MVP: Create aggregated PTDFs across all Tier-2 CNECs (simplified) | |
| ptdf_t2_features = unified.select(['mtu']) | |
| for ptdf_col in ptdf_cols: | |
| zone = ptdf_col.replace('ptdf_', '') | |
| # Aggregate Tier-2 PTDFs: mean, max, min, std across all Tier-2 CNECs per timestamp | |
| tier2_ptdf_agg = tier2_cnecs.group_by('mtu').agg([ | |
| pl.col(ptdf_col).mean().alias(f'cnec_t2_ptdf_{zone}_mean'), | |
| pl.col(ptdf_col).max().alias(f'cnec_t2_ptdf_{zone}_max'), | |
| pl.col(ptdf_col).min().alias(f'cnec_t2_ptdf_{zone}_min'), | |
| pl.col(ptdf_col).std().alias(f'cnec_t2_ptdf_{zone}_std'), | |
| (pl.col(ptdf_col).abs()).mean().alias(f'cnec_t2_ptdf_{zone}_abs_mean') | |
| ]) | |
| # Join to features | |
| ptdf_t2_features = ptdf_t2_features.join(tier2_ptdf_agg, on='mtu', how='left') | |
| tier2_ptdf_count = len([c for c in ptdf_t2_features.columns if c.startswith('cnec_t2_ptdf_')]) | |
| print(f" Tier-2 PTDF features: {tier2_ptdf_count}") | |
| # --- PTDF-NetPos Interactions (48 features) --- | |
| print(" Processing PTDF-NetPos interactions...") | |
| # Get Net Position columns from unified dataset | |
| netpos_cols = [c for c in unified.columns if c.startswith('netpos_')] | |
| # For each zone, create interaction: aggregated_ptdf × netpos | |
| ptdf_netpos_features = unified.select(['mtu']) | |
| for zone in ['AT', 'BE', 'CZ', 'DE', 'FR', 'HR', 'HU', 'NL', 'PL', 'RO', 'SI', 'SK']: | |
| netpos_col = f'netpos_{zone}' | |
| if netpos_col in unified.columns: | |
| # Extract zone PTDF aggregates from tier2_ptdf_agg | |
| ptdf_mean_col = f'cnec_t2_ptdf_{zone}_mean' | |
| if ptdf_mean_col in ptdf_t2_features.columns: | |
| # Interaction: PTDF_mean × NetPos | |
| interaction = ( | |
| ptdf_t2_features[ptdf_mean_col].fill_null(0) * | |
| unified[netpos_col].fill_null(0) | |
| ).alias(f'ptdf_netpos_{zone}') | |
| ptdf_netpos_features = ptdf_netpos_features.with_columns([interaction]) | |
| ptdf_netpos_count = len([c for c in ptdf_netpos_features.columns if c.startswith('ptdf_netpos_')]) | |
| print(f" PTDF-NetPos features: {ptdf_netpos_count}") | |
| # --- Combine all PTDF features --- | |
| all_ptdf_features = ptdf_t1_features.join(ptdf_t2_features, on='mtu', how='left') | |
| all_ptdf_features = all_ptdf_features.join(ptdf_netpos_features, on='mtu', how='left') | |
| total_ptdf_features = len([c for c in all_ptdf_features.columns if c != 'mtu']) | |
| print(f" Total PTDF features: {total_ptdf_features}") | |
| return all_ptdf_features | |
| # ========================================================================= | |
| # Feature Category 4: LTA Future Covariates | |
| # ========================================================================= | |
| def engineer_lta_features(unified: pl.DataFrame) -> pl.DataFrame: | |
| """Engineer ~40 LTA future covariate features. | |
| LTA (Long Term Allocations) are known years in advance via auctions. | |
| - 38 border columns (one per border) | |
| - Forward-looking (D+1 to D+14 known at forecast time) | |
| - No lags needed (future covariates) | |
| Total: ~40 features | |
| """ | |
| print("\n[4/11] Engineering LTA future covariate features...") | |
| # Get all LTA border columns | |
| lta_cols = [c for c in unified.columns if c.startswith('border_')] | |
| # LTA are future covariates - use as-is (no lags) | |
| # Add aggregate features: total allocated capacity, % allocated | |
| lta_sum = unified.select(lta_cols).sum_horizontal().alias('lta_total_allocated') | |
| lta_mean = unified.select(lta_cols).mean_horizontal().alias('lta_mean_allocated') | |
| features = unified.select(['mtu']).with_columns([ | |
| lta_sum, | |
| lta_mean | |
| ]) | |
| # Add individual LTA borders (38 features) | |
| for col in lta_cols: | |
| features = features.with_columns([ | |
| unified[col].alias(f'lta_{col}') | |
| ]) | |
| print(f" LTA features: {len([c for c in features.columns if c.startswith('lta_')])} features") | |
| return features | |
| # ========================================================================= | |
| # Feature Category 4-10: Remaining feature categories (scaffolding) | |
| # ========================================================================= | |
| def engineer_netpos_features(unified: pl.DataFrame) -> pl.DataFrame: | |
| """Engineer 84 Net Position features (28 current + 56 lags). | |
| Net Positions represent zone-level scheduled positions (long/short MW): | |
| - min/max values for each of 12 Core FBMC zones | |
| - Plus Albania-related positions (ALBE, ALDE) | |
| - L24 and L72 lags (not L1 - no value for net positions) | |
| Total: 28 current + 56 lags = 84 features | |
| """ | |
| print("\n[5/11] Engineering NetPos features...") | |
| # Get all Net Position columns (min/max for each zone) | |
| netpos_cols = [c for c in unified.columns if c.startswith('min') or c.startswith('max')] | |
| print(f" Found {len(netpos_cols)} Net Position columns") | |
| # Start with current values | |
| features = unified.select(['mtu'] + netpos_cols) | |
| # Add L24 and L72 lags for all Net Position columns | |
| for col in netpos_cols: | |
| features = features.with_columns([ | |
| pl.col(col).shift(24).alias(f'{col}_L24'), | |
| pl.col(col).shift(72).alias(f'{col}_L72') | |
| ]) | |
| netpos_feature_count = len([c for c in features.columns if c != 'mtu']) | |
| print(f" NetPos features: {netpos_feature_count} features") | |
| return features | |
| def engineer_maxbex_features(unified: pl.DataFrame) -> pl.DataFrame: | |
| """Engineer 76 MaxBEX lag features (38 borders × 2 lags). | |
| MaxBEX historical lags provide: | |
| - L24: 24-hour lag (yesterday same hour) | |
| - L72: 72-hour lag (3 days ago same hour) | |
| Total: 38 borders × 2 lags = 76 features | |
| """ | |
| print("\n[6/11] Engineering MaxBEX features...") | |
| # Get MaxBEX border columns | |
| maxbex_cols = [c for c in unified.columns if c.startswith('border_') and 'lta' not in c.lower()] | |
| print(f" Found {len(maxbex_cols)} MaxBEX border columns") | |
| features = unified.select(['mtu']) | |
| # Add L24 and L72 lags for all 38 borders | |
| for col in maxbex_cols: | |
| features = features.with_columns([ | |
| unified[col].shift(24).alias(f'{col}_L24'), | |
| unified[col].shift(72).alias(f'{col}_L72') | |
| ]) | |
| maxbex_feature_count = len([c for c in features.columns if c != 'mtu']) | |
| print(f" MaxBEX lag features: {maxbex_feature_count} features") | |
| return features | |
| def engineer_temporal_features(unified: pl.DataFrame) -> pl.DataFrame: | |
| """Engineer ~20 temporal encoding features.""" | |
| print("\n[7/11] Engineering temporal features...") | |
| # Extract temporal features from mtu | |
| features = unified.select(['mtu']).with_columns([ | |
| pl.col('mtu').dt.hour().alias('hour'), | |
| pl.col('mtu').dt.day().alias('day'), | |
| pl.col('mtu').dt.month().alias('month'), | |
| pl.col('mtu').dt.weekday().alias('weekday'), | |
| pl.col('mtu').dt.year().alias('year'), | |
| (pl.col('mtu').dt.weekday() >= 5).cast(pl.Int64).alias('is_weekend'), | |
| # Cyclic encoding for hour (sin/cos) | |
| (pl.col('mtu').dt.hour() * 2 * np.pi / 24).sin().alias('hour_sin'), | |
| (pl.col('mtu').dt.hour() * 2 * np.pi / 24).cos().alias('hour_cos'), | |
| # Cyclic encoding for month | |
| (pl.col('mtu').dt.month() * 2 * np.pi / 12).sin().alias('month_sin'), | |
| (pl.col('mtu').dt.month() * 2 * np.pi / 12).cos().alias('month_cos'), | |
| # Cyclic encoding for weekday | |
| (pl.col('mtu').dt.weekday() * 2 * np.pi / 7).sin().alias('weekday_sin'), | |
| (pl.col('mtu').dt.weekday() * 2 * np.pi / 7).cos().alias('weekday_cos') | |
| ]) | |
| print(f" Temporal features: {len([c for c in features.columns if c != 'mtu'])} features") | |
| return features | |
| def engineer_system_aggregates(unified: pl.DataFrame) -> pl.DataFrame: | |
| """Engineer ~20 system aggregate features.""" | |
| print("\n[8/11] Engineering system aggregate features...") | |
| # Implementation: total capacity, utilization, regional sums | |
| # Placeholder: returns mtu only for now | |
| return unified.select(['mtu']) | |
| def engineer_regional_proxies(unified: pl.DataFrame) -> pl.DataFrame: | |
| """Engineer ~36 regional proxy features.""" | |
| print("\n[9/11] Engineering regional proxy features...") | |
| # Implementation: regional capacity sums (North, South, East, West) | |
| # Placeholder: returns mtu only for now | |
| return unified.select(['mtu']) | |
| def engineer_pca_clusters(unified: pl.DataFrame, cnec_hourly: pl.DataFrame) -> pl.DataFrame: | |
| """Engineer ~10 PCA cluster features.""" | |
| print("\n[10/11] Engineering PCA cluster features...") | |
| # Implementation: PCA on CNEC binding patterns | |
| # Placeholder: returns mtu only for now | |
| return unified.select(['mtu']) | |
| def engineer_additional_lags(unified: pl.DataFrame) -> pl.DataFrame: | |
| """Engineer ~27 additional lag features.""" | |
| print("\n[11/11] Engineering additional lag features...") | |
| # Implementation: extra lags for key features | |
| # Placeholder: returns mtu only for now | |
| return unified.select(['mtu']) | |
| # ========================================================================= | |
| # Main Feature Engineering Pipeline | |
| # ========================================================================= | |
| def engineer_jao_features( | |
| unified_path: Path, | |
| cnec_hourly_path: Path, | |
| master_cnec_path: Path, | |
| output_dir: Path | |
| ) -> pl.DataFrame: | |
| """Engineer all ~1,600 JAO features using master CNEC list (176 unique). | |
| Args: | |
| unified_path: Path to unified JAO data | |
| cnec_hourly_path: Path to CNEC hourly data | |
| master_cnec_path: Path to master CNEC list (176 unique: 168 physical + 8 Alegro) | |
| output_dir: Directory to save features | |
| Returns: | |
| DataFrame with ~1,600 features | |
| """ | |
| print("\n" + "=" * 80) | |
| print("JAO FEATURE ENGINEERING (MASTER CNEC LIST - 176 UNIQUE)") | |
| print("=" * 80) | |
| # Load data | |
| print("\nLoading data...") | |
| unified = pl.read_parquet(unified_path) | |
| cnec_hourly = pl.read_parquet(cnec_hourly_path) | |
| master_cnecs = pl.read_csv(master_cnec_path) | |
| print(f" Unified data: {unified.shape}") | |
| print(f" CNEC hourly: {cnec_hourly.shape}") | |
| print(f" Master CNEC list: {len(master_cnecs)} unique CNECs") | |
| # Validate master list | |
| unique_eics = master_cnecs['cnec_eic'].n_unique() | |
| assert unique_eics == 176, f"Expected 176 unique CNECs, got {unique_eics}" | |
| assert len(master_cnecs) == 176, f"Expected 176 rows in master list, got {len(master_cnecs)}" | |
| # Get CNEC EIC lists by tier | |
| # Tier 1: "Tier 1" OR "Tier 1 (Alegro)" = 46 physical + 8 Alegro = 54 total | |
| tier1_cnecs = master_cnecs.filter(pl.col('tier').str.contains('Tier 1')) | |
| tier1_eics = tier1_cnecs['cnec_eic'].to_list() | |
| # Tier 2: "Tier 2" only = 122 physical | |
| tier2_cnecs = master_cnecs.filter(pl.col('tier').str.contains('Tier 2')) | |
| tier2_eics = tier2_cnecs['cnec_eic'].to_list() | |
| # Validation checks | |
| print(f"\n CNEC Breakdown:") | |
| print(f" Tier-1 (includes 8 Alegro): {len(tier1_eics)} CNECs") | |
| print(f" Tier-2 (physical only): {len(tier2_eics)} CNECs") | |
| print(f" Total unique: {len(tier1_eics) + len(tier2_eics)} CNECs") | |
| assert len(tier1_eics) == 54, f"Expected 54 Tier-1 CNECs (46 physical + 8 Alegro), got {len(tier1_eics)}" | |
| assert len(tier2_eics) == 122, f"Expected 122 Tier-2 CNECs, got {len(tier2_eics)}" | |
| assert len(tier1_eics) + len(tier2_eics) == 176, "Tier counts don't sum to 176!" | |
| # Engineer features by category | |
| print("\nEngineering features...") | |
| feat_tier1 = engineer_tier1_cnec_features(cnec_hourly, tier1_eics, unified) | |
| feat_tier2 = engineer_tier2_cnec_features(cnec_hourly, tier2_eics, unified) | |
| feat_ptdf = engineer_ptdf_features(cnec_hourly, tier1_eics, tier2_eics, unified) | |
| feat_lta = engineer_lta_features(unified) | |
| feat_netpos = engineer_netpos_features(unified) | |
| feat_maxbex = engineer_maxbex_features(unified) | |
| feat_temporal = engineer_temporal_features(unified) | |
| feat_system = engineer_system_aggregates(unified) | |
| feat_regional = engineer_regional_proxies(unified) | |
| feat_pca = engineer_pca_clusters(unified, cnec_hourly) | |
| feat_lags = engineer_additional_lags(unified) | |
| # Combine all features | |
| print("\nCombining all feature categories...") | |
| # Start with Tier-1 (has mtu) | |
| all_features = feat_tier1.clone() | |
| # Join all other feature sets on mtu | |
| for feat_df in [feat_tier2, feat_ptdf, feat_lta, feat_netpos, feat_maxbex, | |
| feat_temporal, feat_system, feat_regional, feat_pca, feat_lags]: | |
| all_features = all_features.join(feat_df, on='mtu', how='left') | |
| # Add target variables from DIRECTIONAL FLOWS (not MaxBEX capacity) | |
| # JAO provides directional flows (CZ>PL, PL>CZ, etc.) as separate columns | |
| # Create targets for EACH DIRECTION as separate forecast targets | |
| # Example: target_border_CZ_PL = flow from CZ to PL (CZ>PL column) | |
| # target_border_PL_CZ = flow from PL to CZ (PL>CZ column) | |
| # Find all directional flow columns | |
| directional_cols = [c for c in unified.columns if '>' in c] | |
| print(f"\n[INFO] Creating targets for {len(directional_cols)} directional flows...") | |
| # Create one target per directional flow (e.g., CZ>PL becomes target_border_CZ_PL) | |
| for col in sorted(directional_cols): | |
| from_country, to_country = col.split('>') | |
| # Target name: target_border_{FROM}_{TO} (preserves direction) | |
| target_name = f'target_border_{from_country}_{to_country}' | |
| all_features = all_features.with_columns([ | |
| unified[col].alias(target_name) | |
| ]) | |
| print(f"\n[INFO] Adding volatility features for {len(directional_cols)} directional borders...") | |
| print("[INFO] Volatility features will help model capture hour-to-hour variations") | |
| # Add volatility features for EACH directional border | |
| # These features teach the model about hourly change patterns and volatility magnitude | |
| for col in sorted(directional_cols): | |
| from_country, to_country = col.split('>') | |
| border_code = f'{from_country}_{to_country}' | |
| # Get the target series | |
| target_col = f'target_border_{border_code}' | |
| # 1. Hour-over-hour delta (captures immediate hourly changes) | |
| # Formula: target[t] - target[t-1] | |
| # Helps model learn rapid swings and ramps | |
| delta_col = f'target_delta_h1_{border_code}' | |
| all_features = all_features.with_columns([ | |
| (all_features[target_col] - all_features[target_col].shift(1)).alias(delta_col) | |
| ]) | |
| # 2. 24-hour rolling volatility (captures daily volatility patterns) | |
| # Formula: rolling_std(target, window=24h) | |
| # Informs model about magnitude of daily variation | |
| vol_col = f'target_vol_24h_{border_code}' | |
| all_features = all_features.with_columns([ | |
| all_features[target_col].rolling_std(window_size=24, min_periods=1).alias(vol_col) | |
| ]) | |
| # 3. 6-hour range (captures intraday swings - morning/evening ramps) | |
| # Formula: rolling_max(target, 6h) - rolling_min(target, 6h) | |
| # Detects peak/off-peak transitions | |
| range_col = f'target_range_6h_{border_code}' | |
| all_features = all_features.with_columns([ | |
| (all_features[target_col].rolling_max(window_size=6, min_periods=1) - | |
| all_features[target_col].rolling_min(window_size=6, min_periods=1)).alias(range_col) | |
| ]) | |
| print(f"[OK] Added {len(directional_cols) * 3} volatility features:") | |
| print(f" - {len(directional_cols)} hour-over-hour delta features") | |
| print(f" - {len(directional_cols)} 24-hour rolling volatility features") | |
| print(f" - {len(directional_cols)} 6-hour range features") | |
| # Remove duplicates if any | |
| if 'mtu_right' in all_features.columns: | |
| all_features = all_features.drop([c for c in all_features.columns if c.endswith('_right')]) | |
| # Final validation | |
| print("\n" + "=" * 80) | |
| print("FEATURE ENGINEERING COMPLETE") | |
| print("=" * 80) | |
| print(f"Total features: {all_features.shape[1] - 1} (excluding mtu)") | |
| print(f"Total rows: {len(all_features):,}") | |
| print(f"Null count: {all_features.null_count().sum_horizontal()[0]:,}") | |
| # Save features | |
| output_path = output_dir / 'features_jao_24month.parquet' | |
| all_features.write_parquet(output_path) | |
| print(f"\nFeatures saved: {output_path}") | |
| print(f"File size: {output_path.stat().st_size / (1024**2):.2f} MB") | |
| print("=" * 80) | |
| print() | |
| return all_features | |
| def main(): | |
| """Main execution using master CNEC list (176 unique).""" | |
| # Paths | |
| base_dir = Path.cwd() | |
| processed_dir = base_dir / 'data' / 'processed' | |
| unified_path = processed_dir / 'unified_jao_24month.parquet' | |
| cnec_hourly_path = processed_dir / 'cnec_hourly_24month.parquet' | |
| master_cnec_path = processed_dir / 'cnecs_master_176.csv' | |
| # Verify files exist | |
| for path in [unified_path, cnec_hourly_path, master_cnec_path]: | |
| if not path.exists(): | |
| raise FileNotFoundError(f"Required file not found: {path}") | |
| # Engineer features | |
| features = engineer_jao_features( | |
| unified_path, | |
| cnec_hourly_path, | |
| master_cnec_path, | |
| processed_dir | |
| ) | |
| print("SUCCESS: JAO features re-engineered with deduplicated 176 CNECs and saved to data/processed/") | |
| if __name__ == '__main__': | |
| main() | |