fbmc-chronos2 / src /forecasting /feature_availability.py
Evgueni Poloukarov
feat: add dynamic forecast system to prevent data leakage
f4be780
raw
history blame
13.6 kB
#!/usr/bin/env python3
"""
Feature Availability Module
Categorizes 2,514 features by their availability windows for forecasting.
Purpose: Prevent data leakage by clearly defining what features are available
at run time for different forecast horizons.
Categories:
1. Full-horizon D+14 (always known): temporal, weather, CNEC outages, LTA
2. Partial D+1 only (masked D+2-D+14): load forecasts
3. Historical only (not available): prices, generation, demand, lags, etc.
"""
from typing import Dict, List, Tuple, Set
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
class FeatureAvailability:
"""
Defines availability windows for all features in the dataset.
Availability Horizons:
- D+14: Available for full 14-day forecast (temporal, weather, outages, LTA)
- D+1: Available for day-ahead only (load forecasts)
- D+0: Current value only, forward-filled (LTA)
- Historical: Not available for future (prices, generation, demand, lags)
"""
# Feature categories with their availability windows
AVAILABILITY_WINDOWS = {
# FULL HORIZON - D+14 (336 hours)
'temporal': {
'horizon_hours': float('inf'), # Always computable
'description': 'Time-based features (hour, day, month, weekday, etc.)',
'patterns': ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend'],
'suffixes': ['_sin', '_cos'],
'expected_count': 12,
},
'weather': {
'horizon_hours': 336, # D+14 weather forecasts
'description': 'Weather forecasts (temp, wind, solar, cloud, pressure)',
'prefixes': ['temp_', 'wind_', 'wind10m_', 'wind100m_', 'winddir_', 'solar_', 'cloud_', 'pressure_'],
'expected_count': 375, # Approximate (52 grid points × ~7 variables)
},
'cnec_outages': {
'horizon_hours': 336, # D+14+ planned transmission outages
'description': 'Planned CNEC transmission outages (published weeks ahead)',
'prefixes': ['outage_cnec_'],
'expected_count': 176,
},
'lta': {
'horizon_hours': 0, # D+0 only (current value)
'description': 'Long-term allocations (forward-filled from D+0)',
'prefixes': ['lta_'],
'expected_count': 40,
'forward_fill': True, # Special handling: forward-fill current value
},
# PARTIAL HORIZON - D+1 only (24 hours)
'load_forecast': {
'horizon_hours': 24, # D+1 only, masked D+2-D+14
'description': 'Day-ahead load forecasts (published D-1)',
'prefixes': ['load_forecast_'],
'expected_count': 12,
'requires_masking': True, # Mask hours 25-336
},
# HISTORICAL ONLY - Not available for forecasting
'prices': {
'horizon_hours': -1, # Historical only
'description': 'Day-ahead electricity prices (determined D-1)',
'prefixes': ['price_'],
'expected_count': 24,
},
'generation': {
'horizon_hours': -1,
'description': 'Actual generation by fuel type',
'prefixes': ['gen_'],
'expected_count': 183, # 12 zones × ~15 fuel types
},
'demand': {
'horizon_hours': -1,
'description': 'Actual electricity demand',
'prefixes': ['demand_'],
'expected_count': 24, # 12 zones + aggregates
},
'border_lags': {
'horizon_hours': -1,
'description': 'Lagged cross-border flows',
'patterns': ['_lag_', '_L', 'border_'],
'expected_count': 264, # 38 borders × 7 lags (1h, 3h, 6h, 12h, 24h, 168h, 720h)
},
'cnec_flows': {
'horizon_hours': -1,
'description': 'Historical CNEC flows and constraints',
'prefixes': ['cnec_'],
'patterns': ['_flow', '_binding', '_margin', '_ram'],
'expected_count': 1000, # Tier-1 CNECs with multiple metrics
},
'netpos': {
'horizon_hours': -1,
'description': 'Historical net positions',
'prefixes': ['netpos_'],
'expected_count': 48, # 12 zones × 4 metrics
},
'system_agg': {
'horizon_hours': -1,
'description': 'System-level aggregates',
'prefixes': ['total_', 'avg_', 'max', 'min', 'std_', 'mean_', 'sum_'],
'expected_count': 353, # Various aggregations
},
'pumped_storage': {
'horizon_hours': -1,
'description': 'Pumped hydro storage generation',
'prefixes': ['pumped_'],
'expected_count': 7, # Countries with pumped storage
},
'hydro_storage': {
'horizon_hours': -1,
'description': 'Hydro reservoir levels (weekly data)',
'prefixes': ['hydro_storage_'],
'expected_count': 7,
},
}
@classmethod
def categorize_features(cls, columns: List[str]) -> Dict[str, List[str]]:
"""
Categorize all features by their availability windows.
Args:
columns: All column names from dataset
Returns:
Dictionary with categories:
- full_horizon_d14: Available for full 14-day forecast
- partial_d1: Available D+1 only (requires masking)
- historical: Not available for forecasting
- uncategorized: Features that don't match any pattern
"""
full_horizon_d14 = []
partial_d1 = []
historical = []
uncategorized = []
for col in columns:
# Skip metadata columns
if col == 'timestamp' or col.startswith('target_border_'):
continue
categorized = False
# Check each category
for category, config in cls.AVAILABILITY_WINDOWS.items():
if cls._matches_category(col, config):
# Assign to appropriate list based on horizon
if config['horizon_hours'] >= 336 or config['horizon_hours'] == float('inf'):
full_horizon_d14.append(col)
elif config['horizon_hours'] == 24:
partial_d1.append(col)
elif config['horizon_hours'] < 0:
historical.append(col)
elif config['horizon_hours'] == 0:
# LTA: forward-filled, treat as full horizon
full_horizon_d14.append(col)
categorized = True
break
if not categorized:
uncategorized.append(col)
return {
'full_horizon_d14': full_horizon_d14,
'partial_d1': partial_d1,
'historical': historical,
'uncategorized': uncategorized,
}
@classmethod
def _matches_category(cls, col: str, config: Dict) -> bool:
"""Check if column matches category patterns."""
# Check exact matches
if 'patterns' in config:
if col in config['patterns']:
return True
# Check for pattern substring matches
if any(pattern in col for pattern in config['patterns']):
return True
# Check prefixes
if 'prefixes' in config:
if any(col.startswith(prefix) for prefix in config['prefixes']):
return True
# Check suffixes
if 'suffixes' in config:
if any(col.endswith(suffix) for suffix in config['suffixes']):
return True
return False
@classmethod
def create_availability_mask(
cls,
feature_name: str,
forecast_horizon_hours: int = 336
) -> np.ndarray:
"""
Create binary availability mask for a feature across forecast horizon.
Args:
feature_name: Name of the feature
forecast_horizon_hours: Length of forecast (default 336 = 14 days)
Returns:
Binary mask: 1 = available, 0 = masked/unavailable
"""
# Determine category
for category, config in cls.AVAILABILITY_WINDOWS.items():
if cls._matches_category(feature_name, config):
horizon = config['horizon_hours']
# Full horizon or infinite (temporal)
if horizon >= forecast_horizon_hours or horizon == float('inf'):
return np.ones(forecast_horizon_hours, dtype=np.float32)
# Partial horizon (e.g., D+1 = 24 hours)
elif horizon > 0:
mask = np.zeros(forecast_horizon_hours, dtype=np.float32)
mask[:int(horizon)] = 1.0
return mask
# Forward-fill (LTA: D+0)
elif horizon == 0:
return np.ones(forecast_horizon_hours, dtype=np.float32)
# Historical only
else:
return np.zeros(forecast_horizon_hours, dtype=np.float32)
# Unknown feature: assume historical (conservative)
return np.zeros(forecast_horizon_hours, dtype=np.float32)
@classmethod
def validate_categorization(
cls,
categories: Dict[str, List[str]],
verbose: bool = True
) -> Tuple[bool, List[str]]:
"""
Validate feature categorization against expected counts.
Args:
categories: Output from categorize_features()
verbose: Print validation details
Returns:
(is_valid, warnings)
"""
warnings = []
# Total feature count (excl. timestamp + 38 targets)
total_features = sum(len(v) for v in categories.values())
expected_total = 2514 # 2,553 columns - 1 timestamp - 38 targets
if total_features != expected_total:
warnings.append(
f"Feature count mismatch: {total_features} vs expected {expected_total}"
)
# Check full-horizon D+14 features
full_d14 = len(categories['full_horizon_d14'])
# Expected: temporal (12) + weather (~375) + outages (176) + LTA (40) = ~603
if full_d14 < 200 or full_d14 > 700:
warnings.append(
f"Full-horizon D+14 count unusual: {full_d14} (expected ~240-640)"
)
# Check partial D+1 features
partial_d1 = len(categories['partial_d1'])
if partial_d1 != 12:
warnings.append(
f"Partial D+1 count: {partial_d1} (expected 12 load forecasts)"
)
# Check uncategorized
if categories['uncategorized']:
warnings.append(
f"Uncategorized features: {len(categories['uncategorized'])} "
f"(first 5: {categories['uncategorized'][:5]})"
)
if verbose:
print("="*60)
print("FEATURE CATEGORIZATION VALIDATION")
print("="*60)
print(f"Full-horizon D+14: {len(categories['full_horizon_d14']):4d} features")
print(f"Partial D+1: {len(categories['partial_d1']):4d} features")
print(f"Historical only: {len(categories['historical']):4d} features")
print(f"Uncategorized: {len(categories['uncategorized']):4d} features")
print(f"Total: {total_features:4d} features")
if warnings:
print("\n[!] WARNINGS:")
for w in warnings:
print(f" - {w}")
else:
print("\n[OK] Validation passed!")
print("="*60)
return len(warnings) == 0, warnings
@classmethod
def get_category_summary(cls, categories: Dict[str, List[str]]) -> pd.DataFrame:
"""
Generate summary table of feature categorization.
Returns:
DataFrame with category, count, availability, and sample features
"""
summary = []
# Full-horizon D+14
summary.append({
'Category': 'Full-horizon D+14',
'Count': len(categories['full_horizon_d14']),
'Availability': 'D+1 to D+14 (336 hours)',
'Masking': 'None',
'Sample Features': ', '.join(categories['full_horizon_d14'][:3]),
})
# Partial D+1
summary.append({
'Category': 'Partial D+1',
'Count': len(categories['partial_d1']),
'Availability': 'D+1 only (24 hours)',
'Masking': 'Mask D+2 to D+14',
'Sample Features': ', '.join(categories['partial_d1'][:3]),
})
# Historical
summary.append({
'Category': 'Historical only',
'Count': len(categories['historical']),
'Availability': 'Not available for forecasting',
'Masking': 'All zeros',
'Sample Features': ', '.join(categories['historical'][:3]),
})
# Uncategorized
if categories['uncategorized']:
summary.append({
'Category': 'Uncategorized',
'Count': len(categories['uncategorized']),
'Availability': 'Unknown (conservative: historical)',
'Masking': 'All zeros (conservative)',
'Sample Features': ', '.join(categories['uncategorized'][:3]),
})
return pd.DataFrame(summary)