Spaces:
Sleeping
Sleeping
File size: 13,583 Bytes
f4be780 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 |
#!/usr/bin/env python3
"""
Feature Availability Module
Categorizes 2,514 features by their availability windows for forecasting.
Purpose: Prevent data leakage by clearly defining what features are available
at run time for different forecast horizons.
Categories:
1. Full-horizon D+14 (always known): temporal, weather, CNEC outages, LTA
2. Partial D+1 only (masked D+2-D+14): load forecasts
3. Historical only (not available): prices, generation, demand, lags, etc.
"""
from typing import Dict, List, Tuple, Set
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
class FeatureAvailability:
"""
Defines availability windows for all features in the dataset.
Availability Horizons:
- D+14: Available for full 14-day forecast (temporal, weather, outages, LTA)
- D+1: Available for day-ahead only (load forecasts)
- D+0: Current value only, forward-filled (LTA)
- Historical: Not available for future (prices, generation, demand, lags)
"""
# Feature categories with their availability windows
AVAILABILITY_WINDOWS = {
# FULL HORIZON - D+14 (336 hours)
'temporal': {
'horizon_hours': float('inf'), # Always computable
'description': 'Time-based features (hour, day, month, weekday, etc.)',
'patterns': ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend'],
'suffixes': ['_sin', '_cos'],
'expected_count': 12,
},
'weather': {
'horizon_hours': 336, # D+14 weather forecasts
'description': 'Weather forecasts (temp, wind, solar, cloud, pressure)',
'prefixes': ['temp_', 'wind_', 'wind10m_', 'wind100m_', 'winddir_', 'solar_', 'cloud_', 'pressure_'],
'expected_count': 375, # Approximate (52 grid points × ~7 variables)
},
'cnec_outages': {
'horizon_hours': 336, # D+14+ planned transmission outages
'description': 'Planned CNEC transmission outages (published weeks ahead)',
'prefixes': ['outage_cnec_'],
'expected_count': 176,
},
'lta': {
'horizon_hours': 0, # D+0 only (current value)
'description': 'Long-term allocations (forward-filled from D+0)',
'prefixes': ['lta_'],
'expected_count': 40,
'forward_fill': True, # Special handling: forward-fill current value
},
# PARTIAL HORIZON - D+1 only (24 hours)
'load_forecast': {
'horizon_hours': 24, # D+1 only, masked D+2-D+14
'description': 'Day-ahead load forecasts (published D-1)',
'prefixes': ['load_forecast_'],
'expected_count': 12,
'requires_masking': True, # Mask hours 25-336
},
# HISTORICAL ONLY - Not available for forecasting
'prices': {
'horizon_hours': -1, # Historical only
'description': 'Day-ahead electricity prices (determined D-1)',
'prefixes': ['price_'],
'expected_count': 24,
},
'generation': {
'horizon_hours': -1,
'description': 'Actual generation by fuel type',
'prefixes': ['gen_'],
'expected_count': 183, # 12 zones × ~15 fuel types
},
'demand': {
'horizon_hours': -1,
'description': 'Actual electricity demand',
'prefixes': ['demand_'],
'expected_count': 24, # 12 zones + aggregates
},
'border_lags': {
'horizon_hours': -1,
'description': 'Lagged cross-border flows',
'patterns': ['_lag_', '_L', 'border_'],
'expected_count': 264, # 38 borders × 7 lags (1h, 3h, 6h, 12h, 24h, 168h, 720h)
},
'cnec_flows': {
'horizon_hours': -1,
'description': 'Historical CNEC flows and constraints',
'prefixes': ['cnec_'],
'patterns': ['_flow', '_binding', '_margin', '_ram'],
'expected_count': 1000, # Tier-1 CNECs with multiple metrics
},
'netpos': {
'horizon_hours': -1,
'description': 'Historical net positions',
'prefixes': ['netpos_'],
'expected_count': 48, # 12 zones × 4 metrics
},
'system_agg': {
'horizon_hours': -1,
'description': 'System-level aggregates',
'prefixes': ['total_', 'avg_', 'max', 'min', 'std_', 'mean_', 'sum_'],
'expected_count': 353, # Various aggregations
},
'pumped_storage': {
'horizon_hours': -1,
'description': 'Pumped hydro storage generation',
'prefixes': ['pumped_'],
'expected_count': 7, # Countries with pumped storage
},
'hydro_storage': {
'horizon_hours': -1,
'description': 'Hydro reservoir levels (weekly data)',
'prefixes': ['hydro_storage_'],
'expected_count': 7,
},
}
@classmethod
def categorize_features(cls, columns: List[str]) -> Dict[str, List[str]]:
"""
Categorize all features by their availability windows.
Args:
columns: All column names from dataset
Returns:
Dictionary with categories:
- full_horizon_d14: Available for full 14-day forecast
- partial_d1: Available D+1 only (requires masking)
- historical: Not available for forecasting
- uncategorized: Features that don't match any pattern
"""
full_horizon_d14 = []
partial_d1 = []
historical = []
uncategorized = []
for col in columns:
# Skip metadata columns
if col == 'timestamp' or col.startswith('target_border_'):
continue
categorized = False
# Check each category
for category, config in cls.AVAILABILITY_WINDOWS.items():
if cls._matches_category(col, config):
# Assign to appropriate list based on horizon
if config['horizon_hours'] >= 336 or config['horizon_hours'] == float('inf'):
full_horizon_d14.append(col)
elif config['horizon_hours'] == 24:
partial_d1.append(col)
elif config['horizon_hours'] < 0:
historical.append(col)
elif config['horizon_hours'] == 0:
# LTA: forward-filled, treat as full horizon
full_horizon_d14.append(col)
categorized = True
break
if not categorized:
uncategorized.append(col)
return {
'full_horizon_d14': full_horizon_d14,
'partial_d1': partial_d1,
'historical': historical,
'uncategorized': uncategorized,
}
@classmethod
def _matches_category(cls, col: str, config: Dict) -> bool:
"""Check if column matches category patterns."""
# Check exact matches
if 'patterns' in config:
if col in config['patterns']:
return True
# Check for pattern substring matches
if any(pattern in col for pattern in config['patterns']):
return True
# Check prefixes
if 'prefixes' in config:
if any(col.startswith(prefix) for prefix in config['prefixes']):
return True
# Check suffixes
if 'suffixes' in config:
if any(col.endswith(suffix) for suffix in config['suffixes']):
return True
return False
@classmethod
def create_availability_mask(
cls,
feature_name: str,
forecast_horizon_hours: int = 336
) -> np.ndarray:
"""
Create binary availability mask for a feature across forecast horizon.
Args:
feature_name: Name of the feature
forecast_horizon_hours: Length of forecast (default 336 = 14 days)
Returns:
Binary mask: 1 = available, 0 = masked/unavailable
"""
# Determine category
for category, config in cls.AVAILABILITY_WINDOWS.items():
if cls._matches_category(feature_name, config):
horizon = config['horizon_hours']
# Full horizon or infinite (temporal)
if horizon >= forecast_horizon_hours or horizon == float('inf'):
return np.ones(forecast_horizon_hours, dtype=np.float32)
# Partial horizon (e.g., D+1 = 24 hours)
elif horizon > 0:
mask = np.zeros(forecast_horizon_hours, dtype=np.float32)
mask[:int(horizon)] = 1.0
return mask
# Forward-fill (LTA: D+0)
elif horizon == 0:
return np.ones(forecast_horizon_hours, dtype=np.float32)
# Historical only
else:
return np.zeros(forecast_horizon_hours, dtype=np.float32)
# Unknown feature: assume historical (conservative)
return np.zeros(forecast_horizon_hours, dtype=np.float32)
@classmethod
def validate_categorization(
cls,
categories: Dict[str, List[str]],
verbose: bool = True
) -> Tuple[bool, List[str]]:
"""
Validate feature categorization against expected counts.
Args:
categories: Output from categorize_features()
verbose: Print validation details
Returns:
(is_valid, warnings)
"""
warnings = []
# Total feature count (excl. timestamp + 38 targets)
total_features = sum(len(v) for v in categories.values())
expected_total = 2514 # 2,553 columns - 1 timestamp - 38 targets
if total_features != expected_total:
warnings.append(
f"Feature count mismatch: {total_features} vs expected {expected_total}"
)
# Check full-horizon D+14 features
full_d14 = len(categories['full_horizon_d14'])
# Expected: temporal (12) + weather (~375) + outages (176) + LTA (40) = ~603
if full_d14 < 200 or full_d14 > 700:
warnings.append(
f"Full-horizon D+14 count unusual: {full_d14} (expected ~240-640)"
)
# Check partial D+1 features
partial_d1 = len(categories['partial_d1'])
if partial_d1 != 12:
warnings.append(
f"Partial D+1 count: {partial_d1} (expected 12 load forecasts)"
)
# Check uncategorized
if categories['uncategorized']:
warnings.append(
f"Uncategorized features: {len(categories['uncategorized'])} "
f"(first 5: {categories['uncategorized'][:5]})"
)
if verbose:
print("="*60)
print("FEATURE CATEGORIZATION VALIDATION")
print("="*60)
print(f"Full-horizon D+14: {len(categories['full_horizon_d14']):4d} features")
print(f"Partial D+1: {len(categories['partial_d1']):4d} features")
print(f"Historical only: {len(categories['historical']):4d} features")
print(f"Uncategorized: {len(categories['uncategorized']):4d} features")
print(f"Total: {total_features:4d} features")
if warnings:
print("\n[!] WARNINGS:")
for w in warnings:
print(f" - {w}")
else:
print("\n[OK] Validation passed!")
print("="*60)
return len(warnings) == 0, warnings
@classmethod
def get_category_summary(cls, categories: Dict[str, List[str]]) -> pd.DataFrame:
"""
Generate summary table of feature categorization.
Returns:
DataFrame with category, count, availability, and sample features
"""
summary = []
# Full-horizon D+14
summary.append({
'Category': 'Full-horizon D+14',
'Count': len(categories['full_horizon_d14']),
'Availability': 'D+1 to D+14 (336 hours)',
'Masking': 'None',
'Sample Features': ', '.join(categories['full_horizon_d14'][:3]),
})
# Partial D+1
summary.append({
'Category': 'Partial D+1',
'Count': len(categories['partial_d1']),
'Availability': 'D+1 only (24 hours)',
'Masking': 'Mask D+2 to D+14',
'Sample Features': ', '.join(categories['partial_d1'][:3]),
})
# Historical
summary.append({
'Category': 'Historical only',
'Count': len(categories['historical']),
'Availability': 'Not available for forecasting',
'Masking': 'All zeros',
'Sample Features': ', '.join(categories['historical'][:3]),
})
# Uncategorized
if categories['uncategorized']:
summary.append({
'Category': 'Uncategorized',
'Count': len(categories['uncategorized']),
'Availability': 'Unknown (conservative: historical)',
'Masking': 'All zeros (conservative)',
'Sample Features': ', '.join(categories['uncategorized'][:3]),
})
return pd.DataFrame(summary)
|