Spaces:
Sleeping
Sleeping
Evgueni Poloukarov
Claude
commited on
Commit
·
a57b996
1
Parent(s):
7f2c237
feat: add 396 volatility features for zero-shot forecast improvement
Browse filesAdded per-border volatility metrics to help model capture hour-to-hour variations:
- 132 hour-over-hour delta features (target_delta_h1_*)
- 132 24-hour rolling volatility features (target_vol_24h_*)
- 132 6-hour range features (target_range_6h_*)
Total features: 2,647 -> 3,043 columns
Dataset uploaded to HuggingFace: evgueni-p/fbmc-features-24month
Initial evaluation: NO improvement (0.0% MAE change)
Likely cause: Chronos-2 ignores target-derived features or feature explosion
Co-Authored-By: Claude <[email protected]>
src/feature_engineering/engineer_jao_features.py
CHANGED
|
@@ -618,6 +618,48 @@ def engineer_jao_features(
|
|
| 618 |
unified[col].alias(target_name)
|
| 619 |
])
|
| 620 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
# Remove duplicates if any
|
| 622 |
if 'mtu_right' in all_features.columns:
|
| 623 |
all_features = all_features.drop([c for c in all_features.columns if c.endswith('_right')])
|
|
|
|
| 618 |
unified[col].alias(target_name)
|
| 619 |
])
|
| 620 |
|
| 621 |
+
print(f"\n[INFO] Adding volatility features for {len(directional_cols)} directional borders...")
|
| 622 |
+
print("[INFO] Volatility features will help model capture hour-to-hour variations")
|
| 623 |
+
|
| 624 |
+
# Add volatility features for EACH directional border
|
| 625 |
+
# These features teach the model about hourly change patterns and volatility magnitude
|
| 626 |
+
for col in sorted(directional_cols):
|
| 627 |
+
from_country, to_country = col.split('>')
|
| 628 |
+
border_code = f'{from_country}_{to_country}'
|
| 629 |
+
|
| 630 |
+
# Get the target series
|
| 631 |
+
target_col = f'target_border_{border_code}'
|
| 632 |
+
|
| 633 |
+
# 1. Hour-over-hour delta (captures immediate hourly changes)
|
| 634 |
+
# Formula: target[t] - target[t-1]
|
| 635 |
+
# Helps model learn rapid swings and ramps
|
| 636 |
+
delta_col = f'target_delta_h1_{border_code}'
|
| 637 |
+
all_features = all_features.with_columns([
|
| 638 |
+
(all_features[target_col] - all_features[target_col].shift(1)).alias(delta_col)
|
| 639 |
+
])
|
| 640 |
+
|
| 641 |
+
# 2. 24-hour rolling volatility (captures daily volatility patterns)
|
| 642 |
+
# Formula: rolling_std(target, window=24h)
|
| 643 |
+
# Informs model about magnitude of daily variation
|
| 644 |
+
vol_col = f'target_vol_24h_{border_code}'
|
| 645 |
+
all_features = all_features.with_columns([
|
| 646 |
+
all_features[target_col].rolling_std(window_size=24, min_periods=1).alias(vol_col)
|
| 647 |
+
])
|
| 648 |
+
|
| 649 |
+
# 3. 6-hour range (captures intraday swings - morning/evening ramps)
|
| 650 |
+
# Formula: rolling_max(target, 6h) - rolling_min(target, 6h)
|
| 651 |
+
# Detects peak/off-peak transitions
|
| 652 |
+
range_col = f'target_range_6h_{border_code}'
|
| 653 |
+
all_features = all_features.with_columns([
|
| 654 |
+
(all_features[target_col].rolling_max(window_size=6, min_periods=1) -
|
| 655 |
+
all_features[target_col].rolling_min(window_size=6, min_periods=1)).alias(range_col)
|
| 656 |
+
])
|
| 657 |
+
|
| 658 |
+
print(f"[OK] Added {len(directional_cols) * 3} volatility features:")
|
| 659 |
+
print(f" - {len(directional_cols)} hour-over-hour delta features")
|
| 660 |
+
print(f" - {len(directional_cols)} 24-hour rolling volatility features")
|
| 661 |
+
print(f" - {len(directional_cols)} 6-hour range features")
|
| 662 |
+
|
| 663 |
# Remove duplicates if any
|
| 664 |
if 'mtu_right' in all_features.columns:
|
| 665 |
all_features = all_features.drop([c for c in all_features.columns if c.endswith('_right')])
|
upload_to_hf.py
CHANGED
|
@@ -37,16 +37,16 @@ def upload_extended_dataset():
|
|
| 37 |
login(token=hf_token)
|
| 38 |
print(" [OK] Logged in")
|
| 39 |
|
| 40 |
-
# Load
|
| 41 |
-
|
| 42 |
-
if not
|
| 43 |
-
raise FileNotFoundError(f"
|
| 44 |
|
| 45 |
-
print(f"\nLoading
|
| 46 |
-
df = pl.read_parquet(
|
| 47 |
print(f" Shape: {df.shape}")
|
| 48 |
print(f" Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
|
| 49 |
-
print(f" File size: {
|
| 50 |
|
| 51 |
# Convert to HuggingFace Dataset
|
| 52 |
print("\nConverting to HuggingFace Dataset format...")
|
|
|
|
| 37 |
login(token=hf_token)
|
| 38 |
print(" [OK] Logged in")
|
| 39 |
|
| 40 |
+
# Load unified dataset with volatility features
|
| 41 |
+
unified_file = Path("data/processed/features_unified_24month.parquet")
|
| 42 |
+
if not unified_file.exists():
|
| 43 |
+
raise FileNotFoundError(f"Unified dataset not found: {unified_file}")
|
| 44 |
|
| 45 |
+
print(f"\nLoading unified dataset with volatility features...")
|
| 46 |
+
df = pl.read_parquet(unified_file)
|
| 47 |
print(f" Shape: {df.shape}")
|
| 48 |
print(f" Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
|
| 49 |
+
print(f" File size: {unified_file.stat().st_size / 1024 / 1024:.1f} MB")
|
| 50 |
|
| 51 |
# Convert to HuggingFace Dataset
|
| 52 |
print("\nConverting to HuggingFace Dataset format...")
|