"""Download all FBMC data from HuggingFace Datasets. This script downloads all required datasets from HuggingFace Datasets to local storage. Used for setting up new environments (HF Space, analyst handover, etc.) """ from pathlib import Path from hf_datasets_manager import FBMCDatasetManager import sys def setup_data(data_dir: Path = Path("data/raw"), force_redownload: bool = False): """Download all datasets if not present locally. Args: data_dir: Directory to store downloaded data (default: data/raw) force_redownload: Re-download even if files exist (default: False) """ print("=" * 60) print("FBMC Data Setup - Download from HuggingFace Datasets") print("=" * 60) manager = FBMCDatasetManager() # Expected datasets (will be created during Day 1) datasets_to_download = { "fbmc-cnecs-2024-2025": "cnecs_2024_2025.parquet", "fbmc-weather-2024-2025": "weather_2024_2025.parquet", "fbmc-entsoe-2024-2025": "entsoe_2024_2025.parquet", } data_dir.mkdir(parents=True, exist_ok=True) success_count = 0 skip_count = 0 fail_count = 0 for dataset_name, filename in datasets_to_download.items(): output_path = data_dir / filename print(f"\n[{filename}]") if output_path.exists() and not force_redownload: file_size_mb = output_path.stat().st_size / (1024 * 1024) print(f"✅ Already exists ({file_size_mb:.1f} MB), skipping") skip_count += 1 continue try: df = manager.download_dataset(dataset_name, output_path) if df is not None: success_count += 1 else: fail_count += 1 except Exception as e: print(f"❌ Failed to download {dataset_name}: {e}") print(f" You may need to run Day 1 data collection first") fail_count += 1 print("\n" + "=" * 60) print("Download Summary:") print(f" ✅ Downloaded: {success_count}") print(f" ⏭️ Skipped: {skip_count}") print(f" ❌ Failed: {fail_count}") print("=" * 60) if fail_count > 0: print("\n⚠️ Some datasets failed to download.") print(" Run Day 1 data collection to create these datasets.") return False else: print("\n✅ Data setup complete!") return True if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Download FBMC datasets from HuggingFace") parser.add_argument( "--data-dir", type=Path, default=Path("data/raw"), help="Directory to store data (default: data/raw)" ) parser.add_argument( "--force", action="store_true", help="Force re-download even if files exist" ) args = parser.parse_args() success = setup_data(data_dir=args.data_dir, force_redownload=args.force) sys.exit(0 if success else 1)