"""Upload extended dataset to HuggingFace Datasets. Uploads features_unified_extended.parquet (17,880 rows) to replace existing 24-month dataset (17,544 rows) on HuggingFace. Dataset: evgueni-p/fbmc-features-24month New date range: Oct 1, 2023 - Oct 14, 2025 Author: Claude Date: 2025-11-14 """ from pathlib import Path import os from datasets import Dataset import polars as pl from huggingface_hub import login import sys # Load environment variables from .env file from dotenv import load_dotenv load_dotenv() def upload_extended_dataset(): """Upload extended dataset to HuggingFace.""" print("\n" + "=" * 80) print("UPLOADING EXTENDED DATASET TO HUGGINGFACE") print("=" * 80) # Load HF token hf_token = os.getenv("HF_TOKEN") if not hf_token: raise ValueError("HF_TOKEN environment variable not set - check .env file") # Login to HuggingFace print("\nAuthenticating with HuggingFace...") login(token=hf_token) print(" [OK] Logged in") # Load unified dataset with volatility features unified_file = Path("data/processed/features_unified_24month.parquet") if not unified_file.exists(): raise FileNotFoundError(f"Unified dataset not found: {unified_file}") print(f"\nLoading unified dataset with volatility features...") df = pl.read_parquet(unified_file) print(f" Shape: {df.shape}") print(f" Date range: {df['timestamp'].min()} to {df['timestamp'].max()}") print(f" File size: {unified_file.stat().st_size / 1024 / 1024:.1f} MB") # Convert to HuggingFace Dataset print("\nConverting to HuggingFace Dataset format...") hf_dataset = Dataset.from_polars(df) print(f" [OK] Converted: {hf_dataset}") # Upload to HuggingFace dataset_name = "evgueni-p/fbmc-features-24month" print(f"\nUploading to HuggingFace: {dataset_name}") print(" This may take a few minutes...") hf_dataset.push_to_hub( dataset_name, token=hf_token, private=False # Make public ) print(f"\n[OK] Dataset uploaded successfully!") print(f" URL: https://huggingface.co/datasets/{dataset_name}") print(f" Rows: {len(hf_dataset)}") print(f" Columns: {len(hf_dataset.column_names)}") return dataset_name def verify_upload(dataset_name: str): """Verify uploaded dataset by downloading and checking shape.""" print("\n" + "=" * 80) print("VERIFYING UPLOAD") print("=" * 80) from datasets import load_dataset hf_token = os.getenv("HF_TOKEN") print(f"\nDownloading dataset from HuggingFace...") print(f" Dataset: {dataset_name}") downloaded = load_dataset( dataset_name, split="train", token=hf_token ) print(f"\n[OK] Downloaded successfully!") print(f" Shape: {downloaded.shape}") # Convert to Polars for inspection df_check = pl.from_arrow(downloaded.data.table) print(f" Date range: {df_check['timestamp'].min()} to {df_check['timestamp'].max()}") # Validate expected_rows = 17880 expected_cols = 2553 issues = [] if downloaded.shape[0] != expected_rows: issues.append(f"Row mismatch: {downloaded.shape[0]} != {expected_rows}") if downloaded.shape[1] != expected_cols: issues.append(f"Column mismatch: {downloaded.shape[1]} != {expected_cols}") if issues: print("\n[WARNING] Validation issues:") for issue in issues: print(f" - {issue}") return False else: print("\n[OK] Upload verified successfully!") return True def main(): """Main execution: Upload and verify extended dataset.""" print("\n" + "=" * 80) print("HUGGINGFACE DATASET UPLOAD") print("Uploading extended dataset (17,880 rows)") print("=" * 80) try: # Upload dataset dataset_name = upload_extended_dataset() # Verify upload verification_passed = verify_upload(dataset_name) if verification_passed: print("\n" + "=" * 80) print("SUCCESS: Dataset uploaded and verified!") print("=" * 80) print(f"\nDataset URL: https://huggingface.co/datasets/{dataset_name}") print("\nNext steps:") print(" 1. Create inference notebooks (.ipynb)") print(" 2. Create HF Space README.md") print(" 3. Deploy notebooks to HF Space") print(" 4. Test inference on GPU") else: print("\n[ERROR] Verification failed") sys.exit(1) except Exception as e: error_msg = str(e).encode('ascii', 'replace').decode('ascii') print(f"\n[ERROR] Upload failed: {error_msg}") import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": main()