"""Final validation of complete 24-month LTA + Net Positions datasets.""" import polars as pl from pathlib import Path print("\n" + "=" * 80) print("FINAL DATA COLLECTION VALIDATION") print("=" * 80) # ========================================================================= # LTA Dataset # ========================================================================= lta_path = Path('data/raw/phase1_24month/jao_lta.parquet') lta = pl.read_parquet(lta_path) print("\n[1/2] LTA (Long Term Allocations)") print("-" * 80) print(f" Records: {len(lta):,}") print(f" Columns: {len(lta.columns)} (1 timestamp + {len(lta.columns)-3} borders + 2 masking flags)") print(f" File size: {lta_path.stat().st_size / (1024**2):.2f} MB") print(f" Date range: {lta['mtu'].min()} to {lta['mtu'].max()}") print(f" Unique timestamps: {lta['mtu'].n_unique():,}") # Check October 2023 oct_2023 = lta.filter((pl.col('mtu').dt.year() == 2023) & (pl.col('mtu').dt.month() == 10)) days_2023 = sorted(oct_2023['mtu'].dt.day().unique().to_list()) masked_2023 = oct_2023.filter(pl.col('is_masked') == True) print(f"\n October 2023:") print(f" Days present: {days_2023}") print(f" Total records: {len(oct_2023)}") print(f" Masked records: {len(masked_2023)} ({len(masked_2023)/len(lta)*100:.3f}%)") # Check October 2024 oct_2024 = lta.filter((pl.col('mtu').dt.year() == 2024) & (pl.col('mtu').dt.month() == 10)) days_2024 = sorted(oct_2024['mtu'].dt.day().unique().to_list()) print(f"\n October 2024:") print(f" Days present: {days_2024}") print(f" Total records: {len(oct_2024)}") # ========================================================================= # Net Positions Dataset # ========================================================================= np_path = Path('data/raw/phase1_24month/jao_net_positions.parquet') np_df = pl.read_parquet(np_path) print("\n[2/2] Net Positions (Domain Boundaries)") print("-" * 80) print(f" Records: {len(np_df):,}") print(f" Columns: {len(np_df.columns)} (1 timestamp + 28 zones + 1 collection_date)") print(f" File size: {np_path.stat().st_size / (1024**2):.2f} MB") print(f" Date range: {np_df['mtu'].min()} to {np_df['mtu'].max()}") print(f" Unique dates: {np_df['mtu'].dt.date().n_unique()}") # Expected: Oct 1, 2023 to Sep 30, 2025 = 731 days expected_days = 731 print(f" Expected days: {expected_days}") print(f" Coverage: {np_df['mtu'].dt.date().n_unique() / expected_days * 100:.1f}%") # ========================================================================= # Summary # ========================================================================= print("\n" + "=" * 80) print("COLLECTION STATUS") print("=" * 80) lta_complete = (days_2023 == list(range(1, 32))) and (days_2024 == list(range(1, 32))) np_complete = (np_df['mtu'].dt.date().n_unique() >= expected_days - 1) # Allow 1 day variance if lta_complete and np_complete: print("[SUCCESS] Data collection complete!") print(f" ✓ LTA: {len(lta):,} records with {len(masked_2023)} masked (Oct 27-31, 2023)") print(f" ✓ Net Positions: {len(np_df):,} records covering {np_df['mtu'].dt.date().n_unique()} days") else: print("[WARNING] Data collection incomplete:") if not lta_complete: print(f" - LTA October coverage issue") if not np_complete: print(f" - Net Positions has {np_df['mtu'].dt.date().n_unique()}/{expected_days} expected days") print("=" * 80) print()