""" Inspect JAO Sample Data Structure Quick visual inspection of MaxBEX and CNECs/PTDFs data """ import polars as pl from pathlib import Path import sys # Redirect output to file to avoid encoding issues output_file = Path('data/raw/sample/data_inspection.txt') sys.stdout = open(output_file, 'w', encoding='utf-8') # Load the sample data maxbex_path = Path('data/raw/sample/maxbex_sample_sept2025.parquet') cnecs_path = Path('data/raw/sample/cnecs_sample_sept2025.parquet') print("="*80) print("JAO SAMPLE DATA INSPECTION") print("="*80) # ============================================================================ # 1. MaxBEX DATA (TARGET VARIABLE) # ============================================================================ print("\n" + "="*80) print("1. MaxBEX DATA (TARGET VARIABLE)") print("="*80) maxbex_df = pl.read_parquet(maxbex_path) print(f"\nShape: {maxbex_df.shape[0]} rows x {maxbex_df.shape[1]} columns") print(f"\nColumn names (first 20 border directions):") print(maxbex_df.columns[:20]) print(f"\nDataFrame Schema:") print(maxbex_df.schema) print(f"\nFirst 5 rows:") print(maxbex_df.head(5)) print(f"\nBasic Statistics (first 10 borders):") print(maxbex_df.select(maxbex_df.columns[:10]).describe()) # Check for nulls null_counts = maxbex_df.null_count() total_nulls = sum([null_counts[col][0] for col in maxbex_df.columns]) print(f"\nNull Values: {total_nulls} total across all columns") # ============================================================================ # 2. CNECs/PTDFs DATA # ============================================================================ print("\n" + "="*80) print("2. CNECs/PTDFs DATA (Active Constraints)") print("="*80) cnecs_df = pl.read_parquet(cnecs_path) print(f"\nShape: {cnecs_df.shape[0]} rows x {cnecs_df.shape[1]} columns") print(f"\nColumn names:") print(cnecs_df.columns) print(f"\nDataFrame Schema:") print(cnecs_df.schema) print(f"\nFirst 5 rows:") print(cnecs_df.head(5)) print(f"\nBasic Statistics (numeric columns):") # Select numeric columns only numeric_cols = [col for col in cnecs_df.columns if cnecs_df[col].dtype in [pl.Float64, pl.Int64]] print(cnecs_df.select(numeric_cols).describe()) # Check for nulls null_counts_cnecs = cnecs_df.null_count() total_nulls_cnecs = sum([null_counts_cnecs[col][0] for col in cnecs_df.columns]) print(f"\nNull Values: {total_nulls_cnecs} total across all columns") # ============================================================================ # 3. KEY INSIGHTS # ============================================================================ print("\n" + "="*80) print("3. KEY INSIGHTS") print("="*80) print(f"\nMaxBEX Data:") print(f" - Time series format: Index is datetime") print(f" - Border directions: {maxbex_df.shape[1]} total") print(f" - Wide format: Each column = one border direction") print(f" - Data type: All float64 (MW capacity values)") print(f"\nCNECs/PTDFs Data:") print(f" - Unique CNECs: {cnecs_df['cnec_name'].n_unique()}") print(f" - Unique TSOs: {cnecs_df['tso'].n_unique()}") print(f" - PTDF columns: {len([c for c in cnecs_df.columns if c.startswith('ptdf_')])}") print(f" - Has shadow prices: {'shadow_price' in cnecs_df.columns}") print(f" - Has RAM values: {'ram' in cnecs_df.columns}") # Show sample CNEC names print(f"\nSample CNEC names (first 10):") for i, name in enumerate(cnecs_df['cnec_name'].unique()[:10]): print(f" {i+1}. {name}") # Show PTDF column names ptdf_cols = [c for c in cnecs_df.columns if c.startswith('ptdf_')] print(f"\nPTDF columns ({len(ptdf_cols)} zones):") print(f" {ptdf_cols}") print("\n" + "="*80) print("INSPECTION COMPLETE") print("="*80) # Close file and print location sys.stdout.close() sys.stdout = sys.__stdout__ print(f"[OK] Data inspection saved to: {output_file}") print(f" View with: cat {output_file}")