""" David Training Pipeline ======================== Author: AbstractPhil Assistant: Claude Sonnet 4.5 ------------------------------------------------------------= Training pipeline for David multi-scale feature classifier. Will be placed officially at: geovocab2/train/model/core/david_trainer.py Or run from: scripts/train_david.py Runs on colab without hassle, set your repo and your HF_TOKEN as a userdata secret in colab. Features: - Pure fp32 training (no mixed precision for geometric stability) - Can enable mixed if you want speed. - Adaptive training controller (freeze/unfreeze scales) - Gradient analysis and scaling - SafeTensors checkpointing and epoch control support - Enhanced loss component tracking - Proper weight organization: weights/model_name/timestamp/ - Accuracy in filenames and comprehensive tracking - Saves models into a shared index (MODELS_INDEX.json) in the repo. - Parses a readme if one exists, creates a repo if one doesn't. """ import torch import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader from torch.utils.tensorboard import SummaryWriter from datasets import load_dataset from huggingface_hub import HfApi, create_repo, upload_folder, upload_file import numpy as np import os import json import time import tempfile from datetime import datetime from tqdm.auto import tqdm from pathlib import Path from typing import Dict, List, Optional, Tuple, Union from dataclasses import dataclass, field, asdict # Import David components from geovocab2.train.config.david_config import ( DavidArchitectureConfig, DavidPresets, SharingMode, FusionMode ) from geovocab2.train.model.core.david import ( David, MultiScaleCrystalLoss, ) # Import SimplexFactory from geovocab2.shapes.factory import SimplexFactory # ============================================================================ # TRAINING CONFIGURATION # ============================================================================ @dataclass class DavidTrainingConfig: """ Complete training configuration for David. Separate from model architecture config. """ # Metadata name: str = "david_training" run_id: str = "" # Auto-generated timestamp # Dataset dataset_name: str = "AbstractPhil/imagenet-clip-features-orderly" model_variant: str = "clip_vit_b16" num_classes: int = 1000 # Model architecture (references to david_config) preset: Optional[str] = "balanced" # Or None to use custom config custom_config_path: Optional[str] = None # Path to custom david_config.json # Architecture overrides (applied to preset or custom config) num_classes_override: Optional[int] = None use_belly_override: Optional[bool] = None belly_expand_override: Optional[float] = None progressive_training_override: Optional[bool] = None # Override progressive training scale_warmup_epochs_override: Optional[Dict[int, int]] = None # Custom warmup schedule # Training hyperparameters num_epochs: int = 50 batch_size: int = 512 learning_rate: float = 5e-3 weight_decay: float = 1e-5 warmup_epochs: int = 3 # Loss weights use_rose_loss: bool = True rose_initial_weight: float = 0.01 rose_max_weight: float = 0.1 rose_weight_schedule: str = "adaptive" use_cayley_loss: bool = False cayley_weight: float = 0.001 scale_loss_balance: Optional[Dict[int, float]] = None # Optimization use_mixed_precision: bool = False # Keep False for stability gradient_clip: float = 5.0 scheduler_type: str = "cosine_restarts" min_lr: float = 1e-6 # Adaptive training (safer defaults) freeze_strategy: str = "never" # "performance" or "never" freeze_threshold: float = 90.0 # Only freeze when scale hits 90% accuracy unfreeze_on_plateau: bool = True patience: int = 10 # Gradient monitoring track_gradients: bool = True gradient_scale_threshold: float = 1e-5 gradient_scale_multiplier: float = 10.0 # Logging log_interval: int = 50 val_interval: int = 1 save_interval: int = 5 log_fusion_weights: bool = True log_loss_components: bool = True # Checkpointing save_format: str = "both" # "pytorch", "safetensors", or "both" # HuggingFace Hub (optional) hf_repo: Optional[str] = "YourName/Repo" #"AbstractPhil/gated-david" # Your HF repo upload_to_hub: bool = False # Local paths base_dir: str = "./david_training" # Hardware num_workers: int = 10 pin_memory: bool = True prefetch_factor: int = 4 persistent_workers: bool = True def __post_init__(self): """Generate run_id if not provided.""" if not self.run_id: self.run_id = datetime.now().strftime('%Y%m%d_%H%M%S') def to_dict(self) -> dict: """Convert to dictionary.""" return asdict(self) @classmethod def from_dict(cls, data: dict) -> 'DavidTrainingConfig': """Create from dictionary.""" return cls(**data) def to_json(self, path: str): """Save to JSON.""" data = self.to_dict() # Convert any nested dicts with int keys to str keys if data.get('scale_loss_balance'): data['scale_loss_balance'] = { str(k): v for k, v in data['scale_loss_balance'].items() } if data.get('scale_warmup_epochs_override'): data['scale_warmup_epochs_override'] = { str(k): v for k, v in data['scale_warmup_epochs_override'].items() } with open(path, 'w') as f: json.dump(data, f, indent=2) @classmethod def from_json(cls, path: str) -> 'DavidTrainingConfig': """Load from JSON.""" with open(path, 'r') as f: data = json.load(f) # Convert str keys back to int for scale_loss_balance if 'scale_loss_balance' in data and data['scale_loss_balance']: data['scale_loss_balance'] = { int(k): v for k, v in data['scale_loss_balance'].items() } # Convert str keys back to int for scale_warmup_epochs_override if 'scale_warmup_epochs_override' in data and data['scale_warmup_epochs_override']: data['scale_warmup_epochs_override'] = { int(k): v for k, v in data['scale_warmup_epochs_override'].items() } return cls(**data) # ============================================================================ # ADAPTIVE TRAINING CONTROLLER # ============================================================================ class AdaptiveTrainingController: """Manages adaptive training strategies for multi-scale model.""" def __init__(self, model: David, config: DavidTrainingConfig): self.model = model self.config = config scales = model.scales self.scale_history = {scale: [] for scale in scales} self.best_scale_acc = {scale: 0.0 for scale in scales} self.scales_frozen = {scale: False for scale in scales} self.overall_history = [] self.plateau_counter = 0 self.best_overall = 0.0 def update_metrics(self, scale_accuracies: Dict[int, float], overall_accuracy: float): """Update metrics and best scores.""" for scale, acc in scale_accuracies.items(): self.scale_history[scale].append(acc) if acc > self.best_scale_acc[scale]: self.best_scale_acc[scale] = acc self.overall_history.append(overall_accuracy) if overall_accuracy > self.best_overall: self.best_overall = overall_accuracy self.plateau_counter = 0 else: self.plateau_counter += 1 def should_freeze_scale(self, scale: int, current_acc: float) -> bool: """Determine if a scale should be frozen.""" if self.config.freeze_strategy == "never": return False if self.scales_frozen[scale]: return False if self.config.freeze_strategy == "performance": return current_acc >= self.config.freeze_threshold return False def should_unfreeze_scales(self) -> bool: """Check if scales should be unfrozen due to plateau.""" if not self.config.unfreeze_on_plateau: return False return self.plateau_counter >= 5 def apply_adaptive_strategies(self, scale_accuracies: Dict[int, float], epoch: int): """Apply freeze/unfreeze based on performance.""" active_scales = self.model.get_active_scales() # Don't freeze scales if it would leave no trainable parameters for scale, acc in scale_accuracies.items(): if self.should_freeze_scale(scale, acc): # Count how many active scales would remain unfrozen active_unfrozen = [s for s in active_scales if not self.scales_frozen.get(s, False)] if len(active_unfrozen) <= 1: print(f"[āš ļø] Skipping freeze of scale {scale} (would leave no active trainable scales)") continue self.model.freeze_scale(scale) self.scales_frozen[scale] = True print(f"[ā„ļø] Froze scale {scale} (acc={acc:.2f}%)") if self.should_unfreeze_scales() and any(self.scales_frozen.values()): for scale in self.model.scales: if self.scales_frozen[scale]: self.model.unfreeze_scale(scale) self.scales_frozen[scale] = False self.plateau_counter = 0 print(f"[šŸ”„] Unfroze all scales due to plateau") # ============================================================================ # OPTIMIZER & SCHEDULER CREATION # ============================================================================ def create_optimizer(david: David, config: DavidTrainingConfig) -> torch.optim.Optimizer: """Create optimizer with parameter groups.""" param_groups = [] # Shared parameters (if exists) if hasattr(david, 'shared_extractor'): param_groups.append({ 'params': david.shared_extractor.parameters(), 'lr': config.learning_rate, 'name': 'shared' }) elif hasattr(david, 'shared_base'): param_groups.append({ 'params': david.shared_base.parameters(), 'lr': config.learning_rate, 'name': 'shared' }) # Scale-specific parameters for scale in david.scales: scale_params = [] if david.sharing_mode == SharingMode.HIERARCHICAL: head = getattr(david, f'head_{scale}', None) if head: scale_params.extend(head.parameters()) refine = getattr(david, f'refine_{scale}', None) if refine: scale_params.extend(refine.parameters()) else: scale_params.extend(david.heads[str(scale)].parameters()) if scale_params: param_groups.append({ 'params': scale_params, 'lr': config.learning_rate, 'name': f'scale_{scale}' }) # Fusion parameters if hasattr(david, 'fusion'): param_groups.append({ 'params': david.fusion.parameters(), 'lr': config.learning_rate * 0.5, 'name': 'fusion' }) elif hasattr(david, 'fusion_weights'): param_groups.append({ 'params': [david.fusion_weights], 'lr': config.learning_rate * 0.5, 'name': 'fusion' }) return torch.optim.AdamW(param_groups, weight_decay=config.weight_decay) def create_scheduler(optimizer: torch.optim.Optimizer, config: DavidTrainingConfig) -> torch.optim.lr_scheduler._LRScheduler: """Create learning rate scheduler.""" if config.scheduler_type == "cosine_restarts": return torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, T_0=10, T_mult=2, eta_min=config.min_lr ) elif config.scheduler_type == "cosine": return torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=config.num_epochs, eta_min=config.min_lr ) else: return None # ============================================================================ # GRADIENT ANALYSIS # ============================================================================ def analyze_gradients(model: David, config: DavidTrainingConfig) -> Dict[str, float]: """Analyze gradient magnitudes for debugging.""" grad_stats = { 'mean': 0.0, 'max': 0.0, 'min': float('inf'), 'num_zero': 0, 'num_small': 0, 'total': 0 } for name, param in model.named_parameters(): if param.grad is not None: grad_norm = param.grad.norm().item() grad_stats['mean'] += grad_norm grad_stats['max'] = max(grad_stats['max'], grad_norm) grad_stats['min'] = min(grad_stats['min'], grad_norm) grad_stats['total'] += 1 if grad_norm < 1e-10: grad_stats['num_zero'] += 1 elif grad_norm < config.gradient_scale_threshold: grad_stats['num_small'] += 1 if grad_stats['total'] > 0: grad_stats['mean'] /= grad_stats['total'] return grad_stats def scale_small_gradients(model: David, config: DavidTrainingConfig): """Scale up very small gradients to prevent vanishing.""" if not config.track_gradients: return for param in model.parameters(): if param.grad is not None: grad_norm = param.grad.norm() if grad_norm < config.gradient_scale_threshold and grad_norm > 0: param.grad.mul_(config.gradient_scale_multiplier) # ============================================================================ # HUGGINGFACE HUB UTILITIES # ============================================================================ def generate_model_readme( config: DavidTrainingConfig, david_config: DavidArchitectureConfig, best_metrics: Dict, run_id: str ) -> str: """Generate README.md for model card.""" readme = f"""--- language: en license: mit tags: - image-classification - imagenet - multi-scale - feature-geometry - david datasets: - imagenet-1k metrics: - accuracy model-index: - name: David-{david_config.sharing_mode}-{david_config.fusion_mode} results: - task: type: image-classification dataset: name: ImageNet-1K type: imagenet-1k metrics: - type: accuracy value: {best_metrics.get('best_val_acc', 0.0):.2f} --- # David: Multi-Scale Feature Classifier **David** is a multi-scale deep learning classifier that uses feature geometry (pentachora/4-simplexes) as class prototypes with role-weighted similarity computation (Rose Loss). ## Model Details ### Architecture - **Preset**: {config.preset} - **Sharing Mode**: {david_config.sharing_mode} - **Fusion Mode**: {david_config.fusion_mode} - **Scales**: {david_config.scales} - **Feature Dim**: {david_config.feature_dim} - **Parameters**: {best_metrics.get('parameters', 0):,} ### Training Configuration - **Dataset**: {config.dataset_name} - **Model Variant**: {config.model_variant} - **Epochs**: {config.num_epochs} - **Batch Size**: {config.batch_size} - **Learning Rate**: {config.learning_rate} - **Rose Loss Weight**: {config.rose_initial_weight} → {config.rose_max_weight} - **Cayley Loss**: {config.use_cayley_loss} ## Performance ### Best Results - **Validation Accuracy**: {best_metrics.get('best_val_acc', 0.0):.2f}% - **Best Epoch**: {best_metrics.get('best_epoch', 0)} - **Final Train Accuracy**: {best_metrics.get('final_train_acc', 0.0):.2f}% ### Per-Scale Performance """ if 'scale_accuracies' in best_metrics: for scale, acc in best_metrics['scale_accuracies'].items(): readme += f"- **Scale {scale}**: {acc:.2f}%\n" readme += f""" ## Usage ### Quick Model Lookup **Check `MODELS_INDEX.json` in the repo root** - it lists all trained models sorted by accuracy with links to weights and configs. ### Repository Structure ``` {config.hf_repo if config.hf_repo else 'AbstractPhil/david'}/ ā”œā”€ā”€ MODELS_INDEX.json # šŸ“Š Master index of all models (sorted by accuracy) ā”œā”€ā”€ README.md # This file ā”œā”€ā”€ best_model.json # Latest best model info ā”œā”€ā”€ weights/ │ └── {david_config.name}/ │ └── {run_id}/ │ ā”œā”€ā”€ MODEL_SUMMARY.txt # šŸŽÆ Human-readable performance summary │ ā”œā”€ā”€ training_history.json # šŸ“ˆ Epoch-by-epoch training curve │ ā”œā”€ā”€ best_model_acc{best_metrics.get('best_val_acc', 0.0):.2f}.safetensors # ⭐ Accuracy in filename! │ ā”œā”€ā”€ best_model_acc{best_metrics.get('best_val_acc', 0.0):.2f}_metadata.json │ ā”œā”€ā”€ final_model.safetensors │ ā”œā”€ā”€ checkpoint_epoch_X_accYY.YY.safetensors │ ā”œā”€ā”€ david_config.json │ └── train_config.json └── runs/ └── {david_config.name}/ └── {run_id}/ └── events.out.tfevents.* # TensorBoard logs ``` ### Loading the Model ```python from geovocab2.train.model.core.david import David, DavidArchitectureConfig from huggingface_hub import hf_hub_download # Browse available models in MODELS_INDEX.json first! # Specify model variant and run model_name = "{david_config.name}" run_id = "{run_id}" accuracy = "{best_metrics.get('best_val_acc', 0.0):.2f}" # From MODELS_INDEX.json # Download config config_path = hf_hub_download( repo_id="{config.hf_repo if config.hf_repo else 'AbstractPhil/david'}", filename=f"weights/{{model_name}}/{{run_id}}/david_config.json" ) config = DavidArchitectureConfig.from_json(config_path) # Download weights (accuracy in filename!) weights_path = hf_hub_download( repo_id="{config.hf_repo if config.hf_repo else 'AbstractPhil/david'}", filename=f"weights/{{model_name}}/{{run_id}}/best_model_acc{{accuracy}}.safetensors" ) # Download training history (optional - see full training curve) history_path = hf_hub_download( repo_id="{config.hf_repo if config.hf_repo else 'AbstractPhil/david'}", filename=f"weights/{{model_name}}/{{run_id}}/training_history.json" ) # Load model from safetensors.torch import load_file david = David.from_config(config) david.load_state_dict(load_file(weights_path)) david.eval() ``` ### Inference ```python import torch import torch.nn.functional as F # Assuming you have CLIP features (512-dim for ViT-B/16) features = get_clip_features(image) # [1, 512] # Load anchors anchors_dict = torch.load("anchors.pth") # Forward pass with torch.no_grad(): logits, _ = david(features, anchors_dict) predictions = logits.argmax(dim=-1) ``` ## Architecture Overview ### Multi-Scale Processing David processes inputs at multiple scales ({', '.join(map(str, david_config.scales))}), allowing it to capture both coarse and fine-grained features. ### Feature Geometry Each class is represented by a pentachoron (4-simplex) in embedding space with 5 vertices: - **Anchor**: Primary class representative - **Need**: Complementary direction - **Relation**: Contextual alignment - **Purpose**: Functional direction - **Observer**: Meta-perspective ### Rose Loss Similarity computation uses role-weighted cosine similarities: ``` score = w_anchor * sim(z, anchor) + w_need * sim(z, need) + ... ``` ### Fusion Strategy **{david_config.fusion_mode}**: Intelligently combines predictions from multiple scales. ## Training Details ### Loss Components - **Cross-Entropy**: Standard classification loss - **Rose Loss**: Pentachora role-weighted margin loss (weight: {config.rose_initial_weight}→{config.rose_max_weight}) - **Cayley Loss**: Geometric regularization ({'enabled' if config.use_cayley_loss else 'disabled'}) ### Optimization - **Optimizer**: AdamW - **Weight Decay**: {config.weight_decay} - **Scheduler**: {config.scheduler_type} - **Gradient Clip**: {config.gradient_clip} - **Mixed Precision**: {config.use_mixed_precision} ## Citation ```bibtex Modify specifically to include your name for citation if you modify the code. @software{{david_classifier_2025, title = {{David: Multi-Scale Feature Classifier}}, author = {{AbstractPhil}}, year = {{2025}}, url = {{https://huggingface.co/{config.hf_repo if config.hf_repo else 'AbstractPhil/david'}}}, note = {{Run ID: {run_id}}} }} ``` ## License MIT License ## Acknowledgments Model origin author: AbstractPhil Built with lattice geometry and multi-scale deep learning. Special thanks to Claude Sonnet 4.5 (Anthropic) for debugging assistance. --- *Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}* """ return readme def save_best_model_json( filepath: str, metrics: Dict, config: DavidTrainingConfig, david_config: DavidArchitectureConfig ): """Save best_model.json with comprehensive metrics.""" model_name = f"David-{david_config.sharing_mode}-{david_config.fusion_mode}" best_model_info = { "model_name": model_name, "run_id": config.run_id, "timestamp": datetime.now().isoformat(), # Best metrics "best_val_acc": metrics.get('best_val_acc', 0.0), "best_epoch": metrics.get('best_epoch', 0), "final_train_acc": metrics.get('final_train_acc', 0.0), "final_train_loss": metrics.get('final_train_loss', 0.0), # Per-scale performance "scale_accuracies": metrics.get('scale_accuracies', {}), # Architecture "architecture": { "preset": config.preset, "sharing_mode": david_config.sharing_mode, "fusion_mode": david_config.fusion_mode, "scales": david_config.scales, "feature_dim": david_config.feature_dim, "num_classes": david_config.num_classes, "use_belly": david_config.use_belly, "belly_expand": david_config.belly_expand, }, # Training config "training": { "dataset": config.dataset_name, "model_variant": config.model_variant, "num_epochs": config.num_epochs, "batch_size": config.batch_size, "learning_rate": config.learning_rate, "rose_weight": f"{config.rose_initial_weight}→{config.rose_max_weight}", "cayley_loss": config.use_cayley_loss, "optimizer": "AdamW", "scheduler": config.scheduler_type, }, # Files (organized by model/run) "files": { "weights_safetensors": f"weights/{model_name}/{config.run_id}/best_model_acc{metrics.get('best_val_acc', 0.0):.2f}.safetensors", "weights_pytorch": f"weights/{model_name}/{config.run_id}/best_model.pth", "config": f"weights/{model_name}/{config.run_id}/david_config.json", "training_config": f"weights/{model_name}/{config.run_id}/train_config.json", "tensorboard": f"runs/{model_name}/{config.run_id}/" } } with open(filepath, 'w') as f: json.dump(best_model_info, f, indent=2) print(f"[šŸ“„] Saved best_model.json: {filepath}") def create_model_summary( weights_dir: str, config: DavidTrainingConfig, david_config: DavidArchitectureConfig, best_metrics: Dict, model_name: str ): """Create prominent model summary with accuracy front and center.""" summary_path = os.path.join(weights_dir, 'MODEL_SUMMARY.txt') best_acc = best_metrics.get('best_val_acc', 0.0) training_history = best_metrics.get('training_history', {}) summary = f""" ╔══════════════════════════════════════════════════════════════╗ ā•‘ DAVID MODEL SUMMARY ā•‘ ╠══════════════════════════════════════════════════════════════╣ ā•‘ ā•‘ ā•‘ šŸŽÆ VALIDATION ACCURACY: {best_acc:.2f}% ā•‘ ā•‘ ā•‘ ā•šā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā• MODEL: {model_name} RUN ID: {config.run_id} BEST EPOCH: {best_metrics.get('best_epoch', 0) + 1}/{config.num_epochs} ═══════════════════════════════════════════════════════════════ šŸ“Š PERFORMANCE BREAKDOWN Final Training Accuracy: {best_metrics.get('final_train_acc', 0.0):.2f}% Best Validation Accuracy: {best_acc:.2f}% Per-Scale Accuracies: """ scale_accs = best_metrics.get('scale_accuracies', {}) for scale in sorted(scale_accs.keys()): acc = scale_accs[scale] summary += f" • Scale {scale:4d}: {acc:.2f}%\n" summary += f""" ═══════════════════════════════════════════════════════════════ šŸ—ļø ARCHITECTURE Preset: {config.preset} Sharing Mode: {david_config.sharing_mode} Fusion Mode: {david_config.fusion_mode} Scales: {len(david_config.scales)} scales - {david_config.scales} Feature Dim: {david_config.feature_dim} Parameters: {best_metrics.get('parameters', 0):,} ═══════════════════════════════════════════════════════════════ šŸ“ˆ TRAINING CURVE """ if training_history and 'val_acc' in training_history: summary += "Epoch | Train Acc | Val Acc | Learning Rate\n" summary += "------|-----------|----------|--------------\n" for i, epoch in enumerate(training_history.get('epochs', [])): train_acc = training_history['train_acc'][i] if i < len(training_history['train_acc']) else 0 val_acc = training_history['val_acc'][i] if i < len(training_history['val_acc']) else 0 lr = training_history['lr'][i] if i < len(training_history['lr']) else 0 marker = " šŸ‘‘" if val_acc == best_acc else "" summary += f"{epoch:5d} | {train_acc:8.2f}% | {val_acc:7.2f}%{marker} | {lr:.2e}\n" summary += f""" ═══════════════════════════════════════════════════════════════ šŸ“ FILES Best Model: best_model_acc{best_acc:.2f}.safetensors Config: david_config.json Training Cfg: train_config.json History: training_history.json ═══════════════════════════════════════════════════════════════ Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} """ with open(summary_path, 'w') as f: f.write(summary) print(f"[šŸ“„] Created MODEL_SUMMARY.txt") return summary_path def update_models_index( config: DavidTrainingConfig, david_config: DavidArchitectureConfig, best_metrics: Dict, model_name: str ): """Update master models index file tracking all trained models.""" if not config.upload_to_hub or not config.hf_repo: return try: from huggingface_hub import hf_hub_download api = HfApi() # Try to download existing index try: index_path = hf_hub_download( repo_id=config.hf_repo, filename="MODELS_INDEX.json", repo_type="model" ) with open(index_path, 'r') as f: models_index = json.load(f) except: # Create new index if doesn't exist models_index = { "repository": config.hf_repo, "updated": datetime.now().isoformat(), "models": [] } # Add current model entry model_entry = { "model_name": model_name, "run_id": config.run_id, "timestamp": datetime.now().isoformat(), "best_val_acc": best_metrics.get('best_val_acc', 0.0), "best_epoch": best_metrics.get('best_epoch', 0), "num_scales": len(david_config.scales), "scales": david_config.scales, "parameters": best_metrics.get('parameters', 0), "sharing_mode": david_config.sharing_mode, "fusion_mode": david_config.fusion_mode, "preset": config.preset, "weights_path": f"weights/{model_name}/{config.run_id}/best_model_acc{best_metrics.get('best_val_acc', 0.0):.2f}.safetensors", "config_path": f"weights/{model_name}/{config.run_id}/david_config.json", "history_path": f"weights/{model_name}/{config.run_id}/training_history.json" } # Remove old entry for same run_id if exists (update) models_index["models"] = [m for m in models_index["models"] if m.get("run_id") != config.run_id] models_index["models"].append(model_entry) # Sort by accuracy (descending) models_index["models"].sort(key=lambda x: x.get("best_val_acc", 0), reverse=True) models_index["updated"] = datetime.now().isoformat() models_index["total_models"] = len(models_index["models"]) # Save locally with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as f: json.dump(models_index, f, indent=2) temp_path = f.name # Upload to hub root api.upload_file( path_or_fileobj=temp_path, path_in_repo="MODELS_INDEX.json", repo_id=config.hf_repo, commit_message=f"Update models index - {model_name} @ {best_metrics.get('best_val_acc', 0.0):.2f}%" ) os.unlink(temp_path) print(f"[šŸ“Š] Updated MODELS_INDEX.json - {len(models_index['models'])} models tracked") except Exception as e: print(f"[āš ļø] Failed to update models index: {e}") def upload_to_huggingface( local_dir: str, repo_id: str, commit_message: str, path_in_repo: Optional[str] = None, patterns: Optional[List[str]] = None ): """Upload directory to HuggingFace Hub.""" try: api = HfApi() # Create repo if it doesn't exist try: create_repo(repo_id, exist_ok=True, repo_type="model") print(f"[šŸ¤—] Repo ready: {repo_id}") except Exception as e: print(f"[āš ļø] Repo exists or creation failed: {e}") # Upload folder if patterns: # Upload specific patterns for pattern in patterns: matching_files = list(Path(local_dir).rglob(pattern)) for file_path in matching_files: rel_path = file_path.relative_to(local_dir) if path_in_repo: repo_path = f"{path_in_repo}/{rel_path}" else: repo_path = str(rel_path) api.upload_file( path_or_fileobj=str(file_path), path_in_repo=repo_path, repo_id=repo_id, commit_message=commit_message ) else: # Upload entire folder api.upload_folder( folder_path=local_dir, repo_id=repo_id, path_in_repo=path_in_repo, commit_message=commit_message ) print(f"[āœ…] Uploaded to Hub: https://huggingface.co/{repo_id}") except Exception as e: print(f"[āŒ] Hub upload failed: {e}") print(f" Continuing training (files saved locally)") def prepare_hub_upload( weights_dir: str, runs_dir: str, config: DavidTrainingConfig, david_config: DavidArchitectureConfig, best_metrics: Dict, model_name: str ): """Prepare and upload all artifacts to HuggingFace Hub.""" if not config.upload_to_hub or not config.hf_repo: return print("\n[šŸ¤—] Preparing HuggingFace Hub upload...") # Create model summary file summary_path = create_model_summary(weights_dir, config, david_config, best_metrics, model_name) # Update master models index update_models_index(config, david_config, best_metrics, model_name) api = HfApi() try: create_repo(config.hf_repo, exist_ok=True, repo_type="model") except: pass # Create temporary directory for root files with tempfile.TemporaryDirectory() as temp_dir: # Generate README at root readme_path = os.path.join(temp_dir, "README.md") readme_content = generate_model_readme(config, david_config, best_metrics, config.run_id) with open(readme_path, 'w') as f: f.write(readme_content) print(f"[šŸ“] Generated README.md") # Save best_model.json at root best_json_path = os.path.join(temp_dir, "best_model.json") save_best_model_json(best_json_path, best_metrics, config, david_config) # Upload root files (README.md, best_model.json) print(f"[šŸ“¤] Uploading root files...") api.upload_file( path_or_fileobj=readme_path, path_in_repo="README.md", repo_id=config.hf_repo, commit_message=f"Update README - Run {config.run_id}" ) api.upload_file( path_or_fileobj=best_json_path, path_in_repo="best_model.json", repo_id=config.hf_repo, commit_message=f"Update metrics - Run {config.run_id}" ) # Upload ONLY essential weight files (not entire directory!) weights_repo_path = f"weights/{model_name}/{config.run_id}" best_acc = best_metrics.get('best_val_acc', 0.0) print(f"[šŸ“¤] Uploading essential files to {weights_repo_path}...") # List of specific files to upload (not entire directory) files_to_upload = [ ('MODEL_SUMMARY.txt', 'MODEL_SUMMARY.txt'), ('training_history.json', 'training_history.json'), ('david_config.json', 'david_config.json'), ('train_config.json', 'train_config.json'), (f'best_model_acc{best_acc:.2f}.safetensors', f'best_model_acc{best_acc:.2f}.safetensors'), (f'best_model_acc{best_acc:.2f}_metadata.json', f'best_model_acc{best_acc:.2f}_metadata.json'), ] for local_filename, repo_filename in files_to_upload: local_path = os.path.join(weights_dir, local_filename) if os.path.exists(local_path): try: api.upload_file( path_or_fileobj=local_path, path_in_repo=f"{weights_repo_path}/{repo_filename}", repo_id=config.hf_repo, commit_message=f"Update {repo_filename} - Run {config.run_id}" ) except Exception as e: print(f"[āš ļø] Failed to upload {repo_filename}: {e}") print(f"[āœ…] Uploaded to Hub: https://huggingface.co/{config.hf_repo}") # Upload tensorboard logs (only if they exist and it's final upload) # Skip TensorBoard during training to avoid huge uploads every epoch # if os.path.exists(runs_dir): # runs_repo_path = f"runs/{model_name}/{config.run_id}" # print(f"[šŸ“¤] Uploading TensorBoard logs to {runs_repo_path}...") # upload_to_huggingface( # local_dir=runs_dir, # repo_id=config.hf_repo, # commit_message=f"Upload TensorBoard logs - {model_name} - Run {config.run_id}", # path_in_repo=runs_repo_path # ) # ============================================================================ # CHECKPOINT UTILITIES # ============================================================================ def save_checkpoint( filepath: str, david: David, optimizer: torch.optim.Optimizer, scheduler: Optional[torch.optim.lr_scheduler._LRScheduler], epoch: int, metrics: Dict, train_config: DavidTrainingConfig ): """Save checkpoint in PyTorch and/or SafeTensors format.""" checkpoint = { 'epoch': epoch, 'model_state_dict': david.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict() if scheduler else None, 'metrics': metrics, 'train_config': train_config.to_dict(), } # Add accuracy to filename if available val_acc = metrics.get('best_val_acc') or metrics.get('val_acc') if val_acc: acc_suffix = f"_acc{val_acc:.2f}" filepath = filepath + acc_suffix if train_config.save_format in ['pytorch', 'both']: torch.save(checkpoint, filepath + '.pth') print(f"[šŸ’¾] Saved PyTorch: {filepath}.pth") if train_config.save_format in ['safetensors', 'both']: try: from safetensors.torch import save_file # Save model state model_state = {k: v.contiguous() for k, v in david.state_dict().items()} save_file(model_state, filepath + '.safetensors') # Save metadata separately (now includes full training history) metadata = {k: v for k, v in checkpoint.items() if k not in ['model_state_dict']} with open(filepath + '_metadata.json', 'w') as f: json.dump(metadata, f, indent=2, default=str) print(f"[šŸ’¾] Saved SafeTensors: {filepath}.safetensors") except ImportError: print(f"[āš ļø] SafeTensors not available, skipping") def load_checkpoint( checkpoint_path: str, david: David, optimizer: Optional[torch.optim.Optimizer] = None, scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, device: str = "cuda" ) -> Tuple[int, Dict]: """Load checkpoint and return epoch and metrics.""" if checkpoint_path.endswith('.safetensors'): # Load SafeTensors format try: from safetensors.torch import load_file model_state = load_file(checkpoint_path, device=device) david.load_state_dict(model_state) # Load metadata metadata_path = checkpoint_path.replace('.safetensors', '_metadata.json') with open(metadata_path, 'r') as f: metadata = json.load(f) epoch = metadata.get('epoch', 0) metrics = metadata.get('metrics', {}) if optimizer and 'optimizer_state_dict' in metadata: optimizer.load_state_dict(metadata['optimizer_state_dict']) if scheduler and 'scheduler_state_dict' in metadata and metadata['scheduler_state_dict']: scheduler.load_state_dict(metadata['scheduler_state_dict']) print(f"[āœ…] Loaded from SafeTensors: {checkpoint_path}") return epoch, metrics except ImportError: raise ImportError("safetensors not installed") else: # Load PyTorch format checkpoint = torch.load(checkpoint_path, map_location=device) david.load_state_dict(checkpoint['model_state_dict']) if optimizer and 'optimizer_state_dict' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer_state_dict']) if scheduler and 'scheduler_state_dict' in checkpoint and checkpoint['scheduler_state_dict']: scheduler.load_state_dict(checkpoint['scheduler_state_dict']) print(f"[āœ…] Loaded from PyTorch: {checkpoint_path}") return checkpoint['epoch'], checkpoint.get('metrics', {}) # ============================================================================ # DATASET # ============================================================================ class ImageNetHFDataset(Dataset): """PyTorch Dataset wrapper for HuggingFace ImageNet features.""" def __init__(self, dataset_name: str, model_variant: str, split: str = "train"): # Load only the specific split to avoid downloading all data print(f"[šŸ“„] Loading {split} split for {model_variant}...") self.dataset = load_dataset( dataset_name, name=model_variant, # Dataset configuration/variant name split=split # Only load this specific split ) self.length = len(self.dataset) print(f"[āœ…] Loaded {self.length:,} samples from {split} split") def __len__(self): return self.length def __getitem__(self, idx): item = self.dataset[idx] features = torch.tensor(item['clip_features'], dtype=torch.float32) label = torch.tensor(item['label'], dtype=torch.long) return features, label def create_dataloaders(config: DavidTrainingConfig): """Create train and validation dataloaders.""" train_dataset = ImageNetHFDataset( config.dataset_name, config.model_variant, "train" ) val_dataset = ImageNetHFDataset( config.dataset_name, config.model_variant, "validation" ) train_loader = DataLoader( train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, pin_memory=config.pin_memory, prefetch_factor=config.prefetch_factor, persistent_workers=config.persistent_workers ) val_loader = DataLoader( val_dataset, batch_size=config.batch_size * 2, shuffle=False, num_workers=config.num_workers, pin_memory=config.pin_memory, prefetch_factor=config.prefetch_factor, persistent_workers=config.persistent_workers ) return train_loader, val_loader # ============================================================================ # CRYSTAL GENERATOR # ============================================================================ class CrystalGenerator: """Generate crystals for all scales.""" def __init__(self, num_classes: int, scales: List[int], device: str = "cuda"): self.num_classes = num_classes self.scales = scales self.device = device self.factories = { scale: SimplexFactory(k=4, embed_dim=scale, method="random") for scale in scales } def generate(self, seed: int = 42) -> Tuple[Dict[int, torch.Tensor], Dict[int, torch.Tensor]]: """Generate anchors and crystals for all scales.""" anchors_dict = {} crystals_dict = {} for scale in tqdm(self.scales, desc="Generating crystals"): factory = self.factories[scale] batch_crystals = [] for class_idx in range(self.num_classes): crystal = factory.build( backend="torch", device=self.device, dtype=torch.float32, seed=seed + class_idx, validate=True ) batch_crystals.append(crystal) crystals = torch.stack(batch_crystals) anchors = F.normalize(crystals[:, 0, :], dim=-1) # Verify anchor diversity anchor_sims = anchors @ anchors.T off_diag = anchor_sims[~torch.eye(self.num_classes, dtype=bool, device=anchors.device)] max_sim = off_diag.max().item() mean_sim = off_diag.mean().item() print(f" Scale {scale}: max_sim={max_sim:.4f}, mean_sim={mean_sim:.4f}") if max_sim > 0.99: print(f" āš ļø WARNING: Anchors too similar at scale {scale}!") anchors_dict[scale] = anchors crystals_dict[scale] = crystals return anchors_dict, crystals_dict # ============================================================================ # TRAINING LOOP # ============================================================================ def train_epoch( david: David, train_loader: DataLoader, optimizer: torch.optim.Optimizer, criterion: MultiScaleCrystalLoss, anchors_dict: Dict[int, torch.Tensor], crystals_dict: Dict[int, torch.Tensor], epoch: int, config: DavidTrainingConfig, writer: Optional[SummaryWriter], global_step: int ) -> Tuple[float, float, int, Dict]: """Train for one epoch - Pure FP32.""" david.train() david.update_epoch(epoch) total_loss = 0 correct = 0 total = 0 loss_components_sum = {} active_scales = david.get_active_scales() pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.num_epochs}") for batch_idx, (features, labels) in enumerate(pbar): features = features.cuda(non_blocking=True) labels = labels.cuda(non_blocking=True) # Zero gradients optimizer.zero_grad() # Forward pass - Pure FP32, no autocast combined, logits_list, features_list, fusion_weights = david( features, anchors_dict, return_all_scales=True ) # Compute loss losses = criterion( combined, logits_list, features_list, labels, crystals_dict, epoch ) # Backward losses['total'].backward() # Gradient analysis if config.track_gradients and batch_idx % config.log_interval == 0: grad_stats = analyze_gradients(david, config) if writer: step = global_step + batch_idx writer.add_scalar('train/grad_mean', grad_stats['mean'], step) writer.add_scalar('train/grad_max', grad_stats['max'], step) writer.add_scalar('train/grad_num_small', grad_stats['num_small'], step) # Scale small gradients scale_small_gradients(david, config) # Gradient clipping torch.nn.utils.clip_grad_norm_(david.parameters(), config.gradient_clip) # Optimizer step optimizer.step() # Metrics total_loss += losses['total'].item() _, predicted = torch.max(combined, 1) total += labels.size(0) correct += (predicted == labels).sum().item() # Accumulate loss components for key, value in losses.items(): if key not in loss_components_sum: loss_components_sum[key] = 0.0 loss_components_sum[key] += value.item() # Logging if writer and batch_idx % config.log_interval == 0: step = global_step + batch_idx writer.add_scalar('train/loss_batch', losses['total'].item(), step) writer.add_scalar('train/acc_batch', 100 * correct / total, step) if config.log_loss_components: for key, value in losses.items(): if key != 'total': writer.add_scalar(f'train/loss_{key}', value.item(), step) if config.log_fusion_weights and fusion_weights is not None: if fusion_weights.dim() == 2: mean_weights = fusion_weights.mean(dim=0) for i, w in enumerate(mean_weights): if i < len(active_scales): writer.add_scalar( f'train/fusion_weight_{active_scales[i]}', w.item(), step ) writer.add_scalar('train/lr', optimizer.param_groups[0]['lr'], step) pbar.set_postfix({ 'loss': f'{total_loss / (batch_idx + 1):.4f}', 'acc': f'{100 * correct / total:.2f}%' }) global_step += 1 # Average loss components avg_components = {k: v / len(train_loader) for k, v in loss_components_sum.items()} return ( total_loss / len(train_loader), 100 * correct / total, global_step, avg_components ) @torch.no_grad() def validate( david: David, val_loader: DataLoader, anchors_dict: Dict[int, torch.Tensor], config: DavidTrainingConfig ) -> Tuple[float, Dict[int, float]]: """Validate model - Pure FP32.""" david.eval() correct = 0 total = 0 active_scales = david.get_active_scales() scale_correct = {scale: 0 for scale in active_scales} for features, labels in tqdm(val_loader, desc="Validation", leave=False): features = features.cuda(non_blocking=True) labels = labels.cuda(non_blocking=True) # Forward pass - no autocast combined, logits_list, _, _ = david( features, anchors_dict, return_all_scales=True ) _, predicted = torch.max(combined, 1) total += labels.size(0) correct += (predicted == labels).sum().item() for i, scale in enumerate(active_scales): if i < len(logits_list): _, scale_pred = torch.max(logits_list[i], 1) scale_correct[scale] += (scale_pred == labels).sum().item() accuracy = 100 * correct / total scale_accs = {s: 100 * scale_correct[s] / total for s in scale_correct} return accuracy, scale_accs # ============================================================================ # MAIN TRAINING FUNCTION # ============================================================================ def train_david(config: DavidTrainingConfig): """Main training pipeline.""" # Enable TensorFloat32 for better performance on Ampere+ GPUs torch.set_float32_matmul_precision('high') print("="*80) print("🌟 DAVID TRAINING PIPELINE") print("="*80) print(f"Run ID: {config.run_id}") print(f"Preset: {config.preset}") print(f"Batch Size: {config.batch_size}") print(f"Learning Rate: {config.learning_rate}") print(f"Mixed Precision: {config.use_mixed_precision}") print(f"TensorFloat32: Enabled (high precision)") print("="*80) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Load or create David config FIRST (needed for model_name) if config.custom_config_path: david_config = DavidArchitectureConfig.from_json(config.custom_config_path) print(f"[šŸ“] Loaded custom config: {config.custom_config_path}") elif config.preset: david_config = DavidPresets.get_preset(config.preset) print(f"[āš™ļø] Using preset: {config.preset}") else: raise ValueError("Must specify either preset or custom_config_path") # Create model name from architecture model_name = f"David-{david_config.sharing_mode}-{david_config.fusion_mode}" print(f"[šŸ·ļø] Model: {model_name}") # Setup directories with proper hierarchy: weights/model_name/timestamp/ weights_dir = os.path.join(config.base_dir, "weights", model_name, config.run_id) runs_dir = os.path.join(config.base_dir, "runs", model_name, config.run_id) os.makedirs(weights_dir, exist_ok=True) os.makedirs(runs_dir, exist_ok=True) print(f"[šŸ“] Weights: {weights_dir}") print(f"[šŸ“] Logs: {runs_dir}") writer = SummaryWriter(runs_dir) # Apply overrides if config.num_classes_override: david_config.num_classes = config.num_classes_override if config.use_belly_override is not None: david_config.use_belly = config.use_belly_override if config.belly_expand_override is not None: david_config.belly_expand = config.belly_expand_override if config.progressive_training_override is not None: david_config.progressive_training = config.progressive_training_override if not david_config.progressive_training: # Disable warmup if progressive training disabled david_config.scale_warmup_epochs = {s: 0 for s in david_config.scales} # Override scale warmup schedule if provided if config.scale_warmup_epochs_override is not None: david_config.scale_warmup_epochs = config.scale_warmup_epochs_override # Enable progressive training if custom schedule provided if not david_config.progressive_training: print(f"[āš™ļø] Enabling progressive training (custom warmup schedule provided)") david_config.progressive_training = True print(f"[āš™ļø] Progressive training: {david_config.progressive_training}") if david_config.progressive_training: print(f" Scale warmup schedule: {david_config.scale_warmup_epochs}") # Save configs david_config_path = os.path.join(weights_dir, "david_config.json") david_config.to_json(david_config_path) print(f"[šŸ’¾] Saved David config: {david_config_path}") train_config_path = os.path.join(weights_dir, "train_config.json") config.to_json(train_config_path) print(f"[šŸ’¾] Saved training config: {train_config_path}") # Initialize David david = David.from_config(david_config).cuda() print(f"\n{david}\n") # Count parameters total_params = sum(p.numel() for p in david.parameters()) trainable_params = sum(p.numel() for p in david.parameters() if p.requires_grad) print(f"[šŸ“Š] Total Parameters: {total_params:,}") print(f"[šŸ“Š] Trainable Parameters: {trainable_params:,}") # Load data train_loader, val_loader = create_dataloaders(config) # Generate crystals crystal_gen = CrystalGenerator( david_config.num_classes, david_config.scales, str(device) ) anchors_dict, crystals_dict = crystal_gen.generate() # Setup training criterion = MultiScaleCrystalLoss( scales=david_config.scales, num_classes=david_config.num_classes, use_rose_loss=config.use_rose_loss, use_cayley_loss=config.use_cayley_loss, rose_initial_weight=config.rose_initial_weight, rose_max_weight=config.rose_max_weight, cayley_weight=config.cayley_weight, scale_loss_balance=config.scale_loss_balance ).cuda() optimizer = create_optimizer(david, config) scheduler = create_scheduler(optimizer, config) controller = AdaptiveTrainingController(david, config) # Tracking best_val_acc = 0.0 best_epoch = 0 best_scale_accs = {} global_step = 0 final_train_acc = 0.0 final_train_loss = 0.0 # Training history for epoch-by-epoch tracking training_history = { 'epochs': [], 'train_loss': [], 'train_acc': [], 'val_acc': [], 'scale_accs': {}, 'lr': [] } # DIAGNOSTIC: Test one forward/backward pass before training print("\n[šŸ”] Running diagnostic forward/backward pass...") #david.compile() david.train() # Get a small batch for features_test, labels_test in train_loader: features_test = features_test.cuda(non_blocking=True)[:8] # Just 8 samples labels_test = labels_test.cuda(non_blocking=True)[:8] # Forward combined_test, logits_test, features_test_out, _ = david( features_test, anchors_dict, return_all_scales=True ) # Loss losses_test = criterion( combined_test, logits_test, features_test_out, labels_test, crystals_dict, epoch=0 ) print(f" Initial loss: {losses_test['total'].item():.6f}") print(f" Loss components:") for key, value in losses_test.items(): if key != 'total': print(f" {key}: {value.item():.6f}") # Backward optimizer.zero_grad() losses_test['total'].backward() # Check gradients grad_count = sum(1 for p in david.parameters() if p.grad is not None and p.grad.norm() > 0) total_grad_params = sum(1 for p in david.parameters() if p.requires_grad) print(f" Parameters with non-zero gradients: {grad_count}/{total_grad_params}") if grad_count == 0: print(f" āŒ ERROR: No gradients! Training will not work.") return None, 0.0 elif grad_count < total_grad_params * 0.5: print(f" āš ļø WARNING: Less than 50% of parameters have gradients") else: print(f" āœ… Gradients look good") break # Only test one batch print("\n[šŸš€] Starting training...\n") for epoch in range(config.num_epochs): epoch_start = time.time() # Train train_loss, train_acc, global_step, loss_components = train_epoch( david, train_loader, optimizer, criterion, anchors_dict, crystals_dict, epoch, config, writer, global_step ) # Validate val_acc, scale_accs = validate(david, val_loader, anchors_dict, config) # Update controller controller.update_metrics(scale_accs, val_acc) controller.apply_adaptive_strategies(scale_accs, epoch) # Step scheduler if scheduler: scheduler.step() epoch_time = time.time() - epoch_start # Print print(f"\nšŸ“Š Epoch {epoch+1}/{config.num_epochs} ({epoch_time:.1f}s)") print(f" Train: Loss={train_loss:.4f}, Acc={train_acc:.2f}%") print(f" Val: Acc={val_acc:.2f}% (Best: {best_val_acc:.2f}%)") print(f" Active scales: {david.get_active_scales()}") print(f" LR: {optimizer.param_groups[0]['lr']:.2e}") if config.log_loss_components and loss_components: print(f" Loss breakdown:") for key, value in sorted(loss_components.items()): if key != 'total': print(f" {key:20s}: {value:.6f}") for scale, acc in scale_accs.items(): frozen = "ā„ļø" if controller.scales_frozen.get(scale, False) else "šŸ”„" print(f" {frozen} Scale {scale}: {acc:.2f}%") # Update tracking final_train_acc = train_acc final_train_loss = train_loss # Record training history training_history['epochs'].append(epoch + 1) training_history['train_loss'].append(train_loss) training_history['train_acc'].append(train_acc) training_history['val_acc'].append(val_acc) training_history['lr'].append(optimizer.param_groups[0]['lr']) # Record per-scale accuracies for scale, acc in scale_accs.items(): if scale not in training_history['scale_accs']: training_history['scale_accs'][scale] = [] training_history['scale_accs'][scale].append(acc) # TensorBoard writer.add_scalar('train/loss', train_loss, epoch) writer.add_scalar('train/acc', train_acc, epoch) writer.add_scalar('val/acc', val_acc, epoch) for scale, acc in scale_accs.items(): writer.add_scalar(f'val/acc_scale_{scale}', acc, epoch) # Save best if val_acc > best_val_acc: best_val_acc = val_acc best_epoch = epoch best_scale_accs = scale_accs.copy() # Save training history alongside best model history_path = os.path.join(weights_dir, 'training_history.json') with open(history_path, 'w') as f: json.dump(training_history, f, indent=2) save_checkpoint( os.path.join(weights_dir, 'best_model'), david, optimizer, scheduler, epoch, { 'best_val_acc': best_val_acc, 'best_epoch': best_epoch, 'scale_accuracies': best_scale_accs, 'training_history': training_history }, config ) # Upload to hub when best model improves if config.upload_to_hub: best_metrics = { 'best_val_acc': best_val_acc, 'best_epoch': best_epoch, 'scale_accuracies': best_scale_accs, 'final_train_acc': train_acc, 'final_train_loss': train_loss, 'training_history': training_history, 'parameters': total_params } prepare_hub_upload(weights_dir, runs_dir, config, david_config, best_metrics, model_name) # Periodic save if (epoch + 1) % config.save_interval == 0: save_checkpoint( os.path.join(weights_dir, f'checkpoint_epoch_{epoch+1}'), david, optimizer, scheduler, epoch, {'val_acc': val_acc}, config ) # Final save save_checkpoint( os.path.join(weights_dir, 'final_model'), david, optimizer, scheduler, config.num_epochs - 1, {'final_val_acc': val_acc}, config ) writer.close() # Final hub upload with all artifacts if config.upload_to_hub: print("\n[šŸ¤—] Performing final HuggingFace Hub upload...") final_metrics = { 'best_val_acc': best_val_acc, 'best_epoch': best_epoch, 'scale_accuracies': best_scale_accs, 'final_train_acc': final_train_acc, 'final_train_loss': final_train_loss, 'training_history': training_history, 'parameters': total_params } prepare_hub_upload(weights_dir, runs_dir, config, david_config, final_metrics, model_name) print("\n" + "="*80) print(f"šŸŽ‰ Training Complete!") print(f" Best Val Acc: {best_val_acc:.2f}% (Epoch {best_epoch+1})") print(f" Final Train Acc: {final_train_acc:.2f}%") print(f" Weights: {weights_dir}") if config.upload_to_hub: print(f" Hub: https://huggingface.co/{config.hf_repo}") print("="*80) return david, best_val_acc # ============================================================================ # USAGE EXAMPLE # ============================================================================ if __name__ == "__main__": configs = [] config = DavidTrainingConfig( preset="clip_vit_bigg14", # Uses progressive training by default model_variant="clip_vit_laion_bigg14", num_epochs=10, batch_size=1024, learning_rate=1e-3, use_mixed_precision=False, # leave off, mixed precision kills accuracy with rose gradient_clip=10.0, use_rose_loss=True, rose_initial_weight=0.1, rose_max_weight=0.5, use_cayley_loss=False, progressive_training_override=False, # Adaptive training (disabled by default for stability) freeze_strategy="none", # Set to "performance" to enable freeze_threshold=90.0, save_format="safetensors", # HuggingFace Hub upload # DO NOT PUT YOUR HF TOKEN IN THE CONFIG, load it in os.environ or other means. upload_to_hub=False, # set to true for your repo hf_repo= "YourName/Repo" # "AbstractPhil/gated-david", ) #configs.append(DavidTrainingConfig( # preset="high_accuracy", # Uses progressive training by default # model_variant="clip_vit_laion_b32", # # num_epochs=20, # batch_size=1024, # learning_rate=1e-3, # # use_mixed_precision=False, # gradient_clip=10.0, # # use_rose_loss=True, # rose_initial_weight=0.1, # rose_max_weight=0.5, # use_cayley_loss=False, # # # Adaptive training (disabled by default for stability) # freeze_strategy="never", # Set to "performance" to enable # freeze_threshold=90.0, # # save_format="safetensors", # # # HuggingFace Hub upload # upload_to_hub=True, # hf_repo="AbstractPhil/gated-david", #)) #configs.append(DavidTrainingConfig( # preset="balanced", # Uses progressive training by default # model_variant="clip_vit_laion_b32", # # num_epochs=20, # batch_size=1024, # learning_rate=1e-3, # # # Custom scale warmup schedule (overrides preset) # #scale_warmup_epochs_override={ # # 384: 0, # Sca+le 256 active from epoch 0 # # #512: 1, # Scale 512 active from epoch 2 # # 768: 1, # Scale 768 active from epoch 5 # # 1024: 2, # Scale 1024 active from epoch 8 # # 1280: 3 # Scale 1280 active from epoch 10 # #}, # #scale_warmup_epochs_override={ # # 256: 0, # # 512: 1, # # 768: 2, # # 1024: 3, # # 1280: 4, # # 1536: 5, # # 1792: 6, # # 2048: 7, # # 2304: 8, # # 2560: 9 # #}, # # use_mixed_precision=False, # gradient_clip=10.0, # # use_rose_loss=True, # rose_initial_weight=0.1, # rose_max_weight=0.5, # use_cayley_loss=False, # # # Adaptive training (disabled by default for stability) # freeze_strategy="never", # Set to "performance" to enable # freeze_threshold=90.0, # # save_format="safetensors", # # # HuggingFace Hub upload # upload_to_hub=True, # hf_repo="AbstractPhil/gated-david", #)) # #configs.append(DavidTrainingConfig( # preset="clip_vit_l14_ultra_deep", # Uses progressive training by default # model_variant="clip_vit_l14", # # num_epochs=10, # batch_size=1024, # learning_rate=1e-3, # # # Custom scale warmup schedule (overrides preset) # #scale_warmup_epochs_override={ # # 384: 0, # Scale 256 active from epoch 0 # # #512: 1, # Scale 512 active from epoch 2 # # 768: 1, # Scale 768 active from epoch 5 # # 1024: 2, # Scale 1024 active from epoch 8 # # 1280: 3 # Scale 1280 active from epoch 10 # #}, # #scale_warmup_epochs_override={ # # 256: 0, # # 512: 1, # # 768: 2, # # 1024: 3, # # 1280: 4, # # 1536: 5, # # 1792: 6, # # 2048: 7, # # 2304: 8, # # 2560: 9 # #}, # # use_mixed_precision=False, # gradient_clip=10.0, # # use_rose_loss=True, # rose_initial_weight=0.1, # rose_max_weight=0.5, # use_cayley_loss=False, # # # Adaptive training (disabled by default for stability) # freeze_strategy="never", # Set to "performance" to enable # freeze_threshold=90.0, # # save_format="safetensors", # # # HuggingFace Hub upload # upload_to_hub=True, # hf_repo="AbstractPhil/gated-david", #)) #for config in configs: # print("Starting train") # try: david, best_acc = train_david(config) #except Exception as e: # print(f"Error during training: {e}") #print("train complete")