File size: 30,607 Bytes

09b6e4d

"""
ImageNet Multi-CLIP Collective Experiment
==========================================
Uses pre-extracted CLIP features from multiple model variants.
No image processing - pure feature routing at A100 speeds.

Dataset: AbstractPhil/clip-imagenet-features
Streams: b32, b16, l14, laion_b32, laion_bigg14, laion_h14

Each CLIP variant becomes an expert stream with:
- Learnable translation head
- Own router with unique fingerprint
- Hierarchical coordination via mailbox

Training:
- AMP mixed precision
- 8 workers total, pinned, persistent
- Hierarchical chain topology

Author: AbstractPhil
Date: December 2025
License: Apache 2.0
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from datasets import load_dataset
from dataclasses import dataclass, field
from typing import Dict, Tuple, List, Optional
from collections import defaultdict
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

# =============================================================================
# IMPORTS FROM GEOFRACTAL
# =============================================================================

from geofractal.model.blocks.router.global_fractal_router import (
    GlobalFractalRouter,
    GlobalFractalRouterConfig,
    get_registry,
    RouterMailbox,
)

# =============================================================================
# CONFIG
# =============================================================================

@dataclass
class ImageNetCollectiveConfig:
    """Configuration for ImageNet multi-CLIP collective."""
    
    # Dataset
    dataset_name: str = "AbstractPhil/imagenet-clip-features"
    num_classes: int = 1000
    
    # CLIP variants and their dimensions
    clip_variants: Dict[str, int] = field(default_factory=lambda: {
        'clip_vit_b32': 512,
        'clip_vit_b16': 512,
        'clip_vit_l14': 768,
        'clip_vit_laion_b32': 512,
        'clip_vit_laion_bigg14': 1280,
        # 'clip_vit_laion_h14': 1024,  # Can add if memory permits
    })
    
    # Feature dimensions
    feature_dim: int = 512  # Internal routing dimension
    fingerprint_dim: int = 64
    
    # Router
    num_anchors: int = 16
    num_routes: int = 8
    num_slots: int = 16  # Sequence length for routing
    
    # Training
    batch_size: int = 256
    epochs: int = 20
    lr: float = 3e-4
    weight_decay: float = 0.01
    warmup_epochs: int = 2
    
    # DataLoader - A100 optimized
    num_workers: int = 8  # Total across all loaders
    pin_memory: bool = True
    persistent_workers: bool = True
    prefetch_factor: int = 4
    
    # AMP
    use_amp: bool = True
    
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    
    def workers_per_loader(self) -> int:
        """Distribute workers across loaders."""
        n_loaders = len(self.clip_variants)
        return max(1, self.num_workers // n_loaders)


# =============================================================================
# DATASET
# =============================================================================

class CLIPFeatureDataset(Dataset):
    """
    Wraps HuggingFace dataset for a single CLIP variant.
    Returns pre-extracted features and labels.
    """
    
    def __init__(
        self,
        hf_dataset,
        feature_column: str = 'clip_features',
        label_column: str = 'label',
    ):
        self.dataset = hf_dataset
        self.feature_column = feature_column
        self.label_column = label_column
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        features = torch.tensor(item[self.feature_column], dtype=torch.float32)
        label = item[self.label_column]
        return features, label


class MultiCLIPDataset(Dataset):
    """
    Loads features from multiple CLIP variants simultaneously.
    Returns dict of features + label.
    """
    
    def __init__(
        self,
        dataset_name: str,
        split_prefix: str,  # e.g., 'train' or 'validation'
        clip_variants: Dict[str, int],
    ):
        self.variants = list(clip_variants.keys())
        self.datasets = {}
        
        print(f"Loading {split_prefix} splits...")
        for variant in tqdm(self.variants, desc="Loading variants"):
            split_name = f"{variant}_{split_prefix}"
            try:
                ds = load_dataset(dataset_name, split=split_name)
                self.datasets[variant] = ds
                print(f"  {variant}: {len(ds):,} samples")
            except Exception as e:
                print(f"  WARNING: Could not load {split_name}: {e}")
        
        # Use first dataset for length (all should be same)
        self.length = len(next(iter(self.datasets.values())))
        
        # Verify all same length
        for name, ds in self.datasets.items():
            assert len(ds) == self.length, f"{name} has {len(ds)} != {self.length}"
    
    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        features = {}
        label = None
        
        for variant, ds in self.datasets.items():
            item = ds[idx]
            features[variant] = torch.tensor(item['clip_features'], dtype=torch.float32)
            if label is None:
                label = item['label']
        
        return features, label


def get_dataloaders(config: ImageNetCollectiveConfig):
    """Create train and validation dataloaders."""
    
    train_dataset = MultiCLIPDataset(
        config.dataset_name,
        'train',
        config.clip_variants,
    )
    
    val_dataset = MultiCLIPDataset(
        config.dataset_name,
        'validation', 
        config.clip_variants,
    )
    
    # Collate function for dict of features
    def collate_fn(batch):
        features = {k: [] for k in config.clip_variants.keys()}
        labels = []
        
        for feat_dict, label in batch:
            for k, v in feat_dict.items():
                features[k].append(v)
            labels.append(label)
        
        features = {k: torch.stack(v) for k, v in features.items()}
        labels = torch.tensor(labels, dtype=torch.long)
        
        return features, labels
    
    workers_per = config.workers_per_loader()
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=config.num_workers,
        pin_memory=config.pin_memory,
        persistent_workers=config.persistent_workers if config.num_workers > 0 else False,
        prefetch_factor=config.prefetch_factor if config.num_workers > 0 else None,
        collate_fn=collate_fn,
        drop_last=True,
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=config.num_workers,
        pin_memory=config.pin_memory,
        persistent_workers=config.persistent_workers if config.num_workers > 0 else False,
        prefetch_factor=config.prefetch_factor if config.num_workers > 0 else None,
        collate_fn=collate_fn,
    )
    
    return train_loader, val_loader


# =============================================================================
# FEATURE STREAM (No CLIP model - just translation + routing)
# =============================================================================

class FeatureStream(nn.Module):
    """
    Stream for pre-extracted CLIP features.
    No CLIP model - just translation head + router.
    """
    
    def __init__(
        self,
        config: ImageNetCollectiveConfig,
        variant_name: str,
        input_dim: int,
        parent_id: Optional[str] = None,
    ):
        super().__init__()
        self.config = config
        self.variant_name = variant_name
        self.input_dim = input_dim
        
        # Translation head: CLIP dim → routing space
        self.translation = nn.Sequential(
            nn.Linear(input_dim, config.feature_dim * 2),
            nn.LayerNorm(config.feature_dim * 2),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(config.feature_dim * 2, config.feature_dim * config.num_slots),
        )
        
        # Learnable slot embeddings (unique per stream)
        self.slot_embed = nn.Parameter(
            torch.randn(1, config.num_slots, config.feature_dim) * 0.02
        )
        
        # Router with unique fingerprint
        router_config = GlobalFractalRouterConfig(
            feature_dim=config.feature_dim,
            fingerprint_dim=config.fingerprint_dim,
            num_anchors=config.num_anchors,
            num_routes=config.num_routes,
            use_adjacent_gating=True,
            use_cantor_prior=True,
            grid_size=(config.num_slots, 1),
        )
        
        self.router = GlobalFractalRouter(
            config=router_config,
            parent_id=parent_id,
            cooperation_group="imagenet_collective",
            name=variant_name,
        )
    
    @property
    def fingerprint(self) -> torch.Tensor:
        return self.router.fingerprint
    
    @property
    def module_id(self) -> str:
        return self.router.module_id
    
    def forward(
        self,
        features: torch.Tensor,
        mailbox: RouterMailbox,
        target_fingerprint: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Dict]:
        """
        Args:
            features: [B, input_dim] pre-extracted CLIP features
            mailbox: Shared mailbox
            target_fingerprint: Next stream's fingerprint
            
        Returns:
            routed: [B, num_slots, feature_dim]
            info: Dict with metrics
        """
        B = features.shape[0]
        
        # Translate to routing space
        translated = self.translation(features)  # [B, feature_dim * num_slots]
        slots = translated.view(B, self.config.num_slots, self.config.feature_dim)
        
        # Add slot embeddings
        slots = slots + self.slot_embed
        
        # Route
        routes, weights, routed = self.router(
            slots,
            mailbox=mailbox,
            target_fingerprint=target_fingerprint,
            skip_first=False,
        )
        
        info = {
            'route_entropy': -(weights * (weights + 1e-8).log()).sum(dim=-1).mean().item(),
        }
        
        return routed, info


# =============================================================================
# MULTI-CLIP COLLECTIVE
# =============================================================================

class ImageNetCollective(nn.Module):
    """
    Collective of pre-extracted CLIP features from multiple variants.
    Hierarchical chain topology with shared mailbox coordination.
    """
    
    def __init__(self, config: ImageNetCollectiveConfig):
        super().__init__()
        self.config = config
        
        # Reset registry for fresh start
        get_registry().reset()
        
        # Build streams in hierarchical chain
        self.streams = nn.ModuleDict()
        self.stream_order = list(config.clip_variants.keys())
        
        parent_id = None
        for variant_name, input_dim in config.clip_variants.items():
            stream = FeatureStream(
                config=config,
                variant_name=variant_name,
                input_dim=input_dim,
                parent_id=parent_id,
            )
            self.streams[variant_name] = stream
            parent_id = stream.module_id
            print(f"  Stream: {variant_name} ({input_dim}D) -> parent: {parent_id[:8] if parent_id else 'root'}...")
        
        # Shared mailbox
        router_config = GlobalFractalRouterConfig(
            feature_dim=config.feature_dim,
            fingerprint_dim=config.fingerprint_dim,
        )
        self.mailbox = RouterMailbox(router_config)
        
        # Fusion layer
        num_streams = len(config.clip_variants)
        self.fusion = nn.Sequential(
            nn.Linear(config.feature_dim * num_streams, config.feature_dim * 2),
            nn.LayerNorm(config.feature_dim * 2),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(config.feature_dim * 2, config.feature_dim),
            nn.LayerNorm(config.feature_dim),
        )
        
        # Classification head
        self.classifier = nn.Linear(config.feature_dim, config.num_classes)
        
        # Per-stream classifiers (for measuring individual contribution)
        self.stream_classifiers = nn.ModuleDict({
            name: nn.Linear(config.feature_dim, config.num_classes)
            for name in config.clip_variants.keys()
        })
    
    def forward(
        self,
        features: Dict[str, torch.Tensor],
        return_individual: bool = False,
    ) -> Tuple[torch.Tensor, Dict]:
        """
        Args:
            features: Dict mapping variant name to [B, clip_dim] features
            return_individual: Also return per-stream predictions
            
        Returns:
            logits: [B, num_classes]
            info: Dict with metrics
        """
        # Clear mailbox
        self.mailbox.clear()
        
        # Process streams in order
        stream_features = {}
        stream_infos = {}
        
        for i, name in enumerate(self.stream_order):
            stream = self.streams[name]
            
            # Get target fingerprint (next stream or None)
            if i < len(self.stream_order) - 1:
                next_name = self.stream_order[i + 1]
                target_fp = self.streams[next_name].fingerprint
            else:
                target_fp = None
            
            # Forward
            routed, info = stream(features[name], self.mailbox, target_fp)
            
            # Pool across slots
            pooled = routed.mean(dim=1)  # [B, feature_dim]
            stream_features[name] = pooled
            stream_infos[name] = info
        
        # Fuse all streams
        fused = torch.cat([stream_features[n] for n in self.stream_order], dim=-1)
        fused = self.fusion(fused)
        
        # Classify
        logits = self.classifier(fused)
        
        info = {
            'stream_infos': stream_infos,
            'mailbox_messages': len(self.mailbox.messages),
            'mean_route_entropy': np.mean([i['route_entropy'] for i in stream_infos.values()]),
        }
        
        if return_individual:
            individual_logits = {
                name: self.stream_classifiers[name](stream_features[name])
                for name in self.stream_order
            }
            info['individual_logits'] = individual_logits
        
        return logits, info


# =============================================================================
# SINGLE STREAM BASELINE
# =============================================================================

class SingleStreamBaseline(nn.Module):
    """Single CLIP variant with linear probe (no routing)."""
    
    def __init__(self, config: ImageNetCollectiveConfig, variant_name: str, input_dim: int):
        super().__init__()
        self.variant_name = variant_name
        
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, config.feature_dim),
            nn.LayerNorm(config.feature_dim),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(config.feature_dim, config.num_classes),
        )
    
    def forward(self, features: torch.Tensor) -> torch.Tensor:
        return self.classifier(features)


# =============================================================================
# TRAINING
# =============================================================================

def train_collective(
    model: ImageNetCollective,
    train_loader: DataLoader,
    val_loader: DataLoader,
    config: ImageNetCollectiveConfig,
):
    """Train collective with AMP."""
    
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=config.lr,
        weight_decay=config.weight_decay,
    )
    
    # Warmup + cosine schedule
    total_steps = len(train_loader) * config.epochs
    warmup_steps = len(train_loader) * config.warmup_epochs
    
    def lr_lambda(step):
        if step < warmup_steps:
            return step / warmup_steps
        progress = (step - warmup_steps) / (total_steps - warmup_steps)
        return 0.5 * (1 + np.cos(np.pi * progress))
    
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
    scaler = GradScaler() if config.use_amp else None
    
    history = defaultdict(list)
    best_acc = 0
    
    for epoch in range(config.epochs):
        model.train()
        epoch_loss = 0
        correct = 0
        total = 0
        
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.epochs}")
        
        for features, labels in pbar:
            # Move to device
            features = {k: v.to(config.device, non_blocking=True) for k, v in features.items()}
            labels = labels.to(config.device, non_blocking=True)
            
            optimizer.zero_grad()
            
            if config.use_amp:
                with autocast():
                    logits, info = model(features)
                    loss = F.cross_entropy(logits, labels)
                
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
            else:
                logits, info = model(features)
                loss = F.cross_entropy(logits, labels)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
            
            scheduler.step()
            
            epoch_loss += loss.item() * labels.size(0)
            correct += (logits.argmax(dim=1) == labels).sum().item()
            total += labels.size(0)
            
            pbar.set_postfix({
                'loss': f"{loss.item():.4f}",
                'acc': f"{correct/total*100:.1f}%",
                'lr': f"{scheduler.get_last_lr()[0]:.2e}",
            })
        
        # Validate
        val_acc, val_stream_accs = evaluate_collective(model, val_loader, config)
        
        history['train_loss'].append(epoch_loss / total)
        history['train_acc'].append(correct / total)
        history['val_acc'].append(val_acc)
        history['stream_accs'].append(val_stream_accs)
        
        # Log
        stream_str = ' | '.join([f"{k[:4]}: {v*100:.1f}%" for k, v in val_stream_accs.items()])
        tqdm.write(f"Epoch {epoch+1:3d} | Loss: {epoch_loss/total:.4f} | "
                  f"Val: {val_acc*100:.2f}% | {stream_str}")
        
        if val_acc > best_acc:
            best_acc = val_acc
            tqdm.write(f"  ★ New best: {best_acc*100:.2f}%")
    
    return dict(history), best_acc


def evaluate_collective(
    model: ImageNetCollective,
    loader: DataLoader,
    config: ImageNetCollectiveConfig,
) -> Tuple[float, Dict[str, float]]:
    """Evaluate collective and per-stream accuracy."""
    
    model.eval()
    correct = 0
    total = 0
    stream_correct = defaultdict(int)
    
    with torch.no_grad():
        for features, labels in tqdm(loader, desc="Eval", leave=False):
            features = {k: v.to(config.device, non_blocking=True) for k, v in features.items()}
            labels = labels.to(config.device, non_blocking=True)
            
            if config.use_amp:
                with autocast():
                    logits, info = model(features, return_individual=True)
            else:
                logits, info = model(features, return_individual=True)
            
            correct += (logits.argmax(dim=1) == labels).sum().item()
            total += labels.size(0)
            
            for name, ind_logits in info['individual_logits'].items():
                stream_correct[name] += (ind_logits.argmax(dim=1) == labels).sum().item()
    
    acc = correct / total
    stream_accs = {k: v / total for k, v in stream_correct.items()}
    
    return acc, stream_accs


def train_baseline(
    variant_name: str,
    input_dim: int,
    train_loader: DataLoader,
    val_loader: DataLoader,
    config: ImageNetCollectiveConfig,
):
    """Train single stream baseline."""
    
    model = SingleStreamBaseline(config, variant_name, input_dim).to(config.device)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.epochs)
    scaler = GradScaler() if config.use_amp else None
    
    history = defaultdict(list)
    best_acc = 0
    
    for epoch in range(config.epochs):
        model.train()
        epoch_loss = 0
        correct = 0
        total = 0
        
        for features, labels in tqdm(train_loader, desc=f"{variant_name} E{epoch+1}", leave=False):
            feat = features[variant_name].to(config.device, non_blocking=True)
            labels = labels.to(config.device, non_blocking=True)
            
            optimizer.zero_grad()
            
            if config.use_amp:
                with autocast():
                    logits = model(feat)
                    loss = F.cross_entropy(logits, labels)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                logits = model(feat)
                loss = F.cross_entropy(logits, labels)
                loss.backward()
                optimizer.step()
            
            epoch_loss += loss.item() * labels.size(0)
            correct += (logits.argmax(dim=1) == labels).sum().item()
            total += labels.size(0)
        
        scheduler.step()
        
        # Validate
        model.eval()
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for features, labels in val_loader:
                feat = features[variant_name].to(config.device, non_blocking=True)
                labels = labels.to(config.device, non_blocking=True)
                
                if config.use_amp:
                    with autocast():
                        logits = model(feat)
                else:
                    logits = model(feat)
                
                val_correct += (logits.argmax(dim=1) == labels).sum().item()
                val_total += labels.size(0)
        
        val_acc = val_correct / val_total
        history['val_acc'].append(val_acc)
        
        if val_acc > best_acc:
            best_acc = val_acc
        
        if (epoch + 1) % 5 == 0 or epoch == 0:
            tqdm.write(f"{variant_name} Epoch {epoch+1:3d} | Val: {val_acc*100:.2f}%")
    
    return dict(history), best_acc


# =============================================================================
# VISUALIZATION
# =============================================================================

def plot_results(
    collective_history: Dict,
    baseline_results: Dict[str, float],
    config: ImageNetCollectiveConfig,
    save_path: str = "imagenet_collective_results.png",
):
    """Plot training results."""
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    epochs = range(1, len(collective_history['val_acc']) + 1)
    
    # Validation accuracy over time
    ax = axes[0, 0]
    ax.plot(epochs, [a*100 for a in collective_history['val_acc']], 'b-', 
            label='Collective', linewidth=2)
    for name in config.clip_variants.keys():
        accs = [sa[name]*100 for sa in collective_history['stream_accs']]
        ax.plot(epochs, accs, '--', label=f'{name} (in coll.)', alpha=0.7)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Validation Accuracy (%)')
    ax.set_title('Training Progress')
    ax.legend(fontsize=8)
    ax.grid(True, alpha=0.3)
    
    # Final comparison bar
    ax = axes[0, 1]
    
    final_collective = collective_history['val_acc'][-1] * 100
    final_streams = {k: v*100 for k, v in collective_history['stream_accs'][-1].items()}
    
    names = ['Collective'] + list(baseline_results.keys())
    values = [final_collective] + [v*100 for v in baseline_results.values()]
    colors = ['steelblue'] + ['coral'] * len(baseline_results)
    
    bars = ax.bar(range(len(names)), values, color=colors)
    ax.set_xticks(range(len(names)))
    ax.set_xticklabels([n.replace('clip_vit_', '').replace('_', '\n') for n in names], fontsize=8)
    ax.set_ylabel('Validation Accuracy (%)')
    ax.set_title('Final Accuracy: Collective vs Individual Baselines')
    
    for bar, val in zip(bars, values):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3,
                f'{val:.1f}%', ha='center', va='bottom', fontsize=8)
    
    # Per-stream accuracy in collective vs baseline
    ax = axes[1, 0]
    
    stream_names = list(config.clip_variants.keys())
    x = np.arange(len(stream_names))
    width = 0.35
    
    in_collective = [final_streams[n] for n in stream_names]
    standalone = [baseline_results[n]*100 for n in stream_names]
    
    bars1 = ax.bar(x - width/2, in_collective, width, label='In Collective', color='steelblue')
    bars2 = ax.bar(x + width/2, standalone, width, label='Standalone', color='coral')
    
    ax.set_ylabel('Accuracy (%)')
    ax.set_title('Per-Stream: Collective vs Standalone')
    ax.set_xticks(x)
    ax.set_xticklabels([n.replace('clip_vit_', '') for n in stream_names], fontsize=8, rotation=45)
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')
    
    # Summary
    ax = axes[1, 1]
    ax.axis('off')
    
    best_baseline = max(baseline_results.values()) * 100
    improvement = final_collective - best_baseline
    
    summary = f"""
    IMAGENET COLLECTIVE RESULTS
    ════════════════════════════════════════════════════════
    
    Collective:           {final_collective:.2f}%
    Best Individual:      {best_baseline:.2f}%
    
    Improvement:          {improvement:+.2f}%
    
    ════════════════════════════════════════════════════════
    
    Per-stream in collective:
    """
    
    for name, acc in final_streams.items():
        short_name = name.replace('clip_vit_', '')
        summary += f"\n      {short_name:<15}: {acc:.2f}%"
    
    summary += """
    
    ════════════════════════════════════════════════════════
    
    Individual baselines:
    """
    
    for name, acc in baseline_results.items():
        short_name = name.replace('clip_vit_', '')
        summary += f"\n      {short_name:<15}: {acc*100:.2f}%"
    
    ax.text(0.05, 0.95, summary, fontsize=10, family='monospace',
            verticalalignment='top', transform=ax.transAxes)
    
    plt.tight_layout()
    plt.savefig(save_path, dpi=150, bbox_inches='tight')
    plt.show()
    print(f"\nSaved: {save_path}")


# =============================================================================
# MAIN
# =============================================================================

def main():
    print("="*70)
    print("  ImageNet Multi-CLIP Collective Experiment")
    print("  Pre-extracted Features via GlobalFractalRouter")
    print("="*70)
    
    config = ImageNetCollectiveConfig()
    
    print(f"\nConfig:")
    print(f"  Dataset: {config.dataset_name}")
    print(f"  Variants: {len(config.clip_variants)}")
    for name, dim in config.clip_variants.items():
        print(f"    - {name}: {dim}D")
    print(f"  Feature dim: {config.feature_dim}")
    print(f"  Epochs: {config.epochs}")
    print(f"  Batch size: {config.batch_size}")
    print(f"  AMP: {config.use_amp}")
    print(f"  Device: {config.device}")
    
    # Data
    print("\n" + "="*70)
    print("  Loading Data")
    print("="*70)
    
    train_loader, val_loader = get_dataloaders(config)
    print(f"\n  Train batches: {len(train_loader)}")
    print(f"  Val batches: {len(val_loader)}")
    
    # =================================================================
    # COLLECTIVE
    # =================================================================
    print("\n" + "="*70)
    print("  Training COLLECTIVE")
    print("="*70)
    
    collective = ImageNetCollective(config).to(config.device)
    
    params = sum(p.numel() for p in collective.parameters())
    print(f"\n  Parameters: {params:,}")
    
    collective_history, collective_best = train_collective(
        collective, train_loader, val_loader, config
    )
    
    # =================================================================
    # BASELINES
    # =================================================================
    print("\n" + "="*70)
    print("  Training BASELINES (Individual Streams)")
    print("="*70)
    
    baseline_results = {}
    
    for variant_name, input_dim in config.clip_variants.items():
        print(f"\n  Training: {variant_name}")
        _, best_acc = train_baseline(
            variant_name, input_dim, train_loader, val_loader, config
        )
        baseline_results[variant_name] = best_acc
        print(f"  {variant_name} best: {best_acc*100:.2f}%")
    
    # =================================================================
    # RESULTS
    # =================================================================
    print("\n" + "="*70)
    print("  FINAL RESULTS")
    print("="*70)
    
    print(f"\n  Collective:      {collective_best*100:.2f}%")
    print(f"  Best individual: {max(baseline_results.values())*100:.2f}%")
    print(f"  Improvement:     {(collective_best - max(baseline_results.values()))*100:+.2f}%")
    
    print("\n  Per-stream final (in collective):")
    for name, acc in collective_history['stream_accs'][-1].items():
        print(f"    {name}: {acc*100:.2f}%")
    
    print("\n  Individual baselines:")
    for name, acc in baseline_results.items():
        print(f"    {name}: {acc*100:.2f}%")
    
    plot_results(collective_history, baseline_results, config)
    
    return collective, collective_history, baseline_results


if __name__ == "__main__":
    results = main()