AbstractPhil
/

sd15-flow-matching

@@ -1,7 +1,7 @@
 # =====================================================================================
 # SD1.5 Flow-Matching Trainer — David-Driven Adaptive Timestep Sampling
 # Quartermaster: Mirel
-# NEW: David-weighted timesteps + SD3 shift + adaptive chaos
 # =====================================================================================
 from __future__ import annotations
 import os, json, math, random, re, shutil
@@ -86,7 +86,8 @@ class BaseConfig:
     timestep_shift: float = 3.0  # SD3-style shift (higher = bias toward clean)
     base_jitter: int = 5  # Base ±jitter around bin center
     adaptive_chaos: bool = True  # Scale jitter by pattern difficulty
-    profile_samples: int = 500  # Samples to profile David's difficulty
     # Scheduler
     num_train_timesteps: int = 1000
@@ -114,13 +115,16 @@ class BaseConfig:
 class DavidWeightedTimestepSampler:
     """
     Samples timesteps weighted by David's inherent difficulty + SD3 shift + adaptive chaos.
     """
-    def __init__(self, num_timesteps=1000, num_bins=100, shift=3.0, base_jitter=5, adaptive_chaos=True):
         self.num_timesteps = num_timesteps
         self.num_bins = num_bins
         self.shift = shift
         self.base_jitter = base_jitter
         self.adaptive_chaos = adaptive_chaos
         self.difficulty_weights = None  # Timestep difficulty
         self.pattern_difficulty = None   # Pattern confusion per bin
@@ -162,38 +166,38 @@ class DavidWeightedTimestepSampler:
                 # Pool features
                 pooled = {name: f.mean(dim=(2, 3)) for name, f in feats.items()}
-                # Get David's outputs
                 outputs = david(pooled, t.float())
                 # 1. Timestep difficulty (from classification error)
-                ts_key = None
-                for key in ["timestep_logits", "logits_timestep", "timestep_head_logits"]:
-                    if key in outputs:
-                        ts_key = key
-                        break
-                if ts_key:
-                    ts_logits = outputs[ts_key]
-                    if isinstance(ts_logits, dict):
-                        ts_logits = torch.stack(list(ts_logits.values())).mean(0)
                     preds = ts_logits.argmax(dim=-1)
                     for pred, true_bin in zip(preds, t_bins):
                         bin_idx = true_bin.item()
                         correct_per_bin[bin_idx] += (pred == true_bin).float().item()
                         total_per_bin[bin_idx] += 1
                 # 2. Pattern difficulty (from entropy)
-                pt_key = None
-                for key in ["pattern_logits", "logits_pattern", "pattern_head_logits"]:
-                    if key in outputs:
-                        pt_key = key
-                        break
-                if pt_key:
-                    pt_logits = outputs[pt_key]
-                    if isinstance(pt_logits, dict):
-                        pt_logits = torch.stack(list(pt_logits.values())).mean(0)
                     P = pt_logits.softmax(-1)
                     ent = -(P * P.clamp_min(1e-9).log()).sum(-1)
@@ -204,23 +208,78 @@ class DavidWeightedTimestepSampler:
                         entropy_per_bin[bin_idx] += norm_ent[i].item()
                         entropy_count_per_bin[bin_idx] += 1
-        # Compute timestep difficulty (inverse of accuracy)
         accuracy_per_bin = correct_per_bin / (total_per_bin.clamp(min=1))
-        timestep_difficulty = (1.0 - accuracy_per_bin) + 0.1  # Higher = harder
         self.difficulty_weights = timestep_difficulty / timestep_difficulty.sum()
         # Compute pattern difficulty (average entropy per bin)
         self.pattern_difficulty = entropy_per_bin / (entropy_count_per_bin.clamp(min=1))
         self.pattern_difficulty = self.pattern_difficulty.clamp(min=0.1, max=1.0)
-        print(f"✓ David difficulty map computed:")
-        print(f"  Avg timestep accuracy: {accuracy_per_bin.mean():.2%}")
-        print(f"  Hardest timestep bin: {accuracy_per_bin.argmin().item()} ({accuracy_per_bin.min():.2%} acc)")
-        print(f"  Easiest timestep bin: {accuracy_per_bin.argmax().item()} ({accuracy_per_bin.max():.2%} acc)")
-        print(f"  Avg pattern entropy: {self.pattern_difficulty.mean():.3f}")
         return self.difficulty_weights
     def sample(self, batch_size: int) -> List[int]:
         """Sample timesteps with David weighting + shift + adaptive chaos."""
         if self.difficulty_weights is None:
@@ -424,6 +483,12 @@ class DavidLoader:
             cfg.block_weights = self.hf_config["block_weights"]
 class DavidAssessor(nn.Module):
     def __init__(self, gdc: GeoDavidCollective, pooling: str):
         super().__init__()
         self.gdc = gdc
@@ -435,56 +500,88 @@ class DavidAssessor(nn.Module):
     @torch.no_grad()
     def forward(self, feats_student: Dict[str, torch.Tensor], t: torch.LongTensor
                ) -> Tuple[Dict[str,float], Dict[str,float], Dict[str,float]]:
         Zs = self._pool(feats_student)
         outs = self.gdc(Zs, t.float())
         e_t, e_p, coh = {}, {}, {}
-        ts_key = None
-        for key in ["timestep_logits", "logits_timestep", "timestep_head_logits"]:
-            if key in outs: ts_key = key; break
-        pt_key = None
-        for key in ["pattern_logits", "logits_pattern", "pattern_head_logits"]:
-            if key in outs: pt_key = key; break
         t_bins = (t // 10).to(next(self.gdc.parameters()).device)
-        if ts_key is not None:
-            ts_logits = outs[ts_key]
-            if isinstance(ts_logits, dict):
-                for name, L in ts_logits.items():
-                    ce = F.cross_entropy(L, t_bins, reduction="mean")
-                    e_t[name] = float(ce.item())
-            else:
                 ce = F.cross_entropy(ts_logits, t_bins, reduction="mean")
-                for name in Zs.keys():
-                    e_t[name] = float(ce.item())
-        else:
-            for name in Zs.keys(): e_t[name] = 0.0
-        if pt_key is not None:
-            pt_logits = outs[pt_key]
-            if isinstance(pt_logits, dict):
-                for name, L in pt_logits.items():
-                    P = L.softmax(-1)
-                    ent = -(P * (P.clamp_min(1e-9)).log()).sum(-1).mean()
-                    e_p[name] = float(ent.item() / math.log(P.shape[-1]))
-            else:
                 P = pt_logits.softmax(-1)
                 ent = -(P * (P.clamp_min(1e-9)).log()).sum(-1).mean()
-                for name in Zs.keys():
-                    e_p[name] = float(ent.item() / math.log(P.shape[-1]))
-        else:
-            for name in Zs.keys(): e_p[name] = 0.0
-        alphas = {}
         try:
             alphas = self.gdc.get_cantor_alphas()
         except Exception:
-            alphas = {}
-        avg_alpha = float(sum(alphas.values())/max(len(alphas),1)) if alphas else 1.0
         for name in Zs.keys():
-            coh[name] = avg_alpha
         return e_t, e_p, coh
 class BlockPenaltyFusion:
@@ -528,13 +625,15 @@ class FlowMatchDavidTrainer:
             print(f"   SD3 shift: {cfg.timestep_shift}")
             print(f"   Base jitter: ±{cfg.base_jitter}")
             print(f"   Adaptive chaos: {cfg.adaptive_chaos}")
             self.timestep_sampler = DavidWeightedTimestepSampler(
                 num_timesteps=cfg.num_train_timesteps,
                 num_bins=100,
                 shift=cfg.timestep_shift if cfg.use_david_weights else 0.0,
                 base_jitter=cfg.base_jitter,
-                adaptive_chaos=cfg.adaptive_chaos
             )
             if cfg.use_david_weights:
@@ -558,95 +657,43 @@ class FlowMatchDavidTrainer:
         self.sched = torch.optim.lr_scheduler.CosineAnnealingLR(self.opt, T_max=cfg.epochs * len(self.loader))
         self.scaler = torch.cuda.amp.GradScaler(enabled=cfg.amp)
-        # Load checkpoints
-        emergency_path = Path("./EMERGENCY_SAVE_SUCCESS.pt")
-        if not emergency_path.exists():
-            print("\n🔍 Emergency checkpoint not found locally, checking HuggingFace...")
-            emergency_path = self._download_emergency_checkpoint()
-        if emergency_path and emergency_path.exists():
-            self._load_emergency_checkpoint(emergency_path)
-        elif cfg.continue_training:
             self._load_latest_from_hf()
         self.writer = SummaryWriter(log_dir=os.path.join(cfg.out_dir, cfg.run_name))
-    def _download_emergency_checkpoint(self) -> Optional[Path]:
-        """Download emergency checkpoint from HuggingFace backup repo."""
-        emergency_repo = "AbstractPhil/sd15-flow-emergency-backup"
-        emergency_file = "EMERGENCY_SAVE_SUCCESS.pt"
-        try:
-            print(f"📥 Downloading emergency checkpoint from {emergency_repo}...")
-            local_path = hf_hub_download(
-                repo_id=emergency_repo,
-                filename=emergency_file,
-                repo_type="model",
-                cache_dir="./_emergency_cache"
-            )
-            target_path = Path("./EMERGENCY_SAVE_SUCCESS.pt")
-            shutil.copy(local_path, target_path)
-            size_mb = target_path.stat().st_size / 1e6
-            print(f"✅ Downloaded emergency checkpoint ({size_mb:.1f} MB)")
-            return target_path
-        except Exception as e:
-            print(f"⚠️ Could not download emergency checkpoint: {e}")
-            return None
-    def _load_emergency_checkpoint(self, path: Path):
-        """Load emergency checkpoint with student_unet structure."""
-        try:
-            print(f"\n🚨 Found emergency checkpoint: {path}")
-            checkpoint = torch.load(path, map_location='cpu')
-            if 'student_unet' in checkpoint:
-                print("📦 Loading emergency checkpoint format...")
-                missing, unexpected = self.student.unet.load_state_dict(checkpoint['student_unet'], strict=False)
-                print(f"✓ Loaded student UNet")
-                if 'opt' in checkpoint:
-                    self.opt.load_state_dict(checkpoint['opt'])
-                    print("✓ Loaded optimizer state")
-                if 'sched' in checkpoint:
-                    self.sched.load_state_dict(checkpoint['sched'])
-                    print("✓ Loaded scheduler state")
-                if 'gstep' in checkpoint:
-                    self.start_gstep = checkpoint['gstep']
-                    self.start_epoch = self.start_gstep // len(self.loader)
-                    print(f"✓ Resuming from global step {self.start_gstep} (epoch ~{self.start_epoch})")
-                print("✅ Emergency checkpoint loaded successfully!")
-        except Exception as e:
-            print(f"⚠️ Failed to load emergency checkpoint: {e}")
     def _load_latest_from_hf(self):
         if not self.cfg.hf_repo_id:
             return
         try:
             api = HfApi()
             print(f"\n🔍 Searching for latest checkpoint in {self.cfg.hf_repo_id}...")
             files = api.list_repo_files(repo_id=self.cfg.hf_repo_id, repo_type="model")
             epochs = []
             for f in files:
-                if f.endswith('.pt'):
                     match = re.search(r'_e(\d+)\.pt$', f)
                     if match:
-                        epochs.append((int(match.group(1)), f))
             if not epochs:
                 return
             latest_epoch, latest_file = max(epochs, key=lambda x: x[0])
-            print(f"📥 Downloading: {latest_file}")
             local_path = hf_hub_download(
                 repo_id=self.cfg.hf_repo_id,
                 filename=latest_file,
@@ -654,27 +701,58 @@ class FlowMatchDavidTrainer:
                 cache_dir=self.cfg.ckpt_dir
             )
             checkpoint = torch.load(local_path, map_location='cpu')
-            if 'student_unet' in checkpoint:
-                self.student.unet.load_state_dict(checkpoint['student_unet'], strict=False)
-            elif 'student' in checkpoint:
-                self.student.load_state_dict(checkpoint['student'], strict=False)
             if 'opt' in checkpoint:
-                self.opt.load_state_dict(checkpoint['opt'])
             if 'sched' in checkpoint:
-                self.sched.load_state_dict(checkpoint['sched'])
-            self.start_epoch = latest_epoch
-            self.start_gstep = latest_epoch * len(self.loader)
-            print(f"✅ Resuming from epoch {self.start_epoch + 1}")
             del checkpoint
             torch.cuda.empty_cache()
         except Exception as e:
-            print(f"⚠️ Failed to load from HF: {e}")
     def _v_star(self, x_t, t, eps_hat):
         alpha, sigma = self.teacher.alpha_sigma(t)
@@ -692,7 +770,45 @@ class FlowMatchDavidTrainer:
         cfg = self.cfg
         gstep = self.start_gstep
         for ep in range(self.start_epoch, cfg.epochs):
             self.student.train()
             pbar = tqdm(self.loader, desc=f"Epoch {ep+1}/{cfg.epochs}",
                         dynamic_ncols=True, leave=True, position=0)
@@ -778,6 +894,33 @@ class FlowMatchDavidTrainer:
                 self._save(ep+1, gstep)
         self._save("final", gstep)
         self.writer.close()
     def _save(self, tag, gstep):
@@ -820,11 +963,17 @@ class FlowMatchDavidTrainer:
     def sample(self, prompts: List[str], steps: Optional[int]=None, guidance: Optional[float]=None) -> torch.Tensor:
         steps = steps or self.cfg.sample_steps
         guidance = guidance if guidance is not None else self.cfg.guidance_scale
         cond_e = self.teacher.encode(prompts)
         uncond_e = self.teacher.encode([""]*len(prompts))
         sched = self.teacher.sched
         sched.set_timesteps(steps, device=self.device)
-        x_t = torch.randn(len(prompts), 4, 64, 64, device=self.device)
         for t_scalar in sched.timesteps:
             t = torch.full((x_t.shape[0],), t_scalar, device=self.device, dtype=torch.long)

 # =====================================================================================
 # SD1.5 Flow-Matching Trainer — David-Driven Adaptive Timestep Sampling
 # Quartermaster: Mirel
+# FIXED: David nested output handling + reliability filtering + clean checkpoint loading
 # =====================================================================================
 from __future__ import annotations
 import os, json, math, random, re, shutil
     timestep_shift: float = 3.0  # SD3-style shift (higher = bias toward clean)
     base_jitter: int = 5  # Base ±jitter around bin center
     adaptive_chaos: bool = True  # Scale jitter by pattern difficulty
+    profile_samples: int = 2500  # Samples to profile David's difficulty
+    reliability_threshold: float = 0.15  # Minimum accuracy to trust David's guidance
     # Scheduler
     num_train_timesteps: int = 1000
 class DavidWeightedTimestepSampler:
     """
     Samples timesteps weighted by David's inherent difficulty + SD3 shift + adaptive chaos.
+    FIXED: Properly handles nested GeoDavidCollective output structure.
+    FIXED: Filters out unreliable bins (accuracy < threshold).
     """
+    def __init__(self, num_timesteps=1000, num_bins=100, shift=3.0, base_jitter=5, adaptive_chaos=True, reliability_threshold=0.15):
         self.num_timesteps = num_timesteps
         self.num_bins = num_bins
         self.shift = shift
         self.base_jitter = base_jitter
         self.adaptive_chaos = adaptive_chaos
+        self.reliability_threshold = reliability_threshold
         self.difficulty_weights = None  # Timestep difficulty
         self.pattern_difficulty = None   # Pattern confusion per bin
                 # Pool features
                 pooled = {name: f.mean(dim=(2, 3)) for name, f in feats.items()}
+                # Get David's outputs (NESTED STRUCTURE!)
                 outputs = david(pooled, t.float())
+                # ================================================================
+                # FIXED: Aggregate across blocks
+                # ================================================================
                 # 1. Timestep difficulty (from classification error)
+                timestep_logits_list = []
+                for block_name, block_out in outputs.items():
+                    if 'timestep_logits' in block_out:
+                        timestep_logits_list.append(block_out['timestep_logits'])
+                if timestep_logits_list:
+                    # Average predictions across blocks
+                    ts_logits = torch.stack(timestep_logits_list).mean(0)
                     preds = ts_logits.argmax(dim=-1)
                     for pred, true_bin in zip(preds, t_bins):
                         bin_idx = true_bin.item()
                         correct_per_bin[bin_idx] += (pred == true_bin).float().item()
                         total_per_bin[bin_idx] += 1
                 # 2. Pattern difficulty (from entropy)
+                pattern_logits_list = []
+                for block_name, block_out in outputs.items():
+                    if 'pattern_logits' in block_out:
+                        pattern_logits_list.append(block_out['pattern_logits'])
+                if pattern_logits_list:
+                    # Average predictions across blocks
+                    pt_logits = torch.stack(pattern_logits_list).mean(0)
                     P = pt_logits.softmax(-1)
                     ent = -(P * P.clamp_min(1e-9).log()).sum(-1)
                         entropy_per_bin[bin_idx] += norm_ent[i].item()
                         entropy_count_per_bin[bin_idx] += 1
+        # Compute accuracy per bin
         accuracy_per_bin = correct_per_bin / (total_per_bin.clamp(min=1))
+        # ========================================================================
+        # RELIABILITY FILTERING: Disable bins with accuracy < threshold
+        # ========================================================================
+        reliable_mask = accuracy_per_bin >= self.reliability_threshold
+        num_reliable = reliable_mask.sum().item()
+        num_disabled = self.num_bins - num_reliable
+        print(f"\n🎯 Reliability Analysis:")
+        print(f"  Threshold: {self.reliability_threshold:.0%}")
+        print(f"  Reliable bins: {num_reliable}/{self.num_bins}")
+        print(f"  Disabled bins: {num_disabled}/{self.num_bins}")
+        if num_disabled > 0:
+            disabled_bins = torch.where(~reliable_mask)[0].tolist()
+            disabled_accs = [accuracy_per_bin[i].item() for i in disabled_bins]
+            print(f"  Disabled: {disabled_bins[:10]}{'...' if len(disabled_bins) > 10 else ''}")
+            print(f"    (accuracies: {[f'{a:.1%}' for a in disabled_accs[:10]]})")
+        # Create difficulty weights ONLY for reliable bins
+        if num_reliable == 0:
+            print("\n⚠️  WARNING: No reliable bins found! Falling back to uniform sampling.")
+            self.difficulty_weights = torch.ones(self.num_bins) / self.num_bins
+            self.pattern_difficulty = torch.ones(self.num_bins) * 0.5
+            return self.difficulty_weights
+        # Compute difficulty (inverse accuracy) for reliable bins
+        timestep_difficulty = torch.zeros(self.num_bins)
+        timestep_difficulty[reliable_mask] = (1.0 - accuracy_per_bin[reliable_mask]) + 0.1
+        # Zero out unreliable bins (won't be sampled)
+        timestep_difficulty[~reliable_mask] = 0.0
+        # Normalize weights over reliable bins only
         self.difficulty_weights = timestep_difficulty / timestep_difficulty.sum()
         # Compute pattern difficulty (average entropy per bin)
         self.pattern_difficulty = entropy_per_bin / (entropy_count_per_bin.clamp(min=1))
         self.pattern_difficulty = self.pattern_difficulty.clamp(min=0.1, max=1.0)
+        # Set entropy to 0.5 (neutral) for disabled bins
+        self.pattern_difficulty[~reliable_mask] = 0.5
+        # ========================================================================
+        # REPORT
+        # ========================================================================
+        print(f"\n✓ David difficulty map computed (filtered):")
+        print(f"  Avg timestep accuracy (all bins): {accuracy_per_bin.mean():.2%}")
+        print(f"  Avg timestep accuracy (reliable): {accuracy_per_bin[reliable_mask].mean():.2%}")
+        # Find hardest/easiest among reliable bins
+        reliable_indices = torch.where(reliable_mask)[0]
+        if len(reliable_indices) > 0:
+            hardest_idx = reliable_indices[accuracy_per_bin[reliable_mask].argmin()].item()
+            easiest_idx = reliable_indices[accuracy_per_bin[reliable_mask].argmax()].item()
+            print(f"  Hardest reliable bin: {hardest_idx} ({accuracy_per_bin[hardest_idx]:.2%} acc)")
+            print(f"  Easiest reliable bin: {easiest_idx} ({accuracy_per_bin[easiest_idx]:.2%} acc)")
+        print(f"  Avg pattern entropy (reliable): {self.pattern_difficulty[reliable_mask].mean():.3f}")
+        # Show sampling distribution (top 10 weighted bins)
+        top_weights, top_bins = self.difficulty_weights.topk(10)
+        print(f"\n📊 Top 10 sampled bins (by difficulty weight):")
+        for i, (bin_idx, weight) in enumerate(zip(top_bins.tolist(), top_weights.tolist())):
+            acc = accuracy_per_bin[bin_idx].item()
+            print(f"    {i+1}. Bin {bin_idx:2d}: weight={weight:.3f} (acc={acc:.1%})")
         return self.difficulty_weights
     def sample(self, batch_size: int) -> List[int]:
         """Sample timesteps with David weighting + shift + adaptive chaos."""
         if self.difficulty_weights is None:
             cfg.block_weights = self.hf_config["block_weights"]
 class DavidAssessor(nn.Module):
+    """
+    CORRECTED: Properly handles GeoDavidCollective's nested multi-block output structure.
+    GeoDavidCollective returns: Dict[block_name, Dict[str, Tensor]]
+    Not a flat Dict[str, Tensor]!
+    """
     def __init__(self, gdc: GeoDavidCollective, pooling: str):
         super().__init__()
         self.gdc = gdc
     @torch.no_grad()
     def forward(self, feats_student: Dict[str, torch.Tensor], t: torch.LongTensor
                ) -> Tuple[Dict[str,float], Dict[str,float], Dict[str,float]]:
+        """
+        Assess student features using David's geometric knowledge.
+        Returns:
+            e_t: Dict[block_name, timestep_error] - classification error per block
+            e_p: Dict[block_name, pattern_entropy] - normalized entropy per block
+            coh: Dict[block_name, coherence] - geometric coherence per block
+        """
+        # Pool spatial features
         Zs = self._pool(feats_student)
+        # Forward through GeoDavidCollective
+        # Returns: Dict[block_name, Dict[str, Tensor]]
         outs = self.gdc(Zs, t.float())
+        # Initialize output dicts
         e_t, e_p, coh = {}, {}, {}
+        # Compute timestep bins for targets
         t_bins = (t // 10).to(next(self.gdc.parameters()).device)
+        # ====================================================================
+        # TIMESTEP ERROR - Per-block
+        # ====================================================================
+        for block_name, block_out in outs.items():
+            if 'timestep_logits' in block_out:
+                ts_logits = block_out['timestep_logits']
                 ce = F.cross_entropy(ts_logits, t_bins, reduction="mean")
+                e_t[block_name] = float(ce.item())
+        # If no timestep predictions, set all errors to 0
+        if not e_t:
+            for name in Zs.keys():
+                e_t[name] = 0.0
+        # ====================================================================
+        # PATTERN ENTROPY - Per-block
+        # ====================================================================
+        for block_name, block_out in outs.items():
+            if 'pattern_logits' in block_out:
+                pt_logits = block_out['pattern_logits']
+                # Compute normalized entropy
                 P = pt_logits.softmax(-1)
                 ent = -(P * (P.clamp_min(1e-9)).log()).sum(-1).mean()
+                norm_ent = ent / math.log(P.shape[-1])  # Normalize by max entropy
+                e_p[block_name] = float(norm_ent.item())
+        # If no pattern predictions, set all entropies to 0
+        if not e_p:
+            for name in Zs.keys():
+                e_p[name] = 0.0
+        # ====================================================================
+        # COHERENCE (from Cantor alphas)
+        # ====================================================================
         try:
             alphas = self.gdc.get_cantor_alphas()
+            # Alphas should be close to 0.5 for good coherence
+            # Map to coherence: 1.0 = perfect (alpha=0.5), lower = worse
+            for name, alpha in alphas.items():
+                # Coherence = 1 - 2*|alpha - 0.5|
+                # When alpha=0.5: coherence=1.0
+                # When alpha=0 or 1: coherence=0.0
+                coherence = 1.0 - 2.0 * abs(alpha - 0.5)
+                coh[name] = max(0.0, min(1.0, coherence))
         except Exception:
+            # Fallback: assume perfect coherence
+            for name in Zs.keys():
+                coh[name] = 1.0
+        # Ensure all input blocks have values (fill missing with block averages)
         for name in Zs.keys():
+            if name not in e_t:
+                # Use average of available blocks
+                e_t[name] = sum(e_t.values()) / max(len(e_t), 1) if e_t else 0.0
+            if name not in e_p:
+                e_p[name] = sum(e_p.values()) / max(len(e_p), 1) if e_p else 0.0
+            if name not in coh:
+                coh[name] = sum(coh.values()) / max(len(coh), 1) if coh else 1.0
         return e_t, e_p, coh
 class BlockPenaltyFusion:
             print(f"   SD3 shift: {cfg.timestep_shift}")
             print(f"   Base jitter: ±{cfg.base_jitter}")
             print(f"   Adaptive chaos: {cfg.adaptive_chaos}")
+            print(f"   Reliability threshold: {cfg.reliability_threshold:.0%}")
             self.timestep_sampler = DavidWeightedTimestepSampler(
                 num_timesteps=cfg.num_train_timesteps,
                 num_bins=100,
                 shift=cfg.timestep_shift if cfg.use_david_weights else 0.0,
                 base_jitter=cfg.base_jitter,
+                adaptive_chaos=cfg.adaptive_chaos,
+                reliability_threshold=cfg.reliability_threshold
             )
             if cfg.use_david_weights:
         self.sched = torch.optim.lr_scheduler.CosineAnnealingLR(self.opt, T_max=cfg.epochs * len(self.loader))
         self.scaler = torch.cuda.amp.GradScaler(enabled=cfg.amp)
+        # Load latest checkpoint from HuggingFace if continuing training
+        if cfg.continue_training:
             self._load_latest_from_hf()
         self.writer = SummaryWriter(log_dir=os.path.join(cfg.out_dir, cfg.run_name))
     def _load_latest_from_hf(self):
+        """Load the most recent checkpoint from HuggingFace repo."""
         if not self.cfg.hf_repo_id:
+            print("ℹ️  No HuggingFace repo specified, starting from scratch\n")
             return
         try:
             api = HfApi()
             print(f"\n🔍 Searching for latest checkpoint in {self.cfg.hf_repo_id}...")
+            # List all files in the repo
             files = api.list_repo_files(repo_id=self.cfg.hf_repo_id, repo_type="model")
+            # Find all epoch checkpoints (format: {run_name}_e{epoch}.pt)
             epochs = []
             for f in files:
+                if f.endswith('.pt') and 'final' not in f.lower():
                     match = re.search(r'_e(\d+)\.pt$', f)
                     if match:
+                        epoch_num = int(match.group(1))
+                        epochs.append((epoch_num, f))
             if not epochs:
+                print("ℹ️  No previous checkpoints found, starting from scratch\n")
                 return
+            # Get the latest epoch
             latest_epoch, latest_file = max(epochs, key=lambda x: x[0])
+            print(f"📥 Found latest checkpoint: {latest_file} (epoch {latest_epoch})")
+            # Download checkpoint
             local_path = hf_hub_download(
                 repo_id=self.cfg.hf_repo_id,
                 filename=latest_file,
                 cache_dir=self.cfg.ckpt_dir
             )
+            # Load checkpoint
+            print(f"📦 Loading checkpoint...")
             checkpoint = torch.load(local_path, map_location='cpu')
+            # Load student state dict
+            if 'student' in checkpoint:
+                missing, unexpected = self.student.load_state_dict(checkpoint['student'], strict=False)
+                if missing:
+                    print(f"  ⚠️  Missing keys: {len(missing)}")
+                if unexpected:
+                    print(f"  ⚠️  Unexpected keys: {len(unexpected)}")
+                print(f"  ✓ Loaded student model")
+            else:
+                print(f"  ⚠️  Warning: 'student' key not found in checkpoint")
+                return
+            # Load optimizer state
             if 'opt' in checkpoint:
+                try:
+                    self.opt.load_state_dict(checkpoint['opt'])
+                    print("  ✓ Loaded optimizer state")
+                except Exception as e:
+                    print(f"  ⚠️  Failed to load optimizer state: {e}")
+            # Load scheduler state
             if 'sched' in checkpoint:
+                try:
+                    self.sched.load_state_dict(checkpoint['sched'])
+                    print("  ✓ Loaded scheduler state")
+                except Exception as e:
+                    print(f"  ⚠️  Failed to load scheduler state: {e}")
+            # Set starting epoch and global step
+            if 'gstep' in checkpoint:
+                self.start_gstep = checkpoint['gstep']
+                self.start_epoch = latest_epoch
+                print(f"  ✓ Resuming from epoch {self.start_epoch + 1}, global step {self.start_gstep}")
+            else:
+                # Fallback: estimate from epoch number
+                self.start_epoch = latest_epoch
+                self.start_gstep = latest_epoch * len(self.loader)
+                print(f"  ✓ Resuming from epoch {self.start_epoch + 1} (estimated step {self.start_gstep})")
+            # Cleanup
             del checkpoint
             torch.cuda.empty_cache()
+            print(f"✅ Successfully resumed from checkpoint!\n")
         except Exception as e:
+            print(f"⚠️ Failed to load checkpoint: {e}")
+            print("   Starting training from scratch...\n")
     def _v_star(self, x_t, t, eps_hat):
         alpha, sigma = self.teacher.alpha_sigma(t)
         cfg = self.cfg
         gstep = self.start_gstep
+        # Test prompts for monitoring progress
+        test_prompts = [
+            "a castle at sunset",
+            "a mountain landscape with trees",
+            "a city street at night"
+        ]
         for ep in range(self.start_epoch, cfg.epochs):
+            # Sample before epoch to monitor progress
+            if ep > 0 or self.start_epoch > 0:  # Skip first ever epoch
+                print(f"\n🎨 Sampling test images before epoch {ep+1}...")
+                try:
+                    test_imgs = self.sample(test_prompts, steps=30, guidance=7.5)
+                    # Save individual images
+                    sample_dir = Path(cfg.out_dir) / "samples"
+                    sample_dir.mkdir(exist_ok=True, parents=True)
+                    for i, (img, prompt) in enumerate(zip(test_imgs, test_prompts)):
+                        # Convert to PIL
+                        img_np = ((img.cpu().permute(1,2,0).numpy() + 1) / 2 * 255).astype('uint8')
+                        from PIL import Image
+                        pil_img = Image.fromarray(img_np)
+                        # Save with epoch number
+                        safe_prompt = prompt.replace(" ", "_")[:30]
+                        img_path = sample_dir / f"e{ep}_p{i}_{safe_prompt}.png"
+                        pil_img.save(img_path)
+                        # Log to tensorboard
+                        self.writer.add_image(f"samples/{safe_prompt}",
+                                            (img + 1) / 2,  # Normalize to [0,1]
+                                            global_step=ep)
+                    print(f"✓ Saved {len(test_imgs)} test images to {sample_dir}")
+                except Exception as e:
+                    print(f"⚠️ Sampling failed: {e}")
             self.student.train()
             pbar = tqdm(self.loader, desc=f"Epoch {ep+1}/{cfg.epochs}",
                         dynamic_ncols=True, leave=True, position=0)
                 self._save(ep+1, gstep)
         self._save("final", gstep)
+        # Final comprehensive sampling
+        print("\n🎨 Generating final test samples...")
+        final_prompts = [
+            "a castle at sunset",
+            "a mountain landscape with trees",
+            "a city street at night",
+            "a portrait of a person",
+            "abstract geometric shapes"
+        ]
+        try:
+            final_imgs = self.sample(final_prompts, steps=30, guidance=7.5)
+            sample_dir = Path(cfg.out_dir) / "samples"
+            sample_dir.mkdir(exist_ok=True, parents=True)
+            for i, (img, prompt) in enumerate(zip(final_imgs, final_prompts)):
+                from PIL import Image
+                img_np = ((img.cpu().permute(1,2,0).numpy() + 1) / 2 * 255).astype('uint8')
+                pil_img = Image.fromarray(img_np)
+                safe_prompt = prompt.replace(" ", "_")[:30]
+                pil_img.save(sample_dir / f"final_{safe_prompt}.png")
+            print(f"✓ Saved {len(final_imgs)} final images to {sample_dir}")
+        except Exception as e:
+            print(f"⚠️ Final sampling failed: {e}")
         self.writer.close()
     def _save(self, tag, gstep):
     def sample(self, prompts: List[str], steps: Optional[int]=None, guidance: Optional[float]=None) -> torch.Tensor:
         steps = steps or self.cfg.sample_steps
         guidance = guidance if guidance is not None else self.cfg.guidance_scale
+        # Get model dtype from student
+        model_dtype = next(self.student.unet.parameters()).dtype
         cond_e = self.teacher.encode(prompts)
         uncond_e = self.teacher.encode([""]*len(prompts))
         sched = self.teacher.sched
         sched.set_timesteps(steps, device=self.device)
+        # Create latents with correct dtype
+        x_t = torch.randn(len(prompts), 4, 64, 64, device=self.device, dtype=model_dtype)
         for t_scalar in sched.timesteps:
             t = torch.full((x_t.shape[0],), t_scalar, device=self.device, dtype=torch.long)