Spaces:

AbstractPhil
/

lyra-xl-playground

Running on Zero

App Files Files Community

AbstractPhil commited on 18 days ago

Commit

f6dab9d

verified ·

1 Parent(s): 93038cf

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -43

app.py CHANGED Viewed

@@ -309,37 +309,18 @@ class SDXLFlowMatchingPipeline:
         print(f"[Lyra Debug] T5 input: shape={t5_embeds.shape}, mean={t5_embeds.mean():.4f}, std={t5_embeds.std():.4f}")
         with torch.no_grad():
-            # Try approach 1: Cross-modal - encode T5 only, decode to CLIP
-            # This uses T5's semantic understanding to generate CLIP-compatible embeddings
-            t5_only_inputs = {
                 't5_xl_l': t5_embeds.float(),
                 't5_xl_g': t5_embeds.float()
             }
-            # Check if model has separate encode/decode methods
-            if hasattr(self.lyra_model, 'encode') and hasattr(self.lyra_model, 'decode'):
-                print("[Lyra Debug] Using separate encode/decode path")
-                # Encode T5 to latent space
-                mu, logvar = self.lyra_model.encode(t5_only_inputs)
-                z = mu  # Use mean for deterministic output
-                print(f"[Lyra Debug] Latent z: shape={z.shape}, mean={z.mean():.4f}, std={z.std():.4f}")
-                # Decode to CLIP space
-                reconstructions = self.lyra_model.decode(z, target_modalities=['clip_l', 'clip_g'])
-            else:
-                print("[Lyra Debug] Using forward pass with all modalities")
-                # Fall back to full forward pass with all modalities
-                modality_inputs = {
-                    'clip_l': clip_l_embeds.float(),
-                    'clip_g': clip_g_embeds.float(),
-                    't5_xl_l': t5_embeds.float(),
-                    't5_xl_g': t5_embeds.float()
-                }
-                reconstructions, mu, logvar, _ = self.lyra_model(
-                    modality_inputs,
-                    target_modalities=['clip_l', 'clip_g']
-                )
-                print(f"[Lyra Debug] Latent mu: shape={mu.shape}, mean={mu.mean():.4f}, std={mu.std():.4f}")
             lyra_clip_l = reconstructions['clip_l'].to(prompt_embeds.dtype)
             lyra_clip_g = reconstructions['clip_g'].to(prompt_embeds.dtype)
@@ -348,7 +329,6 @@ class SDXLFlowMatchingPipeline:
             print(f"[Lyra Debug] Lyra CLIP-G output: mean={lyra_clip_g.mean():.4f}, std={lyra_clip_g.std():.4f}")
             # Check if reconstruction stats are wildly different from input
-            # If so, we may need to normalize
             clip_l_std_ratio = lyra_clip_l.std() / (clip_l_embeds.std() + 1e-8)
             clip_g_std_ratio = lyra_clip_g.std() / (clip_g_embeds.std() + 1e-8)
             print(f"[Lyra Debug] Std ratio CLIP-L: {clip_l_std_ratio:.4f}, CLIP-G: {clip_g_std_ratio:.4f}")
@@ -393,27 +373,25 @@ class SDXLFlowMatchingPipeline:
             neg_clip_l = negative_prompt_embeds[..., :clip_l_dim]
             neg_clip_g = negative_prompt_embeds[..., clip_l_dim:]
-            if hasattr(self.lyra_model, 'encode') and hasattr(self.lyra_model, 'decode'):
-                t5_neg_inputs = {'t5_xl_l': t5_embeds_neg.float(), 't5_xl_g': t5_embeds_neg.float()}
-                mu_neg, _ = self.lyra_model.encode(t5_neg_inputs)
-                recon_neg = self.lyra_model.decode(mu_neg, target_modalities=['clip_l', 'clip_g'])
-            else:
-                modality_inputs_neg = {
-                    'clip_l': neg_clip_l.float(),
-                    'clip_g': neg_clip_g.float(),
-                    't5_xl_l': t5_embeds_neg.float(),
-                    't5_xl_g': t5_embeds_neg.float()
-                }
-                recon_neg, _, _, _ = self.lyra_model(modality_inputs_neg, target_modalities=['clip_l', 'clip_g'])
             lyra_neg_l = recon_neg['clip_l'].to(negative_prompt_embeds.dtype)
             lyra_neg_g = recon_neg['clip_g'].to(negative_prompt_embeds.dtype)
             # Normalize if needed
-            if lyra_neg_l.std() / (neg_clip_l.std() + 1e-8) > 2.0:
                 lyra_neg_l = (lyra_neg_l - lyra_neg_l.mean()) / (lyra_neg_l.std() + 1e-8)
                 lyra_neg_l = lyra_neg_l * neg_clip_l.std() + neg_clip_l.mean()
-            if lyra_neg_g.std() / (neg_clip_g.std() + 1e-8) > 2.0:
                 lyra_neg_g = (lyra_neg_g - lyra_neg_g.mean()) / (lyra_neg_g.std() + 1e-8)
                 lyra_neg_g = lyra_neg_g * neg_clip_g.std() + neg_clip_g.mean()

         print(f"[Lyra Debug] T5 input: shape={t5_embeds.shape}, mean={t5_embeds.mean():.4f}, std={t5_embeds.std():.4f}")
         with torch.no_grad():
+            # Full forward pass with all modalities (model requires all)
+            modality_inputs = {
+                'clip_l': clip_l_embeds.float(),
+                'clip_g': clip_g_embeds.float(),
                 't5_xl_l': t5_embeds.float(),
                 't5_xl_g': t5_embeds.float()
             }
+            reconstructions, mu, logvar, _ = self.lyra_model(
+                modality_inputs,
+                target_modalities=['clip_l', 'clip_g']
+            )
+            print(f"[Lyra Debug] Latent mu: shape={mu.shape}, mean={mu.mean():.4f}, std={mu.std():.4f}")
             lyra_clip_l = reconstructions['clip_l'].to(prompt_embeds.dtype)
             lyra_clip_g = reconstructions['clip_g'].to(prompt_embeds.dtype)
             print(f"[Lyra Debug] Lyra CLIP-G output: mean={lyra_clip_g.mean():.4f}, std={lyra_clip_g.std():.4f}")
             # Check if reconstruction stats are wildly different from input
             clip_l_std_ratio = lyra_clip_l.std() / (clip_l_embeds.std() + 1e-8)
             clip_g_std_ratio = lyra_clip_g.std() / (clip_g_embeds.std() + 1e-8)
             print(f"[Lyra Debug] Std ratio CLIP-L: {clip_l_std_ratio:.4f}, CLIP-G: {clip_g_std_ratio:.4f}")
             neg_clip_l = negative_prompt_embeds[..., :clip_l_dim]
             neg_clip_g = negative_prompt_embeds[..., clip_l_dim:]
+            # Full forward pass (model requires all modalities)
+            modality_inputs_neg = {
+                'clip_l': neg_clip_l.float(),
+                'clip_g': neg_clip_g.float(),
+                't5_xl_l': t5_embeds_neg.float(),
+                't5_xl_g': t5_embeds_neg.float()
+            }
+            recon_neg, _, _, _ = self.lyra_model(modality_inputs_neg, target_modalities=['clip_l', 'clip_g'])
             lyra_neg_l = recon_neg['clip_l'].to(negative_prompt_embeds.dtype)
             lyra_neg_g = recon_neg['clip_g'].to(negative_prompt_embeds.dtype)
             # Normalize if needed
+            neg_l_ratio = lyra_neg_l.std() / (neg_clip_l.std() + 1e-8)
+            neg_g_ratio = lyra_neg_g.std() / (neg_clip_g.std() + 1e-8)
+            if neg_l_ratio > 2.0 or neg_l_ratio < 0.5:
                 lyra_neg_l = (lyra_neg_l - lyra_neg_l.mean()) / (lyra_neg_l.std() + 1e-8)
                 lyra_neg_l = lyra_neg_l * neg_clip_l.std() + neg_clip_l.mean()
+            if neg_g_ratio > 2.0 or neg_g_ratio < 0.5:
                 lyra_neg_g = (lyra_neg_g - lyra_neg_g.mean()) / (lyra_neg_g.std() + 1e-8)
                 lyra_neg_g = lyra_neg_g * neg_clip_g.std() + neg_clip_g.mean()