Spaces:

AbstractPhil
/

lyra-xl-playground

Running on Zero

App Files Files Community

AbstractPhil commited on 16 days ago

Commit

a7aafe6

verified ·

1 Parent(s): ee10b5a

Update app.py

Browse files

Files changed (1) hide show

app.py +383 -1102

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ Supports Illustrious XL, standard SDXL, and SD1.5 variants.
 Lyra VAE Versions:
 - v1: SD1.5 (768 dim CLIP + T5-base) - geofractal.model.vae.vae_lyra
-- v2: SDXL/Illustrious (768 CLIP-L + 2048 T5-XL) - geofractal.model.vae.vae_lyra_v2
 """
 import os
@@ -25,9 +25,10 @@ from diffusers import (
     UNet2DConditionModel,
     AutoencoderKL,
     EulerDiscreteScheduler,
-    EulerAncestralDiscreteScheduler
 )
-from diffusers.models import UNet2DConditionModel as DiffusersUNet
 from transformers import (
     CLIPTextModel,
     CLIPTokenizer,
@@ -37,74 +38,90 @@ from transformers import (
 )
 from huggingface_hub import hf_hub_download
-# Import Lyra VAE v1 (SD1.5) from geofractal
-try:
-    from geofractal.model.vae.vae_lyra import MultiModalVAE as LyraV1, MultiModalVAEConfig as LyraV1Config
-    LYRA_V1_AVAILABLE = True
-except ImportError:
-    print("⚠️ Lyra VAE v1 not available")
-    LYRA_V1_AVAILABLE = False
-# Import Lyra VAE v2 (SDXL/Illustrious) from geofractal
-try:
-    from geofractal.model.vae.vae_lyra_v2 import MultiModalVAE as LyraV2, MultiModalVAEConfig as LyraV2Config
-    LYRA_V2_AVAILABLE = True
-except ImportError:
-    print("⚠️ Lyra VAE v2 not available")
-    LYRA_V2_AVAILABLE = False
 # ============================================================================
 # CONSTANTS
 # ============================================================================
-# Model architectures
 ARCH_SD15 = "sd15"
 ARCH_SDXL = "sdxl"
-# ComfyUI key prefixes for SDXL single-file checkpoints
-COMFYUI_UNET_PREFIX = "model.diffusion_model."
-COMFYUI_CLIP_L_PREFIX = "conditioner.embedders.0.transformer."
-COMFYUI_CLIP_G_PREFIX = "conditioner.embedders.1.model."
-COMFYUI_VAE_PREFIX = "first_stage_model."
 # ============================================================================
-# MODEL LOADING UTILITIES
 # ============================================================================
-def extract_comfyui_components(state_dict: Dict[str, torch.Tensor]) -> Dict[str, Dict[str, torch.Tensor]]:
-    """Extract UNet, CLIP-L, CLIP-G, and VAE from ComfyUI single-file checkpoint."""
-    components = {
-        "unet": {},
-        "clip_l": {},
-        "clip_g": {},
-        "vae": {}
-    }
-    for key, value in state_dict.items():
-        if key.startswith(COMFYUI_UNET_PREFIX):
-            new_key = key[len(COMFYUI_UNET_PREFIX):]
-            components["unet"][new_key] = value
-        elif key.startswith(COMFYUI_CLIP_L_PREFIX):
-            new_key = key[len(COMFYUI_CLIP_L_PREFIX):]
-            components["clip_l"][new_key] = value
-        elif key.startswith(COMFYUI_CLIP_G_PREFIX):
-            new_key = key[len(COMFYUI_CLIP_G_PREFIX):]
-            components["clip_g"][new_key] = value
-        elif key.startswith(COMFYUI_VAE_PREFIX):
-            new_key = key[len(COMFYUI_VAE_PREFIX):]
-            components["vae"][new_key] = value
-    print(f"  Extracted components:")
-    print(f"    UNet: {len(components['unet'])} keys")
-    print(f"    CLIP-L: {len(components['clip_l'])} keys")
-    print(f"    CLIP-G: {len(components['clip_g'])} keys")
-    print(f"    VAE: {len(components['vae'])} keys")
-    return components
 def get_clip_hidden_state(
     model_output,
@@ -116,13 +133,180 @@ def get_clip_hidden_state(
         return model_output.last_hidden_state
     if hasattr(model_output, 'hidden_states') and model_output.hidden_states is not None:
-        # hidden_states is tuple: (embedding, layer1, ..., layerN)
-        # clip_skip=2 means penultimate layer = hidden_states[-2]
         return model_output.hidden_states[-clip_skip]
     return model_output.last_hidden_state
 # ============================================================================
 # SDXL PIPELINE
 # ============================================================================
@@ -133,16 +317,15 @@ class SDXLFlowMatchingPipeline:
     def __init__(
         self,
         vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,  # CLIP-L
-        text_encoder_2: CLIPTextModelWithProjection,  # CLIP-G
         tokenizer: CLIPTokenizer,
         tokenizer_2: CLIPTokenizer,
         unet: UNet2DConditionModel,
         scheduler,
         device: str = "cuda",
-        t5_encoder: Optional[T5EncoderModel] = None,
-        t5_tokenizer: Optional[T5Tokenizer] = None,
-        lyra_model: Optional[any] = None,
         clip_skip: int = 1
     ):
         self.vae = vae
@@ -154,16 +337,31 @@ class SDXLFlowMatchingPipeline:
         self.scheduler = scheduler
         self.device = device
-        # Lyra components
-        self.t5_encoder = t5_encoder
-        self.t5_tokenizer = t5_tokenizer
-        self.lyra_model = lyra_model
         # Settings
         self.clip_skip = clip_skip
-        self.vae_scale_factor = 0.13025  # SDXL VAE scaling
         self.arch = ARCH_SDXL
     def encode_prompt(
         self,
         prompt: str,
@@ -206,11 +404,8 @@ class SDXLFlowMatchingPipeline:
                 output_hidden_states=output_hidden_states
             )
             prompt_embeds_g = get_clip_hidden_state(clip_g_output, clip_skip, output_hidden_states)
-            # Get pooled output from CLIP-G
             pooled_prompt_embeds = clip_g_output.text_embeds
-        # Concatenate CLIP-L and CLIP-G embeddings
         prompt_embeds = torch.cat([prompt_embeds_l, prompt_embeds_g], dim=-1)
         # Negative prompt
@@ -262,14 +457,8 @@ class SDXLFlowMatchingPipeline:
         t5_summary: str = "",
         lyra_strength: float = 0.3
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Encode prompts using Lyra VAE v2 fusion (CLIP + T5).
-        Uses cross-modal translation: encode T5 → decode to CLIP space,
-        then blend with original CLIP embeddings.
-        Args:
-            lyra_strength: Blend factor (0.0 = pure CLIP, 1.0 = pure Lyra reconstruction)
-        """
         if self.lyra_model is None or self.t5_encoder is None:
             raise ValueError("Lyra VAE components not initialized")
@@ -278,7 +467,7 @@ class SDXLFlowMatchingPipeline:
             prompt, negative_prompt, clip_skip
         )
-        # Format T5 input with pilcrow separator (¶)
         SUMMARY_SEPARATOR = "¶"
         if t5_summary.strip():
             t5_prompt = f"{prompt} {SUMMARY_SEPARATOR} {t5_summary}"
@@ -298,18 +487,10 @@ class SDXLFlowMatchingPipeline:
             t5_embeds = self.t5_encoder(**t5_inputs).last_hidden_state
         clip_l_dim = 768
-        clip_g_dim = 1280
         clip_l_embeds = prompt_embeds[..., :clip_l_dim]
         clip_g_embeds = prompt_embeds[..., clip_l_dim:]
-        # Debug: print input stats
-        print(f"[Lyra Debug] CLIP-L input: shape={clip_l_embeds.shape}, mean={clip_l_embeds.mean():.4f}, std={clip_l_embeds.std():.4f}")
-        print(f"[Lyra Debug] CLIP-G input: shape={clip_g_embeds.shape}, mean={clip_g_embeds.mean():.4f}, std={clip_g_embeds.std():.4f}")
-        print(f"[Lyra Debug] T5 input: shape={t5_embeds.shape}, mean={t5_embeds.mean():.4f}, std={t5_embeds.std():.4f}")
         with torch.no_grad():
-            # Full forward pass with all modalities (model requires all)
             modality_inputs = {
                 'clip_l': clip_l_embeds.float(),
                 'clip_g': clip_g_embeds.float(),
@@ -320,90 +501,30 @@ class SDXLFlowMatchingPipeline:
                 modality_inputs,
                 target_modalities=['clip_l', 'clip_g']
             )
-            print(f"[Lyra Debug] Latent mu: shape={mu.shape}, mean={mu.mean():.4f}, std={mu.std():.4f}")
             lyra_clip_l = reconstructions['clip_l'].to(prompt_embeds.dtype)
             lyra_clip_g = reconstructions['clip_g'].to(prompt_embeds.dtype)
-            print(f"[Lyra Debug] Lyra CLIP-L output: mean={lyra_clip_l.mean():.4f}, std={lyra_clip_l.std():.4f}")
-            print(f"[Lyra Debug] Lyra CLIP-G output: mean={lyra_clip_g.mean():.4f}, std={lyra_clip_g.std():.4f}")
-            # Check if reconstruction stats are wildly different from input
             clip_l_std_ratio = lyra_clip_l.std() / (clip_l_embeds.std() + 1e-8)
             clip_g_std_ratio = lyra_clip_g.std() / (clip_g_embeds.std() + 1e-8)
-            print(f"[Lyra Debug] Std ratio CLIP-L: {clip_l_std_ratio:.4f}, CLIP-G: {clip_g_std_ratio:.4f}")
-            # Normalize reconstructions to match input statistics if needed
             if clip_l_std_ratio > 2.0 or clip_l_std_ratio < 0.5:
-                print("[Lyra Debug] Normalizing CLIP-L reconstruction to match input stats")
                 lyra_clip_l = (lyra_clip_l - lyra_clip_l.mean()) / (lyra_clip_l.std() + 1e-8)
                 lyra_clip_l = lyra_clip_l * clip_l_embeds.std() + clip_l_embeds.mean()
             if clip_g_std_ratio > 2.0 or clip_g_std_ratio < 0.5:
-                print("[Lyra Debug] Normalizing CLIP-G reconstruction to match input stats")
                 lyra_clip_g = (lyra_clip_g - lyra_clip_g.mean()) / (lyra_clip_g.std() + 1e-8)
                 lyra_clip_g = lyra_clip_g * clip_g_embeds.std() + clip_g_embeds.mean()
-        # Blend original CLIP with Lyra reconstruction
         fused_clip_l = (1 - lyra_strength) * clip_l_embeds + lyra_strength * lyra_clip_l
         fused_clip_g = (1 - lyra_strength) * clip_g_embeds + lyra_strength * lyra_clip_g
-        print(f"[Lyra Debug] Final fused CLIP-L: mean={fused_clip_l.mean():.4f}, std={fused_clip_l.std():.4f}")
-        print(f"[Lyra Debug] lyra_strength={lyra_strength}")
         prompt_embeds_fused = torch.cat([fused_clip_l, fused_clip_g], dim=-1)
-        # Process negative prompt (simpler - just use original CLIP for negative)
-        if negative_prompt:
-            # For negative, blend less aggressively
-            neg_strength = lyra_strength
-            t5_neg_prompt = f"{negative_prompt} {SUMMARY_SEPARATOR} {negative_prompt}"
-            t5_inputs_neg = self.t5_tokenizer(
-                t5_neg_prompt,
-                max_length=512,
-                padding='max_length',
-                truncation=True,
-                return_tensors='pt'
-            ).to(self.device)
-            with torch.no_grad():
-                t5_embeds_neg = self.t5_encoder(**t5_inputs_neg).last_hidden_state
-            neg_clip_l = negative_prompt_embeds[..., :clip_l_dim]
-            neg_clip_g = negative_prompt_embeds[..., clip_l_dim:]
-            # Full forward pass (model requires all modalities)
-            modality_inputs_neg = {
-                'clip_l': neg_clip_l.float(),
-                'clip_g': neg_clip_g.float(),
-                't5_xl_l': t5_embeds_neg.float(),
-                't5_xl_g': t5_embeds_neg.float()
-            }
-            recon_neg, _, _, _ = self.lyra_model(modality_inputs_neg, target_modalities=['clip_l', 'clip_g'])
-            lyra_neg_l = recon_neg['clip_l'].to(negative_prompt_embeds.dtype)
-            lyra_neg_g = recon_neg['clip_g'].to(negative_prompt_embeds.dtype)
-            # Normalize if needed
-            neg_l_ratio = lyra_neg_l.std() / (neg_clip_l.std() + 1e-8)
-            neg_g_ratio = lyra_neg_g.std() / (neg_clip_g.std() + 1e-8)
-            if neg_l_ratio > 2.0 or neg_l_ratio < 0.5:
-                lyra_neg_l = (lyra_neg_l - lyra_neg_l.mean()) / (lyra_neg_l.std() + 1e-8)
-                lyra_neg_l = lyra_neg_l * neg_clip_l.std() + neg_clip_l.mean()
-            if neg_g_ratio > 2.0 or neg_g_ratio < 0.5:
-                lyra_neg_g = (lyra_neg_g - lyra_neg_g.mean()) / (lyra_neg_g.std() + 1e-8)
-                lyra_neg_g = lyra_neg_g * neg_clip_g.std() + neg_clip_g.mean()
-            fused_neg_l = (1 - neg_strength) * neg_clip_l + neg_strength * lyra_neg_l
-            fused_neg_g = (1 - neg_strength) * neg_clip_g + neg_strength * lyra_neg_g
-            negative_prompt_embeds_fused = torch.cat([fused_neg_l, fused_neg_g], dim=-1)
-        else:
-            negative_prompt_embeds_fused = torch.zeros_like(prompt_embeds_fused)
-        return prompt_embeds_fused, negative_prompt_embeds_fused, pooled, negative_pooled
     def _get_add_time_ids(
         self,
@@ -424,28 +545,24 @@ class SDXLFlowMatchingPipeline:
         negative_prompt: str = "",
         height: int = 1024,
         width: int = 1024,
-        num_inference_steps: int = 20,
-        guidance_scale: float = 7.5,
-        shift: float = 0.0,
-        use_flow_matching: bool = False,
-        prediction_type: str = "epsilon",
         seed: Optional[int] = None,
         use_lyra: bool = False,
-        clip_skip: int = 1,
         t5_summary: str = "",
         lyra_strength: float = 1.0,
         progress_callback=None
     ):
         """Generate image using SDXL architecture."""
-        # Set seed
         if seed is not None:
             generator = torch.Generator(device=self.device).manual_seed(seed)
         else:
             generator = None
         # Encode prompts
-        if use_lyra and self.lyra_model is not None:
             prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt_lyra(
                 prompt, negative_prompt, clip_skip, t5_summary, lyra_strength
             )
@@ -470,11 +587,9 @@ class SDXLFlowMatchingPipeline:
         self.scheduler.set_timesteps(num_inference_steps, device=self.device)
         timesteps = self.scheduler.timesteps
-        # Scale initial latents
-        if not use_flow_matching:
-            latents = latents * self.scheduler.init_noise_sigma
-        # Prepare added time embeddings for SDXL
         original_size = (height, width)
         target_size = (height, width)
         crops_coords_top_left = (0, 0)
@@ -482,29 +597,18 @@ class SDXLFlowMatchingPipeline:
         add_time_ids = self._get_add_time_ids(
             original_size, crops_coords_top_left, target_size, dtype=torch.float16
         )
-        negative_add_time_ids = add_time_ids  # Same for negative
         # Denoising loop
         for i, t in enumerate(timesteps):
             if progress_callback:
                 progress_callback(i, num_inference_steps, f"Step {i+1}/{num_inference_steps}")
-            # Expand for CFG
             latent_model_input = torch.cat([latents] * 2) if guidance_scale > 1.0 else latents
-            # Flow matching scaling
-            if use_flow_matching and shift > 0:
-                sigma = t.float() / 1000.0
-                sigma_shifted = (shift * sigma) / (1 + (shift - 1) * sigma)
-                scaling = torch.sqrt(1 + sigma_shifted ** 2)
-                latent_model_input = latent_model_input / scaling
-            else:
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-            # Prepare timestep
             timestep = t.expand(latent_model_input.shape[0])
-            # Prepare added conditions
             if guidance_scale > 1.0:
                 text_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
                 add_text_embeds = torch.cat([negative_pooled, pooled])
@@ -514,13 +618,11 @@ class SDXLFlowMatchingPipeline:
                 add_text_embeds = pooled
                 add_time_ids_input = add_time_ids
-            # Prepare added cond kwargs for SDXL UNet
             added_cond_kwargs = {
                 "text_embeds": add_text_embeds,
                 "time_ids": add_time_ids_input
             }
-            # Predict noise
             noise_pred = self.unet(
                 latent_model_input,
                 timestep,
@@ -529,28 +631,11 @@ class SDXLFlowMatchingPipeline:
                 return_dict=False
             )[0]
-            # CFG
             if guidance_scale > 1.0:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-            # Step
-            if use_flow_matching:
-                sigma = t.float() / 1000.0
-                sigma_shifted = (shift * sigma) / (1 + (shift - 1) * sigma)
-                if prediction_type == "v_prediction":
-                    v_pred = noise_pred
-                    alpha_t = torch.sqrt(1 - sigma_shifted ** 2)
-                    sigma_t = sigma_shifted
-                    noise_pred = alpha_t * v_pred + sigma_t * latents
-                dt = -1.0 / num_inference_steps
-                latents = latents + dt * noise_pred
-            else:
-                latents = self.scheduler.step(
-                    noise_pred, t, latents, return_dict=False
-                )[0]
         # Decode
         latents = latents / self.vae_scale_factor
@@ -558,255 +643,6 @@ class SDXLFlowMatchingPipeline:
         with torch.no_grad():
             image = self.vae.decode(latents.to(self.vae.dtype)).sample
-        # Convert to PIL
-        image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-        image = (image * 255).round().astype("uint8")
-        image = Image.fromarray(image[0])
-        return image
-# ============================================================================
-# SD1.5 PIPELINE (Original)
-# ============================================================================
-class SD15FlowMatchingPipeline:
-    """Pipeline for SD1.5-based flow-matching inference."""
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler,
-        device: str = "cuda",
-        t5_encoder: Optional[T5EncoderModel] = None,
-        t5_tokenizer: Optional[T5Tokenizer] = None,
-        lyra_model: Optional[any] = None
-    ):
-        self.vae = vae
-        self.text_encoder = text_encoder
-        self.tokenizer = tokenizer
-        self.unet = unet
-        self.scheduler = scheduler
-        self.device = device
-        self.t5_encoder = t5_encoder
-        self.t5_tokenizer = t5_tokenizer
-        self.lyra_model = lyra_model
-        self.vae_scale_factor = 0.18215
-        self.arch = ARCH_SD15
-        self.is_lune_model = False
-    def encode_prompt(self, prompt: str, negative_prompt: str = ""):
-        """Encode text prompts to embeddings."""
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids.to(self.device)
-        with torch.no_grad():
-            prompt_embeds = self.text_encoder(text_input_ids)[0]
-        if negative_prompt:
-            uncond_inputs = self.tokenizer(
-                negative_prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            uncond_input_ids = uncond_inputs.input_ids.to(self.device)
-            with torch.no_grad():
-                negative_prompt_embeds = self.text_encoder(uncond_input_ids)[0]
-        else:
-            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
-        return prompt_embeds, negative_prompt_embeds
-    def encode_prompt_lyra(self, prompt: str, negative_prompt: str = ""):
-        """Encode using Lyra VAE (CLIP + T5 fusion)."""
-        if self.lyra_model is None or self.t5_encoder is None:
-            raise ValueError("Lyra VAE components not initialized")
-        # CLIP
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids.to(self.device)
-        with torch.no_grad():
-            clip_embeds = self.text_encoder(text_input_ids)[0]
-        # T5
-        t5_inputs = self.t5_tokenizer(
-            prompt,
-            max_length=77,
-            padding='max_length',
-            truncation=True,
-            return_tensors='pt'
-        ).to(self.device)
-        with torch.no_grad():
-            t5_embeds = self.t5_encoder(**t5_inputs).last_hidden_state
-        # Fuse
-        modality_inputs = {'clip': clip_embeds, 't5': t5_embeds}
-        with torch.no_grad():
-            reconstructions, mu, logvar = self.lyra_model(
-                modality_inputs,
-                target_modalities=['clip']
-            )
-            prompt_embeds = reconstructions['clip']
-        # Negative
-        if negative_prompt:
-            uncond_inputs = self.tokenizer(
-                negative_prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            uncond_input_ids = uncond_inputs.input_ids.to(self.device)
-            with torch.no_grad():
-                clip_embeds_uncond = self.text_encoder(uncond_input_ids)[0]
-            t5_inputs_uncond = self.t5_tokenizer(
-                negative_prompt,
-                max_length=77,
-                padding='max_length',
-                truncation=True,
-                return_tensors='pt'
-            ).to(self.device)
-            with torch.no_grad():
-                t5_embeds_uncond = self.t5_encoder(**t5_inputs_uncond).last_hidden_state
-            modality_inputs_uncond = {'clip': clip_embeds_uncond, 't5': t5_embeds_uncond}
-            with torch.no_grad():
-                reconstructions_uncond, _, _ = self.lyra_model(
-                    modality_inputs_uncond,
-                    target_modalities=['clip']
-                )
-                negative_prompt_embeds = reconstructions_uncond['clip']
-        else:
-            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
-        return prompt_embeds, negative_prompt_embeds
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: str,
-        negative_prompt: str = "",
-        height: int = 512,
-        width: int = 512,
-        num_inference_steps: int = 20,
-        guidance_scale: float = 7.5,
-        shift: float = 2.5,
-        use_flow_matching: bool = True,
-        prediction_type: str = "epsilon",
-        seed: Optional[int] = None,
-        use_lyra: bool = False,
-        clip_skip: int = 1,  # Unused for SD1.5 but kept for API consistency
-        progress_callback=None
-    ):
-        """Generate image."""
-        if seed is not None:
-            generator = torch.Generator(device=self.device).manual_seed(seed)
-        else:
-            generator = None
-        if use_lyra and self.lyra_model is not None:
-            prompt_embeds, negative_prompt_embeds = self.encode_prompt_lyra(prompt, negative_prompt)
-        else:
-            prompt_embeds, negative_prompt_embeds = self.encode_prompt(prompt, negative_prompt)
-        latent_channels = 4
-        latent_height = height // 8
-        latent_width = width // 8
-        latents = torch.randn(
-            (1, latent_channels, latent_height, latent_width),
-            generator=generator,
-            device=self.device,
-            dtype=torch.float32
-        )
-        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
-        timesteps = self.scheduler.timesteps
-        if not use_flow_matching:
-            latents = latents * self.scheduler.init_noise_sigma
-        for i, t in enumerate(timesteps):
-            if progress_callback:
-                progress_callback(i, num_inference_steps, f"Step {i+1}/{num_inference_steps}")
-            latent_model_input = torch.cat([latents] * 2) if guidance_scale > 1.0 else latents
-            if use_flow_matching and shift > 0:
-                sigma = t.float() / 1000.0
-                sigma_shifted = (shift * sigma) / (1 + (shift - 1) * sigma)
-                scaling = torch.sqrt(1 + sigma_shifted ** 2)
-                latent_model_input = latent_model_input / scaling
-            else:
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-            timestep = t.expand(latent_model_input.shape[0])
-            text_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) if guidance_scale > 1.0 else prompt_embeds
-            noise_pred = self.unet(
-                latent_model_input,
-                timestep,
-                encoder_hidden_states=text_embeds,
-                return_dict=False
-            )[0]
-            if guidance_scale > 1.0:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-            if use_flow_matching:
-                sigma = t.float() / 1000.0
-                sigma_shifted = (shift * sigma) / (1 + (shift - 1) * sigma)
-                if prediction_type == "v_prediction":
-                    v_pred = noise_pred
-                    alpha_t = torch.sqrt(1 - sigma_shifted ** 2)
-                    sigma_t = sigma_shifted
-                    noise_pred = alpha_t * v_pred + sigma_t * latents
-                dt = -1.0 / num_inference_steps
-                latents = latents + dt * noise_pred
-            else:
-                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-        latents = latents / self.vae_scale_factor
-        if self.is_lune_model:
-            latents = latents * 5.52
-        with torch.no_grad():
-            image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clamp(0, 1)
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         image = (image * 255).round().astype("uint8")
@@ -819,59 +655,26 @@ class SD15FlowMatchingPipeline:
 # MODEL LOADERS
 # ============================================================================
-def load_lune_checkpoint(repo_id: str, filename: str, device: str = "cuda"):
-    """Load Lune checkpoint from .pt file."""
-    print(f"📥 Downloading: {repo_id}/{filename}")
-    checkpoint_path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="model")
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-    print(f"🏗️ Initializing SD1.5 UNet...")
-    unet = UNet2DConditionModel.from_pretrained(
-        "runwayml/stable-diffusion-v1-5",
-        subfolder="unet",
-        torch_dtype=torch.float32
-    )
-    student_state_dict = checkpoint["student"]
-    cleaned_dict = {}
-    for key, value in student_state_dict.items():
-        if key.startswith("unet."):
-            cleaned_dict[key[5:]] = value
-        else:
-            cleaned_dict[key] = value
-    unet.load_state_dict(cleaned_dict, strict=False)
-    step = checkpoint.get("gstep", "unknown")
-    print(f"✅ Loaded Lune from step {step}")
-    return unet.to(device)
 def load_illustrious_xl(
-    repo_id: str = "AbstractPhil/vae-lyra-xl-adaptive-cantor-illustrious",
     filename: str = "illustriousXL_v01.safetensors",
     device: str = "cuda"
 ) -> Tuple[UNet2DConditionModel, AutoencoderKL, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, CLIPTokenizer]:
-    """Load Illustrious XL from single safetensors file using diffusers' single-file loader."""
     from diffusers import StableDiffusionXLPipeline
     print(f"📥 Loading Illustrious XL: {repo_id}/{filename}")
-    # Download the checkpoint
     checkpoint_path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="model")
     print(f"✓ Downloaded: {checkpoint_path}")
-    # Use diffusers' built-in single-file loader which handles key remapping
-    print("📦 Loading with StableDiffusionXLPipeline.from_single_file()...")
     pipe = StableDiffusionXLPipeline.from_single_file(
         checkpoint_path,
         torch_dtype=torch.float16,
         use_safetensors=True,
     )
-    # Extract components
     unet = pipe.unet.to(device)
     vae = pipe.vae.to(device)
     text_encoder = pipe.text_encoder.to(device)
@@ -879,404 +682,72 @@ def load_illustrious_xl(
     tokenizer = pipe.tokenizer
     tokenizer_2 = pipe.tokenizer_2
-    # Clean up the pipeline to free memory
     del pipe
     torch.cuda.empty_cache()
     print("✅ Illustrious XL loaded!")
-    print(f"   UNet params: {sum(p.numel() for p in unet.parameters()):,}")
-    print(f"   VAE params: {sum(p.numel() for p in vae.parameters()):,}")
     return unet, vae, text_encoder, text_encoder_2, tokenizer, tokenizer_2
-def load_sdxl_base(device: str = "cuda"):
-    """Load standard SDXL base model."""
-    print("📥 Loading SDXL Base 1.0...")
-    unet = UNet2DConditionModel.from_pretrained(
-        "stabilityai/stable-diffusion-xl-base-1.0",
-        subfolder="unet",
-        torch_dtype=torch.float16
-    ).to(device)
-    vae = AutoencoderKL.from_pretrained(
-        "stabilityai/stable-diffusion-xl-base-1.0",
-        subfolder="vae",
-        torch_dtype=torch.float16
-    ).to(device)
-    text_encoder = CLIPTextModel.from_pretrained(
-        "stabilityai/stable-diffusion-xl-base-1.0",
-        subfolder="text_encoder",
-        torch_dtype=torch.float16
-    ).to(device)
-    text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(
-        "stabilityai/stable-diffusion-xl-base-1.0",
-        subfolder="text_encoder_2",
-        torch_dtype=torch.float16
-    ).to(device)
-    tokenizer = CLIPTokenizer.from_pretrained(
-        "stabilityai/stable-diffusion-xl-base-1.0",
-        subfolder="tokenizer"
-    )
-    tokenizer_2 = CLIPTokenizer.from_pretrained(
-        "stabilityai/stable-diffusion-xl-base-1.0",
-        subfolder="tokenizer_2"
-    )
-    print("✅ SDXL Base loaded!")
-    return unet, vae, text_encoder, text_encoder_2, tokenizer, tokenizer_2
-def load_lyra_vae(repo_id: str = "AbstractPhil/vae-lyra", device: str = "cuda"):
-    """Load Lyra VAE v1 (SD1.5 version) from HuggingFace."""
-    if not LYRA_V1_AVAILABLE:
-        print("⚠️ Lyra VAE v1 not available")
-        return None
-    print(f"🎵 Loading Lyra VAE v1 from {repo_id}...")
-    try:
-        # Try to download config.json first
-        try:
-            print("  📥 Downloading config.json...")
-            config_path = hf_hub_download(
-                repo_id=repo_id,
-                filename="config.json",
-                repo_type="model"
-            )
-            with open(config_path, 'r') as f:
-                config_dict = json.load(f)
-            print(f"  ✓ Config loaded: {config_dict.get('fusion_strategy', 'unknown')} fusion")
-        except Exception:
-            # Fallback to defaults if no config.json
-            print("  ⚠️ No config.json found, using defaults")
-            config_dict = {
-                'modality_dims': {"clip": 768, "t5": 768},
-                'latent_dim': 768,
-                'seq_len': 77,
-                'encoder_layers': 3,
-                'decoder_layers': 3,
-                'hidden_dim': 1024,
-                'dropout': 0.1,
-                'fusion_strategy': 'cantor',
-                'fusion_heads': 8,
-                'fusion_dropout': 0.1
-            }
-        # Download model weights
-        print("  📥 Downloading model weights...")
-        try:
-            checkpoint_path = hf_hub_download(
-                repo_id=repo_id,
-                filename="model.pt",
-                repo_type="model"
-            )
-        except Exception:
-            # Fallback to best_model.pt
-            checkpoint_path = hf_hub_download(
-                repo_id=repo_id,
-                filename="best_model.pt",
-                repo_type="model"
-            )
-        checkpoint = torch.load(checkpoint_path, map_location="cpu")
-        vae_config = LyraV1Config(
-            modality_dims=config_dict.get('modality_dims', {"clip": 768, "t5": 768}),
-            latent_dim=config_dict.get('latent_dim', 768),
-            seq_len=config_dict.get('seq_len', 77),
-            encoder_layers=config_dict.get('encoder_layers', 3),
-            decoder_layers=config_dict.get('decoder_layers', 3),
-            hidden_dim=config_dict.get('hidden_dim', 1024),
-            dropout=config_dict.get('dropout', 0.1),
-            fusion_strategy=config_dict.get('fusion_strategy', 'cantor'),
-            fusion_heads=config_dict.get('fusion_heads', 8),
-            fusion_dropout=config_dict.get('fusion_dropout', 0.1)
-        )
-        lyra_model = LyraV1(vae_config)
-        if 'model_state_dict' in checkpoint:
-            lyra_model.load_state_dict(checkpoint['model_state_dict'])
-        else:
-            lyra_model.load_state_dict(checkpoint)
-        lyra_model.to(device)
-        lyra_model.eval()
-        print(f"✅ Lyra VAE v1 loaded")
-        print(f"   Fusion: {config_dict.get('fusion_strategy')}")
-        print(f"   Latent dim: {config_dict.get('latent_dim')}")
-        if 'global_step' in checkpoint:
-            print(f"   Step: {checkpoint['global_step']:,}")
-        return lyra_model
-    except Exception as e:
-        print(f"❌ Failed to load Lyra VAE v1: {e}")
-        import traceback
-        traceback.print_exc()
-        return None
-def load_lyra_vae_xl(
-    repo_id: str = "AbstractPhil/vae-lyra-xl-adaptive-cantor-illustrious",
-    checkpoint_filename: str = None,  # Auto-detect if None
-    device: str = "cuda"
-):
-    """Load Lyra VAE v2 (SDXL/Illustrious version) from HuggingFace."""
-    if not LYRA_V2_AVAILABLE:
-        print("⚠️ Lyra VAE v2 not available")
-        return None
-    print(f"🎵 Loading Lyra VAE v2 from {repo_id}...")
-    try:
-        from huggingface_hub import list_repo_files
-        # Download config.json
-        print("  📥 Downloading config.json...")
-        config_path = hf_hub_download(
-            repo_id=repo_id,
-            filename="config.json",
-            repo_type="model"
-        )
-        with open(config_path, 'r') as f:
-            config_dict = json.load(f)
-        print(f"  ✓ Config: {config_dict.get('fusion_strategy', 'unknown')} fusion, latent_dim={config_dict.get('latent_dim')}")
-        # Auto-detect checkpoint if not specified
-        if checkpoint_filename is None:
-            repo_files = list_repo_files(repo_id, repo_type="model")
-            checkpoint_files = [f for f in repo_files if f.endswith('.pt') or f.endswith('.safetensors')]
-            checkpoint_files = [f for f in checkpoint_files if 'checkpoint' in f.lower() or 'model' in f.lower()]
-            if not checkpoint_files:
-                raise FileNotFoundError(f"No checkpoint found in {repo_id}")
-            # Prefer newest checkpoint (highest step number)
-            def extract_step(name):
-                import re
-                match = re.search(r'(\d+)\.pt', name)
-                return int(match.group(1)) if match else 0
-            checkpoint_files.sort(key=extract_step, reverse=True)
-            checkpoint_filename = checkpoint_files[0]
-            print(f"  ✓ Auto-selected checkpoint: {checkpoint_filename}")
-        # Download checkpoint
-        print(f"  📥 Downloading {checkpoint_filename}...")
-        checkpoint_path = hf_hub_download(
-            repo_id=repo_id,
-            filename=checkpoint_filename,
-            repo_type="model"
-        )
-        checkpoint = torch.load(checkpoint_path, map_location="cpu")
-        # Build config with all v2 fields
-        vae_config = LyraV2Config(
-            modality_dims=config_dict.get('modality_dims', {
-                "clip_l": 768, "clip_g": 1280,
-                "t5_xl_l": 2048, "t5_xl_g": 2048
-            }),
-            modality_seq_lens=config_dict.get('modality_seq_lens', {
-                "clip_l": 77, "clip_g": 77,
-                "t5_xl_l": 512, "t5_xl_g": 512
-            }),
-            binding_config=config_dict.get('binding_config', {
-                "clip_l": {"t5_xl_l": 0.3},
-                "clip_g": {"t5_xl_g": 0.3},
-                "t5_xl_l": {},
-                "t5_xl_g": {}
-            }),
-            latent_dim=config_dict.get('latent_dim', 2048),
-            seq_len=config_dict.get('seq_len', 77),
-            encoder_layers=config_dict.get('encoder_layers', 3),
-            decoder_layers=config_dict.get('decoder_layers', 3),
-            hidden_dim=config_dict.get('hidden_dim', 2048),
-            dropout=config_dict.get('dropout', 0.1),
-            fusion_strategy=config_dict.get('fusion_strategy', 'adaptive_cantor'),
-            fusion_heads=config_dict.get('fusion_heads', 8),
-            fusion_dropout=config_dict.get('fusion_dropout', 0.1),
-            cantor_depth=config_dict.get('cantor_depth', 8),
-            cantor_local_window=config_dict.get('cantor_local_window', 3),
-            alpha_init=config_dict.get('alpha_init', 1.0),
-            beta_init=config_dict.get('beta_init', 0.3),
-            alpha_lr_scale=config_dict.get('alpha_lr_scale', 0.1),
-            beta_lr_scale=config_dict.get('beta_lr_scale', 1.0),
-            beta_kl=config_dict.get('beta_kl', 0.1),
-            beta_reconstruction=config_dict.get('beta_reconstruction', 1.0),
-            beta_cross_modal=config_dict.get('beta_cross_modal', 0.0),
-            beta_alpha_regularization=config_dict.get('beta_alpha_regularization', 0.01),
-            kl_clamp_max=config_dict.get('kl_clamp_max', 1.0),
-            logvar_clamp_min=config_dict.get('logvar_clamp_min', -10.0),
-            logvar_clamp_max=config_dict.get('logvar_clamp_max', 10.0),
-        )
-        # Initialize model
-        lyra_model = LyraV2(vae_config)
-        # Load weights
-        state_dict = checkpoint.get('model_state_dict', checkpoint)
-        missing, unexpected = lyra_model.load_state_dict(state_dict, strict=False)
-        if missing:
-            print(f"  ⚠️ Missing keys: {len(missing)} (using initialized weights)")
-        if unexpected:
-            print(f"  ⚠️ Unexpected keys: {len(unexpected)} (ignored)")
-        lyra_model.to(device)
-        lyra_model.eval()
-        # Print summary
-        total_params = sum(p.numel() for p in lyra_model.parameters())
-        print(f"✅ Lyra VAE v2 loaded ({total_params/1e6:.1f}M params)")
-        print(f"   Fusion: {vae_config.fusion_strategy}")
-        print(f"   Latent: {vae_config.latent_dim}, Hidden: {vae_config.hidden_dim}")
-        if 'global_step' in checkpoint:
-            print(f"   Trained steps: {checkpoint['global_step']:,}")
-        if 'best_loss' in checkpoint:
-            print(f"   Best loss: {checkpoint['best_loss']:.4f}")
-        # Print binding info
-        fusion_params = lyra_model.get_fusion_params()
-        if fusion_params.get('alphas'):
-            alpha_vals = {k: torch.sigmoid(v).item() for k, v in fusion_params['alphas'].items()}
-            print(f"   Alphas: {alpha_vals}")
-        return lyra_model
-    except Exception as e:
-        print(f"❌ Failed to load Lyra VAE v2: {e}")
-        import traceback
-        traceback.print_exc()
-        return None
 # ============================================================================
 # PIPELINE INITIALIZATION
 # ============================================================================
-def initialize_pipeline(model_choice: str, device: str = "cuda"):
-    """Initialize the complete pipeline based on model choice."""
     print(f"🚀 Initializing {model_choice} pipeline...")
-    # Determine architecture
-    is_sdxl = "Illustrious" in model_choice or "SDXL" in model_choice
-    is_lune = "Lune" in model_choice
-    if is_sdxl:
-        # SDXL-based models
-        if "Illustrious" in model_choice:
-            unet, vae, text_encoder, text_encoder_2, tokenizer, tokenizer_2 = load_illustrious_xl(device=device)
-        else:
-            unet, vae, text_encoder, text_encoder_2, tokenizer, tokenizer_2 = load_sdxl_base(device=device)
-        # T5-XL for Lyra
-        print("Loading T5-XL encoder...")
-        t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
-        t5_encoder = T5EncoderModel.from_pretrained(
-            "google/flan-t5-xl",
-            torch_dtype=torch.float16
-        ).to(device)
-        t5_encoder.eval()
-        print("✓ T5-XL loaded")
-        # Lyra XL
-        lyra_model = load_lyra_vae_xl(device=device)
-        # Scheduler (epsilon for SDXL)
-        scheduler = EulerDiscreteScheduler.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0",
-            subfolder="scheduler"
-        )
-        pipeline = SDXLFlowMatchingPipeline(
-            vae=vae,
-            text_encoder=text_encoder,
-            text_encoder_2=text_encoder_2,
-            tokenizer=tokenizer,
-            tokenizer_2=tokenizer_2,
-            unet=unet,
-            scheduler=scheduler,
-            device=device,
-            t5_encoder=t5_encoder,
-            t5_tokenizer=t5_tokenizer,
-            lyra_model=lyra_model,
-            clip_skip=1
-        )
     else:
-        # SD1.5-based models
-        vae = AutoencoderKL.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
-            subfolder="vae",
-            torch_dtype=torch.float32
-        ).to(device)
-        text_encoder = CLIPTextModel.from_pretrained(
-            "openai/clip-vit-large-patch14",
-            torch_dtype=torch.float32
-        ).to(device)
-        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-        # T5-base for SD1.5 Lyra
-        print("Loading T5-base encoder...")
-        t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
-        t5_encoder = T5EncoderModel.from_pretrained(
-            "t5-base",
-            torch_dtype=torch.float32
-        ).to(device)
-        t5_encoder.eval()
-        print("✓ T5-base loaded")
-        # Lyra (SD1.5 version)
-        lyra_model = load_lyra_vae(device=device)
-        # Load UNet
-        if is_lune:
-            repo_id = "AbstractPhil/sd15-flow-lune"
-            filename = "sd15_flow_lune_e34_s34000.pt"
-            unet = load_lune_checkpoint(repo_id, filename, device)
-        else:
-            unet = UNet2DConditionModel.from_pretrained(
-                "runwayml/stable-diffusion-v1-5",
-                subfolder="unet",
-                torch_dtype=torch.float32
-            ).to(device)
-        scheduler = EulerDiscreteScheduler.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
-            subfolder="scheduler"
-        )
-        pipeline = SD15FlowMatchingPipeline(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            device=device,
-            t5_encoder=t5_encoder,
-            t5_tokenizer=t5_tokenizer,
-            lyra_model=lyra_model
         )
-        pipeline.is_lune_model = is_lune
-    print("✅ Pipeline initialized!")
     return pipeline
@@ -1286,15 +757,20 @@ def initialize_pipeline(model_choice: str, device: str = "cuda"):
 CURRENT_PIPELINE = None
 CURRENT_MODEL = None
-def get_pipeline(model_choice: str):
     """Get or create pipeline for selected model."""
-    global CURRENT_PIPELINE, CURRENT_MODEL
     if CURRENT_PIPELINE is None or CURRENT_MODEL != model_choice:
-        CURRENT_PIPELINE = initialize_pipeline(model_choice, device="cuda")
         CURRENT_MODEL = model_choice
     return CURRENT_PIPELINE
@@ -1303,35 +779,18 @@ def get_pipeline(model_choice: str):
 # INFERENCE
 # ============================================================================
-def estimate_duration(num_steps: int, width: int, height: int, use_lyra: bool = False, is_sdxl: bool = False) -> int:
-    """Estimate GPU duration."""
-    base_time_per_step = 0.5 if is_sdxl else 0.3
-    resolution_factor = (width * height) / (512 * 512)
-    estimated = num_steps * base_time_per_step * resolution_factor
-    if use_lyra:
-        estimated *= 2
-        estimated += 3
-    return int(estimated + 20)
-@spaces.GPU(duration=lambda *args: estimate_duration(
-    args[5], args[7], args[8], args[11],
-    "SDXL" in args[3] or "Illustrious" in args[3]
-))
 def generate_image(
     prompt: str,
     t5_summary: str,
     negative_prompt: str,
     model_choice: str,
     clip_skip: int,
     num_steps: int,
     cfg_scale: float,
     width: int,
     height: int,
-    shift: float,
-    use_flow_matching: bool,
     use_lyra: bool,
     lyra_strength: float,
     seed: int,
@@ -1347,16 +806,9 @@ def generate_image(
         progress((step + 1) / total, desc=desc)
     try:
-        pipeline = get_pipeline(model_choice)
-        # Determine prediction type based on model
-        is_sdxl = "SDXL" in model_choice or "Illustrious" in model_choice
-        prediction_type = "epsilon"  # SDXL always uses epsilon
-        if not is_sdxl and "Lune" in model_choice:
-            prediction_type = "v_prediction"
-        if not use_lyra or pipeline.lyra_model is None:
             progress(0.05, desc="Generating...")
             image = pipeline(
@@ -1366,9 +818,6 @@ def generate_image(
                 width=width,
                 num_inference_steps=num_steps,
                 guidance_scale=cfg_scale,
-                shift=shift,
-                use_flow_matching=use_flow_matching,
-                prediction_type=prediction_type,
                 seed=seed,
                 use_lyra=False,
                 clip_skip=clip_skip,
@@ -1388,16 +837,13 @@ def generate_image(
                 width=width,
                 num_inference_steps=num_steps,
                 guidance_scale=cfg_scale,
-                shift=shift,
-                use_flow_matching=use_flow_matching,
-                prediction_type=prediction_type,
                 seed=seed,
                 use_lyra=False,
                 clip_skip=clip_skip,
                 progress_callback=lambda s, t, d: progress(0.05 + (s/t) * 0.45, desc=d)
             )
-            progress(0.5, desc="Generating Lyra fusion...")
             image_lyra = pipeline(
                 prompt=prompt,
@@ -1406,9 +852,6 @@ def generate_image(
                 width=width,
                 num_inference_steps=num_steps,
                 guidance_scale=cfg_scale,
-                shift=shift,
-                use_flow_matching=use_flow_matching,
-                prediction_type=prediction_type,
                 seed=seed,
                 use_lyra=True,
                 clip_skip=clip_skip,
@@ -1422,6 +865,8 @@ def generate_image(
     except Exception as e:
         print(f"❌ Generation failed: {e}")
         raise e
@@ -1434,251 +879,93 @@ def create_demo():
     with gr.Blocks() as demo:
         gr.Markdown("""
-        # 🌙 Lyra/Lune Flow-Matching Image Generation
         **Geometric crystalline diffusion** by [AbstractPhil](https://huggingface.co/AbstractPhil)
-        Generate images using SD1.5 and SDXL-based models with geometric deep learning:
         | Model | Architecture | Lyra Version | Best For |
         |-------|-------------|--------------|----------|
         | **Illustrious XL** | SDXL | v2 (T5-XL) | Anime/illustration, high detail |
         | **SDXL Base** | SDXL | v2 (T5-XL) | Photorealistic, general purpose |
-        | **Flow-Lune** | SD1.5 | v1 (T5-base) | Fast flow matching (15-25 steps) |
-        | **SD1.5 Base** | SD1.5 | v1 (T5-base) | Baseline comparison |
-        **Lyra VAE** fuses CLIP + T5 embeddings using:
-        - **Prompt (Tags)**: Booru-style tags for CLIP encoding
-        - **T5 Summary**: Natural language description for T5 (format: `tags ¶ summary`)
-        Enable **Lyra VAE** for side-by-side comparison!
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 prompt = gr.TextArea(
-                    label="Prompt (Tags for CLIP)",
                     value="masterpiece, best quality, 1girl, blue hair, school uniform, cherry blossoms, detailed background",
                     lines=3
                 )
                 t5_summary = gr.TextArea(
-                    label="T5 Summary (Natural Language for Lyra)",
-                    value="A beautiful anime girl with flowing blue hair wearing a school uniform, surrounded by delicate pink cherry blossoms against a bright sky",
                     lines=2,
-                    info="Used after ¶ separator for T5. Leave empty to use tags only."
                 )
                 negative_prompt = gr.TextArea(
                     label="Negative Prompt",
-                    value="lowres, bad anatomy, bad hands, text, error, cropped, worst quality, low quality",
                     lines=2
                 )
-                model_choice = gr.Dropdown(
-                    label="Model",
-                    choices=[
-                        "Illustrious XL",
-                        "SDXL Base",
-                        "Flow-Lune (SD1.5)",
-                        "SD1.5 Base"
-                    ],
-                    value="Illustrious XL"
-                )
                 clip_skip = gr.Slider(
                     label="CLIP Skip",
-                    minimum=1,
-                    maximum=4,
-                    value=2,
-                    step=1,
-                    info="2 recommended for Illustrious, 1 for others"
                 )
                 use_lyra = gr.Checkbox(
-                    label="Enable Lyra VAE (CLIP+T5 Fusion)",
-                    value=True,
                     info="Compare standard vs geometric fusion"
                 )
                 lyra_strength = gr.Slider(
                     label="Lyra Blend Strength",
-                    minimum=0.0,
-                    maximum=3.0,
-                    value=1.0,
-                    step=0.05,
-                    info="0.0 = pure CLIP, 1.0 = pure Lyra reconstruction, 3.0 = way too much but try it anyway"
                 )
                 with gr.Accordion("Generation Settings", open=True):
-                    num_steps = gr.Slider(
-                        label="Steps",
-                        minimum=1,
-                        maximum=50,
-                        value=25,
-                        step=1
-                    )
-                    cfg_scale = gr.Slider(
-                        label="CFG Scale",
-                        minimum=1.0,
-                        maximum=20.0,
-                        value=7.0,
-                        step=0.5
-                    )
                     with gr.Row():
-                        width = gr.Slider(
-                            label="Width",
-                            minimum=512,
-                            maximum=1536,
-                            value=1024,
-                            step=64
-                        )
-                        height = gr.Slider(
-                            label="Height",
-                            minimum=512,
-                            maximum=1536,
-                            value=1024,
-                            step=64
-                        )
-                    seed = gr.Slider(
-                        label="Seed",
-                        minimum=0,
-                        maximum=2**32 - 1,
-                        value=42,
-                        step=1
-                    )
-                    randomize_seed = gr.Checkbox(
-                        label="Randomize Seed",
-                        value=True
-                    )
-                with gr.Accordion("Advanced (Flow Matching)", open=False):
-                    use_flow_matching = gr.Checkbox(
-                        label="Enable Flow Matching",
-                        value=False,
-                        info="Use flow matching ODE (for Lune only)"
-                    )
-                    shift = gr.Slider(
-                        label="Shift",
-                        minimum=0.0,
-                        maximum=5.0,
-                        value=0.0,
-                        step=0.1,
-                        info="Flow matching shift (0=disabled)"
-                    )
                 generate_btn = gr.Button("🎨 Generate", variant="primary", size="lg")
             with gr.Column(scale=1):
                 with gr.Row():
-                    output_image_standard = gr.Image(
-                        label="Standard",
-                        type="pil"
-                    )
-                    output_image_lyra = gr.Image(
-                        label="Lyra Fusion 🎵",
-                        type="pil",
-                        visible=True
-                    )
                 output_seed = gr.Number(label="Seed", precision=0)
-                gr.Markdown("""
-                ### Tips
-                - **Illustrious XL**: Use CLIP skip 2, booru-style tags
-                - **SDXL Base**: Natural language prompts work well
-                - **Flow-Lune**: Enable flow matching, shift ~2.5, fewer steps
-                - **Lyra v2**: SDXL models use T5-XL for richer semantics
-                - **Lyra v1**: SD1.5 models use T5-base
-                ### Model Info
-                - SDXL models use **epsilon** prediction
-                - Lune uses **v_prediction** with flow matching
-                - Lyra fuses CLIP + T5 via geometric Cantor attention
-                """)
-        # Examples
-        gr.Examples(
-            examples=[
-                [
-                    "masterpiece, best quality, 1girl, blue hair, school uniform, cherry blossoms, detailed background",
-                    "A beautiful anime girl with flowing blue hair wearing a school uniform, surrounded by delicate pink cherry blossoms against a bright sky",
-                    "lowres, bad anatomy, worst quality, low quality",
-                    "Illustrious XL",
-                    2, 25, 7.0, 1024, 1024, 0.0, False, True, 0.8, 42, False
-                ],
-                [
-                    "A majestic mountain landscape at golden hour, crystal clear lake, photorealistic, 8k",
-                    "A breathtaking mountain vista bathed in warm golden light at sunset, with a perfectly still crystal clear lake reflecting the peaks",
-                    "blurry, low quality",
-                    "SDXL Base",
-                    1, 30, 7.5, 1024, 1024, 0.0, False, True, 0.8, 123, False
-                ],
-                [
-                    "cyberpunk city at night, neon lights, rain, highly detailed",
-                    "A futuristic cyberpunk metropolis at night with vibrant neon lights reflecting off rain-slicked streets",
-                    "low quality, blurry",
-                    "Flow-Lune (SD1.5)",
-                    1, 20, 7.5, 512, 512, 2.5, True, True, 0.8, 456, False
-                ],
-            ],
-            inputs=[
-                prompt, t5_summary, negative_prompt, model_choice, clip_skip,
-                num_steps, cfg_scale, width, height, shift,
-                use_flow_matching, use_lyra, lyra_strength, seed, randomize_seed
-            ],
-            outputs=[output_image_standard, output_image_lyra, output_seed],
-            fn=generate_image,
-            cache_examples=False
-        )
         # Event handlers
-        def on_model_change(model_name):
-            """Update defaults based on model."""
-            if "Illustrious" in model_name:
-                return {
-                    clip_skip: gr.update(value=2),
-                    width: gr.update(value=1024),
-                    height: gr.update(value=1024),
-                    num_steps: gr.update(value=25),
-                    use_flow_matching: gr.update(value=False),
-                    shift: gr.update(value=0.0)
-                }
-            elif "SDXL" in model_name:
-                return {
-                    clip_skip: gr.update(value=1),
-                    width: gr.update(value=1024),
-                    height: gr.update(value=1024),
-                    num_steps: gr.update(value=30),
-                    use_flow_matching: gr.update(value=False),
-                    shift: gr.update(value=0.0)
-                }
-            elif "Lune" in model_name:
-                return {
-                    clip_skip: gr.update(value=1),
-                    width: gr.update(value=512),
-                    height: gr.update(value=512),
-                    num_steps: gr.update(value=20),
-                    use_flow_matching: gr.update(value=True),
-                    shift: gr.update(value=2.5)
-                }
-            else:  # SD1.5 Base
-                return {
-                    clip_skip: gr.update(value=1),
-                    width: gr.update(value=512),
-                    height: gr.update(value=512),
-                    num_steps: gr.update(value=30),
-                    use_flow_matching: gr.update(value=False),
-                    shift: gr.update(value=0.0)
-                }
         def on_lyra_toggle(enabled):
-            """Show/hide Lyra comparison."""
             if enabled:
                 return {
                     output_image_standard: gr.update(visible=True, label="Standard"),
@@ -1690,12 +977,6 @@ def create_demo():
                     output_image_lyra: gr.update(visible=False)
                 }
-        model_choice.change(
-            fn=on_model_change,
-            inputs=[model_choice],
-            outputs=[clip_skip, width, height, num_steps, use_flow_matching, shift]
-        )
         use_lyra.change(
             fn=on_lyra_toggle,
             inputs=[use_lyra],
@@ -1705,9 +986,9 @@ def create_demo():
         generate_btn.click(
             fn=generate_image,
             inputs=[
-                prompt, t5_summary, negative_prompt, model_choice, clip_skip,
-                num_steps, cfg_scale, width, height, shift,
-                use_flow_matching, use_lyra, lyra_strength, seed, randomize_seed
             ],
             outputs=[output_image_standard, output_image_lyra, output_seed]
         )

 Lyra VAE Versions:
 - v1: SD1.5 (768 dim CLIP + T5-base) - geofractal.model.vae.vae_lyra
+- v2: SDXL/Illustrious (768 CLIP-L + 1280 CLIP-G + 2048 T5-XL) - geofractal.model.vae.vae_lyra_v2
 """
 import os
     UNet2DConditionModel,
     AutoencoderKL,
     EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSDEScheduler,
 )
 from transformers import (
     CLIPTextModel,
     CLIPTokenizer,
 )
 from huggingface_hub import hf_hub_download
+# Lazy imports for Lyra
+LYRA_V1_AVAILABLE = False
+LYRA_V2_AVAILABLE = False
+LyraV1 = None
+LyraV1Config = None
+LyraV2 = None
+LyraV2Config = None
+def _load_lyra_imports():
+    """Lazy load Lyra VAE modules."""
+    global LYRA_V1_AVAILABLE, LYRA_V2_AVAILABLE
+    global LyraV1, LyraV1Config, LyraV2, LyraV2Config
+    try:
+        from geofractal.model.vae.vae_lyra import MultiModalVAE as _LyraV1, MultiModalVAEConfig as _LyraV1Config
+        LyraV1 = _LyraV1
+        LyraV1Config = _LyraV1Config
+        LYRA_V1_AVAILABLE = True
+    except ImportError:
+        print("⚠️ Lyra VAE v1 not available")
+    try:
+        from geofractal.model.vae.vae_lyra_v2 import MultiModalVAE as _LyraV2, MultiModalVAEConfig as _LyraV2Config
+        LyraV2 = _LyraV2
+        LyraV2Config = _LyraV2Config
+        LYRA_V2_AVAILABLE = True
+    except ImportError:
+        print("⚠️ Lyra VAE v2 not available")
 # ============================================================================
 # CONSTANTS
 # ============================================================================
 ARCH_SD15 = "sd15"
 ARCH_SDXL = "sdxl"
+# Scheduler options
+SCHEDULER_EULER_A = "Euler Ancestral"
+SCHEDULER_EULER = "Euler"
+SCHEDULER_DPM_2M_SDE = "DPM++ 2M SDE"
+SCHEDULER_DPM_2M = "DPM++ 2M"
+SDXL_SCHEDULERS = [SCHEDULER_EULER_A, SCHEDULER_EULER, SCHEDULER_DPM_2M_SDE, SCHEDULER_DPM_2M]
 # ============================================================================
+# SCHEDULER FACTORY
 # ============================================================================
+def get_scheduler(scheduler_name: str, config_path: str = "stabilityai/stable-diffusion-xl-base-1.0"):
+    """Create scheduler by name."""
+    if scheduler_name == SCHEDULER_EULER_A:
+        return EulerAncestralDiscreteScheduler.from_pretrained(
+            config_path, subfolder="scheduler"
+        )
+    elif scheduler_name == SCHEDULER_EULER:
+        return EulerDiscreteScheduler.from_pretrained(
+            config_path, subfolder="scheduler"
+        )
+    elif scheduler_name == SCHEDULER_DPM_2M_SDE:
+        return DPMSolverSDEScheduler.from_pretrained(
+            config_path, subfolder="scheduler",
+            algorithm_type="sde-dpmsolver++",
+            solver_order=2,
+        )
+    elif scheduler_name == SCHEDULER_DPM_2M:
+        return DPMSolverMultistepScheduler.from_pretrained(
+            config_path, subfolder="scheduler",
+            algorithm_type="dpmsolver++",
+            solver_order=2,
+        )
+    else:
+        # Default to Euler Ancestral
+        return EulerAncestralDiscreteScheduler.from_pretrained(
+            config_path, subfolder="scheduler"
+        )
+# ============================================================================
+# MODEL LOADING UTILITIES
+# ============================================================================
 def get_clip_hidden_state(
     model_output,
         return model_output.last_hidden_state
     if hasattr(model_output, 'hidden_states') and model_output.hidden_states is not None:
         return model_output.hidden_states[-clip_skip]
     return model_output.last_hidden_state
+# ============================================================================
+# LAZY LOADERS
+# ============================================================================
+class LazyT5Encoder:
+    """Lazy loader for T5 encoder - only loads when first accessed."""
+    def __init__(self, model_name: str = "google/flan-t5-xl", device: str = "cuda"):
+        self.model_name = model_name
+        self.device = device
+        self._encoder = None
+        self._tokenizer = None
+    @property
+    def encoder(self):
+        if self._encoder is None:
+            print(f"📥 Loading T5 encoder: {self.model_name}...")
+            self._encoder = T5EncoderModel.from_pretrained(
+                self.model_name,
+                torch_dtype=torch.float16
+            ).to(self.device)
+            self._encoder.eval()
+            print("✓ T5 encoder loaded")
+        return self._encoder
+    @property
+    def tokenizer(self):
+        if self._tokenizer is None:
+            print(f"📥 Loading T5 tokenizer: {self.model_name}...")
+            self._tokenizer = T5Tokenizer.from_pretrained(self.model_name)
+            print("✓ T5 tokenizer loaded")
+        return self._tokenizer
+    def is_loaded(self):
+        return self._encoder is not None
+class LazyLyraModel:
+    """Lazy loader for Lyra VAE - only loads when first accessed."""
+    def __init__(self, repo_id: str, device: str = "cuda", version: int = 2):
+        self.repo_id = repo_id
+        self.device = device
+        self.version = version
+        self._model = None
+    @property
+    def model(self):
+        if self._model is None:
+            _load_lyra_imports()
+            if self.version == 2:
+                self._model = self._load_v2()
+            else:
+                self._model = self._load_v1()
+        return self._model
+    def _load_v2(self):
+        if not LYRA_V2_AVAILABLE:
+            print("⚠️ Lyra VAE v2 not available")
+            return None
+        print(f"🎵 Loading Lyra VAE v2 from {self.repo_id}...")
+        try:
+            from huggingface_hub import list_repo_files
+            config_path = hf_hub_download(
+                repo_id=self.repo_id,
+                filename="config.json",
+                repo_type="model"
+            )
+            with open(config_path, 'r') as f:
+                config_dict = json.load(f)
+            print(f"  ✓ Config: {config_dict.get('fusion_strategy', 'unknown')} fusion")
+            # Auto-detect checkpoint
+            repo_files = list_repo_files(self.repo_id, repo_type="model")
+            checkpoint_files = [f for f in repo_files if f.endswith('.pt')]
+            checkpoint_files = [f for f in checkpoint_files if 'checkpoint' in f.lower()]
+            if not checkpoint_files:
+                raise FileNotFoundError(f"No checkpoint found in {self.repo_id}")
+            import re
+            def extract_step(name):
+                match = re.search(r'(\d+)\.pt', name)
+                return int(match.group(1)) if match else 0
+            checkpoint_files.sort(key=extract_step, reverse=True)
+            checkpoint_filename = checkpoint_files[0]
+            print(f"  ✓ Using: {checkpoint_filename}")
+            checkpoint_path = hf_hub_download(
+                repo_id=self.repo_id,
+                filename=checkpoint_filename,
+                repo_type="model"
+            )
+            checkpoint = torch.load(checkpoint_path, map_location="cpu")
+            vae_config = LyraV2Config(
+                modality_dims=config_dict.get('modality_dims', {
+                    "clip_l": 768, "clip_g": 1280,
+                    "t5_xl_l": 2048, "t5_xl_g": 2048
+                }),
+                modality_seq_lens=config_dict.get('modality_seq_lens', {
+                    "clip_l": 77, "clip_g": 77,
+                    "t5_xl_l": 512, "t5_xl_g": 512
+                }),
+                binding_config=config_dict.get('binding_config', {
+                    "clip_l": {"t5_xl_l": 0.3},
+                    "clip_g": {"t5_xl_g": 0.3},
+                    "t5_xl_l": {},
+                    "t5_xl_g": {}
+                }),
+                latent_dim=config_dict.get('latent_dim', 2048),
+                seq_len=config_dict.get('seq_len', 77),
+                encoder_layers=config_dict.get('encoder_layers', 3),
+                decoder_layers=config_dict.get('decoder_layers', 3),
+                hidden_dim=config_dict.get('hidden_dim', 2048),
+                dropout=config_dict.get('dropout', 0.1),
+                fusion_strategy=config_dict.get('fusion_strategy', 'adaptive_cantor'),
+                fusion_heads=config_dict.get('fusion_heads', 8),
+                fusion_dropout=config_dict.get('fusion_dropout', 0.1),
+                cantor_depth=config_dict.get('cantor_depth', 8),
+                cantor_local_window=config_dict.get('cantor_local_window', 3),
+                alpha_init=config_dict.get('alpha_init', 1.0),
+                beta_init=config_dict.get('beta_init', 0.3),
+            )
+            lyra_model = LyraV2(vae_config)
+            state_dict = checkpoint.get('model_state_dict', checkpoint)
+            missing, unexpected = lyra_model.load_state_dict(state_dict, strict=False)
+            if missing:
+                print(f"  ⚠️ Missing keys: {len(missing)}")
+            if unexpected:
+                print(f"  ⚠️ Unexpected keys: {len(unexpected)}")
+            lyra_model.to(self.device)
+            lyra_model.eval()
+            total_params = sum(p.numel() for p in lyra_model.parameters())
+            print(f"✅ Lyra VAE v2 loaded ({total_params/1e6:.1f}M params)")
+            return lyra_model
+        except Exception as e:
+            print(f"❌ Failed to load Lyra VAE v2: {e}")
+            import traceback
+            traceback.print_exc()
+            return None
+    def _load_v1(self):
+        if not LYRA_V1_AVAILABLE:
+            print("⚠️ Lyra VAE v1 not available")
+            return None
+        # Similar implementation for v1...
+        return None
+    def is_loaded(self):
+        return self._model is not None
 # ============================================================================
 # SDXL PIPELINE
 # ============================================================================
     def __init__(
         self,
         vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
         tokenizer: CLIPTokenizer,
         tokenizer_2: CLIPTokenizer,
         unet: UNet2DConditionModel,
         scheduler,
         device: str = "cuda",
+        t5_loader: Optional[LazyT5Encoder] = None,
+        lyra_loader: Optional[LazyLyraModel] = None,
         clip_skip: int = 1
     ):
         self.vae = vae
         self.scheduler = scheduler
         self.device = device
+        # Lazy loaders
+        self.t5_loader = t5_loader
+        self.lyra_loader = lyra_loader
         # Settings
         self.clip_skip = clip_skip
+        self.vae_scale_factor = 0.13025
         self.arch = ARCH_SDXL
+    def set_scheduler(self, scheduler_name: str):
+        """Switch scheduler."""
+        self.scheduler = get_scheduler(scheduler_name)
+    @property
+    def t5_encoder(self):
+        return self.t5_loader.encoder if self.t5_loader else None
+    @property
+    def t5_tokenizer(self):
+        return self.t5_loader.tokenizer if self.t5_loader else None
+    @property
+    def lyra_model(self):
+        return self.lyra_loader.model if self.lyra_loader else None
     def encode_prompt(
         self,
         prompt: str,
                 output_hidden_states=output_hidden_states
             )
             prompt_embeds_g = get_clip_hidden_state(clip_g_output, clip_skip, output_hidden_states)
             pooled_prompt_embeds = clip_g_output.text_embeds
         prompt_embeds = torch.cat([prompt_embeds_l, prompt_embeds_g], dim=-1)
         # Negative prompt
         t5_summary: str = "",
         lyra_strength: float = 0.3
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Encode prompts using Lyra VAE v2 fusion (CLIP + T5)."""
         if self.lyra_model is None or self.t5_encoder is None:
             raise ValueError("Lyra VAE components not initialized")
             prompt, negative_prompt, clip_skip
         )
+        # Format T5 input
         SUMMARY_SEPARATOR = "¶"
         if t5_summary.strip():
             t5_prompt = f"{prompt} {SUMMARY_SEPARATOR} {t5_summary}"
             t5_embeds = self.t5_encoder(**t5_inputs).last_hidden_state
         clip_l_dim = 768
         clip_l_embeds = prompt_embeds[..., :clip_l_dim]
         clip_g_embeds = prompt_embeds[..., clip_l_dim:]
         with torch.no_grad():
             modality_inputs = {
                 'clip_l': clip_l_embeds.float(),
                 'clip_g': clip_g_embeds.float(),
                 modality_inputs,
                 target_modalities=['clip_l', 'clip_g']
             )
             lyra_clip_l = reconstructions['clip_l'].to(prompt_embeds.dtype)
             lyra_clip_g = reconstructions['clip_g'].to(prompt_embeds.dtype)
+            # Normalize if stats are off
             clip_l_std_ratio = lyra_clip_l.std() / (clip_l_embeds.std() + 1e-8)
             clip_g_std_ratio = lyra_clip_g.std() / (clip_g_embeds.std() + 1e-8)
             if clip_l_std_ratio > 2.0 or clip_l_std_ratio < 0.5:
                 lyra_clip_l = (lyra_clip_l - lyra_clip_l.mean()) / (lyra_clip_l.std() + 1e-8)
                 lyra_clip_l = lyra_clip_l * clip_l_embeds.std() + clip_l_embeds.mean()
             if clip_g_std_ratio > 2.0 or clip_g_std_ratio < 0.5:
                 lyra_clip_g = (lyra_clip_g - lyra_clip_g.mean()) / (lyra_clip_g.std() + 1e-8)
                 lyra_clip_g = lyra_clip_g * clip_g_embeds.std() + clip_g_embeds.mean()
+        # Blend
         fused_clip_l = (1 - lyra_strength) * clip_l_embeds + lyra_strength * lyra_clip_l
         fused_clip_g = (1 - lyra_strength) * clip_g_embeds + lyra_strength * lyra_clip_g
         prompt_embeds_fused = torch.cat([fused_clip_l, fused_clip_g], dim=-1)
+        # Negative prompt - just use original CLIP
+        return prompt_embeds_fused, negative_prompt_embeds, pooled, negative_pooled
     def _get_add_time_ids(
         self,
         negative_prompt: str = "",
         height: int = 1024,
         width: int = 1024,
+        num_inference_steps: int = 25,
+        guidance_scale: float = 7.0,
         seed: Optional[int] = None,
         use_lyra: bool = False,
+        clip_skip: int = 2,
         t5_summary: str = "",
         lyra_strength: float = 1.0,
         progress_callback=None
     ):
         """Generate image using SDXL architecture."""
         if seed is not None:
             generator = torch.Generator(device=self.device).manual_seed(seed)
         else:
             generator = None
         # Encode prompts
+        if use_lyra and self.lyra_loader is not None:
             prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt_lyra(
                 prompt, negative_prompt, clip_skip, t5_summary, lyra_strength
             )
         self.scheduler.set_timesteps(num_inference_steps, device=self.device)
         timesteps = self.scheduler.timesteps
+        latents = latents * self.scheduler.init_noise_sigma
+        # Time embeddings for SDXL
         original_size = (height, width)
         target_size = (height, width)
         crops_coords_top_left = (0, 0)
         add_time_ids = self._get_add_time_ids(
             original_size, crops_coords_top_left, target_size, dtype=torch.float16
         )
+        negative_add_time_ids = add_time_ids
         # Denoising loop
         for i, t in enumerate(timesteps):
             if progress_callback:
                 progress_callback(i, num_inference_steps, f"Step {i+1}/{num_inference_steps}")
             latent_model_input = torch.cat([latents] * 2) if guidance_scale > 1.0 else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
             timestep = t.expand(latent_model_input.shape[0])
             if guidance_scale > 1.0:
                 text_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
                 add_text_embeds = torch.cat([negative_pooled, pooled])
                 add_text_embeds = pooled
                 add_time_ids_input = add_time_ids
             added_cond_kwargs = {
                 "text_embeds": add_text_embeds,
                 "time_ids": add_time_ids_input
             }
             noise_pred = self.unet(
                 latent_model_input,
                 timestep,
                 return_dict=False
             )[0]
             if guidance_scale > 1.0:
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
         # Decode
         latents = latents / self.vae_scale_factor
         with torch.no_grad():
             image = self.vae.decode(latents.to(self.vae.dtype)).sample
         image = (image / 2 + 0.5).clamp(0, 1)
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         image = (image * 255).round().astype("uint8")
 # MODEL LOADERS
 # ============================================================================
 def load_illustrious_xl(
+    repo_id: str = "AbstractPhil/illustrious-xl-v1",
     filename: str = "illustriousXL_v01.safetensors",
     device: str = "cuda"
 ) -> Tuple[UNet2DConditionModel, AutoencoderKL, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, CLIPTokenizer]:
+    """Load Illustrious XL from single safetensors file."""
     from diffusers import StableDiffusionXLPipeline
     print(f"📥 Loading Illustrious XL: {repo_id}/{filename}")
     checkpoint_path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="model")
     print(f"✓ Downloaded: {checkpoint_path}")
+    print("📦 Loading pipeline...")
     pipe = StableDiffusionXLPipeline.from_single_file(
         checkpoint_path,
         torch_dtype=torch.float16,
         use_safetensors=True,
     )
     unet = pipe.unet.to(device)
     vae = pipe.vae.to(device)
     text_encoder = pipe.text_encoder.to(device)
     tokenizer = pipe.tokenizer
     tokenizer_2 = pipe.tokenizer_2
     del pipe
     torch.cuda.empty_cache()
     print("✅ Illustrious XL loaded!")
     return unet, vae, text_encoder, text_encoder_2, tokenizer, tokenizer_2
 # ============================================================================
 # PIPELINE INITIALIZATION
 # ============================================================================
+def initialize_sdxl_pipeline(
+    model_choice: str,
+    scheduler_name: str = SCHEDULER_EULER_A,
+    device: str = "cuda"
+):
+    """Initialize SDXL pipeline with lazy T5/Lyra loading."""
     print(f"🚀 Initializing {model_choice} pipeline...")
+    # Load base model
+    if "Illustrious" in model_choice:
+        unet, vae, text_encoder, text_encoder_2, tokenizer, tokenizer_2 = load_illustrious_xl(device=device)
     else:
+        # SDXL Base
+        from diffusers import StableDiffusionXLPipeline
+        pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            torch_dtype=torch.float16,
         )
+        unet = pipe.unet.to(device)
+        vae = pipe.vae.to(device)
+        text_encoder = pipe.text_encoder.to(device)
+        text_encoder_2 = pipe.text_encoder_2.to(device)
+        tokenizer = pipe.tokenizer
+        tokenizer_2 = pipe.tokenizer_2
+        del pipe
+        torch.cuda.empty_cache()
+    # Create lazy loaders (don't download yet)
+    t5_loader = LazyT5Encoder(model_name="google/flan-t5-xl", device=device)
+    lyra_loader = LazyLyraModel(
+        repo_id="AbstractPhil/vae-lyra-xl-adaptive-cantor-illustrious",
+        device=device,
+        version=2
+    )
+    # Get scheduler
+    scheduler = get_scheduler(scheduler_name)
+    pipeline = SDXLFlowMatchingPipeline(
+        vae=vae,
+        text_encoder=text_encoder,
+        text_encoder_2=text_encoder_2,
+        tokenizer=tokenizer,
+        tokenizer_2=tokenizer_2,
+        unet=unet,
+        scheduler=scheduler,
+        device=device,
+        t5_loader=t5_loader,
+        lyra_loader=lyra_loader,
+        clip_skip=2
+    )
+    print("✅ Pipeline initialized (T5/Lyra will load on first use)")
     return pipeline
 CURRENT_PIPELINE = None
 CURRENT_MODEL = None
+CURRENT_SCHEDULER = None
+def get_pipeline(model_choice: str, scheduler_name: str = SCHEDULER_EULER_A):
     """Get or create pipeline for selected model."""
+    global CURRENT_PIPELINE, CURRENT_MODEL, CURRENT_SCHEDULER
     if CURRENT_PIPELINE is None or CURRENT_MODEL != model_choice:
+        CURRENT_PIPELINE = initialize_sdxl_pipeline(model_choice, scheduler_name, device="cuda")
         CURRENT_MODEL = model_choice
+        CURRENT_SCHEDULER = scheduler_name
+    elif CURRENT_SCHEDULER != scheduler_name:
+        CURRENT_PIPELINE.set_scheduler(scheduler_name)
+        CURRENT_SCHEDULER = scheduler_name
     return CURRENT_PIPELINE
 # INFERENCE
 # ============================================================================
+@spaces.GPU(duration=120)
 def generate_image(
     prompt: str,
     t5_summary: str,
     negative_prompt: str,
     model_choice: str,
+    scheduler_name: str,
     clip_skip: int,
     num_steps: int,
     cfg_scale: float,
     width: int,
     height: int,
     use_lyra: bool,
     lyra_strength: float,
     seed: int,
         progress((step + 1) / total, desc=desc)
     try:
+        pipeline = get_pipeline(model_choice, scheduler_name)
+        if not use_lyra or pipeline.lyra_loader is None:
             progress(0.05, desc="Generating...")
             image = pipeline(
                 width=width,
                 num_inference_steps=num_steps,
                 guidance_scale=cfg_scale,
                 seed=seed,
                 use_lyra=False,
                 clip_skip=clip_skip,
                 width=width,
                 num_inference_steps=num_steps,
                 guidance_scale=cfg_scale,
                 seed=seed,
                 use_lyra=False,
                 clip_skip=clip_skip,
                 progress_callback=lambda s, t, d: progress(0.05 + (s/t) * 0.45, desc=d)
             )
+            progress(0.5, desc="Loading Lyra + T5 (first run only)...")
             image_lyra = pipeline(
                 prompt=prompt,
                 width=width,
                 num_inference_steps=num_steps,
                 guidance_scale=cfg_scale,
                 seed=seed,
                 use_lyra=True,
                 clip_skip=clip_skip,
     except Exception as e:
         print(f"❌ Generation failed: {e}")
+        import traceback
+        traceback.print_exc()
         raise e
     with gr.Blocks() as demo:
         gr.Markdown("""
+        # 🌙 Lyra/Illustrious XL Image Generation
         **Geometric crystalline diffusion** by [AbstractPhil](https://huggingface.co/AbstractPhil)
         | Model | Architecture | Lyra Version | Best For |
         |-------|-------------|--------------|----------|
         | **Illustrious XL** | SDXL | v2 (T5-XL) | Anime/illustration, high detail |
         | **SDXL Base** | SDXL | v2 (T5-XL) | Photorealistic, general purpose |
+        **Lyra VAE** fuses CLIP + T5-XL embeddings using adaptive Cantor attention.
+        T5 and Lyra only load when you enable the Lyra checkbox!
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 prompt = gr.TextArea(
+                    label="Prompt",
                     value="masterpiece, best quality, 1girl, blue hair, school uniform, cherry blossoms, detailed background",
                     lines=3
                 )
                 t5_summary = gr.TextArea(
+                    label="T5 Summary (for Lyra)",
+                    value="A beautiful anime girl with flowing blue hair wearing a school uniform, surrounded by delicate pink cherry blossoms",
                     lines=2,
+                    info="Natural language description for T5. Leave empty to use prompt."
                 )
                 negative_prompt = gr.TextArea(
                     label="Negative Prompt",
+                    value="lowres, bad anatomy, bad hands, text, error, worst quality, low quality",
                     lines=2
                 )
+                with gr.Row():
+                    model_choice = gr.Dropdown(
+                        label="Model",
+                        choices=["Illustrious XL", "SDXL Base"],
+                        value="Illustrious XL"
+                    )
+                    scheduler_name = gr.Dropdown(
+                        label="Scheduler",
+                        choices=SDXL_SCHEDULERS,
+                        value=SCHEDULER_EULER_A
+                    )
                 clip_skip = gr.Slider(
                     label="CLIP Skip",
+                    minimum=1, maximum=4, value=2, step=1,
+                    info="2 recommended for Illustrious"
                 )
                 use_lyra = gr.Checkbox(
+                    label="Enable Lyra VAE (loads T5-XL on first use)",
+                    value=False,
                     info="Compare standard vs geometric fusion"
                 )
                 lyra_strength = gr.Slider(
                     label="Lyra Blend Strength",
+                    minimum=0.0, maximum=2.0, value=1.0, step=0.05,
+                    info="0.0 = pure CLIP, 1.0 = pure Lyra"
                 )
                 with gr.Accordion("Generation Settings", open=True):
+                    num_steps = gr.Slider(label="Steps", minimum=1, maximum=50, value=25, step=1)
+                    cfg_scale = gr.Slider(label="CFG Scale", minimum=1.0, maximum=15.0, value=7.0, step=0.5)
                     with gr.Row():
+                        width = gr.Slider(label="Width", minimum=512, maximum=1536, value=1024, step=64)
+                        height = gr.Slider(label="Height", minimum=512, maximum=1536, value=1024, step=64)
+                    seed = gr.Slider(label="Seed", minimum=0, maximum=2**32 - 1, value=42, step=1)
+                    randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
                 generate_btn = gr.Button("🎨 Generate", variant="primary", size="lg")
             with gr.Column(scale=1):
                 with gr.Row():
+                    output_image_standard = gr.Image(label="Standard", type="pil")
+                    output_image_lyra = gr.Image(label="Lyra Fusion 🎵", type="pil", visible=True)
                 output_seed = gr.Number(label="Seed", precision=0)
         # Event handlers
         def on_lyra_toggle(enabled):
             if enabled:
                 return {
                     output_image_standard: gr.update(visible=True, label="Standard"),
                     output_image_lyra: gr.update(visible=False)
                 }
         use_lyra.change(
             fn=on_lyra_toggle,
             inputs=[use_lyra],
         generate_btn.click(
             fn=generate_image,
             inputs=[
+                prompt, t5_summary, negative_prompt, model_choice, scheduler_name,
+                clip_skip, num_steps, cfg_scale, width, height,
+                use_lyra, lyra_strength, seed, randomize_seed
             ],
             outputs=[output_image_standard, output_image_lyra, output_seed]
         )