Update model weights after training (epoch 7, loss 5.3543)

Browse files

Files changed (9) hide show

audio_decoder.safetensors +1 -1
config.json +1 -1
configuration_xoron.py +353 -316
cross_attention.safetensors +1 -1
llm.safetensors +1 -1
modeling_xoron.py +0 -0
streaming_state.json +19 -19
trainer_state.json +10 -10
training_state.pt +2 -2

audio_decoder.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d817de2ba9f31539807a8d57d1ad5441f33794329008e0a6b9e01764b831f909
 size 1458410612

 version https://git-lfs.github.com/spec/v1
+oid sha256:1628ac8faba5b54287f88aae6cd7885ccfa61f06ea0a09b08f01e91463b96df6
 size 1458410612

config.json CHANGED Viewed

@@ -49,7 +49,7 @@
   "image_size_step": 32,
   "video_min_size": 128,
   "video_max_size": 320,
-  "video_base_size": 128,
   "video_size_step": 32,
   "video_min_frames": 8,
   "video_max_frames": 8,

   "image_size_step": 32,
   "video_min_size": 128,
   "video_max_size": 320,
+  "video_base_size": 320,
   "video_size_step": 32,
   "video_min_frames": 8,
   "video_max_frames": 8,

configuration_xoron.py CHANGED Viewed

@@ -12,11 +12,11 @@ Usage:
     config = AutoConfig.from_pretrained("your-repo/xoron-model", trust_remote_code=True)
 """
-from transformers import PreTrainedConfig
-from typing import List, Tuple, Union
-class XoronConfig(PreTrainedConfig):
     """
     Configuration class for Xoron-Dev multimodal model.
@@ -57,317 +57,354 @@ class XoronConfig(PreTrainedConfig):
         - FP16-native numerical stability
         - Multi-scale training for variable resolution handling
     """
-    model_type = "xoron"
-    def __init__(
-        self,
-        # Model identification
-        model_name: str = "Xoron-Dev-MultiMoE",
-        # LLM Architecture
-        hidden_size: int = 1024,
-        num_layers: int = 12,
-        num_heads: int = 16,
-        intermediate_size: int = 2048,
-        vocab_size: int = 151643,
-        max_position_embeddings: int = 131072,
-        rms_norm_eps: float = 1e-6,
-        # Ring Attention
-        use_ring_attention: bool = True,
-        ring_attention_chunk_size: int = 4096,
-        # Tie word embeddings
-        tie_word_embeddings: bool = True,
-        # MoE Configuration
-        use_moe: bool = True,
-        num_experts: int = 8,
-        num_experts_per_tok: int = 2,
-        moe_layer_freq: int = 2,
-        use_shared_expert: bool = True,
-        moe_capacity_factor: float = 1.25,
-        use_aux_lossless: bool = True,
-        # Vision Configuration
-        vision_model_name: str = "google/siglip-so400m-patch14-384",
-        freeze_vision: bool = False,
-        num_vision_tokens: int = 64,
-        projector_type: str = "perceiver",
-        # Vision Encoder SOTA Features
-        use_vision_dual_stream: bool = True,
-        use_vision_titok: bool = True,
-        num_vision_titok_tokens: int = 256,
-        num_vision_dual_stream_layers: int = 2,
-        # Video Encoder SOTA Features
-        use_video_3d_rope: bool = True,
-        use_video_temporal_moe: bool = True,
-        num_video_encoder_layers: int = 4,
-        num_video_experts: int = 4,
-        use_video_vidtok: bool = True,
-        vidtok_latent_channels: int = 4,
-        vidtok_temporal_compression: int = 4,
-        vidtok_spatial_compression: int = 8,
-        vidtok_causal: bool = True,
-        vidtok_use_fsq: bool = False,
-        # VideoTiTokTokenizer Configuration (SOTA: TiTok-style 1D tokenization for video)
-        use_video_titok: bool = True,
-        num_video_titok_tokens: int = 64,
-        num_video_titok_layers: int = 2,
-        num_video_titok_heads: int = 8,
-        video_titok_dropout: float = 0.1,
-        # Continuous-Scale Training Configuration
-        use_multi_scale: bool = True,
-        use_continuous_scale: bool = True,
-        image_min_size: int = 128,
-        image_max_size: int = 384,
-        image_base_size: int = 256,
-        image_size_step: int = 32,
-        video_min_size: int = 128,
-        video_max_size: int = 320,
-        video_base_size: int = 192,
-        video_size_step: int = 32,
-        video_min_frames: int = 8,
-        video_max_frames: int = 24,
-        video_base_frames: int = 16,
-        video_frame_step: int = 4,
-        multi_scale_strategy: str = "adaptive",
-        multi_scale_warmup_epochs: int = 3,
-        adaptive_scale_oom_penalty: float = 0.5,
-        adaptive_scale_success_boost: float = 0.1,
-        generation_supported_sizes: Union[List[int], Tuple[int, ...]] = (192, 256, 320, 384),
-        generation_supported_frames: Union[List[int], Tuple[int, ...]] = (8, 12, 16, 20, 24),
-        # Image Generation Configuration
-        enable_generation: bool = True,
-        generation_latent_channels: int = 4,
-        generation_base_channels: int = 128,
-        generation_inference_steps: int = 50,
-        generation_cfg_scale: float = 7.5,
-        generation_use_flow_matching: bool = True,
-        generation_num_experts: int = 4,
-        generation_use_dual_stream: bool = True,
-        # Video Generation Configuration
-        generation_video_cfg_scale: float = 7.5,
-        generation_video_use_flow_matching: bool = True,
-        generation_video_num_experts: int = 4,
-        generation_video_use_3d_rope: bool = True,
-        generation_video_use_temporal_moe: bool = True,
-        # Audio Configuration
-        audio_sample_rate: int = 16000,
-        audio_n_mels: int = 80,
-        audio_max_length: int = 625,  # Max mel frames (10 seconds at 16kHz with hop=256)
-        audio_max_waveform_samples: int = 160000,  # Max raw waveform (10 seconds at 16kHz)
-        audio_num_speakers: int = 256,
-        use_raw_waveform: bool = True,
-        audio_kv_lora_rank: int = 256,
-        audio_speaker_embed_dim: int = 256,
-        use_mas: bool = True,
-        use_in_context_audio_prompting: bool = True,
-        # Tokenizer Configuration
-        tokenizer_name: str = "Qwen/Qwen2.5-1.5B",
-        # LoRA Configuration
-        use_lora: bool = True,
-        lora_r: int = 32,
-        lora_alpha: int = 64,
-        lora_dropout: float = 0.05,
-        lora_target_modules: Union[List[str], Tuple[str, ...]] = (
-            'q_proj', 'k_proj', 'v_proj', 'o_proj',
-            'gate_proj', 'up_proj', 'down_proj',
-        ),
-        train_lora_only: bool = False,
-        use_rslora: bool = True,
-        use_dora: bool = False,
-        lora_plus_lr_ratio: float = 4.0,
-        # Cross-Attention Configuration
-        use_cross_attention: bool = True,
-        cross_attention_layers: int = 4,
-        cross_attention_heads: int = 8,
-        cross_attention_dropout: float = 0.1,
-        # Flash Attention Configuration
-        use_flash_attention: bool = True,
-        # Architecture flags (set during save to track what components exist)
-        has_audio_encoder: bool = True,
-        has_audio_decoder: bool = True,
-        has_waveform_decoder: bool = True,
-        has_vision_encoder: bool = True,
-        has_video_encoder: bool = True,
-        has_generator: bool = True,
-        has_video_generator: bool = True,
-        has_cross_attention: bool = True,
-        lora_applied: bool = False,
-        architecture_version: int = 2,
-        # Output path (used during training)
-        output_dir: str = "./xoron-model",
-        # Training Configuration
-        modality_dropout_prob: float = 0.0,
-        **kwargs,
     ):
-        # Call parent init
-        super().__init__(**kwargs)
-        # Training Configuration
-        self.modality_dropout_prob = modality_dropout_prob
-        # Model identification
-        self.model_name = model_name
-        # LLM Architecture
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.num_heads = num_heads
-        self.intermediate_size = intermediate_size
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.rms_norm_eps = rms_norm_eps
-        # Ring Attention
-        self.use_ring_attention = use_ring_attention
-        self.ring_attention_chunk_size = ring_attention_chunk_size
-        # Tie word embeddings
-        self.tie_word_embeddings = tie_word_embeddings
-        # MoE Configuration
-        self.use_moe = use_moe
-        self.num_experts = num_experts
-        self.num_experts_per_tok = num_experts_per_tok
-        self.moe_layer_freq = moe_layer_freq
-        self.use_shared_expert = use_shared_expert
-        self.moe_capacity_factor = moe_capacity_factor
-        self.use_aux_lossless = use_aux_lossless
-        # Vision Configuration
-        self.vision_model_name = vision_model_name
-        self.freeze_vision = freeze_vision
-        self.num_vision_tokens = num_vision_tokens
-        self.projector_type = projector_type
-        # Vision Encoder SOTA Features
-        self.use_vision_dual_stream = use_vision_dual_stream
-        self.use_vision_titok = use_vision_titok
-        self.num_vision_titok_tokens = num_vision_titok_tokens
-        self.num_vision_dual_stream_layers = num_vision_dual_stream_layers
-        # Video Encoder SOTA Features
-        self.use_video_3d_rope = use_video_3d_rope
-        self.use_video_temporal_moe = use_video_temporal_moe
-        self.num_video_encoder_layers = num_video_encoder_layers
-        self.num_video_experts = num_video_experts
-        self.use_video_vidtok = use_video_vidtok
-        self.vidtok_latent_channels = vidtok_latent_channels
-        self.vidtok_temporal_compression = vidtok_temporal_compression
-        self.vidtok_spatial_compression = vidtok_spatial_compression
-        self.vidtok_causal = vidtok_causal
-        self.vidtok_use_fsq = vidtok_use_fsq
-        # VideoTiTokTokenizer Configuration
-        self.use_video_titok = use_video_titok
-        self.num_video_titok_tokens = num_video_titok_tokens
-        self.num_video_titok_layers = num_video_titok_layers
-        self.num_video_titok_heads = num_video_titok_heads
-        self.video_titok_dropout = video_titok_dropout
-        # Continuous-Scale Training Configuration
-        self.use_multi_scale = use_multi_scale
-        self.use_continuous_scale = use_continuous_scale
-        self.image_min_size = image_min_size
-        self.image_max_size = image_max_size
-        self.image_base_size = image_base_size
-        self.image_size_step = image_size_step
-        self.video_min_size = video_min_size
-        self.video_max_size = video_max_size
-        self.video_base_size = video_base_size
-        self.video_size_step = video_size_step
-        self.video_min_frames = video_min_frames
-        self.video_max_frames = video_max_frames
-        self.video_base_frames = video_base_frames
-        self.video_frame_step = video_frame_step
-        self.multi_scale_strategy = multi_scale_strategy
-        self.multi_scale_warmup_epochs = multi_scale_warmup_epochs
-        self.adaptive_scale_oom_penalty = adaptive_scale_oom_penalty
-        self.adaptive_scale_success_boost = adaptive_scale_success_boost
-        self.generation_supported_sizes = list(generation_supported_sizes) if not isinstance(generation_supported_sizes, list) else generation_supported_sizes
-        self.generation_supported_frames = list(generation_supported_frames) if not isinstance(generation_supported_frames, list) else generation_supported_frames
-        # Image Generation Configuration
-        self.enable_generation = enable_generation
-        self.generation_latent_channels = generation_latent_channels
-        self.generation_base_channels = generation_base_channels
-        self.generation_inference_steps = generation_inference_steps
-        self.generation_cfg_scale = generation_cfg_scale
-        self.generation_use_flow_matching = generation_use_flow_matching
-        self.generation_num_experts = generation_num_experts
-        self.generation_use_dual_stream = generation_use_dual_stream
-        # Video Generation Configuration
-        self.generation_video_cfg_scale = generation_video_cfg_scale
-        self.generation_video_use_flow_matching = generation_video_use_flow_matching
-        self.generation_video_num_experts = generation_video_num_experts
-        self.generation_video_use_3d_rope = generation_video_use_3d_rope
-        self.generation_video_use_temporal_moe = generation_video_use_temporal_moe
-        # Audio Configuration
-        self.audio_sample_rate = audio_sample_rate
-        self.audio_n_mels = audio_n_mels
-        self.audio_max_length = audio_max_length
-        self.audio_max_waveform_samples = audio_max_waveform_samples
-        self.audio_num_speakers = audio_num_speakers
-        self.use_raw_waveform = use_raw_waveform
-        self.audio_kv_lora_rank = audio_kv_lora_rank
-        self.audio_speaker_embed_dim = audio_speaker_embed_dim
-        self.use_mas = use_mas
-        self.use_in_context_audio_prompting = use_in_context_audio_prompting
-        # Tokenizer Configuration
-        self.tokenizer_name = tokenizer_name
-        # LoRA Configuration
-        self.use_lora = use_lora
-        self.lora_r = lora_r
-        self.lora_alpha = lora_alpha
-        self.lora_dropout = lora_dropout
-        self.lora_target_modules = list(lora_target_modules) if not isinstance(lora_target_modules, list) else lora_target_modules
-        self.train_lora_only = train_lora_only
-        self.use_rslora = use_rslora
-        self.use_dora = use_dora
-        self.lora_plus_lr_ratio = lora_plus_lr_ratio
-        # Cross-Attention Configuration
-        self.use_cross_attention = use_cross_attention
-        self.cross_attention_layers = cross_attention_layers
-        self.cross_attention_heads = cross_attention_heads
-        self.cross_attention_dropout = cross_attention_dropout
-        # Flash Attention Configuration
-        self.use_flash_attention = use_flash_attention
-        # Architecture flags
-        self.has_audio_encoder = has_audio_encoder
-        self.has_audio_decoder = has_audio_decoder
-        self.has_waveform_decoder = has_waveform_decoder
-        self.has_vision_encoder = has_vision_encoder
-        self.has_video_encoder = has_video_encoder
-        self.has_generator = has_generator
-        self.has_video_generator = has_video_generator
-        self.has_cross_attention = has_cross_attention
-        self.lora_applied = lora_applied
-        self.architecture_version = architecture_version
-        # Output path
-        self.output_dir = output_dir

     config = AutoConfig.from_pretrained("your-repo/xoron-model", trust_remote_code=True)
 """
+from transformers import PreTrainedConfig
+from typing import List ,Tuple ,Union
+class XoronConfig (PreTrainedConfig ):
     """
     Configuration class for Xoron-Dev multimodal model.
         - FP16-native numerical stability
         - Multi-scale training for variable resolution handling
     """
+    model_type ="xoron"
+    def __init__ (
+    self ,
+    model_name :str ="Xoron-Dev-MultiMoE",
+    hidden_size :int =1024 ,
+    num_layers :int =12 ,
+    num_heads :int =16 ,
+    intermediate_size :int =2048 ,
+    vocab_size :int =151643 ,
+    max_position_embeddings :int =131072 ,
+    rms_norm_eps :float =1e-6 ,
+    use_ring_attention :bool =True ,
+    ring_attention_chunk_size :int =4096 ,
+    tie_word_embeddings :bool =True ,
+    use_moe :bool =True ,
+    num_experts :int =8 ,
+    num_experts_per_tok :int =2 ,
+    moe_layer_freq :int =2 ,
+    use_shared_expert :bool =True ,
+    moe_capacity_factor :float =1.25 ,
+    use_aux_lossless :bool =True ,
+    vision_model_name :str ="google/siglip-so400m-patch14-384",
+    freeze_vision :bool =False ,
+    num_vision_tokens :int =64 ,
+    projector_type :str ="perceiver",
+    use_vision_dual_stream :bool =True ,
+    use_vision_titok :bool =True ,
+    num_vision_titok_tokens :int =256 ,
+    num_vision_dual_stream_layers :int =2 ,
+    use_video_3d_rope :bool =True ,
+    use_video_temporal_moe :bool =True ,
+    num_video_encoder_layers :int =4 ,
+    num_video_experts :int =4 ,
+    use_video_vidtok :bool =True ,
+    vidtok_latent_channels :int =4 ,
+    vidtok_temporal_compression :int =4 ,
+    vidtok_spatial_compression :int =8 ,
+    vidtok_causal :bool =True ,
+    vidtok_use_fsq :bool =False ,
+    use_video_titok :bool =True ,
+    num_video_titok_tokens :int =64 ,
+    num_video_titok_layers :int =2 ,
+    num_video_titok_heads :int =8 ,
+    video_titok_dropout :float =0.1 ,
+    use_multi_scale :bool =True ,
+    use_continuous_scale :bool =True ,
+    image_min_size :int =128 ,
+    image_max_size :int =384 ,
+    image_base_size :int =256 ,
+    image_size_step :int =32 ,
+    video_min_size :int =128 ,
+    video_max_size :int =320 ,
+    video_base_size :int =192 ,
+    video_size_step :int =32 ,
+    video_min_frames :int =8 ,
+    video_max_frames :int =24 ,
+    video_base_frames :int =16 ,
+    video_frame_step :int =4 ,
+    multi_scale_strategy :str ="adaptive",
+    multi_scale_warmup_epochs :int =3 ,
+    adaptive_scale_oom_penalty :float =0.5 ,
+    adaptive_scale_success_boost :float =0.1 ,
+    generation_supported_sizes :Union [List [int ],Tuple [int ,...]]=(192 ,256 ,320 ,384 ),
+    generation_supported_frames :Union [List [int ],Tuple [int ,...]]=(8 ,12 ,16 ,20 ,24 ),
+    enable_generation :bool =True ,
+    generation_latent_channels :int =4 ,
+    generation_base_channels :int =128 ,
+    generation_inference_steps :int =50 ,
+    generation_cfg_scale :float =7.5 ,
+    generation_use_flow_matching :bool =True ,
+    generation_num_experts :int =4 ,
+    generation_use_dual_stream :bool =True ,
+    generation_video_cfg_scale :float =7.5 ,
+    generation_video_use_flow_matching :bool =True ,
+    generation_video_num_experts :int =4 ,
+    generation_video_use_3d_rope :bool =True ,
+    generation_video_use_temporal_moe :bool =True ,
+    audio_sample_rate :int =16000 ,
+    audio_n_mels :int =80 ,
+    audio_max_length :int =625 ,
+    audio_max_waveform_samples :int =160000 ,
+    audio_num_speakers :int =256 ,
+    use_raw_waveform :bool =True ,
+    audio_kv_lora_rank :int =256 ,
+    audio_speaker_embed_dim :int =256 ,
+    use_mas :bool =True ,
+    use_in_context_audio_prompting :bool =True ,
+    tokenizer_name :str ="Qwen/Qwen2.5-1.5B",
+    use_lora :bool =True ,
+    lora_r :int =32 ,
+    lora_alpha :int =64 ,
+    lora_dropout :float =0.05 ,
+    lora_target_modules :Union [List [str ],Tuple [str ,...]]=(
+    'q_proj','k_proj','v_proj','o_proj',
+    'gate_proj','up_proj','down_proj',
+    ),
+    train_lora_only :bool =False ,
+    use_rslora :bool =True ,
+    use_dora :bool =False ,
+    lora_plus_lr_ratio :float =4.0 ,
+    use_cross_attention :bool =True ,
+    cross_attention_layers :int =4 ,
+    cross_attention_heads :int =8 ,
+    cross_attention_dropout :float =0.1 ,
+    use_flash_attention :bool =True ,
+    has_audio_encoder :bool =True ,
+    has_audio_decoder :bool =True ,
+    has_waveform_decoder :bool =True ,
+    has_vision_encoder :bool =True ,
+    has_video_encoder :bool =True ,
+    has_generator :bool =True ,
+    has_video_generator :bool =True ,
+    has_cross_attention :bool =True ,
+    lora_applied :bool =False ,
+    architecture_version :int =2 ,
+    output_dir :str ="./xoron-model",
+    modality_dropout_prob :float =0.0 ,
+    **kwargs ,
     ):
+        super ().__init__ (**kwargs )
+        self .modality_dropout_prob =modality_dropout_prob
+        self .model_name =model_name
+        self .hidden_size =hidden_size
+        self .num_layers =num_layers
+        self .num_heads =num_heads
+        self .intermediate_size =intermediate_size
+        self .vocab_size =vocab_size
+        self .max_position_embeddings =max_position_embeddings
+        self .rms_norm_eps =rms_norm_eps
+        self .use_ring_attention =use_ring_attention
+        self .ring_attention_chunk_size =ring_attention_chunk_size
+        self .tie_word_embeddings =tie_word_embeddings
+        self .use_moe =use_moe
+        self .num_experts =num_experts
+        self .num_experts_per_tok =num_experts_per_tok
+        self .moe_layer_freq =moe_layer_freq
+        self .use_shared_expert =use_shared_expert
+        self .moe_capacity_factor =moe_capacity_factor
+        self .use_aux_lossless =use_aux_lossless
+        self .vision_model_name =vision_model_name
+        self .freeze_vision =freeze_vision
+        self .num_vision_tokens =num_vision_tokens
+        self .projector_type =projector_type
+        self .use_vision_dual_stream =use_vision_dual_stream
+        self .use_vision_titok =use_vision_titok
+        self .num_vision_titok_tokens =num_vision_titok_tokens
+        self .num_vision_dual_stream_layers =num_vision_dual_stream_layers
+        self .use_video_3d_rope =use_video_3d_rope
+        self .use_video_temporal_moe =use_video_temporal_moe
+        self .num_video_encoder_layers =num_video_encoder_layers
+        self .num_video_experts =num_video_experts
+        self .use_video_vidtok =use_video_vidtok
+        self .vidtok_latent_channels =vidtok_latent_channels
+        self .vidtok_temporal_compression =vidtok_temporal_compression
+        self .vidtok_spatial_compression =vidtok_spatial_compression
+        self .vidtok_causal =vidtok_causal
+        self .vidtok_use_fsq =vidtok_use_fsq
+        self .use_video_titok =use_video_titok
+        self .num_video_titok_tokens =num_video_titok_tokens
+        self .num_video_titok_layers =num_video_titok_layers
+        self .num_video_titok_heads =num_video_titok_heads
+        self .video_titok_dropout =video_titok_dropout
+        self .use_multi_scale =use_multi_scale
+        self .use_continuous_scale =use_continuous_scale
+        self .image_min_size =image_min_size
+        self .image_max_size =image_max_size
+        self .image_base_size =image_base_size
+        self .image_size_step =image_size_step
+        self .video_min_size =video_min_size
+        self .video_max_size =video_max_size
+        self .video_base_size =video_base_size
+        self .video_size_step =video_size_step
+        self .video_min_frames =video_min_frames
+        self .video_max_frames =video_max_frames
+        self .video_base_frames =video_base_frames
+        self .video_frame_step =video_frame_step
+        self .multi_scale_strategy =multi_scale_strategy
+        self .multi_scale_warmup_epochs =multi_scale_warmup_epochs
+        self .adaptive_scale_oom_penalty =adaptive_scale_oom_penalty
+        self .adaptive_scale_success_boost =adaptive_scale_success_boost
+        self .generation_supported_sizes =list (generation_supported_sizes )if not isinstance (generation_supported_sizes ,list )else generation_supported_sizes
+        self .generation_supported_frames =list (generation_supported_frames )if not isinstance (generation_supported_frames ,list )else generation_supported_frames
+        self .enable_generation =enable_generation
+        self .generation_latent_channels =generation_latent_channels
+        self .generation_base_channels =generation_base_channels
+        self .generation_inference_steps =generation_inference_steps
+        self .generation_cfg_scale =generation_cfg_scale
+        self .generation_use_flow_matching =generation_use_flow_matching
+        self .generation_num_experts =generation_num_experts
+        self .generation_use_dual_stream =generation_use_dual_stream
+        self .generation_video_cfg_scale =generation_video_cfg_scale
+        self .generation_video_use_flow_matching =generation_video_use_flow_matching
+        self .generation_video_num_experts =generation_video_num_experts
+        self .generation_video_use_3d_rope =generation_video_use_3d_rope
+        self .generation_video_use_temporal_moe =generation_video_use_temporal_moe
+        self .audio_sample_rate =audio_sample_rate
+        self .audio_n_mels =audio_n_mels
+        self .audio_max_length =audio_max_length
+        self .audio_max_waveform_samples =audio_max_waveform_samples
+        self .audio_num_speakers =audio_num_speakers
+        self .use_raw_waveform =use_raw_waveform
+        self .audio_kv_lora_rank =audio_kv_lora_rank
+        self .audio_speaker_embed_dim =audio_speaker_embed_dim
+        self .use_mas =use_mas
+        self .use_in_context_audio_prompting =use_in_context_audio_prompting
+        self .tokenizer_name =tokenizer_name
+        self .use_lora =use_lora
+        self .lora_r =lora_r
+        self .lora_alpha =lora_alpha
+        self .lora_dropout =lora_dropout
+        self .lora_target_modules =list (lora_target_modules )if not isinstance (lora_target_modules ,list )else lora_target_modules
+        self .train_lora_only =train_lora_only
+        self .use_rslora =use_rslora
+        self .use_dora =use_dora
+        self .lora_plus_lr_ratio =lora_plus_lr_ratio
+        self .use_cross_attention =use_cross_attention
+        self .cross_attention_layers =cross_attention_layers
+        self .cross_attention_heads =cross_attention_heads
+        self .cross_attention_dropout =cross_attention_dropout
+        self .use_flash_attention =use_flash_attention
+        self .has_audio_encoder =has_audio_encoder
+        self .has_audio_decoder =has_audio_decoder
+        self .has_waveform_decoder =has_waveform_decoder
+        self .has_vision_encoder =has_vision_encoder
+        self .has_video_encoder =has_video_encoder
+        self .has_generator =has_generator
+        self .has_video_generator =has_video_generator
+        self .has_cross_attention =has_cross_attention
+        self .lora_applied =lora_applied
+        self .architecture_version =architecture_version
+        self .output_dir =output_dir
+    @classmethod
+    def from_pretrained (cls ,pretrained_model_name_or_path :str ,**kwargs ):
+        """
+        SOTA: Load config from directory, filtering out keys that don't match XoronConfig.
+        This enables loading configs from newer/different versions gracefully.
+        """
+        import json
+        import os
+        if os .path .isdir (pretrained_model_name_or_path ):
+            config_path =os .path .join (pretrained_model_name_or_path ,"config.json")
+        else :
+            config_path =pretrained_model_name_or_path
+        if os .path .exists (config_path ):
+            with open (config_path ,"r")as f :
+                config_dict =json .load (f )
+            import inspect
+            sig =inspect .signature (cls .__init__ )
+            valid_keys =set (sig .parameters .keys ())
+            filtered_config ={k :v for k ,v in config_dict .items ()if k in valid_keys }
+            filtered_config .update (kwargs )
+            return cls (**filtered_config )
+        try :
+            return super ().from_pretrained (pretrained_model_name_or_path ,**kwargs )
+        except Exception :
+            return cls (**kwargs )

cross_attention.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6beff1e6cfb37ea461f112bf9d138ca007c01e24ac716b997a92000813aa8de5
 size 174191400

 version https://git-lfs.github.com/spec/v1
+oid sha256:01f2c0d12d9a882e71b37a268aa426e99b03cbc1372629eb283a28a10d05d5c6
 size 174191400

llm.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b168f1e28965acb01ab0375c81614f3af6cd312b27c630633ce21c555d8ab3b5
 size 1506832040

 version https://git-lfs.github.com/spec/v1
+oid sha256:dcb8a06e9fc3b7ab14df9d1e54eea7fe4732a0ece031f4af02f4bed76416c620
 size 1506832040

modeling_xoron.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

streaming_state.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "epoch": 85,
   "unique_samples": 400,
   "total_yields": 800,
   "dataset_positions": {
@@ -87,14 +87,14 @@
     "Pexels-I2V-350k": 650,
     "SmolTalk-OpenHermes": 250,
     "SmolTalk-All": 250,
-    "Cosmopedia-AutoMath": 250,
-    "OpenMathInstruct-1": 250,
-    "NuminaMath-CoT": 250,
-    "UltraData-Math-Conv": 250,
-    "Cosmopedia-KhanAcademy": 250,
-    "NuminaMath-TIR": 250,
-    "UltraData-Math-QA": 250,
-    "Cosmopedia-OpenStax": 250,
     "MedMCQA": 350,
     "Medical-Reasoning-SFT-Mega": 350,
     "Medical-O1-Reasoning-EN": 350
@@ -135,14 +135,14 @@
       "OpenAssistant": 450,
       "SmolTalk-OpenHermes": 250,
       "SmolTalk-All": 250,
-      "Cosmopedia-AutoMath": 250,
-      "OpenMathInstruct-1": 250,
-      "NuminaMath-CoT": 250,
-      "UltraData-Math-Conv": 250,
-      "Cosmopedia-KhanAcademy": 250,
-      "NuminaMath-TIR": 250,
-      "UltraData-Math-QA": 250,
-      "Cosmopedia-OpenStax": 250,
       "MedMCQA": 350,
       "Medical-Reasoning-SFT-Mega": 350,
       "Medical-O1-Reasoning-EN": 350
@@ -170,9 +170,9 @@
     "audio": {}
   },
   "modality_counts": {
-    "text": 0,
     "image": 0,
-    "video": 250,
     "audio": 0
   },
   "last_modality": null

 {
+  "epoch": 98,
   "unique_samples": 400,
   "total_yields": 800,
   "dataset_positions": {
     "Pexels-I2V-350k": 650,
     "SmolTalk-OpenHermes": 250,
     "SmolTalk-All": 250,
+    "Cosmopedia-AutoMath": 600,
+    "OpenMathInstruct-1": 600,
+    "NuminaMath-CoT": 600,
+    "UltraData-Math-Conv": 600,
+    "Cosmopedia-KhanAcademy": 600,
+    "NuminaMath-TIR": 600,
+    "UltraData-Math-QA": 600,
+    "Cosmopedia-OpenStax": 600,
     "MedMCQA": 350,
     "Medical-Reasoning-SFT-Mega": 350,
     "Medical-O1-Reasoning-EN": 350
       "OpenAssistant": 450,
       "SmolTalk-OpenHermes": 250,
       "SmolTalk-All": 250,
+      "Cosmopedia-AutoMath": 600,
+      "OpenMathInstruct-1": 600,
+      "NuminaMath-CoT": 600,
+      "UltraData-Math-Conv": 600,
+      "Cosmopedia-KhanAcademy": 600,
+      "NuminaMath-TIR": 600,
+      "UltraData-Math-QA": 600,
+      "Cosmopedia-OpenStax": 600,
       "MedMCQA": 350,
       "Medical-Reasoning-SFT-Mega": 350,
       "Medical-O1-Reasoning-EN": 350
     "audio": {}
   },
   "modality_counts": {
+    "text": 400,
     "image": 0,
+    "video": 0,
     "audio": 0
   },
   "last_modality": null

trainer_state.json CHANGED Viewed

@@ -1,32 +1,32 @@
 {
   "best_model_checkpoint": "/kaggle/working/xoron-final",
-  "best_metric": 3.398919365755515,
-  "epoch": 1,
-  "epochs_completed": 1,
-  "global_step": 31,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [],
   "logging_steps": 50,
-  "max_steps": 31,
-  "num_train_epochs": 1,
   "total_flos": 0,
   "train_batch_size": 1,
   "effective_batch_size": 16,
   "learning_rate": 0.0001,
   "max_grad_norm": 1.0,
   "trainable_components": [
-    "vision",
-    "video",
     "llm",
     "cross_attention",
-    "video_generation",
     "modality_markers"
   ],
   "frozen_components": [
     "audio",
     "speech",
-    "image_generation"
   ],
   "trial_name": null,
   "trial_params": null

 {
   "best_model_checkpoint": "/kaggle/working/xoron-final",
+  "best_metric": 5.354270628392697,
+  "epoch": 7,
+  "epochs_completed": 7,
+  "global_step": 350,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [],
   "logging_steps": 50,
+  "max_steps": 350,
+  "num_train_epochs": 7,
   "total_flos": 0,
   "train_batch_size": 1,
   "effective_batch_size": 16,
   "learning_rate": 0.0001,
   "max_grad_norm": 1.0,
   "trainable_components": [
     "llm",
     "cross_attention",
     "modality_markers"
   ],
   "frozen_components": [
+    "vision",
+    "video",
     "audio",
     "speech",
+    "image_generation",
+    "video_generation"
   ],
   "trial_name": null,
   "trial_params": null

training_state.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b224a38701068628ea2346719232695d255cff3500d63df4b888e5a94eab7ab4
-size 3426643671

 version https://git-lfs.github.com/spec/v1
+oid sha256:148961e9ff451a61d54b8edf577bcc3dd080efec47d5d54d58a5870199de86a1
+size 1514912171