Backup-bdg commited on
Commit
39827f9
·
verified ·
1 Parent(s): 1fcfda0

Update model weights after training (epoch 7, loss 5.3543)

Browse files
audio_decoder.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d817de2ba9f31539807a8d57d1ad5441f33794329008e0a6b9e01764b831f909
3
  size 1458410612
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1628ac8faba5b54287f88aae6cd7885ccfa61f06ea0a09b08f01e91463b96df6
3
  size 1458410612
config.json CHANGED
@@ -49,7 +49,7 @@
49
  "image_size_step": 32,
50
  "video_min_size": 128,
51
  "video_max_size": 320,
52
- "video_base_size": 128,
53
  "video_size_step": 32,
54
  "video_min_frames": 8,
55
  "video_max_frames": 8,
 
49
  "image_size_step": 32,
50
  "video_min_size": 128,
51
  "video_max_size": 320,
52
+ "video_base_size": 320,
53
  "video_size_step": 32,
54
  "video_min_frames": 8,
55
  "video_max_frames": 8,
configuration_xoron.py CHANGED
@@ -12,11 +12,11 @@ Usage:
12
  config = AutoConfig.from_pretrained("your-repo/xoron-model", trust_remote_code=True)
13
  """
14
 
15
- from transformers import PreTrainedConfig
16
- from typing import List, Tuple, Union
17
 
18
 
19
- class XoronConfig(PreTrainedConfig):
20
  """
21
  Configuration class for Xoron-Dev multimodal model.
22
 
@@ -57,317 +57,354 @@ class XoronConfig(PreTrainedConfig):
57
  - FP16-native numerical stability
58
  - Multi-scale training for variable resolution handling
59
  """
60
-
61
- model_type = "xoron"
62
-
63
- def __init__(
64
- self,
65
- # Model identification
66
- model_name: str = "Xoron-Dev-MultiMoE",
67
-
68
- # LLM Architecture
69
- hidden_size: int = 1024,
70
- num_layers: int = 12,
71
- num_heads: int = 16,
72
- intermediate_size: int = 2048,
73
- vocab_size: int = 151643,
74
- max_position_embeddings: int = 131072,
75
- rms_norm_eps: float = 1e-6,
76
-
77
- # Ring Attention
78
- use_ring_attention: bool = True,
79
- ring_attention_chunk_size: int = 4096,
80
-
81
- # Tie word embeddings
82
- tie_word_embeddings: bool = True,
83
-
84
- # MoE Configuration
85
- use_moe: bool = True,
86
- num_experts: int = 8,
87
- num_experts_per_tok: int = 2,
88
- moe_layer_freq: int = 2,
89
- use_shared_expert: bool = True,
90
- moe_capacity_factor: float = 1.25,
91
- use_aux_lossless: bool = True,
92
-
93
- # Vision Configuration
94
- vision_model_name: str = "google/siglip-so400m-patch14-384",
95
- freeze_vision: bool = False,
96
- num_vision_tokens: int = 64,
97
- projector_type: str = "perceiver",
98
-
99
- # Vision Encoder SOTA Features
100
- use_vision_dual_stream: bool = True,
101
- use_vision_titok: bool = True,
102
- num_vision_titok_tokens: int = 256,
103
- num_vision_dual_stream_layers: int = 2,
104
-
105
- # Video Encoder SOTA Features
106
- use_video_3d_rope: bool = True,
107
- use_video_temporal_moe: bool = True,
108
- num_video_encoder_layers: int = 4,
109
- num_video_experts: int = 4,
110
- use_video_vidtok: bool = True,
111
- vidtok_latent_channels: int = 4,
112
- vidtok_temporal_compression: int = 4,
113
- vidtok_spatial_compression: int = 8,
114
- vidtok_causal: bool = True,
115
- vidtok_use_fsq: bool = False,
116
-
117
- # VideoTiTokTokenizer Configuration (SOTA: TiTok-style 1D tokenization for video)
118
- use_video_titok: bool = True,
119
- num_video_titok_tokens: int = 64,
120
- num_video_titok_layers: int = 2,
121
- num_video_titok_heads: int = 8,
122
- video_titok_dropout: float = 0.1,
123
-
124
- # Continuous-Scale Training Configuration
125
- use_multi_scale: bool = True,
126
- use_continuous_scale: bool = True,
127
- image_min_size: int = 128,
128
- image_max_size: int = 384,
129
- image_base_size: int = 256,
130
- image_size_step: int = 32,
131
- video_min_size: int = 128,
132
- video_max_size: int = 320,
133
- video_base_size: int = 192,
134
- video_size_step: int = 32,
135
- video_min_frames: int = 8,
136
- video_max_frames: int = 24,
137
- video_base_frames: int = 16,
138
- video_frame_step: int = 4,
139
- multi_scale_strategy: str = "adaptive",
140
- multi_scale_warmup_epochs: int = 3,
141
- adaptive_scale_oom_penalty: float = 0.5,
142
- adaptive_scale_success_boost: float = 0.1,
143
- generation_supported_sizes: Union[List[int], Tuple[int, ...]] = (192, 256, 320, 384),
144
- generation_supported_frames: Union[List[int], Tuple[int, ...]] = (8, 12, 16, 20, 24),
145
-
146
- # Image Generation Configuration
147
- enable_generation: bool = True,
148
- generation_latent_channels: int = 4,
149
- generation_base_channels: int = 128,
150
- generation_inference_steps: int = 50,
151
- generation_cfg_scale: float = 7.5,
152
- generation_use_flow_matching: bool = True,
153
- generation_num_experts: int = 4,
154
- generation_use_dual_stream: bool = True,
155
-
156
- # Video Generation Configuration
157
- generation_video_cfg_scale: float = 7.5,
158
- generation_video_use_flow_matching: bool = True,
159
- generation_video_num_experts: int = 4,
160
- generation_video_use_3d_rope: bool = True,
161
- generation_video_use_temporal_moe: bool = True,
162
-
163
- # Audio Configuration
164
- audio_sample_rate: int = 16000,
165
- audio_n_mels: int = 80,
166
- audio_max_length: int = 625, # Max mel frames (10 seconds at 16kHz with hop=256)
167
- audio_max_waveform_samples: int = 160000, # Max raw waveform (10 seconds at 16kHz)
168
- audio_num_speakers: int = 256,
169
- use_raw_waveform: bool = True,
170
- audio_kv_lora_rank: int = 256,
171
- audio_speaker_embed_dim: int = 256,
172
- use_mas: bool = True,
173
- use_in_context_audio_prompting: bool = True,
174
-
175
- # Tokenizer Configuration
176
- tokenizer_name: str = "Qwen/Qwen2.5-1.5B",
177
-
178
- # LoRA Configuration
179
- use_lora: bool = True,
180
- lora_r: int = 32,
181
- lora_alpha: int = 64,
182
- lora_dropout: float = 0.05,
183
- lora_target_modules: Union[List[str], Tuple[str, ...]] = (
184
- 'q_proj', 'k_proj', 'v_proj', 'o_proj',
185
- 'gate_proj', 'up_proj', 'down_proj',
186
- ),
187
- train_lora_only: bool = False,
188
- use_rslora: bool = True,
189
- use_dora: bool = False,
190
- lora_plus_lr_ratio: float = 4.0,
191
-
192
- # Cross-Attention Configuration
193
- use_cross_attention: bool = True,
194
- cross_attention_layers: int = 4,
195
- cross_attention_heads: int = 8,
196
- cross_attention_dropout: float = 0.1,
197
-
198
- # Flash Attention Configuration
199
- use_flash_attention: bool = True,
200
-
201
- # Architecture flags (set during save to track what components exist)
202
- has_audio_encoder: bool = True,
203
- has_audio_decoder: bool = True,
204
- has_waveform_decoder: bool = True,
205
- has_vision_encoder: bool = True,
206
- has_video_encoder: bool = True,
207
- has_generator: bool = True,
208
- has_video_generator: bool = True,
209
- has_cross_attention: bool = True,
210
- lora_applied: bool = False,
211
- architecture_version: int = 2,
212
-
213
- # Output path (used during training)
214
- output_dir: str = "./xoron-model",
215
-
216
- # Training Configuration
217
- modality_dropout_prob: float = 0.0,
218
-
219
- **kwargs,
220
  ):
221
- # Call parent init
222
- super().__init__(**kwargs)
223
-
224
- # Training Configuration
225
- self.modality_dropout_prob = modality_dropout_prob
226
-
227
- # Model identification
228
- self.model_name = model_name
229
-
230
- # LLM Architecture
231
- self.hidden_size = hidden_size
232
- self.num_layers = num_layers
233
- self.num_heads = num_heads
234
- self.intermediate_size = intermediate_size
235
- self.vocab_size = vocab_size
236
- self.max_position_embeddings = max_position_embeddings
237
- self.rms_norm_eps = rms_norm_eps
238
-
239
- # Ring Attention
240
- self.use_ring_attention = use_ring_attention
241
- self.ring_attention_chunk_size = ring_attention_chunk_size
242
-
243
- # Tie word embeddings
244
- self.tie_word_embeddings = tie_word_embeddings
245
-
246
- # MoE Configuration
247
- self.use_moe = use_moe
248
- self.num_experts = num_experts
249
- self.num_experts_per_tok = num_experts_per_tok
250
- self.moe_layer_freq = moe_layer_freq
251
- self.use_shared_expert = use_shared_expert
252
- self.moe_capacity_factor = moe_capacity_factor
253
- self.use_aux_lossless = use_aux_lossless
254
-
255
- # Vision Configuration
256
- self.vision_model_name = vision_model_name
257
- self.freeze_vision = freeze_vision
258
- self.num_vision_tokens = num_vision_tokens
259
- self.projector_type = projector_type
260
-
261
- # Vision Encoder SOTA Features
262
- self.use_vision_dual_stream = use_vision_dual_stream
263
- self.use_vision_titok = use_vision_titok
264
- self.num_vision_titok_tokens = num_vision_titok_tokens
265
- self.num_vision_dual_stream_layers = num_vision_dual_stream_layers
266
-
267
- # Video Encoder SOTA Features
268
- self.use_video_3d_rope = use_video_3d_rope
269
- self.use_video_temporal_moe = use_video_temporal_moe
270
- self.num_video_encoder_layers = num_video_encoder_layers
271
- self.num_video_experts = num_video_experts
272
- self.use_video_vidtok = use_video_vidtok
273
- self.vidtok_latent_channels = vidtok_latent_channels
274
- self.vidtok_temporal_compression = vidtok_temporal_compression
275
- self.vidtok_spatial_compression = vidtok_spatial_compression
276
- self.vidtok_causal = vidtok_causal
277
- self.vidtok_use_fsq = vidtok_use_fsq
278
-
279
- # VideoTiTokTokenizer Configuration
280
- self.use_video_titok = use_video_titok
281
- self.num_video_titok_tokens = num_video_titok_tokens
282
- self.num_video_titok_layers = num_video_titok_layers
283
- self.num_video_titok_heads = num_video_titok_heads
284
- self.video_titok_dropout = video_titok_dropout
285
-
286
- # Continuous-Scale Training Configuration
287
- self.use_multi_scale = use_multi_scale
288
- self.use_continuous_scale = use_continuous_scale
289
- self.image_min_size = image_min_size
290
- self.image_max_size = image_max_size
291
- self.image_base_size = image_base_size
292
- self.image_size_step = image_size_step
293
- self.video_min_size = video_min_size
294
- self.video_max_size = video_max_size
295
- self.video_base_size = video_base_size
296
- self.video_size_step = video_size_step
297
- self.video_min_frames = video_min_frames
298
- self.video_max_frames = video_max_frames
299
- self.video_base_frames = video_base_frames
300
- self.video_frame_step = video_frame_step
301
- self.multi_scale_strategy = multi_scale_strategy
302
- self.multi_scale_warmup_epochs = multi_scale_warmup_epochs
303
- self.adaptive_scale_oom_penalty = adaptive_scale_oom_penalty
304
- self.adaptive_scale_success_boost = adaptive_scale_success_boost
305
- self.generation_supported_sizes = list(generation_supported_sizes) if not isinstance(generation_supported_sizes, list) else generation_supported_sizes
306
- self.generation_supported_frames = list(generation_supported_frames) if not isinstance(generation_supported_frames, list) else generation_supported_frames
307
-
308
- # Image Generation Configuration
309
- self.enable_generation = enable_generation
310
- self.generation_latent_channels = generation_latent_channels
311
- self.generation_base_channels = generation_base_channels
312
- self.generation_inference_steps = generation_inference_steps
313
- self.generation_cfg_scale = generation_cfg_scale
314
- self.generation_use_flow_matching = generation_use_flow_matching
315
- self.generation_num_experts = generation_num_experts
316
- self.generation_use_dual_stream = generation_use_dual_stream
317
-
318
- # Video Generation Configuration
319
- self.generation_video_cfg_scale = generation_video_cfg_scale
320
- self.generation_video_use_flow_matching = generation_video_use_flow_matching
321
- self.generation_video_num_experts = generation_video_num_experts
322
- self.generation_video_use_3d_rope = generation_video_use_3d_rope
323
- self.generation_video_use_temporal_moe = generation_video_use_temporal_moe
324
-
325
- # Audio Configuration
326
- self.audio_sample_rate = audio_sample_rate
327
- self.audio_n_mels = audio_n_mels
328
- self.audio_max_length = audio_max_length
329
- self.audio_max_waveform_samples = audio_max_waveform_samples
330
- self.audio_num_speakers = audio_num_speakers
331
- self.use_raw_waveform = use_raw_waveform
332
- self.audio_kv_lora_rank = audio_kv_lora_rank
333
- self.audio_speaker_embed_dim = audio_speaker_embed_dim
334
- self.use_mas = use_mas
335
- self.use_in_context_audio_prompting = use_in_context_audio_prompting
336
-
337
- # Tokenizer Configuration
338
- self.tokenizer_name = tokenizer_name
339
-
340
- # LoRA Configuration
341
- self.use_lora = use_lora
342
- self.lora_r = lora_r
343
- self.lora_alpha = lora_alpha
344
- self.lora_dropout = lora_dropout
345
- self.lora_target_modules = list(lora_target_modules) if not isinstance(lora_target_modules, list) else lora_target_modules
346
- self.train_lora_only = train_lora_only
347
- self.use_rslora = use_rslora
348
- self.use_dora = use_dora
349
- self.lora_plus_lr_ratio = lora_plus_lr_ratio
350
-
351
- # Cross-Attention Configuration
352
- self.use_cross_attention = use_cross_attention
353
- self.cross_attention_layers = cross_attention_layers
354
- self.cross_attention_heads = cross_attention_heads
355
- self.cross_attention_dropout = cross_attention_dropout
356
-
357
- # Flash Attention Configuration
358
- self.use_flash_attention = use_flash_attention
359
-
360
- # Architecture flags
361
- self.has_audio_encoder = has_audio_encoder
362
- self.has_audio_decoder = has_audio_decoder
363
- self.has_waveform_decoder = has_waveform_decoder
364
- self.has_vision_encoder = has_vision_encoder
365
- self.has_video_encoder = has_video_encoder
366
- self.has_generator = has_generator
367
- self.has_video_generator = has_video_generator
368
- self.has_cross_attention = has_cross_attention
369
- self.lora_applied = lora_applied
370
- self.architecture_version = architecture_version
371
-
372
- # Output path
373
- self.output_dir = output_dir
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  config = AutoConfig.from_pretrained("your-repo/xoron-model", trust_remote_code=True)
13
  """
14
 
15
+ from transformers import PreTrainedConfig
16
+ from typing import List ,Tuple ,Union
17
 
18
 
19
+ class XoronConfig (PreTrainedConfig ):
20
  """
21
  Configuration class for Xoron-Dev multimodal model.
22
 
 
57
  - FP16-native numerical stability
58
  - Multi-scale training for variable resolution handling
59
  """
60
+
61
+ model_type ="xoron"
62
+
63
+ def __init__ (
64
+ self ,
65
+
66
+ model_name :str ="Xoron-Dev-MultiMoE",
67
+
68
+
69
+ hidden_size :int =1024 ,
70
+ num_layers :int =12 ,
71
+ num_heads :int =16 ,
72
+ intermediate_size :int =2048 ,
73
+ vocab_size :int =151643 ,
74
+ max_position_embeddings :int =131072 ,
75
+ rms_norm_eps :float =1e-6 ,
76
+
77
+
78
+ use_ring_attention :bool =True ,
79
+ ring_attention_chunk_size :int =4096 ,
80
+
81
+
82
+ tie_word_embeddings :bool =True ,
83
+
84
+
85
+ use_moe :bool =True ,
86
+ num_experts :int =8 ,
87
+ num_experts_per_tok :int =2 ,
88
+ moe_layer_freq :int =2 ,
89
+ use_shared_expert :bool =True ,
90
+ moe_capacity_factor :float =1.25 ,
91
+ use_aux_lossless :bool =True ,
92
+
93
+
94
+ vision_model_name :str ="google/siglip-so400m-patch14-384",
95
+ freeze_vision :bool =False ,
96
+ num_vision_tokens :int =64 ,
97
+ projector_type :str ="perceiver",
98
+
99
+
100
+ use_vision_dual_stream :bool =True ,
101
+ use_vision_titok :bool =True ,
102
+ num_vision_titok_tokens :int =256 ,
103
+ num_vision_dual_stream_layers :int =2 ,
104
+
105
+
106
+ use_video_3d_rope :bool =True ,
107
+ use_video_temporal_moe :bool =True ,
108
+ num_video_encoder_layers :int =4 ,
109
+ num_video_experts :int =4 ,
110
+ use_video_vidtok :bool =True ,
111
+ vidtok_latent_channels :int =4 ,
112
+ vidtok_temporal_compression :int =4 ,
113
+ vidtok_spatial_compression :int =8 ,
114
+ vidtok_causal :bool =True ,
115
+ vidtok_use_fsq :bool =False ,
116
+
117
+
118
+ use_video_titok :bool =True ,
119
+ num_video_titok_tokens :int =64 ,
120
+ num_video_titok_layers :int =2 ,
121
+ num_video_titok_heads :int =8 ,
122
+ video_titok_dropout :float =0.1 ,
123
+
124
+
125
+ use_multi_scale :bool =True ,
126
+ use_continuous_scale :bool =True ,
127
+ image_min_size :int =128 ,
128
+ image_max_size :int =384 ,
129
+ image_base_size :int =256 ,
130
+ image_size_step :int =32 ,
131
+ video_min_size :int =128 ,
132
+ video_max_size :int =320 ,
133
+ video_base_size :int =192 ,
134
+ video_size_step :int =32 ,
135
+ video_min_frames :int =8 ,
136
+ video_max_frames :int =24 ,
137
+ video_base_frames :int =16 ,
138
+ video_frame_step :int =4 ,
139
+ multi_scale_strategy :str ="adaptive",
140
+ multi_scale_warmup_epochs :int =3 ,
141
+ adaptive_scale_oom_penalty :float =0.5 ,
142
+ adaptive_scale_success_boost :float =0.1 ,
143
+ generation_supported_sizes :Union [List [int ],Tuple [int ,...]]=(192 ,256 ,320 ,384 ),
144
+ generation_supported_frames :Union [List [int ],Tuple [int ,...]]=(8 ,12 ,16 ,20 ,24 ),
145
+
146
+
147
+ enable_generation :bool =True ,
148
+ generation_latent_channels :int =4 ,
149
+ generation_base_channels :int =128 ,
150
+ generation_inference_steps :int =50 ,
151
+ generation_cfg_scale :float =7.5 ,
152
+ generation_use_flow_matching :bool =True ,
153
+ generation_num_experts :int =4 ,
154
+ generation_use_dual_stream :bool =True ,
155
+
156
+
157
+ generation_video_cfg_scale :float =7.5 ,
158
+ generation_video_use_flow_matching :bool =True ,
159
+ generation_video_num_experts :int =4 ,
160
+ generation_video_use_3d_rope :bool =True ,
161
+ generation_video_use_temporal_moe :bool =True ,
162
+
163
+
164
+ audio_sample_rate :int =16000 ,
165
+ audio_n_mels :int =80 ,
166
+ audio_max_length :int =625 ,
167
+ audio_max_waveform_samples :int =160000 ,
168
+ audio_num_speakers :int =256 ,
169
+ use_raw_waveform :bool =True ,
170
+ audio_kv_lora_rank :int =256 ,
171
+ audio_speaker_embed_dim :int =256 ,
172
+ use_mas :bool =True ,
173
+ use_in_context_audio_prompting :bool =True ,
174
+
175
+
176
+ tokenizer_name :str ="Qwen/Qwen2.5-1.5B",
177
+
178
+
179
+ use_lora :bool =True ,
180
+ lora_r :int =32 ,
181
+ lora_alpha :int =64 ,
182
+ lora_dropout :float =0.05 ,
183
+ lora_target_modules :Union [List [str ],Tuple [str ,...]]=(
184
+ 'q_proj','k_proj','v_proj','o_proj',
185
+ 'gate_proj','up_proj','down_proj',
186
+ ),
187
+ train_lora_only :bool =False ,
188
+ use_rslora :bool =True ,
189
+ use_dora :bool =False ,
190
+ lora_plus_lr_ratio :float =4.0 ,
191
+
192
+
193
+ use_cross_attention :bool =True ,
194
+ cross_attention_layers :int =4 ,
195
+ cross_attention_heads :int =8 ,
196
+ cross_attention_dropout :float =0.1 ,
197
+
198
+
199
+ use_flash_attention :bool =True ,
200
+
201
+
202
+ has_audio_encoder :bool =True ,
203
+ has_audio_decoder :bool =True ,
204
+ has_waveform_decoder :bool =True ,
205
+ has_vision_encoder :bool =True ,
206
+ has_video_encoder :bool =True ,
207
+ has_generator :bool =True ,
208
+ has_video_generator :bool =True ,
209
+ has_cross_attention :bool =True ,
210
+ lora_applied :bool =False ,
211
+ architecture_version :int =2 ,
212
+
213
+
214
+ output_dir :str ="./xoron-model",
215
+
216
+
217
+ modality_dropout_prob :float =0.0 ,
218
+
219
+ **kwargs ,
220
  ):
221
+
222
+ super ().__init__ (**kwargs )
223
+
224
+
225
+ self .modality_dropout_prob =modality_dropout_prob
226
+
227
+
228
+ self .model_name =model_name
229
+
230
+
231
+ self .hidden_size =hidden_size
232
+ self .num_layers =num_layers
233
+ self .num_heads =num_heads
234
+ self .intermediate_size =intermediate_size
235
+ self .vocab_size =vocab_size
236
+ self .max_position_embeddings =max_position_embeddings
237
+ self .rms_norm_eps =rms_norm_eps
238
+
239
+
240
+ self .use_ring_attention =use_ring_attention
241
+ self .ring_attention_chunk_size =ring_attention_chunk_size
242
+
243
+
244
+ self .tie_word_embeddings =tie_word_embeddings
245
+
246
+
247
+ self .use_moe =use_moe
248
+ self .num_experts =num_experts
249
+ self .num_experts_per_tok =num_experts_per_tok
250
+ self .moe_layer_freq =moe_layer_freq
251
+ self .use_shared_expert =use_shared_expert
252
+ self .moe_capacity_factor =moe_capacity_factor
253
+ self .use_aux_lossless =use_aux_lossless
254
+
255
+
256
+ self .vision_model_name =vision_model_name
257
+ self .freeze_vision =freeze_vision
258
+ self .num_vision_tokens =num_vision_tokens
259
+ self .projector_type =projector_type
260
+
261
+
262
+ self .use_vision_dual_stream =use_vision_dual_stream
263
+ self .use_vision_titok =use_vision_titok
264
+ self .num_vision_titok_tokens =num_vision_titok_tokens
265
+ self .num_vision_dual_stream_layers =num_vision_dual_stream_layers
266
+
267
+
268
+ self .use_video_3d_rope =use_video_3d_rope
269
+ self .use_video_temporal_moe =use_video_temporal_moe
270
+ self .num_video_encoder_layers =num_video_encoder_layers
271
+ self .num_video_experts =num_video_experts
272
+ self .use_video_vidtok =use_video_vidtok
273
+ self .vidtok_latent_channels =vidtok_latent_channels
274
+ self .vidtok_temporal_compression =vidtok_temporal_compression
275
+ self .vidtok_spatial_compression =vidtok_spatial_compression
276
+ self .vidtok_causal =vidtok_causal
277
+ self .vidtok_use_fsq =vidtok_use_fsq
278
+
279
+
280
+ self .use_video_titok =use_video_titok
281
+ self .num_video_titok_tokens =num_video_titok_tokens
282
+ self .num_video_titok_layers =num_video_titok_layers
283
+ self .num_video_titok_heads =num_video_titok_heads
284
+ self .video_titok_dropout =video_titok_dropout
285
+
286
+
287
+ self .use_multi_scale =use_multi_scale
288
+ self .use_continuous_scale =use_continuous_scale
289
+ self .image_min_size =image_min_size
290
+ self .image_max_size =image_max_size
291
+ self .image_base_size =image_base_size
292
+ self .image_size_step =image_size_step
293
+ self .video_min_size =video_min_size
294
+ self .video_max_size =video_max_size
295
+ self .video_base_size =video_base_size
296
+ self .video_size_step =video_size_step
297
+ self .video_min_frames =video_min_frames
298
+ self .video_max_frames =video_max_frames
299
+ self .video_base_frames =video_base_frames
300
+ self .video_frame_step =video_frame_step
301
+ self .multi_scale_strategy =multi_scale_strategy
302
+ self .multi_scale_warmup_epochs =multi_scale_warmup_epochs
303
+ self .adaptive_scale_oom_penalty =adaptive_scale_oom_penalty
304
+ self .adaptive_scale_success_boost =adaptive_scale_success_boost
305
+ self .generation_supported_sizes =list (generation_supported_sizes )if not isinstance (generation_supported_sizes ,list )else generation_supported_sizes
306
+ self .generation_supported_frames =list (generation_supported_frames )if not isinstance (generation_supported_frames ,list )else generation_supported_frames
307
+
308
+
309
+ self .enable_generation =enable_generation
310
+ self .generation_latent_channels =generation_latent_channels
311
+ self .generation_base_channels =generation_base_channels
312
+ self .generation_inference_steps =generation_inference_steps
313
+ self .generation_cfg_scale =generation_cfg_scale
314
+ self .generation_use_flow_matching =generation_use_flow_matching
315
+ self .generation_num_experts =generation_num_experts
316
+ self .generation_use_dual_stream =generation_use_dual_stream
317
+
318
+
319
+ self .generation_video_cfg_scale =generation_video_cfg_scale
320
+ self .generation_video_use_flow_matching =generation_video_use_flow_matching
321
+ self .generation_video_num_experts =generation_video_num_experts
322
+ self .generation_video_use_3d_rope =generation_video_use_3d_rope
323
+ self .generation_video_use_temporal_moe =generation_video_use_temporal_moe
324
+
325
+
326
+ self .audio_sample_rate =audio_sample_rate
327
+ self .audio_n_mels =audio_n_mels
328
+ self .audio_max_length =audio_max_length
329
+ self .audio_max_waveform_samples =audio_max_waveform_samples
330
+ self .audio_num_speakers =audio_num_speakers
331
+ self .use_raw_waveform =use_raw_waveform
332
+ self .audio_kv_lora_rank =audio_kv_lora_rank
333
+ self .audio_speaker_embed_dim =audio_speaker_embed_dim
334
+ self .use_mas =use_mas
335
+ self .use_in_context_audio_prompting =use_in_context_audio_prompting
336
+
337
+
338
+ self .tokenizer_name =tokenizer_name
339
+
340
+
341
+ self .use_lora =use_lora
342
+ self .lora_r =lora_r
343
+ self .lora_alpha =lora_alpha
344
+ self .lora_dropout =lora_dropout
345
+ self .lora_target_modules =list (lora_target_modules )if not isinstance (lora_target_modules ,list )else lora_target_modules
346
+ self .train_lora_only =train_lora_only
347
+ self .use_rslora =use_rslora
348
+ self .use_dora =use_dora
349
+ self .lora_plus_lr_ratio =lora_plus_lr_ratio
350
+
351
+
352
+ self .use_cross_attention =use_cross_attention
353
+ self .cross_attention_layers =cross_attention_layers
354
+ self .cross_attention_heads =cross_attention_heads
355
+ self .cross_attention_dropout =cross_attention_dropout
356
+
357
+
358
+ self .use_flash_attention =use_flash_attention
359
+
360
+
361
+ self .has_audio_encoder =has_audio_encoder
362
+ self .has_audio_decoder =has_audio_decoder
363
+ self .has_waveform_decoder =has_waveform_decoder
364
+ self .has_vision_encoder =has_vision_encoder
365
+ self .has_video_encoder =has_video_encoder
366
+ self .has_generator =has_generator
367
+ self .has_video_generator =has_video_generator
368
+ self .has_cross_attention =has_cross_attention
369
+ self .lora_applied =lora_applied
370
+ self .architecture_version =architecture_version
371
+
372
+
373
+ self .output_dir =output_dir
374
+
375
+ @classmethod
376
+ def from_pretrained (cls ,pretrained_model_name_or_path :str ,**kwargs ):
377
+ """
378
+ SOTA: Load config from directory, filtering out keys that don't match XoronConfig.
379
+ This enables loading configs from newer/different versions gracefully.
380
+ """
381
+ import json
382
+ import os
383
+
384
+
385
+ if os .path .isdir (pretrained_model_name_or_path ):
386
+ config_path =os .path .join (pretrained_model_name_or_path ,"config.json")
387
+ else :
388
+ config_path =pretrained_model_name_or_path
389
+
390
+ if os .path .exists (config_path ):
391
+ with open (config_path ,"r")as f :
392
+ config_dict =json .load (f )
393
+
394
+
395
+ import inspect
396
+ sig =inspect .signature (cls .__init__ )
397
+ valid_keys =set (sig .parameters .keys ())
398
+
399
+ filtered_config ={k :v for k ,v in config_dict .items ()if k in valid_keys }
400
+
401
+
402
+ filtered_config .update (kwargs )
403
+ return cls (**filtered_config )
404
+
405
+
406
+ try :
407
+ return super ().from_pretrained (pretrained_model_name_or_path ,**kwargs )
408
+ except Exception :
409
+
410
+ return cls (**kwargs )
cross_attention.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6beff1e6cfb37ea461f112bf9d138ca007c01e24ac716b997a92000813aa8de5
3
  size 174191400
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01f2c0d12d9a882e71b37a268aa426e99b03cbc1372629eb283a28a10d05d5c6
3
  size 174191400
llm.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b168f1e28965acb01ab0375c81614f3af6cd312b27c630633ce21c555d8ab3b5
3
  size 1506832040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcb8a06e9fc3b7ab14df9d1e54eea7fe4732a0ece031f4af02f4bed76416c620
3
  size 1506832040
modeling_xoron.py CHANGED
The diff for this file is too large to render. See raw diff
 
streaming_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "epoch": 85,
3
  "unique_samples": 400,
4
  "total_yields": 800,
5
  "dataset_positions": {
@@ -87,14 +87,14 @@
87
  "Pexels-I2V-350k": 650,
88
  "SmolTalk-OpenHermes": 250,
89
  "SmolTalk-All": 250,
90
- "Cosmopedia-AutoMath": 250,
91
- "OpenMathInstruct-1": 250,
92
- "NuminaMath-CoT": 250,
93
- "UltraData-Math-Conv": 250,
94
- "Cosmopedia-KhanAcademy": 250,
95
- "NuminaMath-TIR": 250,
96
- "UltraData-Math-QA": 250,
97
- "Cosmopedia-OpenStax": 250,
98
  "MedMCQA": 350,
99
  "Medical-Reasoning-SFT-Mega": 350,
100
  "Medical-O1-Reasoning-EN": 350
@@ -135,14 +135,14 @@
135
  "OpenAssistant": 450,
136
  "SmolTalk-OpenHermes": 250,
137
  "SmolTalk-All": 250,
138
- "Cosmopedia-AutoMath": 250,
139
- "OpenMathInstruct-1": 250,
140
- "NuminaMath-CoT": 250,
141
- "UltraData-Math-Conv": 250,
142
- "Cosmopedia-KhanAcademy": 250,
143
- "NuminaMath-TIR": 250,
144
- "UltraData-Math-QA": 250,
145
- "Cosmopedia-OpenStax": 250,
146
  "MedMCQA": 350,
147
  "Medical-Reasoning-SFT-Mega": 350,
148
  "Medical-O1-Reasoning-EN": 350
@@ -170,9 +170,9 @@
170
  "audio": {}
171
  },
172
  "modality_counts": {
173
- "text": 0,
174
  "image": 0,
175
- "video": 250,
176
  "audio": 0
177
  },
178
  "last_modality": null
 
1
  {
2
+ "epoch": 98,
3
  "unique_samples": 400,
4
  "total_yields": 800,
5
  "dataset_positions": {
 
87
  "Pexels-I2V-350k": 650,
88
  "SmolTalk-OpenHermes": 250,
89
  "SmolTalk-All": 250,
90
+ "Cosmopedia-AutoMath": 600,
91
+ "OpenMathInstruct-1": 600,
92
+ "NuminaMath-CoT": 600,
93
+ "UltraData-Math-Conv": 600,
94
+ "Cosmopedia-KhanAcademy": 600,
95
+ "NuminaMath-TIR": 600,
96
+ "UltraData-Math-QA": 600,
97
+ "Cosmopedia-OpenStax": 600,
98
  "MedMCQA": 350,
99
  "Medical-Reasoning-SFT-Mega": 350,
100
  "Medical-O1-Reasoning-EN": 350
 
135
  "OpenAssistant": 450,
136
  "SmolTalk-OpenHermes": 250,
137
  "SmolTalk-All": 250,
138
+ "Cosmopedia-AutoMath": 600,
139
+ "OpenMathInstruct-1": 600,
140
+ "NuminaMath-CoT": 600,
141
+ "UltraData-Math-Conv": 600,
142
+ "Cosmopedia-KhanAcademy": 600,
143
+ "NuminaMath-TIR": 600,
144
+ "UltraData-Math-QA": 600,
145
+ "Cosmopedia-OpenStax": 600,
146
  "MedMCQA": 350,
147
  "Medical-Reasoning-SFT-Mega": 350,
148
  "Medical-O1-Reasoning-EN": 350
 
170
  "audio": {}
171
  },
172
  "modality_counts": {
173
+ "text": 400,
174
  "image": 0,
175
+ "video": 0,
176
  "audio": 0
177
  },
178
  "last_modality": null
trainer_state.json CHANGED
@@ -1,32 +1,32 @@
1
  {
2
  "best_model_checkpoint": "/kaggle/working/xoron-final",
3
- "best_metric": 3.398919365755515,
4
- "epoch": 1,
5
- "epochs_completed": 1,
6
- "global_step": 31,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [],
10
  "logging_steps": 50,
11
- "max_steps": 31,
12
- "num_train_epochs": 1,
13
  "total_flos": 0,
14
  "train_batch_size": 1,
15
  "effective_batch_size": 16,
16
  "learning_rate": 0.0001,
17
  "max_grad_norm": 1.0,
18
  "trainable_components": [
19
- "vision",
20
- "video",
21
  "llm",
22
  "cross_attention",
23
- "video_generation",
24
  "modality_markers"
25
  ],
26
  "frozen_components": [
 
 
27
  "audio",
28
  "speech",
29
- "image_generation"
 
30
  ],
31
  "trial_name": null,
32
  "trial_params": null
 
1
  {
2
  "best_model_checkpoint": "/kaggle/working/xoron-final",
3
+ "best_metric": 5.354270628392697,
4
+ "epoch": 7,
5
+ "epochs_completed": 7,
6
+ "global_step": 350,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [],
10
  "logging_steps": 50,
11
+ "max_steps": 350,
12
+ "num_train_epochs": 7,
13
  "total_flos": 0,
14
  "train_batch_size": 1,
15
  "effective_batch_size": 16,
16
  "learning_rate": 0.0001,
17
  "max_grad_norm": 1.0,
18
  "trainable_components": [
 
 
19
  "llm",
20
  "cross_attention",
 
21
  "modality_markers"
22
  ],
23
  "frozen_components": [
24
+ "vision",
25
+ "video",
26
  "audio",
27
  "speech",
28
+ "image_generation",
29
+ "video_generation"
30
  ],
31
  "trial_name": null,
32
  "trial_params": null
training_state.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b224a38701068628ea2346719232695d255cff3500d63df4b888e5a94eab7ab4
3
- size 3426643671
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:148961e9ff451a61d54b8edf577bcc3dd080efec47d5d54d58a5870199de86a1
3
+ size 1514912171