Any-to-Any
Transformers
Safetensors
English
xoron
multimodal
Mixture of Experts
text-to-image
image editing
image to video
text-to-video
video editing
text-to-speech
speech-to-text
speech-to-speech
image-to-text
video-to-text
agentic
tool-use
flow-matching
3d-rope
titok
vidtok
dual-stream-attention
zero-shot-voice-cloning
bigvgan
snake-activation
multi-receptive-field-fusion
custom_code
Update model weights after training (epoch 7, loss 5.3543)
Browse files- audio_decoder.safetensors +1 -1
- config.json +1 -1
- configuration_xoron.py +353 -316
- cross_attention.safetensors +1 -1
- llm.safetensors +1 -1
- modeling_xoron.py +0 -0
- streaming_state.json +19 -19
- trainer_state.json +10 -10
- training_state.pt +2 -2
audio_decoder.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1458410612
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1628ac8faba5b54287f88aae6cd7885ccfa61f06ea0a09b08f01e91463b96df6
|
| 3 |
size 1458410612
|
config.json
CHANGED
|
@@ -49,7 +49,7 @@
|
|
| 49 |
"image_size_step": 32,
|
| 50 |
"video_min_size": 128,
|
| 51 |
"video_max_size": 320,
|
| 52 |
-
"video_base_size":
|
| 53 |
"video_size_step": 32,
|
| 54 |
"video_min_frames": 8,
|
| 55 |
"video_max_frames": 8,
|
|
|
|
| 49 |
"image_size_step": 32,
|
| 50 |
"video_min_size": 128,
|
| 51 |
"video_max_size": 320,
|
| 52 |
+
"video_base_size": 320,
|
| 53 |
"video_size_step": 32,
|
| 54 |
"video_min_frames": 8,
|
| 55 |
"video_max_frames": 8,
|
configuration_xoron.py
CHANGED
|
@@ -12,11 +12,11 @@ Usage:
|
|
| 12 |
config = AutoConfig.from_pretrained("your-repo/xoron-model", trust_remote_code=True)
|
| 13 |
"""
|
| 14 |
|
| 15 |
-
from transformers import PreTrainedConfig
|
| 16 |
-
from typing import List
|
| 17 |
|
| 18 |
|
| 19 |
-
class XoronConfig(PreTrainedConfig):
|
| 20 |
"""
|
| 21 |
Configuration class for Xoron-Dev multimodal model.
|
| 22 |
|
|
@@ -57,317 +57,354 @@ class XoronConfig(PreTrainedConfig):
|
|
| 57 |
- FP16-native numerical stability
|
| 58 |
- Multi-scale training for variable resolution handling
|
| 59 |
"""
|
| 60 |
-
|
| 61 |
-
model_type =
|
| 62 |
-
|
| 63 |
-
def __init__(
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
):
|
| 221 |
-
|
| 222 |
-
super().__init__(**kwargs)
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
self.modality_dropout_prob =
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
self.model_name =
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
self.hidden_size =
|
| 232 |
-
self.num_layers =
|
| 233 |
-
self.num_heads =
|
| 234 |
-
self.intermediate_size =
|
| 235 |
-
self.vocab_size =
|
| 236 |
-
self.max_position_embeddings =
|
| 237 |
-
self.rms_norm_eps =
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
self.use_ring_attention =
|
| 241 |
-
self.ring_attention_chunk_size =
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
self.tie_word_embeddings =
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
self.use_moe =
|
| 248 |
-
self.num_experts =
|
| 249 |
-
self.num_experts_per_tok =
|
| 250 |
-
self.moe_layer_freq =
|
| 251 |
-
self.use_shared_expert =
|
| 252 |
-
self.moe_capacity_factor =
|
| 253 |
-
self.use_aux_lossless =
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
self.vision_model_name =
|
| 257 |
-
self.freeze_vision =
|
| 258 |
-
self.num_vision_tokens =
|
| 259 |
-
self.projector_type =
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
self.use_vision_dual_stream =
|
| 263 |
-
self.use_vision_titok =
|
| 264 |
-
self.num_vision_titok_tokens =
|
| 265 |
-
self.num_vision_dual_stream_layers =
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
self.use_video_3d_rope =
|
| 269 |
-
self.use_video_temporal_moe =
|
| 270 |
-
self.num_video_encoder_layers =
|
| 271 |
-
self.num_video_experts =
|
| 272 |
-
self.use_video_vidtok =
|
| 273 |
-
self.vidtok_latent_channels =
|
| 274 |
-
self.vidtok_temporal_compression =
|
| 275 |
-
self.vidtok_spatial_compression =
|
| 276 |
-
self.vidtok_causal =
|
| 277 |
-
self.vidtok_use_fsq =
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
self.use_video_titok =
|
| 281 |
-
self.num_video_titok_tokens =
|
| 282 |
-
self.num_video_titok_layers =
|
| 283 |
-
self.num_video_titok_heads =
|
| 284 |
-
self.video_titok_dropout =
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
self.use_multi_scale =
|
| 288 |
-
self.use_continuous_scale =
|
| 289 |
-
self.image_min_size =
|
| 290 |
-
self.image_max_size =
|
| 291 |
-
self.image_base_size =
|
| 292 |
-
self.image_size_step =
|
| 293 |
-
self.video_min_size =
|
| 294 |
-
self.video_max_size =
|
| 295 |
-
self.video_base_size =
|
| 296 |
-
self.video_size_step =
|
| 297 |
-
self.video_min_frames =
|
| 298 |
-
self.video_max_frames =
|
| 299 |
-
self.video_base_frames =
|
| 300 |
-
self.video_frame_step =
|
| 301 |
-
self.multi_scale_strategy =
|
| 302 |
-
self.multi_scale_warmup_epochs =
|
| 303 |
-
self.adaptive_scale_oom_penalty =
|
| 304 |
-
self.adaptive_scale_success_boost =
|
| 305 |
-
self.generation_supported_sizes =
|
| 306 |
-
self.generation_supported_frames =
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
self.enable_generation =
|
| 310 |
-
self.generation_latent_channels =
|
| 311 |
-
self.generation_base_channels =
|
| 312 |
-
self.generation_inference_steps =
|
| 313 |
-
self.generation_cfg_scale =
|
| 314 |
-
self.generation_use_flow_matching =
|
| 315 |
-
self.generation_num_experts =
|
| 316 |
-
self.generation_use_dual_stream =
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
self.generation_video_cfg_scale =
|
| 320 |
-
self.generation_video_use_flow_matching =
|
| 321 |
-
self.generation_video_num_experts =
|
| 322 |
-
self.generation_video_use_3d_rope =
|
| 323 |
-
self.generation_video_use_temporal_moe =
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
self.audio_sample_rate =
|
| 327 |
-
self.audio_n_mels =
|
| 328 |
-
self.audio_max_length =
|
| 329 |
-
self.audio_max_waveform_samples =
|
| 330 |
-
self.audio_num_speakers =
|
| 331 |
-
self.use_raw_waveform =
|
| 332 |
-
self.audio_kv_lora_rank =
|
| 333 |
-
self.audio_speaker_embed_dim =
|
| 334 |
-
self.use_mas =
|
| 335 |
-
self.use_in_context_audio_prompting =
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
self.tokenizer_name =
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
self.use_lora =
|
| 342 |
-
self.lora_r =
|
| 343 |
-
self.lora_alpha =
|
| 344 |
-
self.lora_dropout =
|
| 345 |
-
self.lora_target_modules =
|
| 346 |
-
self.train_lora_only =
|
| 347 |
-
self.use_rslora =
|
| 348 |
-
self.use_dora =
|
| 349 |
-
self.lora_plus_lr_ratio =
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
self.use_cross_attention =
|
| 353 |
-
self.cross_attention_layers =
|
| 354 |
-
self.cross_attention_heads =
|
| 355 |
-
self.cross_attention_dropout =
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
self.use_flash_attention =
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
self.has_audio_encoder =
|
| 362 |
-
self.has_audio_decoder =
|
| 363 |
-
self.has_waveform_decoder =
|
| 364 |
-
self.has_vision_encoder =
|
| 365 |
-
self.has_video_encoder =
|
| 366 |
-
self.has_generator =
|
| 367 |
-
self.has_video_generator =
|
| 368 |
-
self.has_cross_attention =
|
| 369 |
-
self.lora_applied =
|
| 370 |
-
self.architecture_version =
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
self.output_dir =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
config = AutoConfig.from_pretrained("your-repo/xoron-model", trust_remote_code=True)
|
| 13 |
"""
|
| 14 |
|
| 15 |
+
from transformers import PreTrainedConfig
|
| 16 |
+
from typing import List ,Tuple ,Union
|
| 17 |
|
| 18 |
|
| 19 |
+
class XoronConfig (PreTrainedConfig ):
|
| 20 |
"""
|
| 21 |
Configuration class for Xoron-Dev multimodal model.
|
| 22 |
|
|
|
|
| 57 |
- FP16-native numerical stability
|
| 58 |
- Multi-scale training for variable resolution handling
|
| 59 |
"""
|
| 60 |
+
|
| 61 |
+
model_type ="xoron"
|
| 62 |
+
|
| 63 |
+
def __init__ (
|
| 64 |
+
self ,
|
| 65 |
+
|
| 66 |
+
model_name :str ="Xoron-Dev-MultiMoE",
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
hidden_size :int =1024 ,
|
| 70 |
+
num_layers :int =12 ,
|
| 71 |
+
num_heads :int =16 ,
|
| 72 |
+
intermediate_size :int =2048 ,
|
| 73 |
+
vocab_size :int =151643 ,
|
| 74 |
+
max_position_embeddings :int =131072 ,
|
| 75 |
+
rms_norm_eps :float =1e-6 ,
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
use_ring_attention :bool =True ,
|
| 79 |
+
ring_attention_chunk_size :int =4096 ,
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
tie_word_embeddings :bool =True ,
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
use_moe :bool =True ,
|
| 86 |
+
num_experts :int =8 ,
|
| 87 |
+
num_experts_per_tok :int =2 ,
|
| 88 |
+
moe_layer_freq :int =2 ,
|
| 89 |
+
use_shared_expert :bool =True ,
|
| 90 |
+
moe_capacity_factor :float =1.25 ,
|
| 91 |
+
use_aux_lossless :bool =True ,
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
vision_model_name :str ="google/siglip-so400m-patch14-384",
|
| 95 |
+
freeze_vision :bool =False ,
|
| 96 |
+
num_vision_tokens :int =64 ,
|
| 97 |
+
projector_type :str ="perceiver",
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
use_vision_dual_stream :bool =True ,
|
| 101 |
+
use_vision_titok :bool =True ,
|
| 102 |
+
num_vision_titok_tokens :int =256 ,
|
| 103 |
+
num_vision_dual_stream_layers :int =2 ,
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
use_video_3d_rope :bool =True ,
|
| 107 |
+
use_video_temporal_moe :bool =True ,
|
| 108 |
+
num_video_encoder_layers :int =4 ,
|
| 109 |
+
num_video_experts :int =4 ,
|
| 110 |
+
use_video_vidtok :bool =True ,
|
| 111 |
+
vidtok_latent_channels :int =4 ,
|
| 112 |
+
vidtok_temporal_compression :int =4 ,
|
| 113 |
+
vidtok_spatial_compression :int =8 ,
|
| 114 |
+
vidtok_causal :bool =True ,
|
| 115 |
+
vidtok_use_fsq :bool =False ,
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
use_video_titok :bool =True ,
|
| 119 |
+
num_video_titok_tokens :int =64 ,
|
| 120 |
+
num_video_titok_layers :int =2 ,
|
| 121 |
+
num_video_titok_heads :int =8 ,
|
| 122 |
+
video_titok_dropout :float =0.1 ,
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
use_multi_scale :bool =True ,
|
| 126 |
+
use_continuous_scale :bool =True ,
|
| 127 |
+
image_min_size :int =128 ,
|
| 128 |
+
image_max_size :int =384 ,
|
| 129 |
+
image_base_size :int =256 ,
|
| 130 |
+
image_size_step :int =32 ,
|
| 131 |
+
video_min_size :int =128 ,
|
| 132 |
+
video_max_size :int =320 ,
|
| 133 |
+
video_base_size :int =192 ,
|
| 134 |
+
video_size_step :int =32 ,
|
| 135 |
+
video_min_frames :int =8 ,
|
| 136 |
+
video_max_frames :int =24 ,
|
| 137 |
+
video_base_frames :int =16 ,
|
| 138 |
+
video_frame_step :int =4 ,
|
| 139 |
+
multi_scale_strategy :str ="adaptive",
|
| 140 |
+
multi_scale_warmup_epochs :int =3 ,
|
| 141 |
+
adaptive_scale_oom_penalty :float =0.5 ,
|
| 142 |
+
adaptive_scale_success_boost :float =0.1 ,
|
| 143 |
+
generation_supported_sizes :Union [List [int ],Tuple [int ,...]]=(192 ,256 ,320 ,384 ),
|
| 144 |
+
generation_supported_frames :Union [List [int ],Tuple [int ,...]]=(8 ,12 ,16 ,20 ,24 ),
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
enable_generation :bool =True ,
|
| 148 |
+
generation_latent_channels :int =4 ,
|
| 149 |
+
generation_base_channels :int =128 ,
|
| 150 |
+
generation_inference_steps :int =50 ,
|
| 151 |
+
generation_cfg_scale :float =7.5 ,
|
| 152 |
+
generation_use_flow_matching :bool =True ,
|
| 153 |
+
generation_num_experts :int =4 ,
|
| 154 |
+
generation_use_dual_stream :bool =True ,
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
generation_video_cfg_scale :float =7.5 ,
|
| 158 |
+
generation_video_use_flow_matching :bool =True ,
|
| 159 |
+
generation_video_num_experts :int =4 ,
|
| 160 |
+
generation_video_use_3d_rope :bool =True ,
|
| 161 |
+
generation_video_use_temporal_moe :bool =True ,
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
audio_sample_rate :int =16000 ,
|
| 165 |
+
audio_n_mels :int =80 ,
|
| 166 |
+
audio_max_length :int =625 ,
|
| 167 |
+
audio_max_waveform_samples :int =160000 ,
|
| 168 |
+
audio_num_speakers :int =256 ,
|
| 169 |
+
use_raw_waveform :bool =True ,
|
| 170 |
+
audio_kv_lora_rank :int =256 ,
|
| 171 |
+
audio_speaker_embed_dim :int =256 ,
|
| 172 |
+
use_mas :bool =True ,
|
| 173 |
+
use_in_context_audio_prompting :bool =True ,
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
tokenizer_name :str ="Qwen/Qwen2.5-1.5B",
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
use_lora :bool =True ,
|
| 180 |
+
lora_r :int =32 ,
|
| 181 |
+
lora_alpha :int =64 ,
|
| 182 |
+
lora_dropout :float =0.05 ,
|
| 183 |
+
lora_target_modules :Union [List [str ],Tuple [str ,...]]=(
|
| 184 |
+
'q_proj','k_proj','v_proj','o_proj',
|
| 185 |
+
'gate_proj','up_proj','down_proj',
|
| 186 |
+
),
|
| 187 |
+
train_lora_only :bool =False ,
|
| 188 |
+
use_rslora :bool =True ,
|
| 189 |
+
use_dora :bool =False ,
|
| 190 |
+
lora_plus_lr_ratio :float =4.0 ,
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
use_cross_attention :bool =True ,
|
| 194 |
+
cross_attention_layers :int =4 ,
|
| 195 |
+
cross_attention_heads :int =8 ,
|
| 196 |
+
cross_attention_dropout :float =0.1 ,
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
use_flash_attention :bool =True ,
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
has_audio_encoder :bool =True ,
|
| 203 |
+
has_audio_decoder :bool =True ,
|
| 204 |
+
has_waveform_decoder :bool =True ,
|
| 205 |
+
has_vision_encoder :bool =True ,
|
| 206 |
+
has_video_encoder :bool =True ,
|
| 207 |
+
has_generator :bool =True ,
|
| 208 |
+
has_video_generator :bool =True ,
|
| 209 |
+
has_cross_attention :bool =True ,
|
| 210 |
+
lora_applied :bool =False ,
|
| 211 |
+
architecture_version :int =2 ,
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
output_dir :str ="./xoron-model",
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
modality_dropout_prob :float =0.0 ,
|
| 218 |
+
|
| 219 |
+
**kwargs ,
|
| 220 |
):
|
| 221 |
+
|
| 222 |
+
super ().__init__ (**kwargs )
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
self .modality_dropout_prob =modality_dropout_prob
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
self .model_name =model_name
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
self .hidden_size =hidden_size
|
| 232 |
+
self .num_layers =num_layers
|
| 233 |
+
self .num_heads =num_heads
|
| 234 |
+
self .intermediate_size =intermediate_size
|
| 235 |
+
self .vocab_size =vocab_size
|
| 236 |
+
self .max_position_embeddings =max_position_embeddings
|
| 237 |
+
self .rms_norm_eps =rms_norm_eps
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
self .use_ring_attention =use_ring_attention
|
| 241 |
+
self .ring_attention_chunk_size =ring_attention_chunk_size
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
self .tie_word_embeddings =tie_word_embeddings
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
self .use_moe =use_moe
|
| 248 |
+
self .num_experts =num_experts
|
| 249 |
+
self .num_experts_per_tok =num_experts_per_tok
|
| 250 |
+
self .moe_layer_freq =moe_layer_freq
|
| 251 |
+
self .use_shared_expert =use_shared_expert
|
| 252 |
+
self .moe_capacity_factor =moe_capacity_factor
|
| 253 |
+
self .use_aux_lossless =use_aux_lossless
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
self .vision_model_name =vision_model_name
|
| 257 |
+
self .freeze_vision =freeze_vision
|
| 258 |
+
self .num_vision_tokens =num_vision_tokens
|
| 259 |
+
self .projector_type =projector_type
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
self .use_vision_dual_stream =use_vision_dual_stream
|
| 263 |
+
self .use_vision_titok =use_vision_titok
|
| 264 |
+
self .num_vision_titok_tokens =num_vision_titok_tokens
|
| 265 |
+
self .num_vision_dual_stream_layers =num_vision_dual_stream_layers
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
self .use_video_3d_rope =use_video_3d_rope
|
| 269 |
+
self .use_video_temporal_moe =use_video_temporal_moe
|
| 270 |
+
self .num_video_encoder_layers =num_video_encoder_layers
|
| 271 |
+
self .num_video_experts =num_video_experts
|
| 272 |
+
self .use_video_vidtok =use_video_vidtok
|
| 273 |
+
self .vidtok_latent_channels =vidtok_latent_channels
|
| 274 |
+
self .vidtok_temporal_compression =vidtok_temporal_compression
|
| 275 |
+
self .vidtok_spatial_compression =vidtok_spatial_compression
|
| 276 |
+
self .vidtok_causal =vidtok_causal
|
| 277 |
+
self .vidtok_use_fsq =vidtok_use_fsq
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
self .use_video_titok =use_video_titok
|
| 281 |
+
self .num_video_titok_tokens =num_video_titok_tokens
|
| 282 |
+
self .num_video_titok_layers =num_video_titok_layers
|
| 283 |
+
self .num_video_titok_heads =num_video_titok_heads
|
| 284 |
+
self .video_titok_dropout =video_titok_dropout
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
self .use_multi_scale =use_multi_scale
|
| 288 |
+
self .use_continuous_scale =use_continuous_scale
|
| 289 |
+
self .image_min_size =image_min_size
|
| 290 |
+
self .image_max_size =image_max_size
|
| 291 |
+
self .image_base_size =image_base_size
|
| 292 |
+
self .image_size_step =image_size_step
|
| 293 |
+
self .video_min_size =video_min_size
|
| 294 |
+
self .video_max_size =video_max_size
|
| 295 |
+
self .video_base_size =video_base_size
|
| 296 |
+
self .video_size_step =video_size_step
|
| 297 |
+
self .video_min_frames =video_min_frames
|
| 298 |
+
self .video_max_frames =video_max_frames
|
| 299 |
+
self .video_base_frames =video_base_frames
|
| 300 |
+
self .video_frame_step =video_frame_step
|
| 301 |
+
self .multi_scale_strategy =multi_scale_strategy
|
| 302 |
+
self .multi_scale_warmup_epochs =multi_scale_warmup_epochs
|
| 303 |
+
self .adaptive_scale_oom_penalty =adaptive_scale_oom_penalty
|
| 304 |
+
self .adaptive_scale_success_boost =adaptive_scale_success_boost
|
| 305 |
+
self .generation_supported_sizes =list (generation_supported_sizes )if not isinstance (generation_supported_sizes ,list )else generation_supported_sizes
|
| 306 |
+
self .generation_supported_frames =list (generation_supported_frames )if not isinstance (generation_supported_frames ,list )else generation_supported_frames
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
self .enable_generation =enable_generation
|
| 310 |
+
self .generation_latent_channels =generation_latent_channels
|
| 311 |
+
self .generation_base_channels =generation_base_channels
|
| 312 |
+
self .generation_inference_steps =generation_inference_steps
|
| 313 |
+
self .generation_cfg_scale =generation_cfg_scale
|
| 314 |
+
self .generation_use_flow_matching =generation_use_flow_matching
|
| 315 |
+
self .generation_num_experts =generation_num_experts
|
| 316 |
+
self .generation_use_dual_stream =generation_use_dual_stream
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
self .generation_video_cfg_scale =generation_video_cfg_scale
|
| 320 |
+
self .generation_video_use_flow_matching =generation_video_use_flow_matching
|
| 321 |
+
self .generation_video_num_experts =generation_video_num_experts
|
| 322 |
+
self .generation_video_use_3d_rope =generation_video_use_3d_rope
|
| 323 |
+
self .generation_video_use_temporal_moe =generation_video_use_temporal_moe
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
self .audio_sample_rate =audio_sample_rate
|
| 327 |
+
self .audio_n_mels =audio_n_mels
|
| 328 |
+
self .audio_max_length =audio_max_length
|
| 329 |
+
self .audio_max_waveform_samples =audio_max_waveform_samples
|
| 330 |
+
self .audio_num_speakers =audio_num_speakers
|
| 331 |
+
self .use_raw_waveform =use_raw_waveform
|
| 332 |
+
self .audio_kv_lora_rank =audio_kv_lora_rank
|
| 333 |
+
self .audio_speaker_embed_dim =audio_speaker_embed_dim
|
| 334 |
+
self .use_mas =use_mas
|
| 335 |
+
self .use_in_context_audio_prompting =use_in_context_audio_prompting
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
self .tokenizer_name =tokenizer_name
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
self .use_lora =use_lora
|
| 342 |
+
self .lora_r =lora_r
|
| 343 |
+
self .lora_alpha =lora_alpha
|
| 344 |
+
self .lora_dropout =lora_dropout
|
| 345 |
+
self .lora_target_modules =list (lora_target_modules )if not isinstance (lora_target_modules ,list )else lora_target_modules
|
| 346 |
+
self .train_lora_only =train_lora_only
|
| 347 |
+
self .use_rslora =use_rslora
|
| 348 |
+
self .use_dora =use_dora
|
| 349 |
+
self .lora_plus_lr_ratio =lora_plus_lr_ratio
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
self .use_cross_attention =use_cross_attention
|
| 353 |
+
self .cross_attention_layers =cross_attention_layers
|
| 354 |
+
self .cross_attention_heads =cross_attention_heads
|
| 355 |
+
self .cross_attention_dropout =cross_attention_dropout
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
self .use_flash_attention =use_flash_attention
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
self .has_audio_encoder =has_audio_encoder
|
| 362 |
+
self .has_audio_decoder =has_audio_decoder
|
| 363 |
+
self .has_waveform_decoder =has_waveform_decoder
|
| 364 |
+
self .has_vision_encoder =has_vision_encoder
|
| 365 |
+
self .has_video_encoder =has_video_encoder
|
| 366 |
+
self .has_generator =has_generator
|
| 367 |
+
self .has_video_generator =has_video_generator
|
| 368 |
+
self .has_cross_attention =has_cross_attention
|
| 369 |
+
self .lora_applied =lora_applied
|
| 370 |
+
self .architecture_version =architecture_version
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
self .output_dir =output_dir
|
| 374 |
+
|
| 375 |
+
@classmethod
|
| 376 |
+
def from_pretrained (cls ,pretrained_model_name_or_path :str ,**kwargs ):
|
| 377 |
+
"""
|
| 378 |
+
SOTA: Load config from directory, filtering out keys that don't match XoronConfig.
|
| 379 |
+
This enables loading configs from newer/different versions gracefully.
|
| 380 |
+
"""
|
| 381 |
+
import json
|
| 382 |
+
import os
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
if os .path .isdir (pretrained_model_name_or_path ):
|
| 386 |
+
config_path =os .path .join (pretrained_model_name_or_path ,"config.json")
|
| 387 |
+
else :
|
| 388 |
+
config_path =pretrained_model_name_or_path
|
| 389 |
+
|
| 390 |
+
if os .path .exists (config_path ):
|
| 391 |
+
with open (config_path ,"r")as f :
|
| 392 |
+
config_dict =json .load (f )
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
import inspect
|
| 396 |
+
sig =inspect .signature (cls .__init__ )
|
| 397 |
+
valid_keys =set (sig .parameters .keys ())
|
| 398 |
+
|
| 399 |
+
filtered_config ={k :v for k ,v in config_dict .items ()if k in valid_keys }
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
filtered_config .update (kwargs )
|
| 403 |
+
return cls (**filtered_config )
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
try :
|
| 407 |
+
return super ().from_pretrained (pretrained_model_name_or_path ,**kwargs )
|
| 408 |
+
except Exception :
|
| 409 |
+
|
| 410 |
+
return cls (**kwargs )
|
cross_attention.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 174191400
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:01f2c0d12d9a882e71b37a268aa426e99b03cbc1372629eb283a28a10d05d5c6
|
| 3 |
size 174191400
|
llm.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1506832040
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dcb8a06e9fc3b7ab14df9d1e54eea7fe4732a0ece031f4af02f4bed76416c620
|
| 3 |
size 1506832040
|
modeling_xoron.py
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
streaming_state.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"epoch":
|
| 3 |
"unique_samples": 400,
|
| 4 |
"total_yields": 800,
|
| 5 |
"dataset_positions": {
|
|
@@ -87,14 +87,14 @@
|
|
| 87 |
"Pexels-I2V-350k": 650,
|
| 88 |
"SmolTalk-OpenHermes": 250,
|
| 89 |
"SmolTalk-All": 250,
|
| 90 |
-
"Cosmopedia-AutoMath":
|
| 91 |
-
"OpenMathInstruct-1":
|
| 92 |
-
"NuminaMath-CoT":
|
| 93 |
-
"UltraData-Math-Conv":
|
| 94 |
-
"Cosmopedia-KhanAcademy":
|
| 95 |
-
"NuminaMath-TIR":
|
| 96 |
-
"UltraData-Math-QA":
|
| 97 |
-
"Cosmopedia-OpenStax":
|
| 98 |
"MedMCQA": 350,
|
| 99 |
"Medical-Reasoning-SFT-Mega": 350,
|
| 100 |
"Medical-O1-Reasoning-EN": 350
|
|
@@ -135,14 +135,14 @@
|
|
| 135 |
"OpenAssistant": 450,
|
| 136 |
"SmolTalk-OpenHermes": 250,
|
| 137 |
"SmolTalk-All": 250,
|
| 138 |
-
"Cosmopedia-AutoMath":
|
| 139 |
-
"OpenMathInstruct-1":
|
| 140 |
-
"NuminaMath-CoT":
|
| 141 |
-
"UltraData-Math-Conv":
|
| 142 |
-
"Cosmopedia-KhanAcademy":
|
| 143 |
-
"NuminaMath-TIR":
|
| 144 |
-
"UltraData-Math-QA":
|
| 145 |
-
"Cosmopedia-OpenStax":
|
| 146 |
"MedMCQA": 350,
|
| 147 |
"Medical-Reasoning-SFT-Mega": 350,
|
| 148 |
"Medical-O1-Reasoning-EN": 350
|
|
@@ -170,9 +170,9 @@
|
|
| 170 |
"audio": {}
|
| 171 |
},
|
| 172 |
"modality_counts": {
|
| 173 |
-
"text":
|
| 174 |
"image": 0,
|
| 175 |
-
"video":
|
| 176 |
"audio": 0
|
| 177 |
},
|
| 178 |
"last_modality": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"epoch": 98,
|
| 3 |
"unique_samples": 400,
|
| 4 |
"total_yields": 800,
|
| 5 |
"dataset_positions": {
|
|
|
|
| 87 |
"Pexels-I2V-350k": 650,
|
| 88 |
"SmolTalk-OpenHermes": 250,
|
| 89 |
"SmolTalk-All": 250,
|
| 90 |
+
"Cosmopedia-AutoMath": 600,
|
| 91 |
+
"OpenMathInstruct-1": 600,
|
| 92 |
+
"NuminaMath-CoT": 600,
|
| 93 |
+
"UltraData-Math-Conv": 600,
|
| 94 |
+
"Cosmopedia-KhanAcademy": 600,
|
| 95 |
+
"NuminaMath-TIR": 600,
|
| 96 |
+
"UltraData-Math-QA": 600,
|
| 97 |
+
"Cosmopedia-OpenStax": 600,
|
| 98 |
"MedMCQA": 350,
|
| 99 |
"Medical-Reasoning-SFT-Mega": 350,
|
| 100 |
"Medical-O1-Reasoning-EN": 350
|
|
|
|
| 135 |
"OpenAssistant": 450,
|
| 136 |
"SmolTalk-OpenHermes": 250,
|
| 137 |
"SmolTalk-All": 250,
|
| 138 |
+
"Cosmopedia-AutoMath": 600,
|
| 139 |
+
"OpenMathInstruct-1": 600,
|
| 140 |
+
"NuminaMath-CoT": 600,
|
| 141 |
+
"UltraData-Math-Conv": 600,
|
| 142 |
+
"Cosmopedia-KhanAcademy": 600,
|
| 143 |
+
"NuminaMath-TIR": 600,
|
| 144 |
+
"UltraData-Math-QA": 600,
|
| 145 |
+
"Cosmopedia-OpenStax": 600,
|
| 146 |
"MedMCQA": 350,
|
| 147 |
"Medical-Reasoning-SFT-Mega": 350,
|
| 148 |
"Medical-O1-Reasoning-EN": 350
|
|
|
|
| 170 |
"audio": {}
|
| 171 |
},
|
| 172 |
"modality_counts": {
|
| 173 |
+
"text": 400,
|
| 174 |
"image": 0,
|
| 175 |
+
"video": 0,
|
| 176 |
"audio": 0
|
| 177 |
},
|
| 178 |
"last_modality": null
|
trainer_state.json
CHANGED
|
@@ -1,32 +1,32 @@
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
-
"best_metric":
|
| 4 |
-
"epoch":
|
| 5 |
-
"epochs_completed":
|
| 6 |
-
"global_step":
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
| 9 |
"log_history": [],
|
| 10 |
"logging_steps": 50,
|
| 11 |
-
"max_steps":
|
| 12 |
-
"num_train_epochs":
|
| 13 |
"total_flos": 0,
|
| 14 |
"train_batch_size": 1,
|
| 15 |
"effective_batch_size": 16,
|
| 16 |
"learning_rate": 0.0001,
|
| 17 |
"max_grad_norm": 1.0,
|
| 18 |
"trainable_components": [
|
| 19 |
-
"vision",
|
| 20 |
-
"video",
|
| 21 |
"llm",
|
| 22 |
"cross_attention",
|
| 23 |
-
"video_generation",
|
| 24 |
"modality_markers"
|
| 25 |
],
|
| 26 |
"frozen_components": [
|
|
|
|
|
|
|
| 27 |
"audio",
|
| 28 |
"speech",
|
| 29 |
-
"image_generation"
|
|
|
|
| 30 |
],
|
| 31 |
"trial_name": null,
|
| 32 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
"best_model_checkpoint": "/kaggle/working/xoron-final",
|
| 3 |
+
"best_metric": 5.354270628392697,
|
| 4 |
+
"epoch": 7,
|
| 5 |
+
"epochs_completed": 7,
|
| 6 |
+
"global_step": 350,
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
| 9 |
"log_history": [],
|
| 10 |
"logging_steps": 50,
|
| 11 |
+
"max_steps": 350,
|
| 12 |
+
"num_train_epochs": 7,
|
| 13 |
"total_flos": 0,
|
| 14 |
"train_batch_size": 1,
|
| 15 |
"effective_batch_size": 16,
|
| 16 |
"learning_rate": 0.0001,
|
| 17 |
"max_grad_norm": 1.0,
|
| 18 |
"trainable_components": [
|
|
|
|
|
|
|
| 19 |
"llm",
|
| 20 |
"cross_attention",
|
|
|
|
| 21 |
"modality_markers"
|
| 22 |
],
|
| 23 |
"frozen_components": [
|
| 24 |
+
"vision",
|
| 25 |
+
"video",
|
| 26 |
"audio",
|
| 27 |
"speech",
|
| 28 |
+
"image_generation",
|
| 29 |
+
"video_generation"
|
| 30 |
],
|
| 31 |
"trial_name": null,
|
| 32 |
"trial_params": null
|
training_state.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:148961e9ff451a61d54b8edf577bcc3dd080efec47d5d54d58a5870199de86a1
|
| 3 |
+
size 1514912171
|