Any-to-Any
Transformers
Safetensors
English
xoron
multimodal
Mixture of Experts
text-to-image
image editing
image to video
text-to-video
video editing
text-to-speech
speech-to-text
speech-to-speech
image-to-text
video-to-text
agentic
tool-use
flow-matching
3d-rope
titok
vidtok
dual-stream-attention
zero-shot-voice-cloning
bigvgan
snake-activation
multi-receptive-field-fusion
custom_code
| { | |
| "best_model_checkpoint": "/kaggle/working/xoron-final", | |
| "best_metric": 5.891898287038009, | |
| "epoch": 4, | |
| "epochs_completed": 4, | |
| "global_step": 72, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [], | |
| "logging_steps": 50, | |
| "max_steps": 72, | |
| "num_train_epochs": 4, | |
| "total_flos": 0, | |
| "train_batch_size": 1, | |
| "effective_batch_size": 16, | |
| "learning_rate": 0.0001, | |
| "max_grad_norm": 1.0, | |
| "trainable_components": [ | |
| "llm", | |
| "cross_attention", | |
| "modality_markers" | |
| ], | |
| "frozen_components": [ | |
| "vision", | |
| "video", | |
| "audio", | |
| "speech", | |
| "image_generation", | |
| "video_generation" | |
| ], | |
| "trial_name": null, | |
| "trial_params": null | |
| } |