chatterbox-turbo / t3_turbo_v1.yaml
ollieollie's picture
Upload folder using huggingface_hub
3c40542 verified
align_f0: false
align_loss_weight: 1.0
asc_loss_weight: 0.02
attention_mechanism: graves
augment_sr: false
base_model: null
bit_depth: 9
causal_convs: false
causal_decoder: false
clap_dims: 512
compat_dcnar_f0_std_cond: false
conv_stack_dilation: !!python/tuple
- 1
- 3
- 9
- 27
convbn_bias: false
cudnn_deterministic: false
dcnar_1d_discrim: false
dcnar_aligner_kernel: 5
dcnar_aligner_type: null
dcnar_allow_trivial_speaker_table: true
dcnar_batch_size: 24
dcnar_conformer: false
dcnar_conformer_attn_chunk_size: null
dcnar_conformer_attn_dim_head: 64
dcnar_conformer_attn_ff_mult: 4
dcnar_conformer_attn_win_size: null
dcnar_conv_weight_grouping: 1
dcnar_df0_loss_weight: 0.5
dcnar_dim_lrg: 512
dcnar_dim_sml: 256
dcnar_dim_style: 32
dcnar_discrim_tanh: false
dcnar_dtw_loss_weight: 1
dcnar_dur_loss_weight: 0.1
dcnar_dur_pred_scale: linear
dcnar_f0_cond_mel_decoding: false
dcnar_f0_cond_mel_decoding_teacher_forcing: true
dcnar_f0_loss_weight: 0.5
dcnar_gan_dims: 64
dcnar_global_style: true
dcnar_hard_gumbel_tones: false
dcnar_hubert_downsample: 1
dcnar_inpaint_vae: false
dcnar_inpaint_vae_kld_loss_weight: 0
dcnar_inpaint_vae_latent_dim: 32
dcnar_inpaint_vae_warmup_steps: 5000
dcnar_inpaint_vae_weight_step_size: 0.0002
dcnar_local_f0: false
dcnar_local_intensity: false
dcnar_local_style: false
dcnar_lr: 0.0001
dcnar_mel_adv: false
dcnar_mel_loss_weight: 10.0
dcnar_mixed_sr_loss: false
dcnar_n_terminal_tones: 0
dcnar_ph_f0_loss_weight: 1.0
dcnar_ph_hubert_loss_weight: 1.0
dcnar_ph_intensity_loss_weight: 1.0
dcnar_pitch_adv: false
dcnar_prosody_adv: false
dcnar_prosody_stats_cond: false
dcnar_pstat_weight_f0_mean: 10
dcnar_pstat_weight_f0_std: 100
dcnar_pstat_weight_intensity_mean: 10
dcnar_pstat_weight_intensity_std: 0
dcnar_pstat_weight_phdur_mean: 1
dcnar_pstat_weight_phdur_std: 1
dcnar_reverb_label: false
dcnar_sampler: default
dcnar_sr_label: false
dcnar_terminal_tone_usl_weight: 0
dcnar_terminal_tone_weight: 0
dcnar_upsampling: gaussian
dcnar_use_log_f0_frames: false
dcnar_use_toucan_utt_embs: false
dcnar_usl_mfcc: false
dcnar_usl_mfcc_deltas: false
dcnar_usl_mfcc_dim: 12
dcnar_usl_mfcc_var_dec: false
dcnar_usl_slim: false
dcnar_usl_slim_dim: 16
dcnar_usl_with_f0: false
dcnar_utt_dur_loss_weight: 0
dcnar_vc_local_hubert: false
dcnar_vc_mode: nn
dcnar_vc_text_predict: false
dcnar_vuv_loss_weight: 0.5
dcvoc_causal: false
dcvoc_causal_lookahead: 3
dcvoc_channel_downsample_mode: interleave
dcvoc_convs_per_scale: 8
dcvoc_disc_duplicates: 1
dcvoc_disc_mpwd: true
dcvoc_disc_mrsd: false
dcvoc_disc_pdd: true
dcvoc_disc_phase_aug: false
dcvoc_discriminator_bound: 1.01
dcvoc_groups_init: 8
dcvoc_halfres_conv: true
dcvoc_hidden_init: 1024
dcvoc_hop: 8
dcvoc_kernel: 7
dcvoc_mel_bneck: 256
dcvoc_smpwd_hidden_max: 1024
dcvoc_smpwd_periods:
- 2
- 3
- 5
- 7
- 9
- 11
- 13
dcvoc_upsample_method: linear
denoise: false
dfd_clip_stft: 1.0e-09
dfd_ramdisk_path: /mnt/ramdisk
ema_coeff: 0.99995
emo_embedded_speaker_id: false
emotion_adv: false
enable_eos_bos_chars: true
encoder_type: voice_encoder
eval_crosslang: false
eval_langs: dataset
eval_max_ref_samples: 192
eval_max_repeats: 1
eval_max_runs: 10
eval_max_sentences: 192
eval_mbnet_name: null
eval_models_dir: saved_models
eval_n_plots: 2
eval_n_wavs: 4
eval_reference: train
eval_syn_batch_size: 64
eval_text_source: default
eval_ve_name: universal/ve_v2
eval_voc_max_frames: 2000
eval_voc_name: null
f0_mode: praat
flatten_lstm_params: true
fmax: 16000
fmin: 0
frames_per_framegroup: 10
freeze_mel_head: false
gmvae_ema_lr: 0.0001
gmvae_latent_dim: 16
gmvae_num_components: 0
gpt_masked_loss: false
gpt_prod_max_text: 200
gpt_speaker_ref_type: same_speaker
gpt_transformer_type: gpt2-medium
hifigan_channels: 256
hooli_enc_dims: 256
hooli_filter_size: 257
hooli_inv_no_uv: false
hooli_inv_pitch_diff_reg_weight: 0
hooli_inv_pitch_shift_reg_weight: 0
hooli_nfft: 16
hooli_osc_freq_cutoff: 0.15
hooli_safe_step: true
hooli_tv_fir: false
hooli_wn_dims: 64
hooligan_discriminators: univnet
hooligan_istft: true
hop_size: 320
input_pos_emb: handled_internally_by_backbone
is_lora: false
language_embed_size: 16
legacy_gpt_hidden_size: 1024
lfcc_nfilts: 128
llama_config_name: Llama_520M
lora_alpha: 64
lora_dropout: 0.05
lora_r: 32
lossynet_bsize: 25
lossynet_clip_stft: 1.0e-09
lossynet_lr: 0.001
lossynet_n_out_classes: 2
lowest_sr: 8000
max_LR: 0.001
max_conditioning_inputs: 2
max_decoder_frames: 2000
max_f0_freq: 600
max_speech_tokens: 604
max_text_tokens: 402
max_total_tokens: 8196
mel_pad_difference: 1
mel_power: 1.0
mel_type: db
min_LR: 1.0e-06
min_f0_freq: 75
mpbert_n_freeze: 0
mpbert_tokenizer: null
mpbert_type: transformer
mu_law: true
n_cqcc_bins: 96
n_cqt_bins: 84
n_fft: 2048
n_gpt_channels: 1024
n_reverbs: 256
n_spk_cond_samples: 2
n_state_per_symbol: 1
n_transformer_heads: 16
n_transformer_layers: 30
normalize_loudness: false
normalized_mels: true
num_ceps: 29
num_diacritcs: 512
num_freq: 1025
num_heads: 4
num_mels: 256
num_style_tokens: 0
num_tones: 16
onehot_language: false
onehot_speaker: false
pf_word_boundaries: false
phonemizer_backend: espeak
preemphasis: 0.97
preemphasize_voc_target: false
prenet_type: original
project_conditioning: false
prosody_embed_size: 0
r_schedule:
- - 1
- -1
rvc_emb_channels: 768
rvc_enc_spk_input: false
rvc_f0_up: 0
rvc_f0_voc: true
rvc_filter_channels: 768
rvc_gin_channels: 256
rvc_hidden_channels: 192
rvc_inter_channels: 192
rvc_kernel_size: 3
rvc_mel_bins: 80
rvc_n_heads: 2
rvc_n_layers: 6
rvc_p_dropout: 0
rvc_resblock: '1'
rvc_resblock_dilation_sizes:
- - 1
- 3
- 5
- - 1
- 3
- 5
- - 1
- 3
- 5
rvc_resblock_kernel_sizes:
- 3
- 7
- 11
rvc_seg_enc_size_frames: 370
rvc_seg_enc_size_samples: 118400
rvc_seg_voc_size_frames: 40
rvc_seg_voc_size_samples: 12800
rvc_speaker_enc: table
rvc_speaker_enc_type: V1
rvc_speaker_pitch: null
rvc_spec_channels: 513
rvc_spk_embed_dim: 109
rvc_stft_filter_len: 1024
rvc_stft_win_len: 1024
rvc_train_kl_weight: 1.0
rvc_train_mel_weight: 45
rvc_upsample_initial_channel: 512
rvc_upsample_kernel_sizes:
- 20
- 16
- 4
- 4
rvc_upsample_rates:
- 10
- 8
- 2
- 2
rvc_use_f0: true
sample_rate: 32000
scheduler_max_total_steps: 200000
seed: 0
self_conditioning: false
separate_stopnet: false
singing_dim: 4
speaker_embed_size: 256
speech_cond_prompt_len: 250
speech_token_type: tortoise
speech_tokens_dict_size: 6563
speed_scale: 0.1
start_speech_token: 6561
start_text_token: 255
stepwise_sigmoid_noise: 2.0
stft_magnitude_min: 0.0001
stop_speech_token: 6562
stop_text_token: 0
stop_threshold: 0.25
style_embed_size: 256
supports_cfg: false
symbol_type: tortoise/data/gpt2_medium.json
syn_ar_f0_predict: true
syn_batch_frames: 16000
syn_batch_size: 32
syn_mel_scale: 1
syn_predict_f0: true
syn_sampler: binnedlength
syn_symmetric_mel: false
syn_train_max_frames: 700
syn_train_min_duration: 1
taco1_postnet: true
taco_decoder_att_rnn_dim: 1024
taco_decoder_prenet_dim: 256
taco_decoder_rnn_dim: 1024
taco_disjoint_conditioning: true
taco_encoder_dim: 512
taco_grad_clip: 1
taco_loss_masking: true
taco_lr: 0.0001
taco_weight_decay: 1.0e-06
target_loudness: -18
text_loss_weight: 0.1
text_preproc: none
text_tokens_dict_size: 50276
ti_vocoder: false
toucan_utt_emb_dim: 704
trim_silence: true
upsample_factors: !!python/tuple
- 5
- 8
- 8
upsample_rate: null
upsamplenet_dropout: false
upsamplenet_lr: 1.0e-05
use_adv_speaker_classifier: false
use_clap_embeds: false
use_diacritic: false
use_emotion_table: false
use_lamb_optimizer: false
use_language_table: false
use_monotonic_alignment: false
use_mpbert: false
use_one_cycle_lr: false
use_perceiver_resampler: false
use_pf: false
use_ph_durations: false
use_singing_labels: false
use_snr_labels: false
use_speaker_table: false
use_speech_codes_as_input: true
use_sv2tts: false
use_tb: false
use_tone: false
use_tpgst: false
use_wandb: false
vad_algo: webrtc
vad_margin: 0.1
validate_sr: true
validate_wav_len: true
vc_mel2f0: false
vc_soft_gt_pitch: false
vc_soft_units: true
ve_final_relu: false
ve_hidden_size: 768
ve_lr: 0.0001
ve_min_samples: 20
ve_partial_frames: 128
ve_spk_batch_size: 128
ve_utt_batch_size: 10
voc_future_horizon: 11
voc_lvc: false
voc_lvc_dims: 8
voc_noise_fir: true
voc_subscale: 0
voc_train_max_duration: 30
voc_train_min_duration: 1.5
voc_voiced_logits_scale: 0
vocoder_bsize: 16
vocoder_fc_dims: 512
vocoder_hidden_size: 512
vocoder_input_length: 16000
vocoder_input_pad: 0
vocoder_lr: 0.0001
vocoder_mode: MOL
wandb_watch_model: false
webrtc_mode: 2
weight_init: false
win_size: 2048