| align_f0: false | |
| align_loss_weight: 1.0 | |
| asc_loss_weight: 0.02 | |
| attention_mechanism: graves | |
| augment_sr: false | |
| base_model: null | |
| bit_depth: 9 | |
| causal_convs: false | |
| causal_decoder: false | |
| clap_dims: 512 | |
| compat_dcnar_f0_std_cond: false | |
| conv_stack_dilation: !!python/tuple | |
| - 1 | |
| - 3 | |
| - 9 | |
| - 27 | |
| convbn_bias: false | |
| cudnn_deterministic: false | |
| dcnar_1d_discrim: false | |
| dcnar_aligner_kernel: 5 | |
| dcnar_aligner_type: null | |
| dcnar_allow_trivial_speaker_table: true | |
| dcnar_batch_size: 24 | |
| dcnar_conformer: false | |
| dcnar_conformer_attn_chunk_size: null | |
| dcnar_conformer_attn_dim_head: 64 | |
| dcnar_conformer_attn_ff_mult: 4 | |
| dcnar_conformer_attn_win_size: null | |
| dcnar_conv_weight_grouping: 1 | |
| dcnar_df0_loss_weight: 0.5 | |
| dcnar_dim_lrg: 512 | |
| dcnar_dim_sml: 256 | |
| dcnar_dim_style: 32 | |
| dcnar_discrim_tanh: false | |
| dcnar_dtw_loss_weight: 1 | |
| dcnar_dur_loss_weight: 0.1 | |
| dcnar_dur_pred_scale: linear | |
| dcnar_f0_cond_mel_decoding: false | |
| dcnar_f0_cond_mel_decoding_teacher_forcing: true | |
| dcnar_f0_loss_weight: 0.5 | |
| dcnar_gan_dims: 64 | |
| dcnar_global_style: true | |
| dcnar_hard_gumbel_tones: false | |
| dcnar_hubert_downsample: 1 | |
| dcnar_inpaint_vae: false | |
| dcnar_inpaint_vae_kld_loss_weight: 0 | |
| dcnar_inpaint_vae_latent_dim: 32 | |
| dcnar_inpaint_vae_warmup_steps: 5000 | |
| dcnar_inpaint_vae_weight_step_size: 0.0002 | |
| dcnar_local_f0: false | |
| dcnar_local_intensity: false | |
| dcnar_local_style: false | |
| dcnar_lr: 0.0001 | |
| dcnar_mel_adv: false | |
| dcnar_mel_loss_weight: 10.0 | |
| dcnar_mixed_sr_loss: false | |
| dcnar_n_terminal_tones: 0 | |
| dcnar_ph_f0_loss_weight: 1.0 | |
| dcnar_ph_hubert_loss_weight: 1.0 | |
| dcnar_ph_intensity_loss_weight: 1.0 | |
| dcnar_pitch_adv: false | |
| dcnar_prosody_adv: false | |
| dcnar_prosody_stats_cond: false | |
| dcnar_pstat_weight_f0_mean: 10 | |
| dcnar_pstat_weight_f0_std: 100 | |
| dcnar_pstat_weight_intensity_mean: 10 | |
| dcnar_pstat_weight_intensity_std: 0 | |
| dcnar_pstat_weight_phdur_mean: 1 | |
| dcnar_pstat_weight_phdur_std: 1 | |
| dcnar_reverb_label: false | |
| dcnar_sampler: default | |
| dcnar_sr_label: false | |
| dcnar_terminal_tone_usl_weight: 0 | |
| dcnar_terminal_tone_weight: 0 | |
| dcnar_upsampling: gaussian | |
| dcnar_use_log_f0_frames: false | |
| dcnar_use_toucan_utt_embs: false | |
| dcnar_usl_mfcc: false | |
| dcnar_usl_mfcc_deltas: false | |
| dcnar_usl_mfcc_dim: 12 | |
| dcnar_usl_mfcc_var_dec: false | |
| dcnar_usl_slim: false | |
| dcnar_usl_slim_dim: 16 | |
| dcnar_usl_with_f0: false | |
| dcnar_utt_dur_loss_weight: 0 | |
| dcnar_vc_local_hubert: false | |
| dcnar_vc_mode: nn | |
| dcnar_vc_text_predict: false | |
| dcnar_vuv_loss_weight: 0.5 | |
| dcvoc_causal: false | |
| dcvoc_causal_lookahead: 3 | |
| dcvoc_channel_downsample_mode: interleave | |
| dcvoc_convs_per_scale: 8 | |
| dcvoc_disc_duplicates: 1 | |
| dcvoc_disc_mpwd: true | |
| dcvoc_disc_mrsd: false | |
| dcvoc_disc_pdd: true | |
| dcvoc_disc_phase_aug: false | |
| dcvoc_discriminator_bound: 1.01 | |
| dcvoc_groups_init: 8 | |
| dcvoc_halfres_conv: true | |
| dcvoc_hidden_init: 1024 | |
| dcvoc_hop: 8 | |
| dcvoc_kernel: 7 | |
| dcvoc_mel_bneck: 256 | |
| dcvoc_smpwd_hidden_max: 1024 | |
| dcvoc_smpwd_periods: | |
| - 2 | |
| - 3 | |
| - 5 | |
| - 7 | |
| - 9 | |
| - 11 | |
| - 13 | |
| dcvoc_upsample_method: linear | |
| denoise: false | |
| dfd_clip_stft: 1.0e-09 | |
| dfd_ramdisk_path: /mnt/ramdisk | |
| ema_coeff: 0.99995 | |
| emo_embedded_speaker_id: false | |
| emotion_adv: false | |
| enable_eos_bos_chars: true | |
| encoder_type: voice_encoder | |
| eval_crosslang: false | |
| eval_langs: dataset | |
| eval_max_ref_samples: 192 | |
| eval_max_repeats: 1 | |
| eval_max_runs: 10 | |
| eval_max_sentences: 192 | |
| eval_mbnet_name: null | |
| eval_models_dir: saved_models | |
| eval_n_plots: 2 | |
| eval_n_wavs: 4 | |
| eval_reference: train | |
| eval_syn_batch_size: 64 | |
| eval_text_source: default | |
| eval_ve_name: universal/ve_v2 | |
| eval_voc_max_frames: 2000 | |
| eval_voc_name: null | |
| f0_mode: praat | |
| flatten_lstm_params: true | |
| fmax: 16000 | |
| fmin: 0 | |
| frames_per_framegroup: 10 | |
| freeze_mel_head: false | |
| gmvae_ema_lr: 0.0001 | |
| gmvae_latent_dim: 16 | |
| gmvae_num_components: 0 | |
| gpt_masked_loss: false | |
| gpt_prod_max_text: 200 | |
| gpt_speaker_ref_type: same_speaker | |
| gpt_transformer_type: gpt2-medium | |
| hifigan_channels: 256 | |
| hooli_enc_dims: 256 | |
| hooli_filter_size: 257 | |
| hooli_inv_no_uv: false | |
| hooli_inv_pitch_diff_reg_weight: 0 | |
| hooli_inv_pitch_shift_reg_weight: 0 | |
| hooli_nfft: 16 | |
| hooli_osc_freq_cutoff: 0.15 | |
| hooli_safe_step: true | |
| hooli_tv_fir: false | |
| hooli_wn_dims: 64 | |
| hooligan_discriminators: univnet | |
| hooligan_istft: true | |
| hop_size: 320 | |
| input_pos_emb: handled_internally_by_backbone | |
| is_lora: false | |
| language_embed_size: 16 | |
| legacy_gpt_hidden_size: 1024 | |
| lfcc_nfilts: 128 | |
| llama_config_name: Llama_520M | |
| lora_alpha: 64 | |
| lora_dropout: 0.05 | |
| lora_r: 32 | |
| lossynet_bsize: 25 | |
| lossynet_clip_stft: 1.0e-09 | |
| lossynet_lr: 0.001 | |
| lossynet_n_out_classes: 2 | |
| lowest_sr: 8000 | |
| max_LR: 0.001 | |
| max_conditioning_inputs: 2 | |
| max_decoder_frames: 2000 | |
| max_f0_freq: 600 | |
| max_speech_tokens: 604 | |
| max_text_tokens: 402 | |
| max_total_tokens: 8196 | |
| mel_pad_difference: 1 | |
| mel_power: 1.0 | |
| mel_type: db | |
| min_LR: 1.0e-06 | |
| min_f0_freq: 75 | |
| mpbert_n_freeze: 0 | |
| mpbert_tokenizer: null | |
| mpbert_type: transformer | |
| mu_law: true | |
| n_cqcc_bins: 96 | |
| n_cqt_bins: 84 | |
| n_fft: 2048 | |
| n_gpt_channels: 1024 | |
| n_reverbs: 256 | |
| n_spk_cond_samples: 2 | |
| n_state_per_symbol: 1 | |
| n_transformer_heads: 16 | |
| n_transformer_layers: 30 | |
| normalize_loudness: false | |
| normalized_mels: true | |
| num_ceps: 29 | |
| num_diacritcs: 512 | |
| num_freq: 1025 | |
| num_heads: 4 | |
| num_mels: 256 | |
| num_style_tokens: 0 | |
| num_tones: 16 | |
| onehot_language: false | |
| onehot_speaker: false | |
| pf_word_boundaries: false | |
| phonemizer_backend: espeak | |
| preemphasis: 0.97 | |
| preemphasize_voc_target: false | |
| prenet_type: original | |
| project_conditioning: false | |
| prosody_embed_size: 0 | |
| r_schedule: | |
| - - 1 | |
| - -1 | |
| rvc_emb_channels: 768 | |
| rvc_enc_spk_input: false | |
| rvc_f0_up: 0 | |
| rvc_f0_voc: true | |
| rvc_filter_channels: 768 | |
| rvc_gin_channels: 256 | |
| rvc_hidden_channels: 192 | |
| rvc_inter_channels: 192 | |
| rvc_kernel_size: 3 | |
| rvc_mel_bins: 80 | |
| rvc_n_heads: 2 | |
| rvc_n_layers: 6 | |
| rvc_p_dropout: 0 | |
| rvc_resblock: '1' | |
| rvc_resblock_dilation_sizes: | |
| - - 1 | |
| - 3 | |
| - 5 | |
| - - 1 | |
| - 3 | |
| - 5 | |
| - - 1 | |
| - 3 | |
| - 5 | |
| rvc_resblock_kernel_sizes: | |
| - 3 | |
| - 7 | |
| - 11 | |
| rvc_seg_enc_size_frames: 370 | |
| rvc_seg_enc_size_samples: 118400 | |
| rvc_seg_voc_size_frames: 40 | |
| rvc_seg_voc_size_samples: 12800 | |
| rvc_speaker_enc: table | |
| rvc_speaker_enc_type: V1 | |
| rvc_speaker_pitch: null | |
| rvc_spec_channels: 513 | |
| rvc_spk_embed_dim: 109 | |
| rvc_stft_filter_len: 1024 | |
| rvc_stft_win_len: 1024 | |
| rvc_train_kl_weight: 1.0 | |
| rvc_train_mel_weight: 45 | |
| rvc_upsample_initial_channel: 512 | |
| rvc_upsample_kernel_sizes: | |
| - 20 | |
| - 16 | |
| - 4 | |
| - 4 | |
| rvc_upsample_rates: | |
| - 10 | |
| - 8 | |
| - 2 | |
| - 2 | |
| rvc_use_f0: true | |
| sample_rate: 32000 | |
| scheduler_max_total_steps: 200000 | |
| seed: 0 | |
| self_conditioning: false | |
| separate_stopnet: false | |
| singing_dim: 4 | |
| speaker_embed_size: 256 | |
| speech_cond_prompt_len: 250 | |
| speech_token_type: tortoise | |
| speech_tokens_dict_size: 6563 | |
| speed_scale: 0.1 | |
| start_speech_token: 6561 | |
| start_text_token: 255 | |
| stepwise_sigmoid_noise: 2.0 | |
| stft_magnitude_min: 0.0001 | |
| stop_speech_token: 6562 | |
| stop_text_token: 0 | |
| stop_threshold: 0.25 | |
| style_embed_size: 256 | |
| supports_cfg: false | |
| symbol_type: tortoise/data/gpt2_medium.json | |
| syn_ar_f0_predict: true | |
| syn_batch_frames: 16000 | |
| syn_batch_size: 32 | |
| syn_mel_scale: 1 | |
| syn_predict_f0: true | |
| syn_sampler: binnedlength | |
| syn_symmetric_mel: false | |
| syn_train_max_frames: 700 | |
| syn_train_min_duration: 1 | |
| taco1_postnet: true | |
| taco_decoder_att_rnn_dim: 1024 | |
| taco_decoder_prenet_dim: 256 | |
| taco_decoder_rnn_dim: 1024 | |
| taco_disjoint_conditioning: true | |
| taco_encoder_dim: 512 | |
| taco_grad_clip: 1 | |
| taco_loss_masking: true | |
| taco_lr: 0.0001 | |
| taco_weight_decay: 1.0e-06 | |
| target_loudness: -18 | |
| text_loss_weight: 0.1 | |
| text_preproc: none | |
| text_tokens_dict_size: 50276 | |
| ti_vocoder: false | |
| toucan_utt_emb_dim: 704 | |
| trim_silence: true | |
| upsample_factors: !!python/tuple | |
| - 5 | |
| - 8 | |
| - 8 | |
| upsample_rate: null | |
| upsamplenet_dropout: false | |
| upsamplenet_lr: 1.0e-05 | |
| use_adv_speaker_classifier: false | |
| use_clap_embeds: false | |
| use_diacritic: false | |
| use_emotion_table: false | |
| use_lamb_optimizer: false | |
| use_language_table: false | |
| use_monotonic_alignment: false | |
| use_mpbert: false | |
| use_one_cycle_lr: false | |
| use_perceiver_resampler: false | |
| use_pf: false | |
| use_ph_durations: false | |
| use_singing_labels: false | |
| use_snr_labels: false | |
| use_speaker_table: false | |
| use_speech_codes_as_input: true | |
| use_sv2tts: false | |
| use_tb: false | |
| use_tone: false | |
| use_tpgst: false | |
| use_wandb: false | |
| vad_algo: webrtc | |
| vad_margin: 0.1 | |
| validate_sr: true | |
| validate_wav_len: true | |
| vc_mel2f0: false | |
| vc_soft_gt_pitch: false | |
| vc_soft_units: true | |
| ve_final_relu: false | |
| ve_hidden_size: 768 | |
| ve_lr: 0.0001 | |
| ve_min_samples: 20 | |
| ve_partial_frames: 128 | |
| ve_spk_batch_size: 128 | |
| ve_utt_batch_size: 10 | |
| voc_future_horizon: 11 | |
| voc_lvc: false | |
| voc_lvc_dims: 8 | |
| voc_noise_fir: true | |
| voc_subscale: 0 | |
| voc_train_max_duration: 30 | |
| voc_train_min_duration: 1.5 | |
| voc_voiced_logits_scale: 0 | |
| vocoder_bsize: 16 | |
| vocoder_fc_dims: 512 | |
| vocoder_hidden_size: 512 | |
| vocoder_input_length: 16000 | |
| vocoder_input_pad: 0 | |
| vocoder_lr: 0.0001 | |
| vocoder_mode: MOL | |
| wandb_watch_model: false | |
| webrtc_mode: 2 | |
| weight_init: false | |
| win_size: 2048 | |