chatterbox-turbo / t3_turbo_v1.yaml

Upload folder using huggingface_hub

3c40542 verified about 1 month ago

8.46 kB

	align_f0: false
	align_loss_weight: 1.0
	asc_loss_weight: 0.02
	attention_mechanism: graves
	augment_sr: false
	base_model: null
	bit_depth: 9
	causal_convs: false
	causal_decoder: false
	clap_dims: 512
	compat_dcnar_f0_std_cond: false
	conv_stack_dilation: !!python/tuple
	- 1
	- 3
	- 9
	- 27
	convbn_bias: false
	cudnn_deterministic: false
	dcnar_1d_discrim: false
	dcnar_aligner_kernel: 5
	dcnar_aligner_type: null
	dcnar_allow_trivial_speaker_table: true
	dcnar_batch_size: 24
	dcnar_conformer: false
	dcnar_conformer_attn_chunk_size: null
	dcnar_conformer_attn_dim_head: 64
	dcnar_conformer_attn_ff_mult: 4
	dcnar_conformer_attn_win_size: null
	dcnar_conv_weight_grouping: 1
	dcnar_df0_loss_weight: 0.5
	dcnar_dim_lrg: 512
	dcnar_dim_sml: 256
	dcnar_dim_style: 32
	dcnar_discrim_tanh: false
	dcnar_dtw_loss_weight: 1
	dcnar_dur_loss_weight: 0.1
	dcnar_dur_pred_scale: linear
	dcnar_f0_cond_mel_decoding: false
	dcnar_f0_cond_mel_decoding_teacher_forcing: true
	dcnar_f0_loss_weight: 0.5
	dcnar_gan_dims: 64
	dcnar_global_style: true
	dcnar_hard_gumbel_tones: false
	dcnar_hubert_downsample: 1
	dcnar_inpaint_vae: false
	dcnar_inpaint_vae_kld_loss_weight: 0
	dcnar_inpaint_vae_latent_dim: 32
	dcnar_inpaint_vae_warmup_steps: 5000
	dcnar_inpaint_vae_weight_step_size: 0.0002
	dcnar_local_f0: false
	dcnar_local_intensity: false
	dcnar_local_style: false
	dcnar_lr: 0.0001
	dcnar_mel_adv: false
	dcnar_mel_loss_weight: 10.0
	dcnar_mixed_sr_loss: false
	dcnar_n_terminal_tones: 0
	dcnar_ph_f0_loss_weight: 1.0
	dcnar_ph_hubert_loss_weight: 1.0
	dcnar_ph_intensity_loss_weight: 1.0
	dcnar_pitch_adv: false
	dcnar_prosody_adv: false
	dcnar_prosody_stats_cond: false
	dcnar_pstat_weight_f0_mean: 10
	dcnar_pstat_weight_f0_std: 100
	dcnar_pstat_weight_intensity_mean: 10
	dcnar_pstat_weight_intensity_std: 0
	dcnar_pstat_weight_phdur_mean: 1
	dcnar_pstat_weight_phdur_std: 1
	dcnar_reverb_label: false
	dcnar_sampler: default
	dcnar_sr_label: false
	dcnar_terminal_tone_usl_weight: 0
	dcnar_terminal_tone_weight: 0
	dcnar_upsampling: gaussian
	dcnar_use_log_f0_frames: false
	dcnar_use_toucan_utt_embs: false
	dcnar_usl_mfcc: false
	dcnar_usl_mfcc_deltas: false
	dcnar_usl_mfcc_dim: 12
	dcnar_usl_mfcc_var_dec: false
	dcnar_usl_slim: false
	dcnar_usl_slim_dim: 16
	dcnar_usl_with_f0: false
	dcnar_utt_dur_loss_weight: 0
	dcnar_vc_local_hubert: false
	dcnar_vc_mode: nn
	dcnar_vc_text_predict: false
	dcnar_vuv_loss_weight: 0.5
	dcvoc_causal: false
	dcvoc_causal_lookahead: 3
	dcvoc_channel_downsample_mode: interleave
	dcvoc_convs_per_scale: 8
	dcvoc_disc_duplicates: 1
	dcvoc_disc_mpwd: true
	dcvoc_disc_mrsd: false
	dcvoc_disc_pdd: true
	dcvoc_disc_phase_aug: false
	dcvoc_discriminator_bound: 1.01
	dcvoc_groups_init: 8
	dcvoc_halfres_conv: true
	dcvoc_hidden_init: 1024
	dcvoc_hop: 8
	dcvoc_kernel: 7
	dcvoc_mel_bneck: 256
	dcvoc_smpwd_hidden_max: 1024
	dcvoc_smpwd_periods:
	- 2
	- 3
	- 5
	- 7
	- 9
	- 11
	- 13
	dcvoc_upsample_method: linear
	denoise: false
	dfd_clip_stft: 1.0e-09
	dfd_ramdisk_path: /mnt/ramdisk
	ema_coeff: 0.99995
	emo_embedded_speaker_id: false
	emotion_adv: false
	enable_eos_bos_chars: true
	encoder_type: voice_encoder
	eval_crosslang: false
	eval_langs: dataset
	eval_max_ref_samples: 192
	eval_max_repeats: 1
	eval_max_runs: 10
	eval_max_sentences: 192
	eval_mbnet_name: null
	eval_models_dir: saved_models
	eval_n_plots: 2
	eval_n_wavs: 4
	eval_reference: train
	eval_syn_batch_size: 64
	eval_text_source: default
	eval_ve_name: universal/ve_v2
	eval_voc_max_frames: 2000
	eval_voc_name: null
	f0_mode: praat
	flatten_lstm_params: true
	fmax: 16000
	fmin: 0
	frames_per_framegroup: 10
	freeze_mel_head: false
	gmvae_ema_lr: 0.0001
	gmvae_latent_dim: 16
	gmvae_num_components: 0
	gpt_masked_loss: false
	gpt_prod_max_text: 200
	gpt_speaker_ref_type: same_speaker
	gpt_transformer_type: gpt2-medium
	hifigan_channels: 256
	hooli_enc_dims: 256
	hooli_filter_size: 257
	hooli_inv_no_uv: false
	hooli_inv_pitch_diff_reg_weight: 0
	hooli_inv_pitch_shift_reg_weight: 0
	hooli_nfft: 16
	hooli_osc_freq_cutoff: 0.15
	hooli_safe_step: true
	hooli_tv_fir: false
	hooli_wn_dims: 64
	hooligan_discriminators: univnet
	hooligan_istft: true
	hop_size: 320
	input_pos_emb: handled_internally_by_backbone
	is_lora: false
	language_embed_size: 16
	legacy_gpt_hidden_size: 1024
	lfcc_nfilts: 128
	llama_config_name: Llama_520M
	lora_alpha: 64
	lora_dropout: 0.05
	lora_r: 32
	lossynet_bsize: 25
	lossynet_clip_stft: 1.0e-09
	lossynet_lr: 0.001
	lossynet_n_out_classes: 2
	lowest_sr: 8000
	max_LR: 0.001
	max_conditioning_inputs: 2
	max_decoder_frames: 2000
	max_f0_freq: 600
	max_speech_tokens: 604
	max_text_tokens: 402
	max_total_tokens: 8196
	mel_pad_difference: 1
	mel_power: 1.0
	mel_type: db
	min_LR: 1.0e-06
	min_f0_freq: 75
	mpbert_n_freeze: 0
	mpbert_tokenizer: null
	mpbert_type: transformer
	mu_law: true
	n_cqcc_bins: 96
	n_cqt_bins: 84
	n_fft: 2048
	n_gpt_channels: 1024
	n_reverbs: 256
	n_spk_cond_samples: 2
	n_state_per_symbol: 1
	n_transformer_heads: 16
	n_transformer_layers: 30
	normalize_loudness: false
	normalized_mels: true
	num_ceps: 29
	num_diacritcs: 512
	num_freq: 1025
	num_heads: 4
	num_mels: 256
	num_style_tokens: 0
	num_tones: 16
	onehot_language: false
	onehot_speaker: false
	pf_word_boundaries: false
	phonemizer_backend: espeak
	preemphasis: 0.97
	preemphasize_voc_target: false
	prenet_type: original
	project_conditioning: false
	prosody_embed_size: 0
	r_schedule:
	- - 1
	- -1
	rvc_emb_channels: 768
	rvc_enc_spk_input: false
	rvc_f0_up: 0
	rvc_f0_voc: true
	rvc_filter_channels: 768
	rvc_gin_channels: 256
	rvc_hidden_channels: 192
	rvc_inter_channels: 192
	rvc_kernel_size: 3
	rvc_mel_bins: 80
	rvc_n_heads: 2
	rvc_n_layers: 6
	rvc_p_dropout: 0
	rvc_resblock: '1'
	rvc_resblock_dilation_sizes:
	- - 1
	- 3
	- 5
	- - 1
	- 3
	- 5
	- - 1
	- 3
	- 5
	rvc_resblock_kernel_sizes:
	- 3
	- 7
	- 11
	rvc_seg_enc_size_frames: 370
	rvc_seg_enc_size_samples: 118400
	rvc_seg_voc_size_frames: 40
	rvc_seg_voc_size_samples: 12800
	rvc_speaker_enc: table
	rvc_speaker_enc_type: V1
	rvc_speaker_pitch: null
	rvc_spec_channels: 513
	rvc_spk_embed_dim: 109
	rvc_stft_filter_len: 1024
	rvc_stft_win_len: 1024
	rvc_train_kl_weight: 1.0
	rvc_train_mel_weight: 45
	rvc_upsample_initial_channel: 512
	rvc_upsample_kernel_sizes:
	- 20
	- 16
	- 4
	- 4
	rvc_upsample_rates:
	- 10
	- 8
	- 2
	- 2
	rvc_use_f0: true
	sample_rate: 32000
	scheduler_max_total_steps: 200000
	seed: 0
	self_conditioning: false
	separate_stopnet: false
	singing_dim: 4
	speaker_embed_size: 256
	speech_cond_prompt_len: 250
	speech_token_type: tortoise
	speech_tokens_dict_size: 6563
	speed_scale: 0.1
	start_speech_token: 6561
	start_text_token: 255
	stepwise_sigmoid_noise: 2.0
	stft_magnitude_min: 0.0001
	stop_speech_token: 6562
	stop_text_token: 0
	stop_threshold: 0.25
	style_embed_size: 256
	supports_cfg: false
	symbol_type: tortoise/data/gpt2_medium.json
	syn_ar_f0_predict: true
	syn_batch_frames: 16000
	syn_batch_size: 32
	syn_mel_scale: 1
	syn_predict_f0: true
	syn_sampler: binnedlength
	syn_symmetric_mel: false
	syn_train_max_frames: 700
	syn_train_min_duration: 1
	taco1_postnet: true
	taco_decoder_att_rnn_dim: 1024
	taco_decoder_prenet_dim: 256
	taco_decoder_rnn_dim: 1024
	taco_disjoint_conditioning: true
	taco_encoder_dim: 512
	taco_grad_clip: 1
	taco_loss_masking: true
	taco_lr: 0.0001
	taco_weight_decay: 1.0e-06
	target_loudness: -18
	text_loss_weight: 0.1
	text_preproc: none
	text_tokens_dict_size: 50276
	ti_vocoder: false
	toucan_utt_emb_dim: 704
	trim_silence: true
	upsample_factors: !!python/tuple
	- 5
	- 8
	- 8
	upsample_rate: null
	upsamplenet_dropout: false
	upsamplenet_lr: 1.0e-05
	use_adv_speaker_classifier: false
	use_clap_embeds: false
	use_diacritic: false
	use_emotion_table: false
	use_lamb_optimizer: false
	use_language_table: false
	use_monotonic_alignment: false
	use_mpbert: false
	use_one_cycle_lr: false
	use_perceiver_resampler: false
	use_pf: false
	use_ph_durations: false
	use_singing_labels: false
	use_snr_labels: false
	use_speaker_table: false
	use_speech_codes_as_input: true
	use_sv2tts: false
	use_tb: false
	use_tone: false
	use_tpgst: false
	use_wandb: false
	vad_algo: webrtc
	vad_margin: 0.1
	validate_sr: true
	validate_wav_len: true
	vc_mel2f0: false
	vc_soft_gt_pitch: false
	vc_soft_units: true
	ve_final_relu: false
	ve_hidden_size: 768
	ve_lr: 0.0001
	ve_min_samples: 20
	ve_partial_frames: 128
	ve_spk_batch_size: 128
	ve_utt_batch_size: 10
	voc_future_horizon: 11
	voc_lvc: false
	voc_lvc_dims: 8
	voc_noise_fir: true
	voc_subscale: 0
	voc_train_max_duration: 30
	voc_train_min_duration: 1.5
	voc_voiced_logits_scale: 0
	vocoder_bsize: 16
	vocoder_fc_dims: 512
	vocoder_hidden_size: 512
	vocoder_input_length: 16000
	vocoder_input_pad: 0
	vocoder_lr: 0.0001
	vocoder_mode: MOL
	wandb_watch_model: false
	webrtc_mode: 2
	weight_init: false
	win_size: 2048