Upload 2 files

7067950 verified 3 months ago

6.93 kB

	# set random seed, so that you may reproduce your result.
	__set_seed1: !apply:random.seed [1986]
	__set_seed2: !apply:numpy.random.seed [1986]
	__set_seed3: !apply:torch.manual_seed [1986]
	__set_seed4: !apply:torch.cuda.manual_seed_all [1986]

	# fixed params
	sample_rate: 24000
	llm_input_size: 896
	llm_output_size: 896
	spk_embed_dim: 192
	qwen_pretrain_path: ''
	token_frame_rate: 25
	token_mel_ratio: 2

	# stream related params
	chunk_size: 25 # streaming inference chunk size, in token
	num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks

	# model params
	# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
	# for system/third_party class/function, we do not require this.
	llm: !new:cosyvoice.llm.llm.CosyVoice3LM
	llm_input_size: !ref <llm_input_size>
	llm_output_size: !ref <llm_output_size>
	speech_token_size: 6561
	length_normalized_loss: True
	lsm_weight: 0
	mix_ratio: [5, 15]
	llm: !new:cosyvoice.llm.llm.Qwen2Encoder
	pretrain_path: !ref <qwen_pretrain_path>
	sampling: !name:cosyvoice.utils.common.ras_sampling
	top_p: 0.8
	top_k: 25
	win_size: 10
	tau_r: 0.1

	flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithDiT
	input_size: 80
	output_size: 80
	spk_embed_dim: !ref <spk_embed_dim>
	output_type: 'mel'
	vocab_size: 6561
	input_frame_rate: !ref <token_frame_rate>
	only_mask_loss: True
	token_mel_ratio: !ref <token_mel_ratio>
	pre_lookahead_len: 3
	pre_lookahead_layer: !new:cosyvoice.transformer.upsample_encoder.PreLookaheadLayer
	in_channels: 80
	channels: 1024
	pre_lookahead_len: 3
	decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
	in_channels: 240
	n_spks: 1
	spk_emb_dim: 80
	cfm_params: !new:omegaconf.DictConfig
	content:
	sigma_min: 1e-06
	solver: 'euler'
	t_scheduler: 'cosine'
	training_cfg_rate: 0.2
	inference_cfg_rate: 0.7
	reg_loss_type: 'l1'
	estimator: !new:cosyvoice.flow.DiT.dit.DiT
	dim: 1024
	depth: 22
	heads: 16
	dim_head: 64
	ff_mult: 2
	mel_dim: 80
	mu_dim: 80
	spk_dim: 80
	out_channels: 80
	static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
	num_decoding_left_chunks: !ref <num_decoding_left_chunks>

	hift: !new:cosyvoice.hifigan.generator.CausalHiFTGenerator
	in_channels: 80
	base_channels: 512
	nb_harmonics: 8
	sampling_rate: !ref <sample_rate>
	nsf_alpha: 0.1
	nsf_sigma: 0.003
	nsf_voiced_threshold: 10
	upsample_rates: [8, 5, 3]
	upsample_kernel_sizes: [16, 11, 7]
	istft_params:
	n_fft: 16
	hop_len: 4
	resblock_kernel_sizes: [3, 7, 11]
	resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
	source_resblock_kernel_sizes: [7, 7, 11]
	source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
	lrelu_slope: 0.1
	audio_limit: 0.99
	conv_pre_look_right: 4
	f0_predictor: !new:cosyvoice.hifigan.f0_predictor.CausalConvRNNF0Predictor
	num_class: 1
	in_channels: 80
	cond_channels: 512

	# gan related module
	mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
	n_fft: 1920
	num_mels: 80
	sampling_rate: !ref <sample_rate>
	hop_size: 480
	win_size: 1920
	fmin: 0
	fmax: null
	center: False
	hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
	generator: !ref <hift>
	discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
	mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
	mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
	mel_spec_transform: [
	!ref <mel_spec_transform1>
	]

	# processor functions
	parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
	get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
	token_path: !ref <qwen_pretrain_path>
	skip_special_tokens: True
	version: cosyvoice3
	allowed_special: 'all'
	tokenize: !name:cosyvoice.dataset.processor.tokenize
	get_tokenizer: !ref <get_tokenizer>
	allowed_special: !ref <allowed_special>
	filter: !name:cosyvoice.dataset.processor.filter
	max_length: 40960
	min_length: 100
	token_max_length: 200
	token_min_length: 1
	resample: !name:cosyvoice.dataset.processor.resample
	resample_rate: !ref <sample_rate>
	truncate: !name:cosyvoice.dataset.processor.truncate
	truncate_length: 24480 # must be a multiplier of hop_size
	feat_extractor: !name:matcha.utils.audio.mel_spectrogram
	n_fft: 1920
	num_mels: 80
	sampling_rate: !ref <sample_rate>
	hop_size: 480
	win_size: 1920
	fmin: 0
	fmax: null
	center: False
	compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
	feat_extractor: !ref <feat_extractor>
	compute_f0: !name:cosyvoice.dataset.processor.compute_f0
	sample_rate: !ref <sample_rate>
	hop_size: 480
	parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
	normalize: True
	shuffle: !name:cosyvoice.dataset.processor.shuffle
	shuffle_size: 1000
	sort: !name:cosyvoice.dataset.processor.sort
	sort_size: 500 # sort_size should be less than shuffle_size
	batch: !name:cosyvoice.dataset.processor.batch
	batch_type: 'dynamic'
	max_frames_in_batch: 2000
	padding: !name:cosyvoice.dataset.processor.padding
	use_spk_embedding: False # change to True during sft


	# dataset processor pipeline
	data_pipeline: [
	!ref <parquet_opener>,
	!ref <tokenize>,
	!ref <filter>,
	!ref <resample>,
	!ref <compute_fbank>,
	!ref <parse_embedding>,
	!ref <shuffle>,
	!ref <sort>,
	!ref <batch>,
	!ref <padding>,
	]
	data_pipeline_gan: [
	!ref <parquet_opener>,
	!ref <tokenize>,
	!ref <filter>,
	!ref <resample>,
	!ref <truncate>,
	!ref <compute_fbank>,
	!ref <compute_f0>,
	!ref <parse_embedding>,
	!ref <shuffle>,
	!ref <sort>,
	!ref <batch>,
	!ref <padding>,
	]

	# llm flow train conf
	train_conf:
	optim: adam
	optim_conf:
	lr: 1e-5 # change to 1e-5 during sft
	scheduler: constantlr # change to constantlr during sft
	scheduler_conf:
	warmup_steps: 2500
	max_epoch: 200
	grad_clip: 5
	accum_grad: 2
	log_interval: 100
	save_per_step: -1

	# gan train conf
	train_conf_gan:
	optim: adam
	optim_conf:
	lr: 0.0002 # use small lr for gan training
	scheduler: constantlr
	optim_d: adam
	optim_conf_d:
	lr: 0.0002 # use small lr for gan training
	scheduler_d: constantlr
	max_epoch: 200
	grad_clip: 5
	accum_grad: 1 # in gan training, accum_grad must be 1
	log_interval: 100
	save_per_step: -1