Dasheng-AudioGen / config.json
mie237's picture
Upload folder using huggingface_hub
d8bc777 verified
{
"model_type": "dasheng_audiogen",
"architectures": [
"DashengAudioGenModel"
],
"auto_map": {
"AutoConfig": "configuration_dasheng_audiogen.DashengAudioGenConfig",
"AutoModel": "modeling_dasheng_audiogen.DashengAudioGenModel"
},
"text_encoder_name": "google/flan-t5-large",
"tokenizer_name": "mispeech/dashengtokenizer",
"use_zero_instruction": false,
"instruction_seq_len": 14,
"task_instruction_dim": 1024,
"sample_rate": 16000,
"downsampling_ratio": 640,
"latent_dim": 1280,
"content_dim": 1024,
"frame_resolution": 0.005,
"duration_offset": 1.0,
"tokenizer_max_length": 512,
"dit_img_size": 1000,
"dit_patch_size": 1,
"dit_in_chans": 1280,
"dit_out_chans": 1280,
"dit_input_type": "1d",
"dit_embed_dim": 1536,
"dit_depth": 32,
"dit_num_heads": 24,
"dit_mlp_ratio": 4.0,
"dit_qk_norm": "layernorm",
"dit_norm_layer": "layernorm",
"dit_act_layer": "geglu",
"dit_context_norm": true,
"dit_time_fusion": "ada",
"dit_ada_sola_rank": 32,
"dit_ada_sola_alpha": 32,
"dit_ta_context_dim": 1024,
"dit_ta_context_fusion": "add",
"dit_ta_context_norm": true,
"dit_context_dim": 1024,
"dit_context_fusion": "cross",
"dit_context_pe_method": "none",
"dit_pe_method": "none",
"dit_rope_mode": "shared",
"adapter_num_heads": 16,
"adapter_dropout": 0.2,
"adapter_duration_grad_scale": 0.1,
"duration_predictor_filter_channels": 512,
"duration_predictor_n_layers": 5,
"duration_predictor_kernel_size": 3,
"duration_predictor_p_dropout": 0.5,
"special_tokens": [],
"train_special_tokens": false
}