Text-to-Audio
Transformers
Safetensors
English
dasheng_audiogen
feature-extraction
audio-generation
text-to-speech
text-to-music
sound-effects
diffusion
custom_code
Instructions to use mispeech/Dasheng-AudioGen with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use mispeech/Dasheng-AudioGen with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-to-audio", model="mispeech/Dasheng-AudioGen", trust_remote_code=True)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("mispeech/Dasheng-AudioGen", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| { | |
| "model_type": "dasheng_audiogen", | |
| "architectures": [ | |
| "DashengAudioGenModel" | |
| ], | |
| "auto_map": { | |
| "AutoConfig": "configuration_dasheng_audiogen.DashengAudioGenConfig", | |
| "AutoModel": "modeling_dasheng_audiogen.DashengAudioGenModel" | |
| }, | |
| "text_encoder_name": "google/flan-t5-large", | |
| "tokenizer_name": "mispeech/dashengtokenizer", | |
| "use_zero_instruction": false, | |
| "instruction_seq_len": 14, | |
| "task_instruction_dim": 1024, | |
| "sample_rate": 16000, | |
| "downsampling_ratio": 640, | |
| "latent_dim": 1280, | |
| "content_dim": 1024, | |
| "frame_resolution": 0.005, | |
| "duration_offset": 1.0, | |
| "tokenizer_max_length": 512, | |
| "dit_img_size": 1000, | |
| "dit_patch_size": 1, | |
| "dit_in_chans": 1280, | |
| "dit_out_chans": 1280, | |
| "dit_input_type": "1d", | |
| "dit_embed_dim": 1536, | |
| "dit_depth": 32, | |
| "dit_num_heads": 24, | |
| "dit_mlp_ratio": 4.0, | |
| "dit_qk_norm": "layernorm", | |
| "dit_norm_layer": "layernorm", | |
| "dit_act_layer": "geglu", | |
| "dit_context_norm": true, | |
| "dit_time_fusion": "ada", | |
| "dit_ada_sola_rank": 32, | |
| "dit_ada_sola_alpha": 32, | |
| "dit_ta_context_dim": 1024, | |
| "dit_ta_context_fusion": "add", | |
| "dit_ta_context_norm": true, | |
| "dit_context_dim": 1024, | |
| "dit_context_fusion": "cross", | |
| "dit_context_pe_method": "none", | |
| "dit_pe_method": "none", | |
| "dit_rope_mode": "shared", | |
| "adapter_num_heads": 16, | |
| "adapter_dropout": 0.2, | |
| "adapter_duration_grad_scale": 0.1, | |
| "duration_predictor_filter_channels": 512, | |
| "duration_predictor_n_layers": 5, | |
| "duration_predictor_kernel_size": 3, | |
| "duration_predictor_p_dropout": 0.5, | |
| "special_tokens": [], | |
| "train_special_tokens": false | |
| } |