|
|
| |
| model: Emotion2vec |
| model_conf: |
| _name: data2vec_multi |
| activation_dropout: 0.0 |
| adversarial_hidden_dim: 128 |
| adversarial_training: false |
| adversarial_weight: 0.1 |
| attention_dropout: 0.1 |
| average_top_k_layers: 16 |
| batch_norm_target_layer: false |
| clone_batch: 12 |
| cls_loss: 1.0 |
| cls_type: chunk |
| d2v_loss: 1.0 |
| decoder_group: false |
| depth: 8 |
| dropout_input: 0.0 |
| ema_anneal_end_step: 20000 |
| ema_decay: 0.9997 |
| ema_encoder_only: false |
| ema_end_decay: 1.0 |
| ema_same_dtype: true |
| embed_dim: 1024 |
| encoder_dropout: 0.1 |
| end_drop_path_rate: 0.0 |
| end_of_block_targets: false |
| instance_norm_target_layer: true |
| instance_norm_targets: false |
| layer_norm_first: false |
| layer_norm_target_layer: false |
| layer_norm_targets: false |
| layerdrop: 0.0 |
| log_norms: true |
| loss_beta: 0.0 |
| loss_scale: null |
| mae_init: false |
| max_update: 100000 |
| min_pred_var: 0.01 |
| min_target_var: 0.1 |
| mlp_ratio: 4.0 |
| normalize: true |
| modalities: |
| _name: null |
| audio: |
| add_masks: false |
| alibi_max_pos: null |
| alibi_scale: 1.0 |
| conv_pos_depth: 5 |
| conv_pos_groups: 16 |
| conv_pos_pre_ln: false |
| conv_pos_width: 95 |
| decoder: |
| add_positions_all: false |
| add_positions_masked: false |
| decoder_dim: 768 |
| decoder_groups: 16 |
| decoder_kernel: 7 |
| decoder_layers: 4 |
| decoder_residual: true |
| input_dropout: 0.1 |
| projection_layers: 1 |
| projection_ratio: 2.0 |
| ema_local_encoder: false |
| encoder_zero_mask: true |
| end_drop_path_rate: 0.0 |
| extractor_mode: layer_norm |
| feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]' |
| init_extra_token_zero: true |
| inverse_mask: false |
| keep_masked_pct: 0.0 |
| learned_alibi: false |
| learned_alibi_scale: true |
| learned_alibi_scale_per_head: true |
| learned_alibi_scale_per_layer: false |
| local_grad_mult: 1.0 |
| mask_channel_length: 64 |
| mask_channel_prob: 0.0 |
| mask_dropout: 0.0 |
| mask_length: 5 |
| mask_noise_std: 0.01 |
| mask_prob: 0.55 |
| mask_prob_adjust: 0.1 |
| mask_prob_min: null |
| model_depth: 8 |
| num_alibi_heads: 16 |
| num_extra_tokens: 10 |
| prenet_depth: 4 |
| prenet_dropout: 0.1 |
| prenet_layerdrop: 0.0 |
| remove_masks: false |
| start_drop_path_rate: 0.0 |
| type: AUDIO |
| use_alibi_encoder: true |
| image: |
| add_masks: false |
| alibi_dims: 2 |
| alibi_distance: manhattan |
| alibi_max_pos: null |
| alibi_scale: 1.0 |
| decoder: |
| add_positions_all: false |
| add_positions_masked: false |
| decoder_dim: 384 |
| decoder_groups: 16 |
| decoder_kernel: 5 |
| decoder_layers: 5 |
| decoder_residual: true |
| input_dropout: 0.1 |
| projection_layers: 1 |
| projection_ratio: 2.0 |
| ema_local_encoder: false |
| embed_dim: 768 |
| enc_dec_transformer: false |
| encoder_zero_mask: true |
| end_drop_path_rate: 0.0 |
| fixed_positions: true |
| in_chans: 3 |
| init_extra_token_zero: true |
| input_size: 224 |
| inverse_mask: false |
| keep_masked_pct: 0.0 |
| learned_alibi: false |
| learned_alibi_scale: false |
| learned_alibi_scale_per_head: false |
| learned_alibi_scale_per_layer: false |
| local_grad_mult: 1.0 |
| mask_channel_length: 64 |
| mask_channel_prob: 0.0 |
| mask_dropout: 0.0 |
| mask_length: 5 |
| mask_noise_std: 0.01 |
| mask_prob: 0.7 |
| mask_prob_adjust: 0.0 |
| mask_prob_min: null |
| model_depth: 8 |
| num_alibi_heads: 16 |
| num_extra_tokens: 0 |
| patch_size: 16 |
| prenet_depth: 4 |
| prenet_dropout: 0.0 |
| prenet_layerdrop: 0.0 |
| remove_masks: false |
| start_drop_path_rate: 0.0 |
| transformer_decoder: false |
| type: IMAGE |
| use_alibi_encoder: false |
| text: |
| add_masks: false |
| alibi_max_pos: null |
| alibi_scale: 1.0 |
| decoder: |
| add_positions_all: false |
| add_positions_masked: false |
| decoder_dim: 384 |
| decoder_groups: 16 |
| decoder_kernel: 5 |
| decoder_layers: 5 |
| decoder_residual: true |
| input_dropout: 0.1 |
| projection_layers: 1 |
| projection_ratio: 2.0 |
| dropout: 0.1 |
| ema_local_encoder: false |
| encoder_zero_mask: true |
| end_drop_path_rate: 0.0 |
| init_extra_token_zero: true |
| inverse_mask: false |
| keep_masked_pct: 0.0 |
| layernorm_embedding: true |
| learned_alibi: false |
| learned_alibi_scale: false |
| learned_alibi_scale_per_head: false |
| learned_alibi_scale_per_layer: false |
| learned_pos: true |
| local_grad_mult: 1.0 |
| mask_channel_length: 64 |
| mask_channel_prob: 0.0 |
| mask_dropout: 0.0 |
| mask_length: 5 |
| mask_noise_std: 0.01 |
| mask_prob: 0.7 |
| mask_prob_adjust: 0.0 |
| mask_prob_min: null |
| max_source_positions: 512 |
| model_depth: 8 |
| no_scale_embedding: true |
| no_token_positional_embeddings: false |
| num_alibi_heads: 16 |
| num_extra_tokens: 0 |
| prenet_depth: 4 |
| prenet_dropout: 0.0 |
| prenet_layerdrop: 0.0 |
| remove_masks: false |
| start_drop_path_rate: 0.0 |
| type: TEXT |
| use_alibi_encoder: false |
| norm_affine: true |
| norm_eps: 1.0e-05 |
| num_heads: 16 |
| post_mlp_drop: 0.1 |
| recon_loss: 0.0 |
| seed: 1 |
| shared_decoder: null |
| skip_ema: false |
| start_drop_path_rate: 0.0 |
| supported_modality: AUDIO |
|
|
| tokenizer: CharTokenizer |
| tokenizer_conf: |
| unk_symbol: <unk> |
| split_with_space: true |
|
|
| scope_map: |
| - 'd2v_model.' |
| - none |
|
|
|
|
|
|