| | model: |
| | class_path: linacodec.model.LinaCodecModel |
| | init_args: |
| | config: |
| | |
| | local_ssl_layers: [6, 9] |
| | global_ssl_layers: [1, 2] |
| | normalize_ssl_features: true |
| |
|
| | |
| | downsample_factor: 4 |
| | mel_upsample_factor: 8 |
| | use_conv_downsample: true |
| | mel_interpolation_mode: linear |
| |
|
| | |
| | sample_rate: 24000 |
| | n_fft: 1024 |
| | hop_length: 256 |
| | n_mels: 100 |
| | padding: center |
| |
|
| | ssl_feature_extractor: |
| | class_path: linacodec.module.ssl_extractor.SSLFeatureExtractor |
| | init_args: |
| | model_name: wavlm_base_plus |
| | output_layer: 2 |
| | sample_rate: 24000 |
| |
|
| | local_encoder: |
| | class_path: linacodec.module.transformer.Transformer |
| | init_args: |
| | dim: 768 |
| | n_layers: 6 |
| | n_heads: 12 |
| | window_size: 125 |
| | use_rope: true |
| | rope_theta: 10000.0 |
| | max_seq_len: 512 |
| | use_flash_attention: true |
| |
|
| | local_quantizer: |
| | class_path: linacodec.module.fsq.FiniteScalarQuantizer |
| | init_args: |
| | input_dim: 768 |
| | output_dim: 768 |
| | levels: [8, 8, 8, 5, 5] |
| |
|
| | feature_decoder: |
| | class_path: linacodec.module.transformer.Transformer |
| | init_args: |
| | dim: 768 |
| | n_layers: 6 |
| | n_heads: 12 |
| | window_size: 125 |
| | use_rope: true |
| | rope_theta: 10000.0 |
| | max_seq_len: 512 |
| | use_flash_attention: true |
| |
|
| | global_encoder: |
| | class_path: linacodec.module.global_encoder.GlobalEncoder |
| | init_args: |
| | input_channels: 768 |
| | output_channels: 128 |
| | num_layers: 4 |
| | dim: 384 |
| | intermediate_dim: 1152 |
| |
|
| | mel_prenet: |
| | class_path: linacodec.module.transformer.Transformer |
| | init_args: |
| | dim: 768 |
| | output_dim: 512 |
| | n_layers: 6 |
| | n_heads: 12 |
| | window_size: 31 |
| | use_rope: true |
| | rope_theta: 10000.0 |
| | max_seq_len: 512 |
| | use_flash_attention: true |
| |
|
| | mel_decoder: |
| | class_path: linacodec.module.transformer.Transformer |
| | init_args: |
| | dim: 512 |
| | output_dim: 100 |
| | n_layers: 6 |
| | n_heads: 8 |
| | window_size: 65 |
| | use_rope: true |
| | rope_theta: 10000.0 |
| | max_seq_len: 512 |
| | adanorm_condition_dim: 128 |
| | use_adaln_zero: true |
| | use_flash_attention: true |
| |
|
| | mel_postnet: |
| | class_path: linacodec.module.postnet.PostNet |
| | init_args: |
| | input_channels: 100 |
| | channels: 256 |
| | kernel_size: 7 |
| | num_layers: 4 |
| | use_layer_norm: true |
| |
|