| { |
| "architectures": [ |
| "HeartCodec" |
| ], |
| "attention_head_dim": 64, |
| "causal": true, |
| "codebook_dim": 32, |
| "codebook_size": 8192, |
| "commitment_weight": 1.0, |
| "decay": 0.9, |
| "default_kernel_size": 7, |
| "delay_kernel_size": 5, |
| "dim": 512, |
| "downsample_factors": [ |
| 3, |
| 4, |
| 4, |
| 4, |
| 5 |
| ], |
| "downsample_kernel_sizes": [ |
| 6, |
| 8, |
| 8, |
| 8, |
| 10 |
| ], |
| "in_channels": 1024, |
| "init_channel": 64, |
| "latent_hidden_dim": 128, |
| "model_type": "heartcodec", |
| "norm_type": "ada_norm_single", |
| "num_attention_heads": 24, |
| "num_bands": 1, |
| "num_layers": 24, |
| "num_layers_2": 6, |
| "num_quantizers": 8, |
| "num_samples": 2, |
| "out_channels": 256, |
| "res_kernel_size": 7, |
| "sample_rate": 48000, |
| "threshold_ema_dead_code": 2, |
| "torch_dtype": "float32", |
| "transformers_version": "4.51.3", |
| "upsample_factors": [ |
| 5, |
| 4, |
| 4, |
| 4, |
| 3 |
| ], |
| "upsample_kernel_sizes": [ |
| 10, |
| 8, |
| 8, |
| 8, |
| 6 |
| ], |
| "use_cosine_sim": false |
| } |
|
|