niobures commited on
Commit
a9a8ac2
·
verified ·
1 Parent(s): 46d7256

Style TTS (en, es)

Browse files
.gitattributes CHANGED
@@ -58,3 +58,6 @@ vi,en/StyleTTS2-lite-vi/reference_audio/vn_1.wav filter=lfs diff=lfs merge=lfs -
58
  vi,en/StyleTTS2-lite-vi/reference_audio/vn_2.wav filter=lfs diff=lfs merge=lfs -text
59
  vi,en/StyleTTS2-lite-vi/reference_audio/vn_3.wav filter=lfs diff=lfs merge=lfs -text
60
  vi,en/StyleTTS2-lite-vi/reference_audio/vn_4.wav filter=lfs diff=lfs merge=lfs -text
 
 
 
 
58
  vi,en/StyleTTS2-lite-vi/reference_audio/vn_2.wav filter=lfs diff=lfs merge=lfs -text
59
  vi,en/StyleTTS2-lite-vi/reference_audio/vn_3.wav filter=lfs diff=lfs merge=lfs -text
60
  vi,en/StyleTTS2-lite-vi/reference_audio/vn_4.wav filter=lfs diff=lfs merge=lfs -text
61
+ en/styletts2-models[[:space:]](SC4949)/narrator.wav filter=lfs diff=lfs merge=lfs -text
62
+ en/styletts2-models[[:space:]](SC4949)/women.wav filter=lfs diff=lfs merge=lfs -text
63
+ es/styletts2-spanish-ft/reference_audio.wav filter=lfs diff=lfs merge=lfs -text
en/StyleTTS2-ONNX-Cpp/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
en/StyleTTS2-ONNX-Cpp/bert_encoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c859514cbe9b5c12a50a0046fd769a797b66af184036ebb914578a2e69d5e82a
3
+ size 1575207
en/StyleTTS2-ONNX-Cpp/final_simp.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38b2cd2f39302c45085659151581af1c80375f0d3b7d77ca29bf89a5085ad561
3
+ size 304950337
en/StyleTTS2-ONNX-Cpp/plbert_simp.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f1c9af67134670d5fb1ac08e482ce8e0613e6d083bcc6a0990bcad41da33a51
3
+ size 23106930
en/StyleTTS2-ONNX-Cpp/predictor_encoder_simp.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47a270615ae814f73f6227ce0d51f7c24a790bf4e1b22ebd12550b879d79f604
3
+ size 55399267
en/StyleTTS2-ONNX-Cpp/ref_p.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af4f5173706a7a03cd7512c440808369e7897614887069dec08a92eadcbfdca4
3
+ size 512
en/StyleTTS2-ONNX-Cpp/ref_s.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d846acc721a2b3ddeb150675ea63f30416326ddd0e7ac7c0c80f20aca3105de
3
+ size 512
en/StyleTTS2-ONNX-Cpp/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/DDATT/StyleTTS2-ONNX-Cpp
en/StyleTTS2-ONNX-Cpp/style_encoder_simp.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90d7973e3d89ffe00b80603e6caa6c10de2921291bf406a568076fe25a9eb051
3
+ size 55399267
en/styletts2-models (SC4949)/.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ narrator.wav filter=lfs diff=lfs merge=lfs -text
37
+ women.wav filter=lfs diff=lfs merge=lfs -text
en/styletts2-models (SC4949)/anger.wav ADDED
Binary file (96 kB). View file
 
en/styletts2-models (SC4949)/config.yml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {ASR_config: Utils/ASR/config.yml, ASR_path: Utils/ASR/epoch_00080.pth, F0_path: Utils/JDC/bst.t7,
2
+ PLBERT_dir: Utils/PLBERT/, batch_size: 8, data_params: {OOD_data: Data/OOD_texts.txt,
3
+ min_length: 50, root_path: '', train_data: Data/train_list.txt, val_data: Data/val_list.txt},
4
+ device: cuda, epochs_1st: 40, epochs_2nd: 25, first_stage_path: first_stage.pth,
5
+ load_only_params: false, log_dir: Models/LibriTTS, log_interval: 10, loss_params: {
6
+ TMA_epoch: 4, diff_epoch: 0, joint_epoch: 0, lambda_F0: 1.0, lambda_ce: 20.0,
7
+ lambda_diff: 1.0, lambda_dur: 1.0, lambda_gen: 1.0, lambda_mel: 5.0, lambda_mono: 1.0,
8
+ lambda_norm: 1.0, lambda_s2s: 1.0, lambda_slm: 1.0, lambda_sty: 1.0}, max_len: 300,
9
+ model_params: {decoder: {resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3,
10
+ 5]], resblock_kernel_sizes: [3, 7, 11], type: hifigan, upsample_initial_channel: 512,
11
+ upsample_kernel_sizes: [20, 10, 6, 4], upsample_rates: [10, 5, 3, 2]}, diffusion: {
12
+ dist: {estimate_sigma_data: true, mean: -3.0, sigma_data: 0.19926648961191362,
13
+ std: 1.0}, embedding_mask_proba: 0.1, transformer: {head_features: 64, multiplier: 2,
14
+ num_heads: 8, num_layers: 3}}, dim_in: 64, dropout: 0.2, hidden_dim: 512,
15
+ max_conv_dim: 512, max_dur: 50, multispeaker: true, n_layer: 3, n_mels: 80, n_token: 178,
16
+ slm: {hidden: 768, initial_channel: 64, model: microsoft/wavlm-base-plus, nlayers: 13,
17
+ sr: 16000}, style_dim: 128}, optimizer_params: {bert_lr: 1.0e-05, ft_lr: 1.0e-05,
18
+ lr: 0.0001}, preprocess_params: {spect_params: {hop_length: 300, n_fft: 2048,
19
+ win_length: 1200}, sr: 24000}, pretrained_model: Models/LibriTTS/epoch_2nd_00002.pth,
20
+ save_freq: 1, second_stage_load_pretrained: true, slmadv_params: {batch_percentage: 0.5,
21
+ iter: 20, max_len: 500, min_len: 400, scale: 0.01, sig: 1.5, thresh: 5}}
en/styletts2-models (SC4949)/epochs_2nd_00020.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1164ffe19a17449d2c722234cecaf2836b35a698fb8ffd42562d2663657dca0a
3
+ size 771390526
en/styletts2-models (SC4949)/narrator.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e49292afb1769d24c753055835b795b0ad225aa7b4d05cc846697826a9935c7b
3
+ size 635084
en/styletts2-models (SC4949)/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/SC4949/styletts2-models
en/styletts2-models (SC4949)/women.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a7d39beddd2c24d864163ce38e799b261ab0bc23cbea492f0ece046feb131f1
3
+ size 145484
es/styletts2-spanish-ft/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ reference_audio.wav filter=lfs diff=lfs merge=lfs -text
es/styletts2-spanish-ft/README.md ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # StyleTTS2 Spanish Fine-tuned Model
2
+
3
+ Modelo StyleTTS2 fine-tuned para síntesis de voz en español con clonación de voz.
4
+
5
+ ## Descripción
6
+
7
+ Este modelo fue entrenado específicamente para generar voz en español con alta calidad y naturalidad. Incluye capacidades de clonación de voz mediante audio de referencia.
8
+
9
+ ## Características
10
+
11
+ - **Idioma**: Español (acento guatemalteco)
12
+ - **Arquitectura**: StyleTTS2 con diffusion-based synthesis
13
+ - **Epoch**: 49 (segunda fase de entrenamiento)
14
+ - **Sample Rate**: 24kHz
15
+ - **Calidad**: Alta fidelidad con clonación de voz
16
+
17
+ ## Archivos Incluidos
18
+
19
+ - `epoch_2nd_00049.pth`: Checkpoint del modelo (2.1GB)
20
+ - `config_spanish_ft.yml`: Configuración del modelo
21
+ - `reference_audio.wav`: Audio de referencia para clonación de voz (916KB)
22
+
23
+ ## Uso
24
+
25
+ ### Instalación
26
+
27
+ ```bash
28
+ pip install -U "huggingface_hub[cli]"
29
+ huggingface-cli download FenixDS/styletts2-spanish-ft --local-dir styletts2-spanish-ft
30
+ ```
31
+
32
+ ### Integración con VoxBridge
33
+
34
+ Este modelo está diseñado para usarse con [VoxBridge](https://github.com/MrBotGT/VoxBridge). Configuración en `config/default.yaml`:
35
+
36
+ ```yaml
37
+ tts:
38
+ provider: styletts2
39
+ config_path: styletts2-spanish-ft/config_spanish_ft.yml
40
+ checkpoint_path: styletts2-spanish-ft/epoch_2nd_00049.pth
41
+ reference_audio: styletts2-spanish-ft/reference_audio.wav
42
+ alpha: 0.3
43
+ beta: 0.5
44
+ diffusion_steps: 4
45
+ embedding_scale: 2
46
+ ```
47
+
48
+ ### Uso Directo con StyleTTS2
49
+
50
+ ```python
51
+ import torch
52
+ from styletts2 import tts
53
+
54
+ # Cargar modelo
55
+ model = tts.StyleTTS2(
56
+ config_path="styletts2-spanish-ft/config_spanish_ft.yml",
57
+ checkpoint_path="styletts2-spanish-ft/epoch_2nd_00049.pth"
58
+ )
59
+
60
+ # Generar voz
61
+ text = "Hola, este es un ejemplo de síntesis de voz en español."
62
+ reference_audio = "styletts2-spanish-ft/reference_audio.wav"
63
+
64
+ audio = model.inference(
65
+ text=text,
66
+ ref_audio=reference_audio,
67
+ alpha=0.3,
68
+ beta=0.5,
69
+ diffusion_steps=4,
70
+ embedding_scale=2
71
+ )
72
+ ```
73
+
74
+ ## Parámetros de Síntesis
75
+
76
+ - **alpha** (0.0-1.0): Control de prosodia. Mayor = más variación prosódica
77
+ - **beta** (0.0-1.0): Control de speaker embedding. Mayor = más similitud con referencia
78
+ - **diffusion_steps** (1-10): Pasos de difusión. Más pasos = mejor calidad pero más lento
79
+ - **embedding_scale** (1-3): Escala del speaker embedding
80
+
81
+ ## Rendimiento
82
+
83
+ - **Latencia** (CPU): ~1.2-1.8 segundos por frase
84
+ - **Latencia** (GPU): ~0.3-0.5 segundos por frase
85
+ - **Calidad**: Muy alta, con clonación de voz precisa
86
+
87
+ ## Licencia
88
+
89
+ MIT License
90
+
91
+ ## Créditos
92
+
93
+ Basado en [StyleTTS2](https://github.com/yl4579/StyleTTS2) por yl4579.
94
+
95
+ Fine-tuning realizado con datos de voz en español guatemalteco.
es/styletts2-spanish-ft/config_spanish_ft.yml ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Spanish (Guatemalan) Multi-speaker Fine-tuning Config
2
+ # Based on LibriTTS config with Multilingual PL-BERT
3
+
4
+ ASR_config: Utils/ASR/config.yml
5
+ ASR_path: Utils/ASR/epoch_00080.pth
6
+ F0_path: Utils/JDC/bst.t7
7
+ PLBERT_dir: Utils/PLBERT/
8
+
9
+ # Batch size - reduced for Mac MPS (increase on GPU with more VRAM)
10
+ batch_size: 1
11
+
12
+ data_params:
13
+ OOD_data: Data/OOD_texts.txt
14
+ min_length: 50
15
+ root_path: Data/wavs_gt
16
+ train_data: Data/train_list.txt
17
+ val_data: Data/val_list.txt
18
+
19
+ device: mps # Use 'cuda' for NVIDIA GPU
20
+
21
+ # Training epochs - fine-tuning needs fewer epochs
22
+ epochs: 100
23
+
24
+ load_only_params: true
25
+ log_dir: Models/Spanish
26
+ log_interval: 10
27
+
28
+ loss_params:
29
+ TMA_epoch: 4
30
+ diff_epoch: 20
31
+ joint_epoch: 30
32
+ lambda_F0: 3.0
33
+ lambda_ce: 20.0
34
+ lambda_diff: 1.0
35
+ lambda_dur: 1.0
36
+ lambda_gen: 1.0
37
+ lambda_mel: 2.0
38
+ lambda_mono: 1.0
39
+ lambda_norm: 1.0
40
+ lambda_s2s: 1.0
41
+ lambda_slm: 1.0
42
+ lambda_sty: 1.0
43
+
44
+ max_len: 250
45
+
46
+ model_params:
47
+ decoder:
48
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
49
+ resblock_kernel_sizes: [3, 7, 11]
50
+ type: hifigan
51
+ upsample_initial_channel: 512
52
+ upsample_kernel_sizes: [20, 10, 6, 4]
53
+ upsample_rates: [10, 5, 3, 2]
54
+ diffusion:
55
+ dist:
56
+ estimate_sigma_data: true
57
+ mean: -3.0
58
+ sigma_data: 0.19319299498227843
59
+ std: 1.0
60
+ embedding_mask_proba: 0.1
61
+ transformer:
62
+ head_features: 64
63
+ multiplier: 2
64
+ num_heads: 8
65
+ num_layers: 3
66
+ dim_in: 64
67
+ dropout: 0.2
68
+ hidden_dim: 512
69
+ max_conv_dim: 512
70
+ max_dur: 50
71
+ multispeaker: true # Multi-speaker mode enabled
72
+ n_layer: 3
73
+ n_mels: 80
74
+ n_token: 178 # Multilingual PL-BERT vocab size
75
+ slm:
76
+ hidden: 768
77
+ initial_channel: 64
78
+ model: microsoft/wavlm-base-plus
79
+ nlayers: 13
80
+ sr: 16000
81
+ style_dim: 128
82
+
83
+ optimizer_params:
84
+ bert_lr: 1.0e-05
85
+ ft_lr: 0.0001
86
+ lr: 0.0001
87
+
88
+ preprocess_params:
89
+ spect_params:
90
+ hop_length: 300
91
+ n_fft: 2048
92
+ win_length: 1200
93
+ sr: 24000
94
+
95
+ pretrained_model: Models/LibriTTS/epochs_2nd_00020.pth
96
+ save_freq: 10
97
+ second_stage_load_pretrained: true
98
+
99
+ slmadv_params:
100
+ batch_percentage: 0.5
101
+ iter: 10
102
+ max_len: 200
103
+ min_len: 150
104
+ scale: 0.01
105
+ sig: 1.5
106
+ thresh: 5
es/styletts2-spanish-ft/epoch_2nd_00049.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:513cbe2b838f5fe9f3ca7be209eb6f42fe70b5cc9fb6eb7699471b6d2cb760a2
3
+ size 2252234593
es/styletts2-spanish-ft/reference_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c1ffe13b57cc612b483a7299088c8736a14cf34725dd0114bf346bef6bd2c30
3
+ size 937808
es/styletts2-spanish-ft/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/FenixDS/styletts2-spanish-ft