Style TTS (en, es)
Browse files- .gitattributes +3 -0
- en/StyleTTS2-ONNX-Cpp/.gitattributes +35 -0
- en/StyleTTS2-ONNX-Cpp/bert_encoder.onnx +3 -0
- en/StyleTTS2-ONNX-Cpp/final_simp.onnx +3 -0
- en/StyleTTS2-ONNX-Cpp/plbert_simp.onnx +3 -0
- en/StyleTTS2-ONNX-Cpp/predictor_encoder_simp.onnx +3 -0
- en/StyleTTS2-ONNX-Cpp/ref_p.bin +3 -0
- en/StyleTTS2-ONNX-Cpp/ref_s.bin +3 -0
- en/StyleTTS2-ONNX-Cpp/source.txt +1 -0
- en/StyleTTS2-ONNX-Cpp/style_encoder_simp.onnx +3 -0
- en/styletts2-models (SC4949)/.gitattributes +37 -0
- en/styletts2-models (SC4949)/anger.wav +0 -0
- en/styletts2-models (SC4949)/config.yml +21 -0
- en/styletts2-models (SC4949)/epochs_2nd_00020.pth +3 -0
- en/styletts2-models (SC4949)/narrator.wav +3 -0
- en/styletts2-models (SC4949)/source.txt +1 -0
- en/styletts2-models (SC4949)/women.wav +3 -0
- es/styletts2-spanish-ft/.gitattributes +36 -0
- es/styletts2-spanish-ft/README.md +95 -0
- es/styletts2-spanish-ft/config_spanish_ft.yml +106 -0
- es/styletts2-spanish-ft/epoch_2nd_00049.pth +3 -0
- es/styletts2-spanish-ft/reference_audio.wav +3 -0
- es/styletts2-spanish-ft/source.txt +1 -0
.gitattributes
CHANGED
|
@@ -58,3 +58,6 @@ vi,en/StyleTTS2-lite-vi/reference_audio/vn_1.wav filter=lfs diff=lfs merge=lfs -
|
|
| 58 |
vi,en/StyleTTS2-lite-vi/reference_audio/vn_2.wav filter=lfs diff=lfs merge=lfs -text
|
| 59 |
vi,en/StyleTTS2-lite-vi/reference_audio/vn_3.wav filter=lfs diff=lfs merge=lfs -text
|
| 60 |
vi,en/StyleTTS2-lite-vi/reference_audio/vn_4.wav filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
vi,en/StyleTTS2-lite-vi/reference_audio/vn_2.wav filter=lfs diff=lfs merge=lfs -text
|
| 59 |
vi,en/StyleTTS2-lite-vi/reference_audio/vn_3.wav filter=lfs diff=lfs merge=lfs -text
|
| 60 |
vi,en/StyleTTS2-lite-vi/reference_audio/vn_4.wav filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
en/styletts2-models[[:space:]](SC4949)/narrator.wav filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
en/styletts2-models[[:space:]](SC4949)/women.wav filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
es/styletts2-spanish-ft/reference_audio.wav filter=lfs diff=lfs merge=lfs -text
|
en/StyleTTS2-ONNX-Cpp/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
en/StyleTTS2-ONNX-Cpp/bert_encoder.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c859514cbe9b5c12a50a0046fd769a797b66af184036ebb914578a2e69d5e82a
|
| 3 |
+
size 1575207
|
en/StyleTTS2-ONNX-Cpp/final_simp.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38b2cd2f39302c45085659151581af1c80375f0d3b7d77ca29bf89a5085ad561
|
| 3 |
+
size 304950337
|
en/StyleTTS2-ONNX-Cpp/plbert_simp.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f1c9af67134670d5fb1ac08e482ce8e0613e6d083bcc6a0990bcad41da33a51
|
| 3 |
+
size 23106930
|
en/StyleTTS2-ONNX-Cpp/predictor_encoder_simp.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47a270615ae814f73f6227ce0d51f7c24a790bf4e1b22ebd12550b879d79f604
|
| 3 |
+
size 55399267
|
en/StyleTTS2-ONNX-Cpp/ref_p.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:af4f5173706a7a03cd7512c440808369e7897614887069dec08a92eadcbfdca4
|
| 3 |
+
size 512
|
en/StyleTTS2-ONNX-Cpp/ref_s.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3d846acc721a2b3ddeb150675ea63f30416326ddd0e7ac7c0c80f20aca3105de
|
| 3 |
+
size 512
|
en/StyleTTS2-ONNX-Cpp/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/DDATT/StyleTTS2-ONNX-Cpp
|
en/StyleTTS2-ONNX-Cpp/style_encoder_simp.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90d7973e3d89ffe00b80603e6caa6c10de2921291bf406a568076fe25a9eb051
|
| 3 |
+
size 55399267
|
en/styletts2-models (SC4949)/.gitattributes
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
narrator.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
women.wav filter=lfs diff=lfs merge=lfs -text
|
en/styletts2-models (SC4949)/anger.wav
ADDED
|
Binary file (96 kB). View file
|
|
|
en/styletts2-models (SC4949)/config.yml
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{ASR_config: Utils/ASR/config.yml, ASR_path: Utils/ASR/epoch_00080.pth, F0_path: Utils/JDC/bst.t7,
|
| 2 |
+
PLBERT_dir: Utils/PLBERT/, batch_size: 8, data_params: {OOD_data: Data/OOD_texts.txt,
|
| 3 |
+
min_length: 50, root_path: '', train_data: Data/train_list.txt, val_data: Data/val_list.txt},
|
| 4 |
+
device: cuda, epochs_1st: 40, epochs_2nd: 25, first_stage_path: first_stage.pth,
|
| 5 |
+
load_only_params: false, log_dir: Models/LibriTTS, log_interval: 10, loss_params: {
|
| 6 |
+
TMA_epoch: 4, diff_epoch: 0, joint_epoch: 0, lambda_F0: 1.0, lambda_ce: 20.0,
|
| 7 |
+
lambda_diff: 1.0, lambda_dur: 1.0, lambda_gen: 1.0, lambda_mel: 5.0, lambda_mono: 1.0,
|
| 8 |
+
lambda_norm: 1.0, lambda_s2s: 1.0, lambda_slm: 1.0, lambda_sty: 1.0}, max_len: 300,
|
| 9 |
+
model_params: {decoder: {resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3,
|
| 10 |
+
5]], resblock_kernel_sizes: [3, 7, 11], type: hifigan, upsample_initial_channel: 512,
|
| 11 |
+
upsample_kernel_sizes: [20, 10, 6, 4], upsample_rates: [10, 5, 3, 2]}, diffusion: {
|
| 12 |
+
dist: {estimate_sigma_data: true, mean: -3.0, sigma_data: 0.19926648961191362,
|
| 13 |
+
std: 1.0}, embedding_mask_proba: 0.1, transformer: {head_features: 64, multiplier: 2,
|
| 14 |
+
num_heads: 8, num_layers: 3}}, dim_in: 64, dropout: 0.2, hidden_dim: 512,
|
| 15 |
+
max_conv_dim: 512, max_dur: 50, multispeaker: true, n_layer: 3, n_mels: 80, n_token: 178,
|
| 16 |
+
slm: {hidden: 768, initial_channel: 64, model: microsoft/wavlm-base-plus, nlayers: 13,
|
| 17 |
+
sr: 16000}, style_dim: 128}, optimizer_params: {bert_lr: 1.0e-05, ft_lr: 1.0e-05,
|
| 18 |
+
lr: 0.0001}, preprocess_params: {spect_params: {hop_length: 300, n_fft: 2048,
|
| 19 |
+
win_length: 1200}, sr: 24000}, pretrained_model: Models/LibriTTS/epoch_2nd_00002.pth,
|
| 20 |
+
save_freq: 1, second_stage_load_pretrained: true, slmadv_params: {batch_percentage: 0.5,
|
| 21 |
+
iter: 20, max_len: 500, min_len: 400, scale: 0.01, sig: 1.5, thresh: 5}}
|
en/styletts2-models (SC4949)/epochs_2nd_00020.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1164ffe19a17449d2c722234cecaf2836b35a698fb8ffd42562d2663657dca0a
|
| 3 |
+
size 771390526
|
en/styletts2-models (SC4949)/narrator.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e49292afb1769d24c753055835b795b0ad225aa7b4d05cc846697826a9935c7b
|
| 3 |
+
size 635084
|
en/styletts2-models (SC4949)/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/SC4949/styletts2-models
|
en/styletts2-models (SC4949)/women.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a7d39beddd2c24d864163ce38e799b261ab0bc23cbea492f0ece046feb131f1
|
| 3 |
+
size 145484
|
es/styletts2-spanish-ft/.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
reference_audio.wav filter=lfs diff=lfs merge=lfs -text
|
es/styletts2-spanish-ft/README.md
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# StyleTTS2 Spanish Fine-tuned Model
|
| 2 |
+
|
| 3 |
+
Modelo StyleTTS2 fine-tuned para síntesis de voz en español con clonación de voz.
|
| 4 |
+
|
| 5 |
+
## Descripción
|
| 6 |
+
|
| 7 |
+
Este modelo fue entrenado específicamente para generar voz en español con alta calidad y naturalidad. Incluye capacidades de clonación de voz mediante audio de referencia.
|
| 8 |
+
|
| 9 |
+
## Características
|
| 10 |
+
|
| 11 |
+
- **Idioma**: Español (acento guatemalteco)
|
| 12 |
+
- **Arquitectura**: StyleTTS2 con diffusion-based synthesis
|
| 13 |
+
- **Epoch**: 49 (segunda fase de entrenamiento)
|
| 14 |
+
- **Sample Rate**: 24kHz
|
| 15 |
+
- **Calidad**: Alta fidelidad con clonación de voz
|
| 16 |
+
|
| 17 |
+
## Archivos Incluidos
|
| 18 |
+
|
| 19 |
+
- `epoch_2nd_00049.pth`: Checkpoint del modelo (2.1GB)
|
| 20 |
+
- `config_spanish_ft.yml`: Configuración del modelo
|
| 21 |
+
- `reference_audio.wav`: Audio de referencia para clonación de voz (916KB)
|
| 22 |
+
|
| 23 |
+
## Uso
|
| 24 |
+
|
| 25 |
+
### Instalación
|
| 26 |
+
|
| 27 |
+
```bash
|
| 28 |
+
pip install -U "huggingface_hub[cli]"
|
| 29 |
+
huggingface-cli download FenixDS/styletts2-spanish-ft --local-dir styletts2-spanish-ft
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
### Integración con VoxBridge
|
| 33 |
+
|
| 34 |
+
Este modelo está diseñado para usarse con [VoxBridge](https://github.com/MrBotGT/VoxBridge). Configuración en `config/default.yaml`:
|
| 35 |
+
|
| 36 |
+
```yaml
|
| 37 |
+
tts:
|
| 38 |
+
provider: styletts2
|
| 39 |
+
config_path: styletts2-spanish-ft/config_spanish_ft.yml
|
| 40 |
+
checkpoint_path: styletts2-spanish-ft/epoch_2nd_00049.pth
|
| 41 |
+
reference_audio: styletts2-spanish-ft/reference_audio.wav
|
| 42 |
+
alpha: 0.3
|
| 43 |
+
beta: 0.5
|
| 44 |
+
diffusion_steps: 4
|
| 45 |
+
embedding_scale: 2
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
### Uso Directo con StyleTTS2
|
| 49 |
+
|
| 50 |
+
```python
|
| 51 |
+
import torch
|
| 52 |
+
from styletts2 import tts
|
| 53 |
+
|
| 54 |
+
# Cargar modelo
|
| 55 |
+
model = tts.StyleTTS2(
|
| 56 |
+
config_path="styletts2-spanish-ft/config_spanish_ft.yml",
|
| 57 |
+
checkpoint_path="styletts2-spanish-ft/epoch_2nd_00049.pth"
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Generar voz
|
| 61 |
+
text = "Hola, este es un ejemplo de síntesis de voz en español."
|
| 62 |
+
reference_audio = "styletts2-spanish-ft/reference_audio.wav"
|
| 63 |
+
|
| 64 |
+
audio = model.inference(
|
| 65 |
+
text=text,
|
| 66 |
+
ref_audio=reference_audio,
|
| 67 |
+
alpha=0.3,
|
| 68 |
+
beta=0.5,
|
| 69 |
+
diffusion_steps=4,
|
| 70 |
+
embedding_scale=2
|
| 71 |
+
)
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
## Parámetros de Síntesis
|
| 75 |
+
|
| 76 |
+
- **alpha** (0.0-1.0): Control de prosodia. Mayor = más variación prosódica
|
| 77 |
+
- **beta** (0.0-1.0): Control de speaker embedding. Mayor = más similitud con referencia
|
| 78 |
+
- **diffusion_steps** (1-10): Pasos de difusión. Más pasos = mejor calidad pero más lento
|
| 79 |
+
- **embedding_scale** (1-3): Escala del speaker embedding
|
| 80 |
+
|
| 81 |
+
## Rendimiento
|
| 82 |
+
|
| 83 |
+
- **Latencia** (CPU): ~1.2-1.8 segundos por frase
|
| 84 |
+
- **Latencia** (GPU): ~0.3-0.5 segundos por frase
|
| 85 |
+
- **Calidad**: Muy alta, con clonación de voz precisa
|
| 86 |
+
|
| 87 |
+
## Licencia
|
| 88 |
+
|
| 89 |
+
MIT License
|
| 90 |
+
|
| 91 |
+
## Créditos
|
| 92 |
+
|
| 93 |
+
Basado en [StyleTTS2](https://github.com/yl4579/StyleTTS2) por yl4579.
|
| 94 |
+
|
| 95 |
+
Fine-tuning realizado con datos de voz en español guatemalteco.
|
es/styletts2-spanish-ft/config_spanish_ft.yml
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Spanish (Guatemalan) Multi-speaker Fine-tuning Config
|
| 2 |
+
# Based on LibriTTS config with Multilingual PL-BERT
|
| 3 |
+
|
| 4 |
+
ASR_config: Utils/ASR/config.yml
|
| 5 |
+
ASR_path: Utils/ASR/epoch_00080.pth
|
| 6 |
+
F0_path: Utils/JDC/bst.t7
|
| 7 |
+
PLBERT_dir: Utils/PLBERT/
|
| 8 |
+
|
| 9 |
+
# Batch size - reduced for Mac MPS (increase on GPU with more VRAM)
|
| 10 |
+
batch_size: 1
|
| 11 |
+
|
| 12 |
+
data_params:
|
| 13 |
+
OOD_data: Data/OOD_texts.txt
|
| 14 |
+
min_length: 50
|
| 15 |
+
root_path: Data/wavs_gt
|
| 16 |
+
train_data: Data/train_list.txt
|
| 17 |
+
val_data: Data/val_list.txt
|
| 18 |
+
|
| 19 |
+
device: mps # Use 'cuda' for NVIDIA GPU
|
| 20 |
+
|
| 21 |
+
# Training epochs - fine-tuning needs fewer epochs
|
| 22 |
+
epochs: 100
|
| 23 |
+
|
| 24 |
+
load_only_params: true
|
| 25 |
+
log_dir: Models/Spanish
|
| 26 |
+
log_interval: 10
|
| 27 |
+
|
| 28 |
+
loss_params:
|
| 29 |
+
TMA_epoch: 4
|
| 30 |
+
diff_epoch: 20
|
| 31 |
+
joint_epoch: 30
|
| 32 |
+
lambda_F0: 3.0
|
| 33 |
+
lambda_ce: 20.0
|
| 34 |
+
lambda_diff: 1.0
|
| 35 |
+
lambda_dur: 1.0
|
| 36 |
+
lambda_gen: 1.0
|
| 37 |
+
lambda_mel: 2.0
|
| 38 |
+
lambda_mono: 1.0
|
| 39 |
+
lambda_norm: 1.0
|
| 40 |
+
lambda_s2s: 1.0
|
| 41 |
+
lambda_slm: 1.0
|
| 42 |
+
lambda_sty: 1.0
|
| 43 |
+
|
| 44 |
+
max_len: 250
|
| 45 |
+
|
| 46 |
+
model_params:
|
| 47 |
+
decoder:
|
| 48 |
+
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
| 49 |
+
resblock_kernel_sizes: [3, 7, 11]
|
| 50 |
+
type: hifigan
|
| 51 |
+
upsample_initial_channel: 512
|
| 52 |
+
upsample_kernel_sizes: [20, 10, 6, 4]
|
| 53 |
+
upsample_rates: [10, 5, 3, 2]
|
| 54 |
+
diffusion:
|
| 55 |
+
dist:
|
| 56 |
+
estimate_sigma_data: true
|
| 57 |
+
mean: -3.0
|
| 58 |
+
sigma_data: 0.19319299498227843
|
| 59 |
+
std: 1.0
|
| 60 |
+
embedding_mask_proba: 0.1
|
| 61 |
+
transformer:
|
| 62 |
+
head_features: 64
|
| 63 |
+
multiplier: 2
|
| 64 |
+
num_heads: 8
|
| 65 |
+
num_layers: 3
|
| 66 |
+
dim_in: 64
|
| 67 |
+
dropout: 0.2
|
| 68 |
+
hidden_dim: 512
|
| 69 |
+
max_conv_dim: 512
|
| 70 |
+
max_dur: 50
|
| 71 |
+
multispeaker: true # Multi-speaker mode enabled
|
| 72 |
+
n_layer: 3
|
| 73 |
+
n_mels: 80
|
| 74 |
+
n_token: 178 # Multilingual PL-BERT vocab size
|
| 75 |
+
slm:
|
| 76 |
+
hidden: 768
|
| 77 |
+
initial_channel: 64
|
| 78 |
+
model: microsoft/wavlm-base-plus
|
| 79 |
+
nlayers: 13
|
| 80 |
+
sr: 16000
|
| 81 |
+
style_dim: 128
|
| 82 |
+
|
| 83 |
+
optimizer_params:
|
| 84 |
+
bert_lr: 1.0e-05
|
| 85 |
+
ft_lr: 0.0001
|
| 86 |
+
lr: 0.0001
|
| 87 |
+
|
| 88 |
+
preprocess_params:
|
| 89 |
+
spect_params:
|
| 90 |
+
hop_length: 300
|
| 91 |
+
n_fft: 2048
|
| 92 |
+
win_length: 1200
|
| 93 |
+
sr: 24000
|
| 94 |
+
|
| 95 |
+
pretrained_model: Models/LibriTTS/epochs_2nd_00020.pth
|
| 96 |
+
save_freq: 10
|
| 97 |
+
second_stage_load_pretrained: true
|
| 98 |
+
|
| 99 |
+
slmadv_params:
|
| 100 |
+
batch_percentage: 0.5
|
| 101 |
+
iter: 10
|
| 102 |
+
max_len: 200
|
| 103 |
+
min_len: 150
|
| 104 |
+
scale: 0.01
|
| 105 |
+
sig: 1.5
|
| 106 |
+
thresh: 5
|
es/styletts2-spanish-ft/epoch_2nd_00049.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:513cbe2b838f5fe9f3ca7be209eb6f42fe70b5cc9fb6eb7699471b6d2cb760a2
|
| 3 |
+
size 2252234593
|
es/styletts2-spanish-ft/reference_audio.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4c1ffe13b57cc612b483a7299088c8736a14cf34725dd0114bf346bef6bd2c30
|
| 3 |
+
size 937808
|
es/styletts2-spanish-ft/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/FenixDS/styletts2-spanish-ft
|