oicui commited on
Commit
0f6bb9c
·
verified ·
1 Parent(s): d44400c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -67
app.py CHANGED
@@ -29,23 +29,12 @@ voicelist = ['f-us-1', 'f-us-2', 'f-us-3', 'f-us-4', 'm-us-1', 'm-us-2', 'm-us-3
29
  voices = {}
30
  import phonemizer
31
  global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
32
- # todo: cache computed style, load using pickle
33
- # if os.path.exists('voices.pkl'):
34
- # with open('voices.pkl', 'rb') as f:
35
- # voices = pickle.load(f)
36
- # else:
37
  for v in voicelist:
38
  voices[v] = styletts2importable.compute_style(f'voices/{v}.wav')
39
- # def synthesize(text, voice, multispeakersteps):
40
- # if text.strip() == "":
41
- # raise gr.Error("You must enter some text")
42
- # # if len(global_phonemizer.phonemize([text])) > 300:
43
- # if len(text) > 300:
44
- # raise gr.Error("Text must be under 300 characters")
45
- # v = voice.lower()
46
- # # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
47
- # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
48
  if not torch.cuda.is_available(): INTROTXT += "\n\n### You are on a CPU-only system, inference will be much slower.\n\nYou can use the [online demo](https://huggingface.co/spaces/styletts2/styletts2) for fast inference."
 
49
  def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
50
  if text.strip() == "":
51
  raise gr.Error("You must enter some text")
@@ -61,30 +50,8 @@ def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
61
  print(t)
62
  audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
63
  return (24000, np.concatenate(audios))
64
- # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
65
- # if password == os.environ['ACCESS_CODE']:
66
- # if text.strip() == "":
67
- # raise gr.Error("You must enter some text")
68
- # if lngsteps > 25:
69
- # raise gr.Error("Max 25 steps")
70
- # if lngsteps < 5:
71
- # raise gr.Error("Min 5 steps")
72
- # texts = split_and_recombine_text(text)
73
- # v = voice.lower()
74
- # audios = []
75
- # for t in progress.tqdm(texts):
76
- # audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
77
- # return (24000, np.concatenate(audios))
78
- # else:
79
- # raise gr.Error('Wrong access code')
80
  def rn_clsynthesize(text, voice, vcsteps, embscale, alpha, beta, progress=gr.Progress()):
81
- # if text.strip() == "":
82
- # raise gr.Error("You must enter some text")
83
- # # if global_phonemizer.phonemize([text]) > 300:
84
- # if len(text) > 400:
85
- # raise gr.Error("Text must be under 400 characters")
86
- # # return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=20, embedding_scale=1))
87
- # return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=vcsteps, embedding_scale=1))
88
  if text.strip() == "":
89
  raise gr.Error("You must enter some text")
90
  if len(text) > 50000:
@@ -96,21 +63,13 @@ def rn_clsynthesize(text, voice, vcsteps, embscale, alpha, beta, progress=gr.Pro
96
  print("*** end ***")
97
  texts = txtsplit(text)
98
  audios = []
99
- # vs = styletts2importable.compute_style(voice)
100
  vs = styletts2importable.compute_style(voice)
101
- # print(vs)
102
  for t in progress.tqdm(texts):
103
  audios.append(styletts2importable.inference(t, vs, alpha=alpha, beta=beta, diffusion_steps=vcsteps, embedding_scale=embscale))
104
- # audios.append(styletts2importable.inference(t, vs, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=5))
105
  return (24000, np.concatenate(audios))
 
106
  def rn_ljsynthesize(text, steps, progress=gr.Progress()):
107
- # if text.strip() == "":
108
- # raise gr.Error("You must enter some text")
109
- # # if global_phonemizer.phonemize([text]) > 300:
110
- # if len(text) > 400:
111
- # raise gr.Error("Text must be under 400 characters")
112
  noise = torch.randn(1,1,256).to('cuda' if torch.cuda.is_available() else 'cpu')
113
- # return (24000, ljspeechimportable.inference(text, noise, diffusion_steps=7, embedding_scale=1))
114
  if text.strip() == "":
115
  raise gr.Error("You must enter some text")
116
  if len(text) > 150000:
@@ -124,58 +83,58 @@ def rn_ljsynthesize(text, steps, progress=gr.Progress()):
124
  audios.append(ljspeechimportable.inference(t, noise, diffusion_steps=steps, embedding_scale=1))
125
  return (24000, np.concatenate(audios))
126
 
127
-
128
  with gr.Blocks() as vctk:
129
  with gr.Row():
130
  with gr.Column(scale=1):
131
  inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
132
  voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
133
  multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
134
- # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
135
  with gr.Column(scale=1):
136
  btn = gr.Button("Synthesize", variant="primary")
137
  audio = gr.Audio(interactive=False, label="Synthesized Audio", show_download_button=True, waveform_options={'waveform_progress_color': '#3C82F6'})
138
  btn.click(synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4)
 
139
  with gr.Blocks() as clone:
140
  with gr.Row():
141
  with gr.Column(scale=1):
142
- clinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
143
- clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
144
- vcsteps = gr.Slider(minimum=3, maximum=20, value=20, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
145
- embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
 
 
 
 
 
 
 
 
 
 
 
146
  alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
147
  beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
148
  with gr.Column(scale=1):
149
  clbtn = gr.Button("Synthesize", variant="primary")
150
  claudio = gr.Audio(interactive=False, label="Synthesized Audio", show_download_button=True, waveform_options={'waveform_progress_color': '#3C82F6'})
151
  clbtn.click(rn_clsynthesize, inputs=[clinp, clvoice, vcsteps, embscale, alpha, beta], outputs=[claudio], concurrency_limit=4)
152
- # with gr.Blocks() as longText:
153
- # with gr.Row():
154
- # with gr.Column(scale=1):
155
- # lnginp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
156
- # lngvoice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-1', interactive=True)
157
- # lngsteps = gr.Slider(minimum=5, maximum=25, value=10, step=1, label="Diffusion Steps", info="Higher = better quality, but slower", interactive=True)
158
- # lngpwd = gr.Textbox(label="Access code", info="This feature is in beta. You need an access code to use it as it uses more resources and we would like to prevent abuse")
159
- # with gr.Column(scale=1):
160
- # lngbtn = gr.Button("Synthesize", variant="primary")
161
- # lngaudio = gr.Audio(interactive=False, label="Synthesized Audio")
162
- # lngbtn.click(longsynthesize, inputs=[lnginp, lngvoice, lngsteps, lngpwd], outputs=[lngaudio], concurrency_limit=4)
163
  with gr.Blocks() as lj:
164
  with gr.Row():
165
  with gr.Column(scale=1):
166
- ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
167
- ljsteps = gr.Slider(minimum=3, maximum=20, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
168
  with gr.Column(scale=1):
169
  ljbtn = gr.Button("Synthesize", variant="primary")
170
  ljaudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
171
  ljbtn.click(rn_ljsynthesize, inputs=[ljinp, ljsteps], outputs=[ljaudio], concurrency_limit=4)
 
172
  with gr.Blocks(title="StyleTTS 2", css="footer{display:none !important}", theme=theme) as demo:
173
  gr.Markdown(INTROTXT)
174
  gr.DuplicateButton("Duplicate Space")
175
- # gr.TabbedInterface([vctk, clone, lj, longText], ['Multi-Voice', 'Voice Cloning', 'LJSpeech', 'Long Text [Beta]'])
176
  gr.TabbedInterface([vctk, clone, lj], ['Multi-Voice', 'Voice Cloning', 'LJSpeech', 'Long Text [Beta]'])
177
  gr.Markdown("""
178
- Demo by [mrfakename](https://twitter.com/realmrfakename). I am not affiliated with the StyleTTS 2 authors.
179
 
180
  Run this demo locally using Docker:
181
 
 
29
  voices = {}
30
  import phonemizer
31
  global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
32
+
 
 
 
 
33
  for v in voicelist:
34
  voices[v] = styletts2importable.compute_style(f'voices/{v}.wav')
35
+
 
 
 
 
 
 
 
 
36
  if not torch.cuda.is_available(): INTROTXT += "\n\n### You are on a CPU-only system, inference will be much slower.\n\nYou can use the [online demo](https://huggingface.co/spaces/styletts2/styletts2) for fast inference."
37
+
38
  def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
39
  if text.strip() == "":
40
  raise gr.Error("You must enter some text")
 
50
  print(t)
51
  audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
52
  return (24000, np.concatenate(audios))
53
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def rn_clsynthesize(text, voice, vcsteps, embscale, alpha, beta, progress=gr.Progress()):
 
 
 
 
 
 
 
55
  if text.strip() == "":
56
  raise gr.Error("You must enter some text")
57
  if len(text) > 50000:
 
63
  print("*** end ***")
64
  texts = txtsplit(text)
65
  audios = []
 
66
  vs = styletts2importable.compute_style(voice)
 
67
  for t in progress.tqdm(texts):
68
  audios.append(styletts2importable.inference(t, vs, alpha=alpha, beta=beta, diffusion_steps=vcsteps, embedding_scale=embscale))
 
69
  return (24000, np.concatenate(audios))
70
+
71
  def rn_ljsynthesize(text, steps, progress=gr.Progress()):
 
 
 
 
 
72
  noise = torch.randn(1,1,256).to('cuda' if torch.cuda.is_available() else 'cpu')
 
73
  if text.strip() == "":
74
  raise gr.Error("You must enter some text")
75
  if len(text) > 150000:
 
83
  audios.append(ljspeechimportable.inference(t, noise, diffusion_steps=steps, embedding_scale=1))
84
  return (24000, np.concatenate(audios))
85
 
 
86
  with gr.Blocks() as vctk:
87
  with gr.Row():
88
  with gr.Column(scale=1):
89
  inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
90
  voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
91
  multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
 
92
  with gr.Column(scale=1):
93
  btn = gr.Button("Synthesize", variant="primary")
94
  audio = gr.Audio(interactive=False, label="Synthesized Audio", show_download_button=True, waveform_options={'waveform_progress_color': '#3C82F6'})
95
  btn.click(synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4)
96
+
97
  with gr.Blocks() as clone:
98
  with gr.Row():
99
  with gr.Column(scale=1):
100
+ clinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read?", interactive=True)
101
+
102
+ # BẢN GỐC:
103
+ # clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={...})
104
+
105
+ # ✅ ĐÃ SỬA — XOÁ GIỚI HẠN max_length
106
+ clvoice = gr.Audio(
107
+ label="Voice",
108
+ interactive=True,
109
+ type='filepath',
110
+ waveform_options={'waveform_progress_color': '#3C82F6'}
111
+ )
112
+
113
+ vcsteps = gr.Slider(minimum=3, maximum=20, value=20, step=1, label="Diffusion Steps", info="Higher = better but slower", interactive=True)
114
+ embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale", info="Default 1", interactive=True)
115
  alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
116
  beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
117
  with gr.Column(scale=1):
118
  clbtn = gr.Button("Synthesize", variant="primary")
119
  claudio = gr.Audio(interactive=False, label="Synthesized Audio", show_download_button=True, waveform_options={'waveform_progress_color': '#3C82F6'})
120
  clbtn.click(rn_clsynthesize, inputs=[clinp, clvoice, vcsteps, embscale, alpha, beta], outputs=[claudio], concurrency_limit=4)
121
+
 
 
 
 
 
 
 
 
 
 
122
  with gr.Blocks() as lj:
123
  with gr.Row():
124
  with gr.Column(scale=1):
125
+ ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read?", interactive=True)
126
+ ljsteps = gr.Slider(minimum=3, maximum=20, value=3, step=1, label="Diffusion Steps", interactive=True)
127
  with gr.Column(scale=1):
128
  ljbtn = gr.Button("Synthesize", variant="primary")
129
  ljaudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
130
  ljbtn.click(rn_ljsynthesize, inputs=[ljinp, ljsteps], outputs=[ljaudio], concurrency_limit=4)
131
+
132
  with gr.Blocks(title="StyleTTS 2", css="footer{display:none !important}", theme=theme) as demo:
133
  gr.Markdown(INTROTXT)
134
  gr.DuplicateButton("Duplicate Space")
 
135
  gr.TabbedInterface([vctk, clone, lj], ['Multi-Voice', 'Voice Cloning', 'LJSpeech', 'Long Text [Beta]'])
136
  gr.Markdown("""
137
+ Demo by mrfakename.
138
 
139
  Run this demo locally using Docker:
140