Update app.py
Browse files
app.py
CHANGED
|
@@ -1,136 +1,136 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
-
import subprocess
|
| 3 |
-
import os
|
| 4 |
-
import sys
|
| 5 |
-
import soundfile as sf
|
| 6 |
-
import numpy as np
|
| 7 |
-
import torch
|
| 8 |
-
import traceback
|
| 9 |
-
import spaces
|
| 10 |
-
|
| 11 |
-
repo_url = "https://huggingface.co/dangtr0408/StyleTTS2-lite-vi"
|
| 12 |
-
repo_dir = "StyleTTS2-lite-vi"
|
| 13 |
-
if not os.path.exists(repo_dir):
|
| 14 |
-
subprocess.run(["git", "clone", repo_url, repo_dir])
|
| 15 |
-
sys.path.append(os.path.abspath(repo_dir))
|
| 16 |
-
from inference import StyleTTS2
|
| 17 |
-
|
| 18 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 19 |
-
config_path = os.path.join(repo_dir, "Models", "config.yaml")
|
| 20 |
-
models_path = os.path.join(repo_dir, "Models", "inference", "model.pth")
|
| 21 |
-
model = StyleTTS2(config_path, models_path).eval().to(device)
|
| 22 |
-
voice_path = os.path.join(repo_dir, "reference_audio")
|
| 23 |
-
eg_voices = [os.path.join(voice_path,"vn_1.wav"), os.path.join(voice_path,"vn_2.wav")]
|
| 24 |
-
eg_texts = [
|
| 25 |
-
"Chỉ với khoảng 90 triệu tham số, [en-us]{StyleTTS2-lite} có thể dễ dàng tạo giọng nói với tốc độ cao.",
|
| 26 |
-
"[id_1] Với [en-us]{StyleTTS2-lite} bạn có thể sử dụng [en-us]{language tag} để mô hình chắc chắn đọc bằng tiếng Anh, [id_2]cũng như sử dụng [en-us]{speaker tag} để chuyển đổi nhanh giữa các giọng đọc.",
|
| 27 |
-
]
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
# Core inference function
|
| 31 |
-
@spaces.GPU
|
| 32 |
-
def main(reference_paths, text_prompt, denoise, avg_style, stabilize):
|
| 33 |
-
try:
|
| 34 |
-
speakers = {}
|
| 35 |
-
for i, path in enumerate(reference_paths, 1):
|
| 36 |
-
speaker_id = f"id_{i}"
|
| 37 |
-
speakers[speaker_id] = {
|
| 38 |
-
"path": path,
|
| 39 |
-
"lang": "vi",
|
| 40 |
-
"speed": 1.0
|
| 41 |
-
}
|
| 42 |
-
|
| 43 |
-
with torch.no_grad():
|
| 44 |
-
styles = model.get_styles(speakers, denoise, avg_style)
|
| 45 |
-
r = model.generate(text_prompt, styles, stabilize, 18, "[id_1]")
|
| 46 |
-
r = r / np.abs(r).max()
|
| 47 |
-
|
| 48 |
-
sf.write("output.wav", r, samplerate=24000)
|
| 49 |
-
return "output.wav", "Audio generated successfully!"
|
| 50 |
-
|
| 51 |
-
except Exception as e:
|
| 52 |
-
error_message = traceback.format_exc()
|
| 53 |
-
return None, error_message
|
| 54 |
-
|
| 55 |
-
def on_file_upload(file_list):
|
| 56 |
-
if not file_list:
|
| 57 |
-
return None, "No file uploaded yet."
|
| 58 |
-
|
| 59 |
-
unique_files = {}
|
| 60 |
-
for file_path in file_list:
|
| 61 |
-
file_name = os.path.basename(file_path)
|
| 62 |
-
unique_files[file_name] = file_path #update and remove duplicate
|
| 63 |
-
|
| 64 |
-
uploaded_infos = []
|
| 65 |
-
uploaded_file_names = list(unique_files.keys())
|
| 66 |
-
for i in range(len(uploaded_file_names)):
|
| 67 |
-
uploaded_infos.append(f"[id_{i+1}]: {uploaded_file_names[i]}")
|
| 68 |
-
|
| 69 |
-
summary = "\n".join(uploaded_infos)
|
| 70 |
-
return list(unique_files.values()), f"Current reference audios:\n{summary}"
|
| 71 |
-
|
| 72 |
-
def gen_example(reference_paths, text_prompt):
|
| 73 |
-
output, status = main(reference_paths, text_prompt, 0.
|
| 74 |
-
return output, reference_paths, status
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
# Gradio UI
|
| 78 |
-
with gr.Blocks() as demo:
|
| 79 |
-
gr.HTML("<h1 style='text-align: center;'>StyleTTS2‑Lite Demo</h1>")
|
| 80 |
-
gr.Markdown(
|
| 81 |
-
"Download the local inference package from Hugging Face: "
|
| 82 |
-
"[StyleTTS2‑Lite (Vietnamese)]"
|
| 83 |
-
"(https://huggingface.co/dangtr0408/StyleTTS2-lite-vi/)."
|
| 84 |
-
)
|
| 85 |
-
gr.Markdown(
|
| 86 |
-
"Annotate any non‑Vietnamese words with the appropriate language tag, e.g., [en-us]{ } for English. For more information, see "
|
| 87 |
-
"[eSpeakNG docs]"
|
| 88 |
-
"(https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)"
|
| 89 |
-
)
|
| 90 |
-
|
| 91 |
-
with gr.Row(equal_height=True):
|
| 92 |
-
with gr.Column(scale=1):
|
| 93 |
-
text_prompt = gr.Textbox(label="Text Prompt", placeholder="Enter your text here...", lines=4)
|
| 94 |
-
with gr.Column(scale=1):
|
| 95 |
-
avg_style = gr.Checkbox(label="Use Average Styles", value=True)
|
| 96 |
-
stabilize = gr.Checkbox(label="Stabilize Speaking Speed", value=True)
|
| 97 |
-
denoise = gr.Slider(0.0, 1.0, step=0.1, value=0.6, label="Denoise Strength")
|
| 98 |
-
|
| 99 |
-
with gr.Row(equal_height=True):
|
| 100 |
-
with gr.Column(scale=1):
|
| 101 |
-
reference_audios = gr.File(label="Reference Audios", file_types=[".wav", ".mp3"], file_count="multiple", height=150)
|
| 102 |
-
gen_button = gr.Button("Generate")
|
| 103 |
-
with gr.Column(scale=1):
|
| 104 |
-
synthesized_audio = gr.Audio(label="Generate Audio", type="filepath")
|
| 105 |
-
|
| 106 |
-
status = gr.Textbox(label="Status", interactive=False, lines=3)
|
| 107 |
-
|
| 108 |
-
reference_audios.change(
|
| 109 |
-
on_file_upload,
|
| 110 |
-
inputs=[reference_audios],
|
| 111 |
-
outputs=[reference_audios, status]
|
| 112 |
-
)
|
| 113 |
-
|
| 114 |
-
gen_button.click(
|
| 115 |
-
fn=main,
|
| 116 |
-
inputs=[
|
| 117 |
-
reference_audios,
|
| 118 |
-
text_prompt,
|
| 119 |
-
denoise,
|
| 120 |
-
avg_style,
|
| 121 |
-
stabilize
|
| 122 |
-
],
|
| 123 |
-
outputs=[synthesized_audio, status]
|
| 124 |
-
)
|
| 125 |
-
|
| 126 |
-
gr.Examples(
|
| 127 |
-
examples=[[[eg_voices[0]], eg_texts[0]], [eg_voices, eg_texts[1]]],
|
| 128 |
-
inputs=[reference_audios, text_prompt],
|
| 129 |
-
outputs=[synthesized_audio, reference_audios, status],
|
| 130 |
-
fn=gen_example,
|
| 131 |
-
cache_examples=False,
|
| 132 |
-
label="Examples",
|
| 133 |
-
run_on_click=True
|
| 134 |
-
)
|
| 135 |
-
|
| 136 |
demo.launch()
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import subprocess
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
import soundfile as sf
|
| 6 |
+
import numpy as np
|
| 7 |
+
import torch
|
| 8 |
+
import traceback
|
| 9 |
+
import spaces
|
| 10 |
+
|
| 11 |
+
repo_url = "https://huggingface.co/dangtr0408/StyleTTS2-lite-vi"
|
| 12 |
+
repo_dir = "StyleTTS2-lite-vi"
|
| 13 |
+
if not os.path.exists(repo_dir):
|
| 14 |
+
subprocess.run(["git", "clone", repo_url, repo_dir])
|
| 15 |
+
sys.path.append(os.path.abspath(repo_dir))
|
| 16 |
+
from inference import StyleTTS2
|
| 17 |
+
|
| 18 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 19 |
+
config_path = os.path.join(repo_dir, "Models", "config.yaml")
|
| 20 |
+
models_path = os.path.join(repo_dir, "Models", "inference", "model.pth")
|
| 21 |
+
model = StyleTTS2(config_path, models_path).eval().to(device)
|
| 22 |
+
voice_path = os.path.join(repo_dir, "reference_audio")
|
| 23 |
+
eg_voices = [os.path.join(voice_path,"vn_1.wav"), os.path.join(voice_path,"vn_2.wav")]
|
| 24 |
+
eg_texts = [
|
| 25 |
+
"Chỉ với khoảng 90 triệu tham số, [en-us]{StyleTTS2-lite} có thể dễ dàng tạo giọng nói với tốc độ cao.",
|
| 26 |
+
"[id_1] Với [en-us]{StyleTTS2-lite} bạn có thể sử dụng [en-us]{language tag} để mô hình chắc chắn đọc bằng tiếng Anh, [id_2]cũng như sử dụng [en-us]{speaker tag} để chuyển đổi nhanh giữa các giọng đọc.",
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Core inference function
|
| 31 |
+
@spaces.GPU
|
| 32 |
+
def main(reference_paths, text_prompt, denoise, avg_style, stabilize):
|
| 33 |
+
try:
|
| 34 |
+
speakers = {}
|
| 35 |
+
for i, path in enumerate(reference_paths, 1):
|
| 36 |
+
speaker_id = f"id_{i}"
|
| 37 |
+
speakers[speaker_id] = {
|
| 38 |
+
"path": path,
|
| 39 |
+
"lang": "vi",
|
| 40 |
+
"speed": 1.0
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
with torch.no_grad():
|
| 44 |
+
styles = model.get_styles(speakers, denoise, avg_style=False)
|
| 45 |
+
r = model.generate(text_prompt, styles, stabilize, 18, "[id_1]")
|
| 46 |
+
r = r / np.abs(r).max()
|
| 47 |
+
|
| 48 |
+
sf.write("output.wav", r, samplerate=24000)
|
| 49 |
+
return "output.wav", "Audio generated successfully!"
|
| 50 |
+
|
| 51 |
+
except Exception as e:
|
| 52 |
+
error_message = traceback.format_exc()
|
| 53 |
+
return None, error_message
|
| 54 |
+
|
| 55 |
+
def on_file_upload(file_list):
|
| 56 |
+
if not file_list:
|
| 57 |
+
return None, "No file uploaded yet."
|
| 58 |
+
|
| 59 |
+
unique_files = {}
|
| 60 |
+
for file_path in file_list:
|
| 61 |
+
file_name = os.path.basename(file_path)
|
| 62 |
+
unique_files[file_name] = file_path #update and remove duplicate
|
| 63 |
+
|
| 64 |
+
uploaded_infos = []
|
| 65 |
+
uploaded_file_names = list(unique_files.keys())
|
| 66 |
+
for i in range(len(uploaded_file_names)):
|
| 67 |
+
uploaded_infos.append(f"[id_{i+1}]: {uploaded_file_names[i]}")
|
| 68 |
+
|
| 69 |
+
summary = "\n".join(uploaded_infos)
|
| 70 |
+
return list(unique_files.values()), f"Current reference audios:\n{summary}"
|
| 71 |
+
|
| 72 |
+
def gen_example(reference_paths, text_prompt):
|
| 73 |
+
output, status = main(reference_paths, text_prompt, 0.3, False, False)
|
| 74 |
+
return output, reference_paths, status
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# Gradio UI
|
| 78 |
+
with gr.Blocks() as demo:
|
| 79 |
+
gr.HTML("<h1 style='text-align: center;'>StyleTTS2‑Lite Demo</h1>")
|
| 80 |
+
gr.Markdown(
|
| 81 |
+
"Download the local inference package from Hugging Face: "
|
| 82 |
+
"[StyleTTS2‑Lite (Vietnamese)]"
|
| 83 |
+
"(https://huggingface.co/dangtr0408/StyleTTS2-lite-vi/)."
|
| 84 |
+
)
|
| 85 |
+
gr.Markdown(
|
| 86 |
+
"Annotate any non‑Vietnamese words with the appropriate language tag, e.g., [en-us]{ } for English. For more information, see "
|
| 87 |
+
"[eSpeakNG docs]"
|
| 88 |
+
"(https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
with gr.Row(equal_height=True):
|
| 92 |
+
with gr.Column(scale=1):
|
| 93 |
+
text_prompt = gr.Textbox(label="Text Prompt", placeholder="Enter your text here...", lines=4)
|
| 94 |
+
with gr.Column(scale=1):
|
| 95 |
+
avg_style = gr.Checkbox(label="Use Average Styles", value=True)
|
| 96 |
+
stabilize = gr.Checkbox(label="Stabilize Speaking Speed", value=True)
|
| 97 |
+
denoise = gr.Slider(0.0, 1.0, step=0.1, value=0.6, label="Denoise Strength")
|
| 98 |
+
|
| 99 |
+
with gr.Row(equal_height=True):
|
| 100 |
+
with gr.Column(scale=1):
|
| 101 |
+
reference_audios = gr.File(label="Reference Audios", file_types=[".wav", ".mp3"], file_count="multiple", height=150)
|
| 102 |
+
gen_button = gr.Button("Generate")
|
| 103 |
+
with gr.Column(scale=1):
|
| 104 |
+
synthesized_audio = gr.Audio(label="Generate Audio", type="filepath")
|
| 105 |
+
|
| 106 |
+
status = gr.Textbox(label="Status", interactive=False, lines=3)
|
| 107 |
+
|
| 108 |
+
reference_audios.change(
|
| 109 |
+
on_file_upload,
|
| 110 |
+
inputs=[reference_audios],
|
| 111 |
+
outputs=[reference_audios, status]
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
gen_button.click(
|
| 115 |
+
fn=main,
|
| 116 |
+
inputs=[
|
| 117 |
+
reference_audios,
|
| 118 |
+
text_prompt,
|
| 119 |
+
denoise,
|
| 120 |
+
avg_style,
|
| 121 |
+
stabilize
|
| 122 |
+
],
|
| 123 |
+
outputs=[synthesized_audio, status]
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
gr.Examples(
|
| 127 |
+
examples=[[[eg_voices[0]], eg_texts[0]], [eg_voices, eg_texts[1]]],
|
| 128 |
+
inputs=[reference_audios, text_prompt],
|
| 129 |
+
outputs=[synthesized_audio, reference_audios, status],
|
| 130 |
+
fn=gen_example,
|
| 131 |
+
cache_examples=False,
|
| 132 |
+
label="Examples",
|
| 133 |
+
run_on_click=True
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
demo.launch()
|