Spaces:

dangtr0408
/

StyleTTS2-lite-vi-space

Running

App Files Files Community

Quocnd174 commited on Oct 6

Commit

36d34f2

verified ·

1 Parent(s): 5c17ac5

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -135

app.py CHANGED Viewed

@@ -1,136 +1,136 @@
-import gradio as gr
-import subprocess
-import os
-import sys
-import soundfile as sf
-import numpy as np
-import torch
-import traceback
-import spaces
-repo_url = "https://huggingface.co/dangtr0408/StyleTTS2-lite-vi"
-repo_dir = "StyleTTS2-lite-vi"
-if not os.path.exists(repo_dir):
-    subprocess.run(["git", "clone", repo_url, repo_dir])
-sys.path.append(os.path.abspath(repo_dir))
-from inference import StyleTTS2
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-config_path = os.path.join(repo_dir, "Models", "config.yaml")
-models_path = os.path.join(repo_dir, "Models", "inference", "model.pth")
-model = StyleTTS2(config_path, models_path).eval().to(device)
-voice_path = os.path.join(repo_dir, "reference_audio")
-eg_voices = [os.path.join(voice_path,"vn_1.wav"), os.path.join(voice_path,"vn_2.wav")]
-eg_texts = [
-    "Chỉ với khoảng 90 triệu tham số, [en-us]{StyleTTS2-lite} có thể dễ dàng tạo giọng nói với tốc độ cao.",
-    "[id_1] Với [en-us]{StyleTTS2-lite} bạn có thể sử dụng [en-us]{language tag} để mô hình chắc chắn đọc bằng tiếng Anh, [id_2]cũng như sử dụng [en-us]{speaker tag} để chuyển đổi nhanh giữa các giọng đọc.",
-]
-# Core inference function
-@spaces.GPU
-def main(reference_paths, text_prompt, denoise, avg_style, stabilize):
-    try:
-        speakers = {}
-        for i, path in enumerate(reference_paths, 1):
-            speaker_id = f"id_{i}"
-            speakers[speaker_id] = {
-                "path": path,
-                "lang": "vi",
-                "speed": 1.0
-        }
-        with torch.no_grad():
-            styles = model.get_styles(speakers, denoise, avg_style)
-            r = model.generate(text_prompt, styles, stabilize, 18, "[id_1]")
-            r = r / np.abs(r).max()
-        sf.write("output.wav", r, samplerate=24000)
-        return "output.wav", "Audio generated successfully!"
-    except Exception as e:
-        error_message = traceback.format_exc()
-        return None, error_message
-def on_file_upload(file_list):
-    if not file_list:
-        return None, "No file uploaded yet."
-    unique_files = {}
-    for file_path in file_list:
-        file_name = os.path.basename(file_path)
-        unique_files[file_name] = file_path #update and remove duplicate
-    uploaded_infos = []
-    uploaded_file_names = list(unique_files.keys())
-    for i in range(len(uploaded_file_names)):
-        uploaded_infos.append(f"[id_{i+1}]: {uploaded_file_names[i]}")
-    summary = "\n".join(uploaded_infos)
-    return list(unique_files.values()), f"Current reference audios:\n{summary}"
-def gen_example(reference_paths, text_prompt):
-    output, status = main(reference_paths, text_prompt, 0.6, True, True)
-    return output, reference_paths, status
-# Gradio UI
-with gr.Blocks() as demo:
-    gr.HTML("<h1 style='text-align: center;'>StyleTTS2‑Lite Demo</h1>")
-    gr.Markdown(
-        "Download the local inference package from Hugging Face: "
-        "[StyleTTS2‑Lite (Vietnamese)]"
-        "(https://huggingface.co/dangtr0408/StyleTTS2-lite-vi/)."
-    )
-    gr.Markdown(
-        "Annotate any non‑Vietnamese words with the appropriate language tag, e.g., [en-us]{  } for English. For more information, see "
-        "[eSpeakNG docs]"
-        "(https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)"
-    )
-    with gr.Row(equal_height=True):
-        with gr.Column(scale=1):
-            text_prompt = gr.Textbox(label="Text Prompt", placeholder="Enter your text here...", lines=4)
-        with gr.Column(scale=1):
-            avg_style = gr.Checkbox(label="Use Average Styles", value=True)
-            stabilize = gr.Checkbox(label="Stabilize Speaking Speed", value=True)
-            denoise = gr.Slider(0.0, 1.0, step=0.1, value=0.6, label="Denoise Strength")
-    with gr.Row(equal_height=True):
-        with gr.Column(scale=1):
-            reference_audios = gr.File(label="Reference Audios", file_types=[".wav", ".mp3"], file_count="multiple", height=150)
-            gen_button = gr.Button("Generate")
-        with gr.Column(scale=1):
-            synthesized_audio = gr.Audio(label="Generate Audio", type="filepath")
-    status = gr.Textbox(label="Status", interactive=False, lines=3)
-    reference_audios.change(
-        on_file_upload,
-        inputs=[reference_audios],
-        outputs=[reference_audios, status]
-    )
-    gen_button.click(
-        fn=main,
-        inputs=[
-            reference_audios,
-            text_prompt,
-            denoise,
-            avg_style,
-            stabilize
-        ],
-        outputs=[synthesized_audio, status]
-    )
-    gr.Examples(
-        examples=[[[eg_voices[0]], eg_texts[0]], [eg_voices, eg_texts[1]]],
-        inputs=[reference_audios, text_prompt],
-        outputs=[synthesized_audio, reference_audios, status],
-        fn=gen_example,
-        cache_examples=False,
-        label="Examples",
-        run_on_click=True
-    )
 demo.launch()

+import gradio as gr
+import subprocess
+import os
+import sys
+import soundfile as sf
+import numpy as np
+import torch
+import traceback
+import spaces
+repo_url = "https://huggingface.co/dangtr0408/StyleTTS2-lite-vi"
+repo_dir = "StyleTTS2-lite-vi"
+if not os.path.exists(repo_dir):
+    subprocess.run(["git", "clone", repo_url, repo_dir])
+sys.path.append(os.path.abspath(repo_dir))
+from inference import StyleTTS2
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+config_path = os.path.join(repo_dir, "Models", "config.yaml")
+models_path = os.path.join(repo_dir, "Models", "inference", "model.pth")
+model = StyleTTS2(config_path, models_path).eval().to(device)
+voice_path = os.path.join(repo_dir, "reference_audio")
+eg_voices = [os.path.join(voice_path,"vn_1.wav"), os.path.join(voice_path,"vn_2.wav")]
+eg_texts = [
+    "Chỉ với khoảng 90 triệu tham số, [en-us]{StyleTTS2-lite} có thể dễ dàng tạo giọng nói với tốc độ cao.",
+    "[id_1] Với [en-us]{StyleTTS2-lite} bạn có thể sử dụng [en-us]{language tag} để mô hình chắc chắn đọc bằng tiếng Anh, [id_2]cũng như sử dụng [en-us]{speaker tag} để chuyển đổi nhanh giữa các giọng đọc.",
+]
+# Core inference function
+@spaces.GPU
+def main(reference_paths, text_prompt, denoise, avg_style, stabilize):
+    try:
+        speakers = {}
+        for i, path in enumerate(reference_paths, 1):
+            speaker_id = f"id_{i}"
+            speakers[speaker_id] = {
+                "path": path,
+                "lang": "vi",
+                "speed": 1.0
+        }
+        with torch.no_grad():
+            styles = model.get_styles(speakers, denoise, avg_style=False)
+            r = model.generate(text_prompt, styles, stabilize, 18, "[id_1]")
+            r = r / np.abs(r).max()
+        sf.write("output.wav", r, samplerate=24000)
+        return "output.wav", "Audio generated successfully!"
+    except Exception as e:
+        error_message = traceback.format_exc()
+        return None, error_message
+def on_file_upload(file_list):
+    if not file_list:
+        return None, "No file uploaded yet."
+    unique_files = {}
+    for file_path in file_list:
+        file_name = os.path.basename(file_path)
+        unique_files[file_name] = file_path #update and remove duplicate
+    uploaded_infos = []
+    uploaded_file_names = list(unique_files.keys())
+    for i in range(len(uploaded_file_names)):
+        uploaded_infos.append(f"[id_{i+1}]: {uploaded_file_names[i]}")
+    summary = "\n".join(uploaded_infos)
+    return list(unique_files.values()), f"Current reference audios:\n{summary}"
+def gen_example(reference_paths, text_prompt):
+    output, status = main(reference_paths, text_prompt, 0.3, False, False)
+    return output, reference_paths, status
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.HTML("<h1 style='text-align: center;'>StyleTTS2‑Lite Demo</h1>")
+    gr.Markdown(
+        "Download the local inference package from Hugging Face: "
+        "[StyleTTS2‑Lite (Vietnamese)]"
+        "(https://huggingface.co/dangtr0408/StyleTTS2-lite-vi/)."
+    )
+    gr.Markdown(
+        "Annotate any non‑Vietnamese words with the appropriate language tag, e.g., [en-us]{  } for English. For more information, see "
+        "[eSpeakNG docs]"
+        "(https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)"
+    )
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=1):
+            text_prompt = gr.Textbox(label="Text Prompt", placeholder="Enter your text here...", lines=4)
+        with gr.Column(scale=1):
+            avg_style = gr.Checkbox(label="Use Average Styles", value=True)
+            stabilize = gr.Checkbox(label="Stabilize Speaking Speed", value=True)
+            denoise = gr.Slider(0.0, 1.0, step=0.1, value=0.6, label="Denoise Strength")
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=1):
+            reference_audios = gr.File(label="Reference Audios", file_types=[".wav", ".mp3"], file_count="multiple", height=150)
+            gen_button = gr.Button("Generate")
+        with gr.Column(scale=1):
+            synthesized_audio = gr.Audio(label="Generate Audio", type="filepath")
+    status = gr.Textbox(label="Status", interactive=False, lines=3)
+    reference_audios.change(
+        on_file_upload,
+        inputs=[reference_audios],
+        outputs=[reference_audios, status]
+    )
+    gen_button.click(
+        fn=main,
+        inputs=[
+            reference_audios,
+            text_prompt,
+            denoise,
+            avg_style,
+            stabilize
+        ],
+        outputs=[synthesized_audio, status]
+    )
+    gr.Examples(
+        examples=[[[eg_voices[0]], eg_texts[0]], [eg_voices, eg_texts[1]]],
+        inputs=[reference_audios, text_prompt],
+        outputs=[synthesized_audio, reference_audios, status],
+        fn=gen_example,
+        cache_examples=False,
+        label="Examples",
+        run_on_click=True
+    )
 demo.launch()