Quocnd174 commited on
Commit
36d34f2
·
verified ·
1 Parent(s): 5c17ac5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -135
app.py CHANGED
@@ -1,136 +1,136 @@
1
- import gradio as gr
2
- import subprocess
3
- import os
4
- import sys
5
- import soundfile as sf
6
- import numpy as np
7
- import torch
8
- import traceback
9
- import spaces
10
-
11
- repo_url = "https://huggingface.co/dangtr0408/StyleTTS2-lite-vi"
12
- repo_dir = "StyleTTS2-lite-vi"
13
- if not os.path.exists(repo_dir):
14
- subprocess.run(["git", "clone", repo_url, repo_dir])
15
- sys.path.append(os.path.abspath(repo_dir))
16
- from inference import StyleTTS2
17
-
18
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
19
- config_path = os.path.join(repo_dir, "Models", "config.yaml")
20
- models_path = os.path.join(repo_dir, "Models", "inference", "model.pth")
21
- model = StyleTTS2(config_path, models_path).eval().to(device)
22
- voice_path = os.path.join(repo_dir, "reference_audio")
23
- eg_voices = [os.path.join(voice_path,"vn_1.wav"), os.path.join(voice_path,"vn_2.wav")]
24
- eg_texts = [
25
- "Chỉ với khoảng 90 triệu tham số, [en-us]{StyleTTS2-lite} có thể dễ dàng tạo giọng nói với tốc độ cao.",
26
- "[id_1] Với [en-us]{StyleTTS2-lite} bạn có thể sử dụng [en-us]{language tag} để mô hình chắc chắn đọc bằng tiếng Anh, [id_2]cũng như sử dụng [en-us]{speaker tag} để chuyển đổi nhanh giữa các giọng đọc.",
27
- ]
28
-
29
-
30
- # Core inference function
31
- @spaces.GPU
32
- def main(reference_paths, text_prompt, denoise, avg_style, stabilize):
33
- try:
34
- speakers = {}
35
- for i, path in enumerate(reference_paths, 1):
36
- speaker_id = f"id_{i}"
37
- speakers[speaker_id] = {
38
- "path": path,
39
- "lang": "vi",
40
- "speed": 1.0
41
- }
42
-
43
- with torch.no_grad():
44
- styles = model.get_styles(speakers, denoise, avg_style)
45
- r = model.generate(text_prompt, styles, stabilize, 18, "[id_1]")
46
- r = r / np.abs(r).max()
47
-
48
- sf.write("output.wav", r, samplerate=24000)
49
- return "output.wav", "Audio generated successfully!"
50
-
51
- except Exception as e:
52
- error_message = traceback.format_exc()
53
- return None, error_message
54
-
55
- def on_file_upload(file_list):
56
- if not file_list:
57
- return None, "No file uploaded yet."
58
-
59
- unique_files = {}
60
- for file_path in file_list:
61
- file_name = os.path.basename(file_path)
62
- unique_files[file_name] = file_path #update and remove duplicate
63
-
64
- uploaded_infos = []
65
- uploaded_file_names = list(unique_files.keys())
66
- for i in range(len(uploaded_file_names)):
67
- uploaded_infos.append(f"[id_{i+1}]: {uploaded_file_names[i]}")
68
-
69
- summary = "\n".join(uploaded_infos)
70
- return list(unique_files.values()), f"Current reference audios:\n{summary}"
71
-
72
- def gen_example(reference_paths, text_prompt):
73
- output, status = main(reference_paths, text_prompt, 0.6, True, True)
74
- return output, reference_paths, status
75
-
76
-
77
- # Gradio UI
78
- with gr.Blocks() as demo:
79
- gr.HTML("<h1 style='text-align: center;'>StyleTTS2‑Lite Demo</h1>")
80
- gr.Markdown(
81
- "Download the local inference package from Hugging Face: "
82
- "[StyleTTS2‑Lite (Vietnamese)]"
83
- "(https://huggingface.co/dangtr0408/StyleTTS2-lite-vi/)."
84
- )
85
- gr.Markdown(
86
- "Annotate any non‑Vietnamese words with the appropriate language tag, e.g., [en-us]{ } for English. For more information, see "
87
- "[eSpeakNG docs]"
88
- "(https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)"
89
- )
90
-
91
- with gr.Row(equal_height=True):
92
- with gr.Column(scale=1):
93
- text_prompt = gr.Textbox(label="Text Prompt", placeholder="Enter your text here...", lines=4)
94
- with gr.Column(scale=1):
95
- avg_style = gr.Checkbox(label="Use Average Styles", value=True)
96
- stabilize = gr.Checkbox(label="Stabilize Speaking Speed", value=True)
97
- denoise = gr.Slider(0.0, 1.0, step=0.1, value=0.6, label="Denoise Strength")
98
-
99
- with gr.Row(equal_height=True):
100
- with gr.Column(scale=1):
101
- reference_audios = gr.File(label="Reference Audios", file_types=[".wav", ".mp3"], file_count="multiple", height=150)
102
- gen_button = gr.Button("Generate")
103
- with gr.Column(scale=1):
104
- synthesized_audio = gr.Audio(label="Generate Audio", type="filepath")
105
-
106
- status = gr.Textbox(label="Status", interactive=False, lines=3)
107
-
108
- reference_audios.change(
109
- on_file_upload,
110
- inputs=[reference_audios],
111
- outputs=[reference_audios, status]
112
- )
113
-
114
- gen_button.click(
115
- fn=main,
116
- inputs=[
117
- reference_audios,
118
- text_prompt,
119
- denoise,
120
- avg_style,
121
- stabilize
122
- ],
123
- outputs=[synthesized_audio, status]
124
- )
125
-
126
- gr.Examples(
127
- examples=[[[eg_voices[0]], eg_texts[0]], [eg_voices, eg_texts[1]]],
128
- inputs=[reference_audios, text_prompt],
129
- outputs=[synthesized_audio, reference_audios, status],
130
- fn=gen_example,
131
- cache_examples=False,
132
- label="Examples",
133
- run_on_click=True
134
- )
135
-
136
  demo.launch()
 
1
+ import gradio as gr
2
+ import subprocess
3
+ import os
4
+ import sys
5
+ import soundfile as sf
6
+ import numpy as np
7
+ import torch
8
+ import traceback
9
+ import spaces
10
+
11
+ repo_url = "https://huggingface.co/dangtr0408/StyleTTS2-lite-vi"
12
+ repo_dir = "StyleTTS2-lite-vi"
13
+ if not os.path.exists(repo_dir):
14
+ subprocess.run(["git", "clone", repo_url, repo_dir])
15
+ sys.path.append(os.path.abspath(repo_dir))
16
+ from inference import StyleTTS2
17
+
18
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
19
+ config_path = os.path.join(repo_dir, "Models", "config.yaml")
20
+ models_path = os.path.join(repo_dir, "Models", "inference", "model.pth")
21
+ model = StyleTTS2(config_path, models_path).eval().to(device)
22
+ voice_path = os.path.join(repo_dir, "reference_audio")
23
+ eg_voices = [os.path.join(voice_path,"vn_1.wav"), os.path.join(voice_path,"vn_2.wav")]
24
+ eg_texts = [
25
+ "Chỉ với khoảng 90 triệu tham số, [en-us]{StyleTTS2-lite} có thể dễ dàng tạo giọng nói với tốc độ cao.",
26
+ "[id_1] Với [en-us]{StyleTTS2-lite} bạn có thể sử dụng [en-us]{language tag} để mô hình chắc chắn đọc bằng tiếng Anh, [id_2]cũng như sử dụng [en-us]{speaker tag} để chuyển đổi nhanh giữa các giọng đọc.",
27
+ ]
28
+
29
+
30
+ # Core inference function
31
+ @spaces.GPU
32
+ def main(reference_paths, text_prompt, denoise, avg_style, stabilize):
33
+ try:
34
+ speakers = {}
35
+ for i, path in enumerate(reference_paths, 1):
36
+ speaker_id = f"id_{i}"
37
+ speakers[speaker_id] = {
38
+ "path": path,
39
+ "lang": "vi",
40
+ "speed": 1.0
41
+ }
42
+
43
+ with torch.no_grad():
44
+ styles = model.get_styles(speakers, denoise, avg_style=False)
45
+ r = model.generate(text_prompt, styles, stabilize, 18, "[id_1]")
46
+ r = r / np.abs(r).max()
47
+
48
+ sf.write("output.wav", r, samplerate=24000)
49
+ return "output.wav", "Audio generated successfully!"
50
+
51
+ except Exception as e:
52
+ error_message = traceback.format_exc()
53
+ return None, error_message
54
+
55
+ def on_file_upload(file_list):
56
+ if not file_list:
57
+ return None, "No file uploaded yet."
58
+
59
+ unique_files = {}
60
+ for file_path in file_list:
61
+ file_name = os.path.basename(file_path)
62
+ unique_files[file_name] = file_path #update and remove duplicate
63
+
64
+ uploaded_infos = []
65
+ uploaded_file_names = list(unique_files.keys())
66
+ for i in range(len(uploaded_file_names)):
67
+ uploaded_infos.append(f"[id_{i+1}]: {uploaded_file_names[i]}")
68
+
69
+ summary = "\n".join(uploaded_infos)
70
+ return list(unique_files.values()), f"Current reference audios:\n{summary}"
71
+
72
+ def gen_example(reference_paths, text_prompt):
73
+ output, status = main(reference_paths, text_prompt, 0.3, False, False)
74
+ return output, reference_paths, status
75
+
76
+
77
+ # Gradio UI
78
+ with gr.Blocks() as demo:
79
+ gr.HTML("<h1 style='text-align: center;'>StyleTTS2‑Lite Demo</h1>")
80
+ gr.Markdown(
81
+ "Download the local inference package from Hugging Face: "
82
+ "[StyleTTS2‑Lite (Vietnamese)]"
83
+ "(https://huggingface.co/dangtr0408/StyleTTS2-lite-vi/)."
84
+ )
85
+ gr.Markdown(
86
+ "Annotate any non‑Vietnamese words with the appropriate language tag, e.g., [en-us]{ } for English. For more information, see "
87
+ "[eSpeakNG docs]"
88
+ "(https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)"
89
+ )
90
+
91
+ with gr.Row(equal_height=True):
92
+ with gr.Column(scale=1):
93
+ text_prompt = gr.Textbox(label="Text Prompt", placeholder="Enter your text here...", lines=4)
94
+ with gr.Column(scale=1):
95
+ avg_style = gr.Checkbox(label="Use Average Styles", value=True)
96
+ stabilize = gr.Checkbox(label="Stabilize Speaking Speed", value=True)
97
+ denoise = gr.Slider(0.0, 1.0, step=0.1, value=0.6, label="Denoise Strength")
98
+
99
+ with gr.Row(equal_height=True):
100
+ with gr.Column(scale=1):
101
+ reference_audios = gr.File(label="Reference Audios", file_types=[".wav", ".mp3"], file_count="multiple", height=150)
102
+ gen_button = gr.Button("Generate")
103
+ with gr.Column(scale=1):
104
+ synthesized_audio = gr.Audio(label="Generate Audio", type="filepath")
105
+
106
+ status = gr.Textbox(label="Status", interactive=False, lines=3)
107
+
108
+ reference_audios.change(
109
+ on_file_upload,
110
+ inputs=[reference_audios],
111
+ outputs=[reference_audios, status]
112
+ )
113
+
114
+ gen_button.click(
115
+ fn=main,
116
+ inputs=[
117
+ reference_audios,
118
+ text_prompt,
119
+ denoise,
120
+ avg_style,
121
+ stabilize
122
+ ],
123
+ outputs=[synthesized_audio, status]
124
+ )
125
+
126
+ gr.Examples(
127
+ examples=[[[eg_voices[0]], eg_texts[0]], [eg_voices, eg_texts[1]]],
128
+ inputs=[reference_audios, text_prompt],
129
+ outputs=[synthesized_audio, reference_audios, status],
130
+ fn=gen_example,
131
+ cache_examples=False,
132
+ label="Examples",
133
+ run_on_click=True
134
+ )
135
+
136
  demo.launch()