Spaces:
Runtime error
Runtime error
Commit
·
1417ec9
1
Parent(s):
07a092c
Add script for tortoise
Browse files- app.py +199 -4
- requirements.txt +4 -0
app.py
CHANGED
|
@@ -1,10 +1,205 @@
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
| 6 |
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
|
|
|
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, sys
|
| 2 |
+
import tempfile
|
| 3 |
import gradio as gr
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import Tuple, List
|
| 6 |
|
| 7 |
+
# Setup and installation
|
| 8 |
+
os.system("git clone https://github.com/neonbjb/tortoise-tts.git")
|
| 9 |
+
sys.path.append("./tortoise-tts/")
|
| 10 |
+
os.system("pip install -r ./tortoise-tts/requirements.txt")
|
| 11 |
+
os.system("python ./tortoise-tts/setup.py install")
|
| 12 |
|
| 13 |
+
import torch
|
| 14 |
+
import torchaudio
|
| 15 |
+
import torch.nn as nn
|
| 16 |
+
import torch.nn.functional as F
|
| 17 |
|
| 18 |
+
from tortoise.api import TextToSpeech
|
| 19 |
+
from tortoise.utils.audio import load_audio, load_voice
|
| 20 |
|
| 21 |
+
# Download and instantiate model
|
| 22 |
+
tts = TextToSpeech()
|
| 23 |
|
| 24 |
+
# Display parameters
|
| 25 |
+
VOICES = [
|
| 26 |
+
"random",
|
| 27 |
+
"train_atkins",
|
| 28 |
+
"train_daws",
|
| 29 |
+
"train_dotrice",
|
| 30 |
+
"train_dreams",
|
| 31 |
+
"train_empire",
|
| 32 |
+
"train_grace",
|
| 33 |
+
"train_kennard",
|
| 34 |
+
"train_lescault",
|
| 35 |
+
"train_mouse",
|
| 36 |
+
"angie",
|
| 37 |
+
"applejack",
|
| 38 |
+
"daniel",
|
| 39 |
+
"deniro",
|
| 40 |
+
"emma",
|
| 41 |
+
"freeman",
|
| 42 |
+
"geralt",
|
| 43 |
+
"halle",
|
| 44 |
+
"jlaw",
|
| 45 |
+
"lj",
|
| 46 |
+
"mol",
|
| 47 |
+
"myself",
|
| 48 |
+
"pat",
|
| 49 |
+
"pat2",
|
| 50 |
+
"rainbow",
|
| 51 |
+
"snakes",
|
| 52 |
+
"tim_reynolds",
|
| 53 |
+
"tom",
|
| 54 |
+
"weaver",
|
| 55 |
+
"william",
|
| 56 |
+
]
|
| 57 |
+
DEFAULT_VOICE = "random"
|
| 58 |
+
PRESETS = ["ultra_fast", "fast", "standard", "high_quality"]
|
| 59 |
+
DEFAULT_PRESET = "fast"
|
| 60 |
+
DEFAULT_TEXT = "Hello, world!"
|
| 61 |
+
|
| 62 |
+
README = """# TorToiSe
|
| 63 |
+
Tortoise is a text-to-speech model developed by James Betker. It is capable of zero-shot voice cloning from a small set of voice samples. GitHub repo: [neonbjb/tortoise-tts](https://github.com/neonbjb/tortoise-tts).
|
| 64 |
+
## Usage
|
| 65 |
+
1. Select a model preset and type the text to speak.
|
| 66 |
+
2. Load a voice - either by choosing a preset, uploading audio files, or recording via microphone. Select the option to split audio into chunks if the clips are much longer than 10 seconds each. Follow the guidelines in the [voice customization guide](https://github.com/neonbjb/tortoise-tts#voice-customization-guide).
|
| 67 |
+
3. Click **Generate**, and wait - it's called *tortoise* for a reason!
|
| 68 |
+
"""
|
| 69 |
+
|
| 70 |
+
TORTOISE_SR_IN = 22050
|
| 71 |
+
TORTOISE_SR_OUT = 24000
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def chunk_audio(
|
| 75 |
+
t: torch.Tensor, sample_rate: int, chunk_duration_sec: int
|
| 76 |
+
) -> List[torch.Tensor]:
|
| 77 |
+
duration = t.shape[1] / sample_rate
|
| 78 |
+
num_chunks = 1 + int(duration / chunk_duration_sec)
|
| 79 |
+
chunks = [
|
| 80 |
+
t[
|
| 81 |
+
:,
|
| 82 |
+
(sample_rate * chunk_duration_sec * i) : (
|
| 83 |
+
sample_rate * chunk_duration_sec * (i + 1)
|
| 84 |
+
),
|
| 85 |
+
]
|
| 86 |
+
for i in range(num_chunks)
|
| 87 |
+
]
|
| 88 |
+
# remove 0-width chunks
|
| 89 |
+
chunks = [chunk for chunk in chunks if chunk.shape[1] > 0]
|
| 90 |
+
return chunks
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def tts_main(voice_samples: List[torch.Tensor], text: str, model_preset: str) -> str:
|
| 94 |
+
gen = tts.tts_with_preset(
|
| 95 |
+
text,
|
| 96 |
+
voice_samples=voice_samples,
|
| 97 |
+
conditioning_latents=None,
|
| 98 |
+
preset=model_preset,
|
| 99 |
+
)
|
| 100 |
+
torchaudio.save("generated.wav", gen.squeeze(0).cpu(), TORTOISE_SR_OUT)
|
| 101 |
+
return "generated.wav"
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def tts_from_preset(voice: str, text, model_preset):
|
| 105 |
+
voice_samples, _ = load_voice(voice)
|
| 106 |
+
return tts_main(voice_samples, text, model_preset)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def tts_from_files(
|
| 110 |
+
files: List[tempfile._TemporaryFileWrapper], do_chunk, text, model_preset
|
| 111 |
+
):
|
| 112 |
+
voice_samples = [load_audio(f.name, TORTOISE_SR_IN) for f in files]
|
| 113 |
+
if do_chunk:
|
| 114 |
+
voice_samples = [
|
| 115 |
+
chunk for t in voice_samples for chunk in chunk_audio(t, TORTOISE_SR_IN, 10)
|
| 116 |
+
]
|
| 117 |
+
return tts_main(voice_samples, text, model_preset)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def tts_from_recording(recording: Tuple[int, np.ndarray], do_chunk, text, model_preset):
|
| 121 |
+
sample_rate, audio = recording
|
| 122 |
+
# normalize- https://github.com/neonbjb/tortoise-tts/blob/main/tortoise/utils/audio.py#L16
|
| 123 |
+
norm_fix = 1
|
| 124 |
+
if audio.dtype == np.int32:
|
| 125 |
+
norm_fix = 2**31
|
| 126 |
+
elif audio.dtype == np.int16:
|
| 127 |
+
norm_fix = 2**15
|
| 128 |
+
audio = torch.FloatTensor(audio.T) / norm_fix
|
| 129 |
+
if len(audio.shape) > 1:
|
| 130 |
+
# convert to mono
|
| 131 |
+
audio = torch.mean(audio, axis=0).unsqueeze(0)
|
| 132 |
+
audio = torchaudio.transforms.Resample(sample_rate, TORTOISE_SR_IN)(audio)
|
| 133 |
+
if do_chunk:
|
| 134 |
+
voice_samples = chunk_audio(audio, TORTOISE_SR_IN, 10)
|
| 135 |
+
else:
|
| 136 |
+
voice_samples = [audio]
|
| 137 |
+
return tts_main(voice_samples, text, model_preset)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def tts_from_url(audio_url, start_time, end_time, do_chunk, text, model_preset):
|
| 141 |
+
os.system(
|
| 142 |
+
f"yt-dlp -x --audio-format mp3 --force-overwrites {audio_url} -o audio.mp3"
|
| 143 |
+
)
|
| 144 |
+
audio = load_audio("audio.mp3", TORTOISE_SR_IN)
|
| 145 |
+
audio = audio[:, start_time * TORTOISE_SR_IN : end_time * TORTOISE_SR_IN]
|
| 146 |
+
if do_chunk:
|
| 147 |
+
voice_samples = chunk_audio(audio, TORTOISE_SR_IN, 10)
|
| 148 |
+
else:
|
| 149 |
+
voice_samples = [audio]
|
| 150 |
+
return tts_main(voice_samples, text, model_preset)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
with gr.Blocks() as demo:
|
| 154 |
+
gr.Markdown(README)
|
| 155 |
+
|
| 156 |
+
preset = gr.Dropdown(PRESETS, label="Model preset", value=DEFAULT_PRESET)
|
| 157 |
+
text = gr.Textbox(label="Text to speak", value=DEFAULT_TEXT)
|
| 158 |
+
do_chunk_label = "Split audio into chunks? (for audio much longer than 10 seconds.)"
|
| 159 |
+
do_chunk_default = True
|
| 160 |
+
|
| 161 |
+
with gr.Tab("Choose preset voice"):
|
| 162 |
+
inp1 = gr.Dropdown(VOICES, value=DEFAULT_VOICE, label="Preset voice")
|
| 163 |
+
btn1 = gr.Button("Generate")
|
| 164 |
+
|
| 165 |
+
with gr.Tab("Upload audio"):
|
| 166 |
+
inp2 = gr.File(file_count="multiple")
|
| 167 |
+
do_chunk2 = gr.Checkbox(label=do_chunk_label, value=do_chunk_default)
|
| 168 |
+
btn2 = gr.Button("Generate")
|
| 169 |
+
|
| 170 |
+
with gr.Tab("Record audio"):
|
| 171 |
+
inp3 = gr.Audio(source="microphone")
|
| 172 |
+
do_chunk3 = gr.Checkbox(label=do_chunk_label, value=do_chunk_default)
|
| 173 |
+
btn3 = gr.Button("Generate")
|
| 174 |
+
|
| 175 |
+
# with gr.Tab("From YouTube"):
|
| 176 |
+
# inp4 = gr.Textbox(label="URL")
|
| 177 |
+
# do_chunk4 = gr.Checkbox(label=do_chunk_label, value=do_chunk_default)
|
| 178 |
+
# start_time = gr.Number(label="Start time (seconds)", precision=0)
|
| 179 |
+
# end_time = gr.Number(label="End time (seconds)", precision=0)
|
| 180 |
+
# btn4 = gr.Button("Generate")
|
| 181 |
+
|
| 182 |
+
audio_out = gr.Audio()
|
| 183 |
+
|
| 184 |
+
btn1.click(
|
| 185 |
+
tts_from_preset,
|
| 186 |
+
[inp1, text, preset],
|
| 187 |
+
[audio_out],
|
| 188 |
+
)
|
| 189 |
+
btn2.click(
|
| 190 |
+
tts_from_files,
|
| 191 |
+
[inp2, do_chunk2, text, preset],
|
| 192 |
+
[audio_out],
|
| 193 |
+
)
|
| 194 |
+
btn3.click(
|
| 195 |
+
tts_from_recording,
|
| 196 |
+
[inp3, do_chunk3, text, preset],
|
| 197 |
+
[audio_out],
|
| 198 |
+
)
|
| 199 |
+
# btn4.click(
|
| 200 |
+
# tts_from_url,
|
| 201 |
+
# [inp4, start_time, end_time, do_chunk4, text, preset],
|
| 202 |
+
# [audio_out],
|
| 203 |
+
# )
|
| 204 |
+
|
| 205 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
torchaudio
|
| 3 |
+
numpy==1.24.1
|
| 4 |
+
yt-dlp
|