Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
lojban v2 voice model
Browse files- README.md +3 -1
- app.py +9 -3
- gr_client.py +36 -5
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
title: xVASynth TTS
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: gray
|
| 5 |
colorTo: gray
|
| 6 |
sdk: gradio
|
|
@@ -9,6 +9,7 @@ sdk_version: 4.20.0
|
|
| 9 |
models:
|
| 10 |
- Pendrokar/xvapitch_nvidia
|
| 11 |
- Pendrokar/TorchMoji
|
|
|
|
| 12 |
app_file: app.py
|
| 13 |
app_port: 7860
|
| 14 |
tags:
|
|
@@ -20,6 +21,7 @@ pinned: false
|
|
| 20 |
preload_from_hub:
|
| 21 |
- Pendrokar/xvapitch_nvidia
|
| 22 |
- Pendrokar/TorchMoji
|
|
|
|
| 23 |
license: gpl-3.0
|
| 24 |
thumbnail: >-
|
| 25 |
https://raw.githubusercontent.com/DanRuta/xVA-Synth/master/assets/x-icon.png
|
|
|
|
| 1 |
---
|
| 2 |
title: xVASynth TTS
|
| 3 |
+
emoji: 🧝♀️🧛♂️🧚♀️
|
| 4 |
colorFrom: gray
|
| 5 |
colorTo: gray
|
| 6 |
sdk: gradio
|
|
|
|
| 9 |
models:
|
| 10 |
- Pendrokar/xvapitch_nvidia
|
| 11 |
- Pendrokar/TorchMoji
|
| 12 |
+
- Pendrokar/xvasynth_lojban
|
| 13 |
app_file: app.py
|
| 14 |
app_port: 7860
|
| 15 |
tags:
|
|
|
|
| 21 |
preload_from_hub:
|
| 22 |
- Pendrokar/xvapitch_nvidia
|
| 23 |
- Pendrokar/TorchMoji
|
| 24 |
+
- Pendrokar/xvasynth_lojban
|
| 25 |
license: gpl-3.0
|
| 26 |
thumbnail: >-
|
| 27 |
https://raw.githubusercontent.com/DanRuta/xVA-Synth/master/assets/x-icon.png
|
app.py
CHANGED
|
@@ -15,16 +15,22 @@ model_repo = HfApi()
|
|
| 15 |
commits = model_repo.list_repo_commits(repo_id=hf_model_name)
|
| 16 |
latest_commit_sha = commits[0].commit_id
|
| 17 |
hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/'
|
|
|
|
| 18 |
models_path = hf_cache_models_path
|
| 19 |
|
| 20 |
current_voice_model = None
|
| 21 |
base_speaker_emb = ''
|
| 22 |
|
| 23 |
def load_model(voice_model_name):
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
-
language = 'en'
|
| 28 |
|
| 29 |
data = {
|
| 30 |
'outputs': None,
|
|
|
|
| 15 |
commits = model_repo.list_repo_commits(repo_id=hf_model_name)
|
| 16 |
latest_commit_sha = commits[0].commit_id
|
| 17 |
hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/'
|
| 18 |
+
hf_cache_lojban_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvasynth_lojban/snapshots/{latest_commit_sha}/'
|
| 19 |
models_path = hf_cache_models_path
|
| 20 |
|
| 21 |
current_voice_model = None
|
| 22 |
base_speaker_emb = ''
|
| 23 |
|
| 24 |
def load_model(voice_model_name):
|
| 25 |
+
if voice_model_name == 'x_selpahi':
|
| 26 |
+
# Lojban
|
| 27 |
+
model_path = hf_cache_lojban_models_path + voice_model_name
|
| 28 |
+
model_type = 'FastPitch1.1'
|
| 29 |
+
else:
|
| 30 |
+
model_path = models_path + voice_model_name
|
| 31 |
+
model_type = 'xVAPitch'
|
| 32 |
|
| 33 |
+
language = 'en' # seems to have no effect if generated text is from a different language
|
|
|
|
| 34 |
|
| 35 |
data = {
|
| 36 |
'outputs': None,
|
gr_client.py
CHANGED
|
@@ -9,13 +9,14 @@ voice_models = [
|
|
| 9 |
]
|
| 10 |
voice_models_more = [
|
| 11 |
("🧔 #6670", "ccby_nvidia_hifi_6670_M"),
|
| 12 |
-
("
|
| 13 |
-
("
|
| 14 |
("👩🦱 #12787", "ccby_nvidia_hifi_12787_F"),
|
| 15 |
("👵 #11614", "ccby_nv_hifi_11614_F"),
|
| 16 |
-
("
|
| 17 |
("👩🦳 #11697", "ccby_nvidia_hifi_11697_F"),
|
| 18 |
-
("
|
|
|
|
| 19 |
]
|
| 20 |
|
| 21 |
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
|
|
@@ -52,6 +53,11 @@ languages_more = [
|
|
| 52 |
("Wolof", "wo"),
|
| 53 |
]
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
# Translated from English by DeepMind's Gemini Pro
|
| 56 |
default_text = {
|
| 57 |
"ar": "هذا هو صوتي.",
|
|
@@ -66,6 +72,7 @@ default_text = {
|
|
| 66 |
"hi": "यह मेरी आवाज़ कैसी लगती है।",
|
| 67 |
"hu": "Így hangzik a hangom.",
|
| 68 |
"it": "Così suona la mia voce.",
|
|
|
|
| 69 |
"jp": "これが私の声です。",
|
| 70 |
"ko": "여기 제 목소리가 어떤지 들어보세요.",
|
| 71 |
"la": "Haec est vox mea sonans.",
|
|
@@ -285,6 +292,19 @@ language_radio_init = {
|
|
| 285 |
'info': "Will be more monotone and have an English accent."
|
| 286 |
}
|
| 287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
_DESCRIPTION = '''
|
| 289 |
<div>
|
| 290 |
<a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
|
|
@@ -475,10 +495,21 @@ class BlocksDemo:
|
|
| 475 |
queue=False,
|
| 476 |
)
|
| 477 |
|
|
|
|
| 478 |
voice_radio.change(
|
| 479 |
self.set_default_audio,
|
| 480 |
inputs=voice_radio,
|
| 481 |
-
outputs=output_wav
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
)
|
| 483 |
|
| 484 |
return demo
|
|
|
|
| 9 |
]
|
| 10 |
voice_models_more = [
|
| 11 |
("🧔 #6670", "ccby_nvidia_hifi_6670_M"),
|
| 12 |
+
("👨🦲 #9017", "ccby_nvidia_hifi_9017_M"),
|
| 13 |
+
("🧑 #6097", "ccby_nvidia_hifi_6097_M"),
|
| 14 |
("👩🦱 #12787", "ccby_nvidia_hifi_12787_F"),
|
| 15 |
("👵 #11614", "ccby_nv_hifi_11614_F"),
|
| 16 |
+
("👩🦰 #8051", "ccby_nvidia_hifi_8051_F"),
|
| 17 |
("👩🦳 #11697", "ccby_nvidia_hifi_11697_F"),
|
| 18 |
+
("👩🦲 #9136", "ccby_nvidia_hifi_9136_F"),
|
| 19 |
+
("♟ Lojban", "x_selpahi"), # v2 model for Lojban, pre-multilingual capabilities of xVASynth
|
| 20 |
]
|
| 21 |
|
| 22 |
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
|
|
|
|
| 53 |
("Wolof", "wo"),
|
| 54 |
]
|
| 55 |
|
| 56 |
+
lojban_lang = [
|
| 57 |
+
# There is no ISO 639-1 for Lojban, but jb is valid
|
| 58 |
+
('♟ Lojban', 'jb')
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
# Translated from English by DeepMind's Gemini Pro
|
| 62 |
default_text = {
|
| 63 |
"ar": "هذا هو صوتي.",
|
|
|
|
| 72 |
"hi": "यह मेरी आवाज़ कैसी लगती है।",
|
| 73 |
"hu": "Így hangzik a hangom.",
|
| 74 |
"it": "Così suona la mia voce.",
|
| 75 |
+
"jb": ".i ",
|
| 76 |
"jp": "これが私の声です。",
|
| 77 |
"ko": "여기 제 목소리가 어떤지 들어보세요.",
|
| 78 |
"la": "Haec est vox mea sonans.",
|
|
|
|
| 292 |
'info': "Will be more monotone and have an English accent."
|
| 293 |
}
|
| 294 |
|
| 295 |
+
def set_lojban_language(voice, lang):
|
| 296 |
+
if voice != 'x_selpahi':
|
| 297 |
+
return lang
|
| 298 |
+
|
| 299 |
+
radio_init = {**language_radio_init}
|
| 300 |
+
radio_init['choices'] = [
|
| 301 |
+
*lojban_lang,
|
| 302 |
+
*languages,
|
| 303 |
+
*languages_more,
|
| 304 |
+
]
|
| 305 |
+
radio_init['value'] = lojban_lang[0][1]
|
| 306 |
+
return gr.Radio(**radio_init)
|
| 307 |
+
|
| 308 |
_DESCRIPTION = '''
|
| 309 |
<div>
|
| 310 |
<a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
|
|
|
|
| 495 |
queue=False,
|
| 496 |
)
|
| 497 |
|
| 498 |
+
# Replace output with voice audio sample
|
| 499 |
voice_radio.change(
|
| 500 |
self.set_default_audio,
|
| 501 |
inputs=voice_radio,
|
| 502 |
+
outputs=output_wav,
|
| 503 |
+
queue=True,
|
| 504 |
+
)
|
| 505 |
+
|
| 506 |
+
# Switched to Lojban voice
|
| 507 |
+
voice_radio.change(
|
| 508 |
+
set_lojban_language,
|
| 509 |
+
inputs=[voice_radio, language_radio],
|
| 510 |
+
outputs=[language_radio],
|
| 511 |
+
trigger_mode='once',
|
| 512 |
+
queue=True,
|
| 513 |
)
|
| 514 |
|
| 515 |
return demo
|