|
|
import torch |
|
|
from transformers import AutoTokenizer, VitsModel |
|
|
import gradio as gr |
|
|
import soundfile as sf |
|
|
import tempfile |
|
|
|
|
|
|
|
|
model = VitsModel.from_pretrained("facebook/mms-tts-nan") |
|
|
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-nan") |
|
|
|
|
|
|
|
|
def synthesize(text): |
|
|
inputs = tokenizer(text, return_tensors="pt") |
|
|
with torch.no_grad(): |
|
|
output = model(**inputs) |
|
|
audio = output.waveform.squeeze().cpu().numpy() |
|
|
|
|
|
|
|
|
sampling_rate = model.config.sampling_rate |
|
|
tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) |
|
|
sf.write(tmp_wav.name, audio, samplerate=sampling_rate) |
|
|
return tmp_wav.name |
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=synthesize, |
|
|
inputs=gr.Textbox( |
|
|
lines=3, |
|
|
placeholder="Enter Min Nan (Taiwanese Hokkien) text in POJ format.\nExample: Lí hó! Góa sī lâng Tâi-oân.", |
|
|
label="Min Nan Text (POJ format)" |
|
|
), |
|
|
outputs=gr.Audio(type="filepath", label="Synthesized Speech"), |
|
|
title="Text-to-Speech (TTS) for Min Nan / Taiwanese Hokkien using Meta’s MMS-TTS Model (facebook/mms-tts-nan)", |
|
|
description=( |
|
|
"🗣️ This application uses Meta's multilingual speech model (MMS-TTS) to generate natural speech " |
|
|
"from text written in Min Nan Chinese (zh-nan), also known as Taiwanese Hokkien. " |
|
|
"Input text should be written using the Pe̍h-ōe-jī (POJ) romanization system. \n\n" |
|
|
"Example input: Lí hó! Góa sī lâng Tâi-oân.\n\n" |
|
|
"The output is a 16kHz WAV audio file synthesized using the VITS-based neural TTS model." |
|
|
), |
|
|
allow_flagging="never" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |