File size: 1,807 Bytes
22a95f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import torch
from transformers import AutoTokenizer, VitsModel
import gradio as gr
import soundfile as sf
import tempfile
# Load Meta's MMS-TTS model for Min Nan (zh-nan)
model = VitsModel.from_pretrained("facebook/mms-tts-nan")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-nan")
# Synthesize speech from Min Nan (POJ) text input
def synthesize(text):
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
output = model(**inputs)
audio = output.waveform.squeeze().cpu().numpy()
# Use model-defined sampling rate for writing audio
sampling_rate = model.config.sampling_rate
tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
sf.write(tmp_wav.name, audio, samplerate=sampling_rate)
return tmp_wav.name
# Gradio app interface
demo = gr.Interface(
fn=synthesize,
inputs=gr.Textbox(
lines=3,
placeholder="Enter Min Nan (Taiwanese Hokkien) text in POJ format.\nExample: Lí hó! Góa sī lâng Tâi-oân.",
label="Min Nan Text (POJ format)"
),
outputs=gr.Audio(type="filepath", label="Synthesized Speech"),
title="Text-to-Speech (TTS) for Min Nan / Taiwanese Hokkien using Meta’s MMS-TTS Model (facebook/mms-tts-nan)",
description=(
"🗣️ This application uses Meta's multilingual speech model (MMS-TTS) to generate natural speech "
"from text written in Min Nan Chinese (zh-nan), also known as Taiwanese Hokkien. "
"Input text should be written using the Pe̍h-ōe-jī (POJ) romanization system. \n\n"
"Example input: Lí hó! Góa sī lâng Tâi-oân.\n\n"
"The output is a 16kHz WAV audio file synthesized using the VITS-based neural TTS model."
),
allow_flagging="never"
)
if __name__ == "__main__":
demo.launch() |