import torch from transformers import AutoTokenizer, VitsModel import gradio as gr import soundfile as sf import tempfile # Load Meta's MMS-TTS model for Min Nan (zh-nan) model = VitsModel.from_pretrained("facebook/mms-tts-nan") tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-nan") # Synthesize speech from Min Nan (POJ) text input def synthesize(text): inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model(**inputs) audio = output.waveform.squeeze().cpu().numpy() # Use model-defined sampling rate for writing audio sampling_rate = model.config.sampling_rate tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) sf.write(tmp_wav.name, audio, samplerate=sampling_rate) return tmp_wav.name # Gradio app interface demo = gr.Interface( fn=synthesize, inputs=gr.Textbox( lines=3, placeholder="Enter Min Nan (Taiwanese Hokkien) text in POJ format.\nExample: Lí hó! Góa sī lâng Tâi-oân.", label="Min Nan Text (POJ format)" ), outputs=gr.Audio(type="filepath", label="Synthesized Speech"), title="Text-to-Speech (TTS) for Min Nan / Taiwanese Hokkien using Meta’s MMS-TTS Model (facebook/mms-tts-nan)", description=( "🗣️ This application uses Meta's multilingual speech model (MMS-TTS) to generate natural speech " "from text written in Min Nan Chinese (zh-nan), also known as Taiwanese Hokkien. " "Input text should be written using the Pe̍h-ōe-jī (POJ) romanization system. \n\n" "Example input: Lí hó! Góa sī lâng Tâi-oân.\n\n" "The output is a 16kHz WAV audio file synthesized using the VITS-based neural TTS model." ), allow_flagging="never" ) if __name__ == "__main__": demo.launch()