import torch
from transformers import AutoTokenizer, VitsModel
import gradio as gr
import soundfile as sf
import tempfile

# Load Meta's MMS-TTS model for Min Nan (zh-nan)
model = VitsModel.from_pretrained("facebook/mms-tts-nan")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-nan")

# Synthesize speech from Min Nan (POJ) text input
def synthesize(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        output = model(**inputs)
    audio = output.waveform.squeeze().cpu().numpy()

    # Use model-defined sampling rate for writing audio
    sampling_rate = model.config.sampling_rate
    tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp_wav.name, audio, samplerate=sampling_rate)
    return tmp_wav.name

# Gradio app interface
demo = gr.Interface(
    fn=synthesize,
    inputs=gr.Textbox(
        lines=3,
        placeholder="Enter Min Nan (Taiwanese Hokkien) text in POJ format.\nExample: Lí hó! Góa sī lâng Tâi-oân.",
        label="Min Nan Text (POJ format)"
    ),
    outputs=gr.Audio(type="filepath", label="Synthesized Speech"),
    title="Text-to-Speech (TTS) for Min Nan / Taiwanese Hokkien using Meta’s MMS-TTS Model (facebook/mms-tts-nan)",
    description=(
        "🗣️ This application uses Meta's multilingual speech model (MMS-TTS) to generate natural speech "
        "from text written in Min Nan Chinese (zh-nan), also known as Taiwanese Hokkien. "
        "Input text should be written using the Pe̍h-ōe-jī (POJ) romanization system. \n\n"
        "Example input: Lí hó! Góa sī lâng Tâi-oân.\n\n"
        "The output is a 16kHz WAV audio file synthesized using the VITS-based neural TTS model."
    ),
    allow_flagging="never"
)

if __name__ == "__main__":
    demo.launch()