Spaces:

Guunk
/

Ttsminnan

Sleeping

App Files Files Community

Ttsminnan / app.py

Guunk

Create app.py

22a95f5 verified 5 months ago

raw

history blame

1.81 kB

	import torch
	from transformers import AutoTokenizer, VitsModel
	import gradio as gr
	import soundfile as sf
	import tempfile

	# Load Meta's MMS-TTS model for Min Nan (zh-nan)
	model = VitsModel.from_pretrained("facebook/mms-tts-nan")
	tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-nan")

	# Synthesize speech from Min Nan (POJ) text input
	def synthesize(text):
	inputs = tokenizer(text, return_tensors="pt")
	with torch.no_grad():
	output = model(**inputs)
	audio = output.waveform.squeeze().cpu().numpy()

	# Use model-defined sampling rate for writing audio
	sampling_rate = model.config.sampling_rate
	tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
	sf.write(tmp_wav.name, audio, samplerate=sampling_rate)
	return tmp_wav.name

	# Gradio app interface
	demo = gr.Interface(
	fn=synthesize,
	inputs=gr.Textbox(
	lines=3,
	placeholder="Enter Min Nan (Taiwanese Hokkien) text in POJ format.\nExample: Lí hó! Góa sī lâng Tâi-oân.",
	label="Min Nan Text (POJ format)"
	),
	outputs=gr.Audio(type="filepath", label="Synthesized Speech"),
	title="Text-to-Speech (TTS) for Min Nan / Taiwanese Hokkien using Meta’s MMS-TTS Model (facebook/mms-tts-nan)",
	description=(
	"🗣️ This application uses Meta's multilingual speech model (MMS-TTS) to generate natural speech "
	"from text written in Min Nan Chinese (zh-nan), also known as Taiwanese Hokkien. "
	"Input text should be written using the Pe̍h-ōe-jī (POJ) romanization system. \n\n"
	"Example input: Lí hó! Góa sī lâng Tâi-oân.\n\n"
	"The output is a 16kHz WAV audio file synthesized using the VITS-based neural TTS model."
	),
	allow_flagging="never"
	)

	if __name__ == "__main__":
	demo.launch()