AJ50 commited on
Commit
ccd13e3
·
1 Parent(s): 2973f11

Switch from XTTS to Facebook MMS for Hindi synthesis

Browse files

BENEFITS:
No TOS/License required (open model)
200MB instead of 1.8GB (9x smaller)
Fast inference
Good quality synthesis
No email/licensing hassles

TECHNICAL CHANGES:
- Model: tts_models/multilingual/multi-dataset/xtts_v2 tts_models/hin/facebook/mms-tts-hin
- Removed stdin suppression (not needed, no TOS)
- Simplified Hindi synthesis (MMS is language-specific)
- No speaker_wav parameter (MMS doesn't support speaker adaptation)

COMPARISON:
XTTS: 1.8GB, High quality, TOS+License required
MMS: 200MB, Good quality, No requirements RECOMMENDED

Files changed (1) hide show
  1. backend/app/multilingual_tts.py +23 -36
backend/app/multilingual_tts.py CHANGED
@@ -110,26 +110,23 @@ class MultilingualTTSService:
110
  print("[MultilingualTTSService] ✓ English vocoder loaded")
111
 
112
  def _load_hindi_models(self):
113
- """Load Hindi XTTS model - uses local cached version or downloads on first run."""
114
  if self._xtts_model is None:
115
- print("[MultilingualTTSService] Loading Hindi XTTS model...")
116
  try:
117
  from TTS.api import TTS
118
- import io
119
 
120
- # Model will be cached in /app/backend/models/tts/ after first download
121
- # Suppress stdin to prevent interactive TOS prompts
122
- old_stdin = sys.stdin
123
- sys.stdin = io.StringIO("y\n") # Auto-answer "y" to TOS
124
-
125
- try:
126
- self._xtts_model = TTS(
127
- model_name="tts_models/multilingual/multi-dataset/xtts_v2",
128
- gpu=False
129
- )
130
- print("[MultilingualTTSService] Hindi XTTS loaded successfully")
131
- finally:
132
- sys.stdin = old_stdin
133
 
134
  except ImportError:
135
  raise ImportError(
@@ -137,8 +134,9 @@ class MultilingualTTSService:
137
  "Install with: pip install TTS>=0.21.0"
138
  )
139
  except Exception as e:
140
- print(f"[MultilingualTTSService] Error loading XTTS model: {e}")
141
- raise RuntimeError(f"Failed to load Hindi XTTS model: {e}")
 
142
 
143
  def synthesize(self, text: str, voice_sample_path: Union[str, Path],
144
  language: str = "english") -> np.ndarray:
@@ -197,32 +195,21 @@ class MultilingualTTSService:
197
  return np.clip(synthesized, -1.0, 1.0)
198
 
199
  def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
200
- """Synthesize Hindi speech using XTTS model."""
201
  self._load_hindi_models()
202
 
203
  print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
204
 
205
- # XTTS language support check
206
- # Try different language code formats
207
  try:
208
  audio = self._xtts_model.tts(
209
  text=text,
210
- speaker_wav=str(voice_sample_path),
211
- language="hi" # Try ISO 639-1 code
212
  )
213
- except NotImplementedError:
214
- print("[MultilingualTTSService] Language code 'hi' not supported, trying 'hindi'...")
215
- try:
216
- audio = self._xtts_model.tts(
217
- text=text,
218
- speaker_wav=str(voice_sample_path),
219
- language="hindi" # Try full language name
220
- )
221
- except NotImplementedError:
222
- raise RuntimeError(
223
- "Hindi language not supported in this XTTS version. "
224
- "XTTS-v2 may only support: en, es, fr, de, it, pt, pl, tr, ru, nl, zh-cn, zh-tw, ar, cs, el, hu, ko, ja"
225
- )
226
 
227
  # Convert to float32 if needed
228
  audio = np.asarray(audio, dtype=np.float32)
 
110
  print("[MultilingualTTSService] ✓ English vocoder loaded")
111
 
112
  def _load_hindi_models(self):
113
+ """Load Hindi Facebook MMS model - no TOS required, lightweight."""
114
  if self._xtts_model is None:
115
+ print("[MultilingualTTSService] Loading Hindi Facebook MMS model...")
116
  try:
117
  from TTS.api import TTS
 
118
 
119
+ # Facebook MMS: No TOS required, lightweight (200MB vs XTTS 1.8GB)
120
+ # Downloads once and caches locally
121
+ self._xtts_model = TTS(
122
+ model_name="tts_models/hin/facebook/mms-tts-hin",
123
+ gpu=False,
124
+ progress_bar=False
125
+ )
126
+ print("[MultilingualTTSService] ✓ Hindi Facebook MMS loaded successfully")
127
+ print("[MultilingualTTSService] Model: Facebook Massively Multilingual Speech (MMS)")
128
+ print("[MultilingualTTSService] Language: Hindi (hin)")
129
+ print("[MultilingualTTSService] TOS: No (Open model)")
 
 
130
 
131
  except ImportError:
132
  raise ImportError(
 
134
  "Install with: pip install TTS>=0.21.0"
135
  )
136
  except Exception as e:
137
+ print(f"[MultilingualTTSService] Error loading Hindi MMS model: {e}")
138
+ print(f"[MultilingualTTSService] Make sure TTS library is properly installed")
139
+ raise RuntimeError(f"Failed to load Hindi MMS model: {e}")
140
 
141
  def synthesize(self, text: str, voice_sample_path: Union[str, Path],
142
  language: str = "english") -> np.ndarray:
 
195
  return np.clip(synthesized, -1.0, 1.0)
196
 
197
  def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
198
+ """Synthesize Hindi speech using Facebook MMS model."""
199
  self._load_hindi_models()
200
 
201
  print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
202
 
203
+ # Facebook MMS uses simple TTS interface (no language parameter needed)
204
+ # MMS model is language-specific, already tuned for Hindi
205
  try:
206
  audio = self._xtts_model.tts(
207
  text=text,
208
+ speaker_wav=None # MMS doesn't use speaker adaptation
 
209
  )
210
+ except Exception as e:
211
+ print(f"[MultilingualTTSService] Error during Hindi synthesis: {e}")
212
+ raise RuntimeError(f"Hindi synthesis failed: {e}")
 
 
 
 
 
 
 
 
 
 
213
 
214
  # Convert to float32 if needed
215
  audio = np.asarray(audio, dtype=np.float32)