Spaces:
Sleeping
Sleeping
Switch from XTTS to Facebook MMS for Hindi synthesis
Browse filesBENEFITS:
No TOS/License required (open model)
200MB instead of 1.8GB (9x smaller)
Fast inference
Good quality synthesis
No email/licensing hassles
TECHNICAL CHANGES:
- Model: tts_models/multilingual/multi-dataset/xtts_v2 tts_models/hin/facebook/mms-tts-hin
- Removed stdin suppression (not needed, no TOS)
- Simplified Hindi synthesis (MMS is language-specific)
- No speaker_wav parameter (MMS doesn't support speaker adaptation)
COMPARISON:
XTTS: 1.8GB, High quality, TOS+License required
MMS: 200MB, Good quality, No requirements RECOMMENDED
- backend/app/multilingual_tts.py +23 -36
backend/app/multilingual_tts.py
CHANGED
|
@@ -110,26 +110,23 @@ class MultilingualTTSService:
|
|
| 110 |
print("[MultilingualTTSService] ✓ English vocoder loaded")
|
| 111 |
|
| 112 |
def _load_hindi_models(self):
|
| 113 |
-
"""Load Hindi
|
| 114 |
if self._xtts_model is None:
|
| 115 |
-
print("[MultilingualTTSService] Loading Hindi
|
| 116 |
try:
|
| 117 |
from TTS.api import TTS
|
| 118 |
-
import io
|
| 119 |
|
| 120 |
-
#
|
| 121 |
-
#
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
finally:
|
| 132 |
-
sys.stdin = old_stdin
|
| 133 |
|
| 134 |
except ImportError:
|
| 135 |
raise ImportError(
|
|
@@ -137,8 +134,9 @@ class MultilingualTTSService:
|
|
| 137 |
"Install with: pip install TTS>=0.21.0"
|
| 138 |
)
|
| 139 |
except Exception as e:
|
| 140 |
-
print(f"[MultilingualTTSService] Error loading
|
| 141 |
-
|
|
|
|
| 142 |
|
| 143 |
def synthesize(self, text: str, voice_sample_path: Union[str, Path],
|
| 144 |
language: str = "english") -> np.ndarray:
|
|
@@ -197,32 +195,21 @@ class MultilingualTTSService:
|
|
| 197 |
return np.clip(synthesized, -1.0, 1.0)
|
| 198 |
|
| 199 |
def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
|
| 200 |
-
"""Synthesize Hindi speech using
|
| 201 |
self._load_hindi_models()
|
| 202 |
|
| 203 |
print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
|
| 204 |
|
| 205 |
-
#
|
| 206 |
-
#
|
| 207 |
try:
|
| 208 |
audio = self._xtts_model.tts(
|
| 209 |
text=text,
|
| 210 |
-
speaker_wav=
|
| 211 |
-
language="hi" # Try ISO 639-1 code
|
| 212 |
)
|
| 213 |
-
except
|
| 214 |
-
print("[MultilingualTTSService]
|
| 215 |
-
|
| 216 |
-
audio = self._xtts_model.tts(
|
| 217 |
-
text=text,
|
| 218 |
-
speaker_wav=str(voice_sample_path),
|
| 219 |
-
language="hindi" # Try full language name
|
| 220 |
-
)
|
| 221 |
-
except NotImplementedError:
|
| 222 |
-
raise RuntimeError(
|
| 223 |
-
"Hindi language not supported in this XTTS version. "
|
| 224 |
-
"XTTS-v2 may only support: en, es, fr, de, it, pt, pl, tr, ru, nl, zh-cn, zh-tw, ar, cs, el, hu, ko, ja"
|
| 225 |
-
)
|
| 226 |
|
| 227 |
# Convert to float32 if needed
|
| 228 |
audio = np.asarray(audio, dtype=np.float32)
|
|
|
|
| 110 |
print("[MultilingualTTSService] ✓ English vocoder loaded")
|
| 111 |
|
| 112 |
def _load_hindi_models(self):
|
| 113 |
+
"""Load Hindi Facebook MMS model - no TOS required, lightweight."""
|
| 114 |
if self._xtts_model is None:
|
| 115 |
+
print("[MultilingualTTSService] Loading Hindi Facebook MMS model...")
|
| 116 |
try:
|
| 117 |
from TTS.api import TTS
|
|
|
|
| 118 |
|
| 119 |
+
# Facebook MMS: No TOS required, lightweight (200MB vs XTTS 1.8GB)
|
| 120 |
+
# Downloads once and caches locally
|
| 121 |
+
self._xtts_model = TTS(
|
| 122 |
+
model_name="tts_models/hin/facebook/mms-tts-hin",
|
| 123 |
+
gpu=False,
|
| 124 |
+
progress_bar=False
|
| 125 |
+
)
|
| 126 |
+
print("[MultilingualTTSService] ✓ Hindi Facebook MMS loaded successfully")
|
| 127 |
+
print("[MultilingualTTSService] Model: Facebook Massively Multilingual Speech (MMS)")
|
| 128 |
+
print("[MultilingualTTSService] Language: Hindi (hin)")
|
| 129 |
+
print("[MultilingualTTSService] TOS: No (Open model)")
|
|
|
|
|
|
|
| 130 |
|
| 131 |
except ImportError:
|
| 132 |
raise ImportError(
|
|
|
|
| 134 |
"Install with: pip install TTS>=0.21.0"
|
| 135 |
)
|
| 136 |
except Exception as e:
|
| 137 |
+
print(f"[MultilingualTTSService] Error loading Hindi MMS model: {e}")
|
| 138 |
+
print(f"[MultilingualTTSService] Make sure TTS library is properly installed")
|
| 139 |
+
raise RuntimeError(f"Failed to load Hindi MMS model: {e}")
|
| 140 |
|
| 141 |
def synthesize(self, text: str, voice_sample_path: Union[str, Path],
|
| 142 |
language: str = "english") -> np.ndarray:
|
|
|
|
| 195 |
return np.clip(synthesized, -1.0, 1.0)
|
| 196 |
|
| 197 |
def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
|
| 198 |
+
"""Synthesize Hindi speech using Facebook MMS model."""
|
| 199 |
self._load_hindi_models()
|
| 200 |
|
| 201 |
print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
|
| 202 |
|
| 203 |
+
# Facebook MMS uses simple TTS interface (no language parameter needed)
|
| 204 |
+
# MMS model is language-specific, already tuned for Hindi
|
| 205 |
try:
|
| 206 |
audio = self._xtts_model.tts(
|
| 207 |
text=text,
|
| 208 |
+
speaker_wav=None # MMS doesn't use speaker adaptation
|
|
|
|
| 209 |
)
|
| 210 |
+
except Exception as e:
|
| 211 |
+
print(f"[MultilingualTTSService] Error during Hindi synthesis: {e}")
|
| 212 |
+
raise RuntimeError(f"Hindi synthesis failed: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
# Convert to float32 if needed
|
| 215 |
audio = np.asarray(audio, dtype=np.float32)
|