Spaces:
Running
on
Zero
Running
on
Zero
刘鑫
commited on
Commit
·
c1e01fa
1
Parent(s):
5360805
change some discription
Browse files
app.py
CHANGED
|
@@ -24,6 +24,13 @@ logging.basicConfig(
|
|
| 24 |
logging.FileHandler('app.log', mode='a', encoding='utf-8')
|
| 25 |
]
|
| 26 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
logger = logging.getLogger(__name__)
|
| 28 |
|
| 29 |
# 启动日志
|
|
@@ -138,15 +145,11 @@ class RayServeVoxCPMClient:
|
|
| 138 |
|
| 139 |
try:
|
| 140 |
start_time = time.time()
|
| 141 |
-
logger.info(f"📁 处理音频文件: {prompt_wav}")
|
| 142 |
|
| 143 |
# 将音频文件转换为base64
|
| 144 |
convert_start = time.time()
|
| 145 |
audio_base64 = self._audio_file_to_base64(prompt_wav)
|
| 146 |
convert_time = time.time() - convert_start
|
| 147 |
-
logger.info(f"🔄 音频转base64耗时: {convert_time:.3f}秒")
|
| 148 |
-
|
| 149 |
-
logger.info("📡 调用Ray Serve ASR API...")
|
| 150 |
|
| 151 |
# 构建ASR请求 - 匹配 voxcpm_api.py 格式
|
| 152 |
asr_request = {
|
|
@@ -173,8 +176,6 @@ class RayServeVoxCPMClient:
|
|
| 173 |
logger.info(f"⏱️ ASR API请求耗时: {api_time:.3f}秒")
|
| 174 |
logger.info(f"⏱️ ASR总耗时: {total_time:.3f}秒")
|
| 175 |
logger.info(f"🔍 完整的ASR响应: {result_data}")
|
| 176 |
-
logger.info(f"📊 ASR响应类型: {type(result_data)}")
|
| 177 |
-
logger.info(f"📊 ASR响应字段: {list(result_data.keys()) if isinstance(result_data, dict) else 'N/A'}")
|
| 178 |
|
| 179 |
# 检查响应状态 - 基于实际响应格式,ASR有多种成功标识
|
| 180 |
if isinstance(result_data, dict) and "text" in result_data and (
|
|
@@ -185,7 +186,6 @@ class RayServeVoxCPMClient:
|
|
| 185 |
return recognized_text
|
| 186 |
else:
|
| 187 |
logger.warning(f"⚠️ ASR响应验证失败:")
|
| 188 |
-
logger.warning(f" - 是否为字典: {isinstance(result_data, dict)}")
|
| 189 |
if isinstance(result_data, dict):
|
| 190 |
logger.warning(f" - code字段: {result_data.get('code')}")
|
| 191 |
logger.warning(f" - 是否有text字段: {'text' in result_data}")
|
|
@@ -210,9 +210,7 @@ class RayServeVoxCPMClient:
|
|
| 210 |
) -> Tuple[int, np.ndarray]:
|
| 211 |
"""
|
| 212 |
Call Ray Serve /generate API and return (sample_rate, waveform).
|
| 213 |
-
"""
|
| 214 |
-
logger.info(f"🔥 调用Ray Serve生成API,文本: '{text[:60]}...'")
|
| 215 |
-
|
| 216 |
try:
|
| 217 |
start_time = time.time()
|
| 218 |
|
|
@@ -233,7 +231,6 @@ class RayServeVoxCPMClient:
|
|
| 233 |
convert_start = time.time()
|
| 234 |
audio_base64 = self._audio_file_to_base64(prompt_wav_path)
|
| 235 |
convert_time = time.time() - convert_start
|
| 236 |
-
logger.info(f"🔄 参考音频转base64耗时: {convert_time:.3f}秒")
|
| 237 |
|
| 238 |
request_data.update({
|
| 239 |
"prompt_wav": audio_base64,
|
|
@@ -242,10 +239,6 @@ class RayServeVoxCPMClient:
|
|
| 242 |
else:
|
| 243 |
logger.info("🎤 使用默认语音模式")
|
| 244 |
prepare_time = time.time() - prepare_start
|
| 245 |
-
logger.info(f"⏱️ 请求数据准备耗时: {prepare_time:.3f}秒")
|
| 246 |
-
|
| 247 |
-
logger.info(f"📡 发送请求到Ray Serve: {self.api_url}/generate")
|
| 248 |
-
logger.info(f"📊 请求参数: CFG={cfg_value}, 推理步数={inference_timesteps}, 文本长度={len(text)}")
|
| 249 |
|
| 250 |
# 调用生成接口
|
| 251 |
api_start = time.time()
|
|
@@ -259,16 +252,11 @@ class RayServeVoxCPMClient:
|
|
| 259 |
api_time = time.time() - api_start
|
| 260 |
|
| 261 |
result_data = response.json()
|
| 262 |
-
logger.info(f"⏱️ TTS API请求耗时: {api_time:.3f}秒")
|
| 263 |
-
logger.info(f"🔍 完整的Ray Serve响应: {result_data}")
|
| 264 |
-
logger.info(f"📊 响应类型: {type(result_data)}")
|
| 265 |
-
logger.info(f"📊 响应字段: {list(result_data.keys()) if isinstance(result_data, dict) else 'N/A'}")
|
| 266 |
|
| 267 |
# 检查响应状态 - 基于实际响应格式,TTS响应没有code字段,只检查data
|
| 268 |
if isinstance(result_data, dict) and "data" in result_data and isinstance(result_data["data"], str) and result_data["data"]:
|
| 269 |
# 成功生成音频
|
| 270 |
audio_base64 = result_data["data"]
|
| 271 |
-
logger.info(f"✅ 找到音频数据,base64长度: {len(audio_base64)}")
|
| 272 |
|
| 273 |
# 将base64音频转换为numpy数组
|
| 274 |
decode_start = time.time()
|
|
@@ -276,10 +264,6 @@ class RayServeVoxCPMClient:
|
|
| 276 |
decode_time = time.time() - decode_start
|
| 277 |
total_time = time.time() - start_time
|
| 278 |
|
| 279 |
-
duration_ms = result_data.get('addition', {}).get('duration', 'unknown')
|
| 280 |
-
logger.info(f"🔄 音频解码耗时: {decode_time:.3f}秒")
|
| 281 |
-
logger.info(f"⏱️ TTS总耗时: {total_time:.3f}秒")
|
| 282 |
-
logger.info(f"🎵 音频生成成功,采样率: {sample_rate}, 时长: {duration_ms}ms")
|
| 283 |
logger.info(f"📈 性能指标: API={api_time:.3f}s, 解码={decode_time:.3f}s, 总计={total_time:.3f}s")
|
| 284 |
|
| 285 |
return sample_rate, audio_array
|
|
@@ -315,14 +299,12 @@ class RayServeVoxCPMClient:
|
|
| 315 |
denoise: bool = True,
|
| 316 |
) -> Tuple[int, np.ndarray]:
|
| 317 |
logger.info("🎤 开始TTS音频生成...")
|
| 318 |
-
logger.info(f"📝 输入文本: '{text_input
|
| 319 |
-
logger.info(f"
|
| 320 |
-
logger.info(f"📄 参考文本: '{prompt_text_input[:30]}{'...' if prompt_text_input and len(prompt_text_input) > 30 else ''}' " if prompt_text_input else "无")
|
| 321 |
logger.info(f"⚙️ CFG值: {cfg_value_input}, 推理步数: {inference_timesteps_input}")
|
| 322 |
-
logger.info(f"🔧
|
| 323 |
|
| 324 |
try:
|
| 325 |
-
full_start_time = time.time()
|
| 326 |
|
| 327 |
text = (text_input or "").strip()
|
| 328 |
if len(text) == 0:
|
|
@@ -334,8 +316,6 @@ class RayServeVoxCPMClient:
|
|
| 334 |
cfg_value = cfg_value_input if cfg_value_input is not None else 2.0
|
| 335 |
inference_timesteps = inference_timesteps_input if inference_timesteps_input is not None else 10
|
| 336 |
|
| 337 |
-
logger.info("🚀 调用Ray Serve TTS生成引擎...")
|
| 338 |
-
generate_start = time.time()
|
| 339 |
sr, wav_np = self._call_ray_serve_generate(
|
| 340 |
text=text,
|
| 341 |
prompt_wav_path=prompt_wav_path,
|
|
@@ -345,11 +325,7 @@ class RayServeVoxCPMClient:
|
|
| 345 |
do_normalize=do_normalize,
|
| 346 |
denoise=denoise,
|
| 347 |
)
|
| 348 |
-
generate_time = time.time() - generate_start
|
| 349 |
-
full_time = time.time() - full_start_time
|
| 350 |
|
| 351 |
-
logger.info(f"✅ TTS生成完成,采样率: {sr}, 音频长度: {len(wav_np) if hasattr(wav_np, '__len__') else 'unknown'}")
|
| 352 |
-
logger.info(f"🏁 完整TTS流程耗时: {full_time:.3f}秒 (生成={generate_time:.3f}s)")
|
| 353 |
return (sr, wav_np)
|
| 354 |
|
| 355 |
except Exception as e:
|
|
@@ -403,7 +379,7 @@ def create_demo_interface(client: RayServeVoxCPMClient):
|
|
| 403 |
gr.HTML('<div class="logo-container"><img src="/gradio_api/file=assets/voxcpm-logo.png" alt="VoxCPM Logo"></div>')
|
| 404 |
|
| 405 |
# Quick Start
|
| 406 |
-
with gr.Accordion("📋 Quick Start Guide
|
| 407 |
gr.Markdown("""
|
| 408 |
### How to Use |使用说明
|
| 409 |
1. **(Optional) Provide a Voice Prompt** - Upload or record an audio clip to provide the desired voice characteristics for synthesis.
|
|
@@ -417,7 +393,7 @@ def create_demo_interface(client: RayServeVoxCPMClient):
|
|
| 417 |
""")
|
| 418 |
|
| 419 |
# Pro Tips
|
| 420 |
-
with gr.Accordion("💡 Pro Tips |使用建议", open=False):
|
| 421 |
gr.Markdown("""
|
| 422 |
### Prompt Speech Enhancement|参考语音降噪
|
| 423 |
- **Enable** to remove background noise for a clean, studio-like voice, with an external ZipEnhancer component.
|
|
@@ -442,19 +418,16 @@ def create_demo_interface(client: RayServeVoxCPMClient):
|
|
| 442 |
**调低**:合成速度更快。
|
| 443 |
- **Higher** for better synthesis quality.
|
| 444 |
**调高**:合成质量更佳。
|
| 445 |
-
|
| 446 |
-
### Long Text (e.g., >5 min speech)|长文本 (如 >5分钟的合成语音)
|
| 447 |
-
While VoxCPM can handle long texts directly, we recommend using empty lines to break very long content into paragraphs; the model will then synthesize each paragraph individually.
|
| 448 |
-
虽然 VoxCPM 支持直接生成长文本,但如果目标文本过长,我们建议使用换行符将内容分段;模型将对每个段落分别合成。
|
| 449 |
""")
|
| 450 |
|
|
|
|
| 451 |
with gr.Row():
|
| 452 |
with gr.Column():
|
| 453 |
prompt_wav = gr.Audio(
|
| 454 |
sources=["upload", 'microphone'],
|
| 455 |
type="filepath",
|
| 456 |
-
label="Prompt Speech",
|
| 457 |
-
value="examples/example.wav"
|
| 458 |
)
|
| 459 |
DoDenoisePromptAudio = gr.Checkbox(
|
| 460 |
value=False,
|
|
@@ -489,16 +462,15 @@ def create_demo_interface(client: RayServeVoxCPMClient):
|
|
| 489 |
)
|
| 490 |
with gr.Row():
|
| 491 |
text = gr.Textbox(
|
| 492 |
-
value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly
|
| 493 |
label="Target Text",
|
| 494 |
-
info="Default processing splits text on \\n into paragraphs; each is synthesized as a chunk and then concatenated into the final audio."
|
| 495 |
)
|
| 496 |
with gr.Row():
|
| 497 |
DoNormalizeText = gr.Checkbox(
|
| 498 |
value=False,
|
| 499 |
label="Text Normalization",
|
| 500 |
elem_id="chk_normalize",
|
| 501 |
-
info="We use
|
| 502 |
)
|
| 503 |
audio_output = gr.Audio(label="Output Audio")
|
| 504 |
|
|
|
|
| 24 |
logging.FileHandler('app.log', mode='a', encoding='utf-8')
|
| 25 |
]
|
| 26 |
)
|
| 27 |
+
|
| 28 |
+
# 控制第三方库的日志级别,避免HTTP请求日志刷屏
|
| 29 |
+
logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
|
| 30 |
+
logging.getLogger("uvicorn").setLevel(logging.WARNING)
|
| 31 |
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
| 32 |
+
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
| 33 |
+
|
| 34 |
logger = logging.getLogger(__name__)
|
| 35 |
|
| 36 |
# 启动日志
|
|
|
|
| 145 |
|
| 146 |
try:
|
| 147 |
start_time = time.time()
|
|
|
|
| 148 |
|
| 149 |
# 将音频文件转换为base64
|
| 150 |
convert_start = time.time()
|
| 151 |
audio_base64 = self._audio_file_to_base64(prompt_wav)
|
| 152 |
convert_time = time.time() - convert_start
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
# 构建ASR请求 - 匹配 voxcpm_api.py 格式
|
| 155 |
asr_request = {
|
|
|
|
| 176 |
logger.info(f"⏱️ ASR API请求耗时: {api_time:.3f}秒")
|
| 177 |
logger.info(f"⏱️ ASR总耗时: {total_time:.3f}秒")
|
| 178 |
logger.info(f"🔍 完整的ASR响应: {result_data}")
|
|
|
|
|
|
|
| 179 |
|
| 180 |
# 检查响应状态 - 基于实际响应格式,ASR有多种成功标识
|
| 181 |
if isinstance(result_data, dict) and "text" in result_data and (
|
|
|
|
| 186 |
return recognized_text
|
| 187 |
else:
|
| 188 |
logger.warning(f"⚠️ ASR响应验证失败:")
|
|
|
|
| 189 |
if isinstance(result_data, dict):
|
| 190 |
logger.warning(f" - code字段: {result_data.get('code')}")
|
| 191 |
logger.warning(f" - 是否有text字段: {'text' in result_data}")
|
|
|
|
| 210 |
) -> Tuple[int, np.ndarray]:
|
| 211 |
"""
|
| 212 |
Call Ray Serve /generate API and return (sample_rate, waveform).
|
| 213 |
+
"""
|
|
|
|
|
|
|
| 214 |
try:
|
| 215 |
start_time = time.time()
|
| 216 |
|
|
|
|
| 231 |
convert_start = time.time()
|
| 232 |
audio_base64 = self._audio_file_to_base64(prompt_wav_path)
|
| 233 |
convert_time = time.time() - convert_start
|
|
|
|
| 234 |
|
| 235 |
request_data.update({
|
| 236 |
"prompt_wav": audio_base64,
|
|
|
|
| 239 |
else:
|
| 240 |
logger.info("🎤 使用默认语音模式")
|
| 241 |
prepare_time = time.time() - prepare_start
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
# 调用生成接口
|
| 244 |
api_start = time.time()
|
|
|
|
| 252 |
api_time = time.time() - api_start
|
| 253 |
|
| 254 |
result_data = response.json()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
# 检查响应状态 - 基于实际响应格式,TTS响应没有code字段,只检查data
|
| 257 |
if isinstance(result_data, dict) and "data" in result_data and isinstance(result_data["data"], str) and result_data["data"]:
|
| 258 |
# 成功生成音频
|
| 259 |
audio_base64 = result_data["data"]
|
|
|
|
| 260 |
|
| 261 |
# 将base64音频转换为numpy数组
|
| 262 |
decode_start = time.time()
|
|
|
|
| 264 |
decode_time = time.time() - decode_start
|
| 265 |
total_time = time.time() - start_time
|
| 266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
logger.info(f"📈 性能指标: API={api_time:.3f}s, 解码={decode_time:.3f}s, 总计={total_time:.3f}s")
|
| 268 |
|
| 269 |
return sample_rate, audio_array
|
|
|
|
| 299 |
denoise: bool = True,
|
| 300 |
) -> Tuple[int, np.ndarray]:
|
| 301 |
logger.info("🎤 开始TTS音频生成...")
|
| 302 |
+
logger.info(f"📝 输入文本: '{text_input}'")
|
| 303 |
+
logger.info(f"📄 参考文本: '{prompt_text_input}' " if prompt_text_input else "无")
|
|
|
|
| 304 |
logger.info(f"⚙️ CFG值: {cfg_value_input}, 推理步数: {inference_timesteps_input}")
|
| 305 |
+
logger.info(f"🔧 文本正则: {do_normalize}, 音频降噪: {denoise}")
|
| 306 |
|
| 307 |
try:
|
|
|
|
| 308 |
|
| 309 |
text = (text_input or "").strip()
|
| 310 |
if len(text) == 0:
|
|
|
|
| 316 |
cfg_value = cfg_value_input if cfg_value_input is not None else 2.0
|
| 317 |
inference_timesteps = inference_timesteps_input if inference_timesteps_input is not None else 10
|
| 318 |
|
|
|
|
|
|
|
| 319 |
sr, wav_np = self._call_ray_serve_generate(
|
| 320 |
text=text,
|
| 321 |
prompt_wav_path=prompt_wav_path,
|
|
|
|
| 325 |
do_normalize=do_normalize,
|
| 326 |
denoise=denoise,
|
| 327 |
)
|
|
|
|
|
|
|
| 328 |
|
|
|
|
|
|
|
| 329 |
return (sr, wav_np)
|
| 330 |
|
| 331 |
except Exception as e:
|
|
|
|
| 379 |
gr.HTML('<div class="logo-container"><img src="/gradio_api/file=assets/voxcpm-logo.png" alt="VoxCPM Logo"></div>')
|
| 380 |
|
| 381 |
# Quick Start
|
| 382 |
+
with gr.Accordion("📋 Quick Start Guide |快速入门", open=False, elem_id="acc_quick"):
|
| 383 |
gr.Markdown("""
|
| 384 |
### How to Use |使用说明
|
| 385 |
1. **(Optional) Provide a Voice Prompt** - Upload or record an audio clip to provide the desired voice characteristics for synthesis.
|
|
|
|
| 393 |
""")
|
| 394 |
|
| 395 |
# Pro Tips
|
| 396 |
+
with gr.Accordion("💡 Pro Tips |使用建议", open=False, elem_id="acc_tips"):
|
| 397 |
gr.Markdown("""
|
| 398 |
### Prompt Speech Enhancement|参考语音降噪
|
| 399 |
- **Enable** to remove background noise for a clean, studio-like voice, with an external ZipEnhancer component.
|
|
|
|
| 418 |
**调低**:合成速度更快。
|
| 419 |
- **Higher** for better synthesis quality.
|
| 420 |
**调高**:合成质量更佳。
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
""")
|
| 422 |
|
| 423 |
+
# Main controls
|
| 424 |
with gr.Row():
|
| 425 |
with gr.Column():
|
| 426 |
prompt_wav = gr.Audio(
|
| 427 |
sources=["upload", 'microphone'],
|
| 428 |
type="filepath",
|
| 429 |
+
label="Prompt Speech (Optional, or let VoxCPM improvise)",
|
| 430 |
+
value="./examples/example.wav",
|
| 431 |
)
|
| 432 |
DoDenoisePromptAudio = gr.Checkbox(
|
| 433 |
value=False,
|
|
|
|
| 462 |
)
|
| 463 |
with gr.Row():
|
| 464 |
text = gr.Textbox(
|
| 465 |
+
value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly realistic speech.",
|
| 466 |
label="Target Text",
|
|
|
|
| 467 |
)
|
| 468 |
with gr.Row():
|
| 469 |
DoNormalizeText = gr.Checkbox(
|
| 470 |
value=False,
|
| 471 |
label="Text Normalization",
|
| 472 |
elem_id="chk_normalize",
|
| 473 |
+
info="We use wetext library to normalize the input text."
|
| 474 |
)
|
| 475 |
audio_output = gr.Audio(label="Output Audio")
|
| 476 |
|