刘鑫 commited on
Commit
c1e01fa
·
1 Parent(s): 5360805

change some discription

Browse files
Files changed (1) hide show
  1. app.py +18 -46
app.py CHANGED
@@ -24,6 +24,13 @@ logging.basicConfig(
24
  logging.FileHandler('app.log', mode='a', encoding='utf-8')
25
  ]
26
  )
 
 
 
 
 
 
 
27
  logger = logging.getLogger(__name__)
28
 
29
  # 启动日志
@@ -138,15 +145,11 @@ class RayServeVoxCPMClient:
138
 
139
  try:
140
  start_time = time.time()
141
- logger.info(f"📁 处理音频文件: {prompt_wav}")
142
 
143
  # 将音频文件转换为base64
144
  convert_start = time.time()
145
  audio_base64 = self._audio_file_to_base64(prompt_wav)
146
  convert_time = time.time() - convert_start
147
- logger.info(f"🔄 音频转base64耗时: {convert_time:.3f}秒")
148
-
149
- logger.info("📡 调用Ray Serve ASR API...")
150
 
151
  # 构建ASR请求 - 匹配 voxcpm_api.py 格式
152
  asr_request = {
@@ -173,8 +176,6 @@ class RayServeVoxCPMClient:
173
  logger.info(f"⏱️ ASR API请求耗时: {api_time:.3f}秒")
174
  logger.info(f"⏱️ ASR总耗时: {total_time:.3f}秒")
175
  logger.info(f"🔍 完整的ASR响应: {result_data}")
176
- logger.info(f"📊 ASR响应类型: {type(result_data)}")
177
- logger.info(f"📊 ASR响应字段: {list(result_data.keys()) if isinstance(result_data, dict) else 'N/A'}")
178
 
179
  # 检查响应状态 - 基于实际响应格式,ASR有多种成功标识
180
  if isinstance(result_data, dict) and "text" in result_data and (
@@ -185,7 +186,6 @@ class RayServeVoxCPMClient:
185
  return recognized_text
186
  else:
187
  logger.warning(f"⚠️ ASR响应验证失败:")
188
- logger.warning(f" - 是否为字典: {isinstance(result_data, dict)}")
189
  if isinstance(result_data, dict):
190
  logger.warning(f" - code字段: {result_data.get('code')}")
191
  logger.warning(f" - 是否有text字段: {'text' in result_data}")
@@ -210,9 +210,7 @@ class RayServeVoxCPMClient:
210
  ) -> Tuple[int, np.ndarray]:
211
  """
212
  Call Ray Serve /generate API and return (sample_rate, waveform).
213
- """
214
- logger.info(f"🔥 调用Ray Serve生成API,文本: '{text[:60]}...'")
215
-
216
  try:
217
  start_time = time.time()
218
 
@@ -233,7 +231,6 @@ class RayServeVoxCPMClient:
233
  convert_start = time.time()
234
  audio_base64 = self._audio_file_to_base64(prompt_wav_path)
235
  convert_time = time.time() - convert_start
236
- logger.info(f"🔄 参考音频转base64耗时: {convert_time:.3f}秒")
237
 
238
  request_data.update({
239
  "prompt_wav": audio_base64,
@@ -242,10 +239,6 @@ class RayServeVoxCPMClient:
242
  else:
243
  logger.info("🎤 使用默认语音模式")
244
  prepare_time = time.time() - prepare_start
245
- logger.info(f"⏱️ 请求数据准备耗时: {prepare_time:.3f}秒")
246
-
247
- logger.info(f"📡 发送请求到Ray Serve: {self.api_url}/generate")
248
- logger.info(f"📊 请求参数: CFG={cfg_value}, 推理步数={inference_timesteps}, 文本长度={len(text)}")
249
 
250
  # 调用生成接口
251
  api_start = time.time()
@@ -259,16 +252,11 @@ class RayServeVoxCPMClient:
259
  api_time = time.time() - api_start
260
 
261
  result_data = response.json()
262
- logger.info(f"⏱️ TTS API请求耗时: {api_time:.3f}秒")
263
- logger.info(f"🔍 完整的Ray Serve响应: {result_data}")
264
- logger.info(f"📊 响应类型: {type(result_data)}")
265
- logger.info(f"📊 响应字段: {list(result_data.keys()) if isinstance(result_data, dict) else 'N/A'}")
266
 
267
  # 检查响应状态 - 基于实际响应格式,TTS响应没有code字段,只检查data
268
  if isinstance(result_data, dict) and "data" in result_data and isinstance(result_data["data"], str) and result_data["data"]:
269
  # 成功生成音频
270
  audio_base64 = result_data["data"]
271
- logger.info(f"✅ 找到音频数据,base64长度: {len(audio_base64)}")
272
 
273
  # 将base64音频转换为numpy数组
274
  decode_start = time.time()
@@ -276,10 +264,6 @@ class RayServeVoxCPMClient:
276
  decode_time = time.time() - decode_start
277
  total_time = time.time() - start_time
278
 
279
- duration_ms = result_data.get('addition', {}).get('duration', 'unknown')
280
- logger.info(f"🔄 音频解码耗时: {decode_time:.3f}秒")
281
- logger.info(f"⏱️ TTS总耗时: {total_time:.3f}秒")
282
- logger.info(f"🎵 音频生成成功,采样率: {sample_rate}, 时长: {duration_ms}ms")
283
  logger.info(f"📈 性能指标: API={api_time:.3f}s, 解码={decode_time:.3f}s, 总计={total_time:.3f}s")
284
 
285
  return sample_rate, audio_array
@@ -315,14 +299,12 @@ class RayServeVoxCPMClient:
315
  denoise: bool = True,
316
  ) -> Tuple[int, np.ndarray]:
317
  logger.info("🎤 开始TTS音频生成...")
318
- logger.info(f"📝 输入文本: '{text_input[:60]}{'...' if len(text_input) > 60 else ''}'")
319
- logger.info(f"🎵 参考音频: {prompt_wav_path_input or ''}")
320
- logger.info(f"📄 参考文本: '{prompt_text_input[:30]}{'...' if prompt_text_input and len(prompt_text_input) > 30 else ''}' " if prompt_text_input else "无")
321
  logger.info(f"⚙️ CFG值: {cfg_value_input}, 推理步数: {inference_timesteps_input}")
322
- logger.info(f"🔧 文本正规化: {do_normalize}, 音频降噪: {denoise}")
323
 
324
  try:
325
- full_start_time = time.time()
326
 
327
  text = (text_input or "").strip()
328
  if len(text) == 0:
@@ -334,8 +316,6 @@ class RayServeVoxCPMClient:
334
  cfg_value = cfg_value_input if cfg_value_input is not None else 2.0
335
  inference_timesteps = inference_timesteps_input if inference_timesteps_input is not None else 10
336
 
337
- logger.info("🚀 调用Ray Serve TTS生成引擎...")
338
- generate_start = time.time()
339
  sr, wav_np = self._call_ray_serve_generate(
340
  text=text,
341
  prompt_wav_path=prompt_wav_path,
@@ -345,11 +325,7 @@ class RayServeVoxCPMClient:
345
  do_normalize=do_normalize,
346
  denoise=denoise,
347
  )
348
- generate_time = time.time() - generate_start
349
- full_time = time.time() - full_start_time
350
 
351
- logger.info(f"✅ TTS生成完成,采样率: {sr}, 音频长度: {len(wav_np) if hasattr(wav_np, '__len__') else 'unknown'}")
352
- logger.info(f"🏁 完整TTS流程耗时: {full_time:.3f}秒 (生成={generate_time:.3f}s)")
353
  return (sr, wav_np)
354
 
355
  except Exception as e:
@@ -403,7 +379,7 @@ def create_demo_interface(client: RayServeVoxCPMClient):
403
  gr.HTML('<div class="logo-container"><img src="/gradio_api/file=assets/voxcpm-logo.png" alt="VoxCPM Logo"></div>')
404
 
405
  # Quick Start
406
- with gr.Accordion("📋 Quick Start Guide | 快速入门", open=False):
407
  gr.Markdown("""
408
  ### How to Use |使用说明
409
  1. **(Optional) Provide a Voice Prompt** - Upload or record an audio clip to provide the desired voice characteristics for synthesis.
@@ -417,7 +393,7 @@ def create_demo_interface(client: RayServeVoxCPMClient):
417
  """)
418
 
419
  # Pro Tips
420
- with gr.Accordion("💡 Pro Tips |使用建议", open=False):
421
  gr.Markdown("""
422
  ### Prompt Speech Enhancement|参考语音降噪
423
  - **Enable** to remove background noise for a clean, studio-like voice, with an external ZipEnhancer component.
@@ -442,19 +418,16 @@ def create_demo_interface(client: RayServeVoxCPMClient):
442
  **调低**:合成速度更快。
443
  - **Higher** for better synthesis quality.
444
  **调高**:合成质量更佳。
445
-
446
- ### Long Text (e.g., >5 min speech)|长文本 (如 >5分钟的合成语音)
447
- While VoxCPM can handle long texts directly, we recommend using empty lines to break very long content into paragraphs; the model will then synthesize each paragraph individually.
448
- 虽然 VoxCPM 支持直接生成长文本,但如果目标文本过长,我们建议使用换行符将内容分段;模型将对每个段落分别合成。
449
  """)
450
 
 
451
  with gr.Row():
452
  with gr.Column():
453
  prompt_wav = gr.Audio(
454
  sources=["upload", 'microphone'],
455
  type="filepath",
456
- label="Prompt Speech",
457
- value="examples/example.wav"
458
  )
459
  DoDenoisePromptAudio = gr.Checkbox(
460
  value=False,
@@ -489,16 +462,15 @@ def create_demo_interface(client: RayServeVoxCPMClient):
489
  )
490
  with gr.Row():
491
  text = gr.Textbox(
492
- value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech.",
493
  label="Target Text",
494
- info="Default processing splits text on \\n into paragraphs; each is synthesized as a chunk and then concatenated into the final audio."
495
  )
496
  with gr.Row():
497
  DoNormalizeText = gr.Checkbox(
498
  value=False,
499
  label="Text Normalization",
500
  elem_id="chk_normalize",
501
- info="We use WeTextPorcessing library to normalize the input text."
502
  )
503
  audio_output = gr.Audio(label="Output Audio")
504
 
 
24
  logging.FileHandler('app.log', mode='a', encoding='utf-8')
25
  ]
26
  )
27
+
28
+ # 控制第三方库的日志级别,避免HTTP请求日志刷屏
29
+ logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
30
+ logging.getLogger("uvicorn").setLevel(logging.WARNING)
31
+ logging.getLogger("httpx").setLevel(logging.WARNING)
32
+ logging.getLogger("httpcore").setLevel(logging.WARNING)
33
+
34
  logger = logging.getLogger(__name__)
35
 
36
  # 启动日志
 
145
 
146
  try:
147
  start_time = time.time()
 
148
 
149
  # 将音频文件转换为base64
150
  convert_start = time.time()
151
  audio_base64 = self._audio_file_to_base64(prompt_wav)
152
  convert_time = time.time() - convert_start
 
 
 
153
 
154
  # 构建ASR请求 - 匹配 voxcpm_api.py 格式
155
  asr_request = {
 
176
  logger.info(f"⏱️ ASR API请求耗时: {api_time:.3f}秒")
177
  logger.info(f"⏱️ ASR总耗时: {total_time:.3f}秒")
178
  logger.info(f"🔍 完整的ASR响应: {result_data}")
 
 
179
 
180
  # 检查响应状态 - 基于实际响应格式,ASR有多种成功标识
181
  if isinstance(result_data, dict) and "text" in result_data and (
 
186
  return recognized_text
187
  else:
188
  logger.warning(f"⚠️ ASR响应验证失败:")
 
189
  if isinstance(result_data, dict):
190
  logger.warning(f" - code字段: {result_data.get('code')}")
191
  logger.warning(f" - 是否有text字段: {'text' in result_data}")
 
210
  ) -> Tuple[int, np.ndarray]:
211
  """
212
  Call Ray Serve /generate API and return (sample_rate, waveform).
213
+ """
 
 
214
  try:
215
  start_time = time.time()
216
 
 
231
  convert_start = time.time()
232
  audio_base64 = self._audio_file_to_base64(prompt_wav_path)
233
  convert_time = time.time() - convert_start
 
234
 
235
  request_data.update({
236
  "prompt_wav": audio_base64,
 
239
  else:
240
  logger.info("🎤 使用默认语音模式")
241
  prepare_time = time.time() - prepare_start
 
 
 
 
242
 
243
  # 调用生成接口
244
  api_start = time.time()
 
252
  api_time = time.time() - api_start
253
 
254
  result_data = response.json()
 
 
 
 
255
 
256
  # 检查响应状态 - 基于实际响应格式,TTS响应没有code字段,只检查data
257
  if isinstance(result_data, dict) and "data" in result_data and isinstance(result_data["data"], str) and result_data["data"]:
258
  # 成功生成音频
259
  audio_base64 = result_data["data"]
 
260
 
261
  # 将base64音频转换为numpy数组
262
  decode_start = time.time()
 
264
  decode_time = time.time() - decode_start
265
  total_time = time.time() - start_time
266
 
 
 
 
 
267
  logger.info(f"📈 性能指标: API={api_time:.3f}s, 解码={decode_time:.3f}s, 总计={total_time:.3f}s")
268
 
269
  return sample_rate, audio_array
 
299
  denoise: bool = True,
300
  ) -> Tuple[int, np.ndarray]:
301
  logger.info("🎤 开始TTS音频生成...")
302
+ logger.info(f"📝 输入文本: '{text_input}'")
303
+ logger.info(f"📄 参考文本: '{prompt_text_input}' " if prompt_text_input else "无")
 
304
  logger.info(f"⚙️ CFG值: {cfg_value_input}, 推理步数: {inference_timesteps_input}")
305
+ logger.info(f"🔧 文本正则: {do_normalize}, 音频降噪: {denoise}")
306
 
307
  try:
 
308
 
309
  text = (text_input or "").strip()
310
  if len(text) == 0:
 
316
  cfg_value = cfg_value_input if cfg_value_input is not None else 2.0
317
  inference_timesteps = inference_timesteps_input if inference_timesteps_input is not None else 10
318
 
 
 
319
  sr, wav_np = self._call_ray_serve_generate(
320
  text=text,
321
  prompt_wav_path=prompt_wav_path,
 
325
  do_normalize=do_normalize,
326
  denoise=denoise,
327
  )
 
 
328
 
 
 
329
  return (sr, wav_np)
330
 
331
  except Exception as e:
 
379
  gr.HTML('<div class="logo-container"><img src="/gradio_api/file=assets/voxcpm-logo.png" alt="VoxCPM Logo"></div>')
380
 
381
  # Quick Start
382
+ with gr.Accordion("📋 Quick Start Guide |快速入门", open=False, elem_id="acc_quick"):
383
  gr.Markdown("""
384
  ### How to Use |使用说明
385
  1. **(Optional) Provide a Voice Prompt** - Upload or record an audio clip to provide the desired voice characteristics for synthesis.
 
393
  """)
394
 
395
  # Pro Tips
396
+ with gr.Accordion("💡 Pro Tips |使用建议", open=False, elem_id="acc_tips"):
397
  gr.Markdown("""
398
  ### Prompt Speech Enhancement|参考语音降噪
399
  - **Enable** to remove background noise for a clean, studio-like voice, with an external ZipEnhancer component.
 
418
  **调低**:合成速度更快。
419
  - **Higher** for better synthesis quality.
420
  **调高**:合成质量更佳。
 
 
 
 
421
  """)
422
 
423
+ # Main controls
424
  with gr.Row():
425
  with gr.Column():
426
  prompt_wav = gr.Audio(
427
  sources=["upload", 'microphone'],
428
  type="filepath",
429
+ label="Prompt Speech (Optional, or let VoxCPM improvise)",
430
+ value="./examples/example.wav",
431
  )
432
  DoDenoisePromptAudio = gr.Checkbox(
433
  value=False,
 
462
  )
463
  with gr.Row():
464
  text = gr.Textbox(
465
+ value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly realistic speech.",
466
  label="Target Text",
 
467
  )
468
  with gr.Row():
469
  DoNormalizeText = gr.Checkbox(
470
  value=False,
471
  label="Text Normalization",
472
  elem_id="chk_normalize",
473
+ info="We use wetext library to normalize the input text."
474
  )
475
  audio_output = gr.Audio(label="Output Audio")
476