Spaces:
Sleeping
Sleeping
Add song generation backend: Demucs vocal separation + voice synthesis + audio mixing
Browse files
backend/app/routes.py
CHANGED
|
@@ -407,3 +407,146 @@ def get_waveform(audio_filename):
|
|
| 407 |
err_msg = f'Failed to generate waveform: {str(e)}'
|
| 408 |
return jsonify({'error': err_msg}), 500
|
| 409 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
err_msg = f'Failed to generate waveform: {str(e)}'
|
| 408 |
return jsonify({'error': err_msg}), 500
|
| 409 |
|
| 410 |
+
|
| 411 |
+
# ============================================================================
|
| 412 |
+
# SONG GENERATION ENDPOINTS
|
| 413 |
+
# ============================================================================
|
| 414 |
+
|
| 415 |
+
@bp.route('/convert_song', methods=['POST'])
|
| 416 |
+
def convert_song():
|
| 417 |
+
"""
|
| 418 |
+
Convert a song to user's voice.
|
| 419 |
+
|
| 420 |
+
Form data:
|
| 421 |
+
- song: audio file (mp3, wav, etc.)
|
| 422 |
+
- voice_id: ID of enrolled voice to use
|
| 423 |
+
- language: 'english' or 'hindi'
|
| 424 |
+
- add_effects: 'true' or 'false' to add reverb/compression
|
| 425 |
+
|
| 426 |
+
Returns: Generated song audio file
|
| 427 |
+
"""
|
| 428 |
+
try:
|
| 429 |
+
print("\n[API] POST /api/convert_song")
|
| 430 |
+
|
| 431 |
+
# Validate input
|
| 432 |
+
if 'song' not in request.files:
|
| 433 |
+
return jsonify({'error': 'No song file provided'}), 400
|
| 434 |
+
|
| 435 |
+
if 'voice_id' not in request.form:
|
| 436 |
+
return jsonify({'error': 'No voice_id provided'}), 400
|
| 437 |
+
|
| 438 |
+
song_file = request.files['song']
|
| 439 |
+
voice_id = request.form.get('voice_id')
|
| 440 |
+
language = request.form.get('language', 'english')
|
| 441 |
+
add_effects = request.form.get('add_effects', 'true').lower() == 'true'
|
| 442 |
+
|
| 443 |
+
if not allowed_file(song_file.filename):
|
| 444 |
+
return jsonify({'error': f'File type not allowed. Allowed: {ALLOWED_EXTENSIONS}'}), 400
|
| 445 |
+
|
| 446 |
+
# Load voices database
|
| 447 |
+
voices_db = load_voices_db()
|
| 448 |
+
voice_data = next((v for v in voices_db if v['id'] == voice_id), None)
|
| 449 |
+
|
| 450 |
+
if not voice_data:
|
| 451 |
+
return jsonify({'error': f'Voice {voice_id} not found'}), 404
|
| 452 |
+
|
| 453 |
+
# Save uploaded song
|
| 454 |
+
song_filename = f"song_{uuid.uuid4().hex}.wav"
|
| 455 |
+
song_path = OUTPUT_FOLDER / song_filename
|
| 456 |
+
song_file.save(song_path)
|
| 457 |
+
print(f"[API] Song saved: {song_path}")
|
| 458 |
+
|
| 459 |
+
# Get voice file path
|
| 460 |
+
voice_filepath = UPLOAD_FOLDER / voice_data['filename']
|
| 461 |
+
if not voice_filepath.exists():
|
| 462 |
+
return jsonify({'error': 'Voice file not found'}), 404
|
| 463 |
+
|
| 464 |
+
# Output path
|
| 465 |
+
output_filename = f"converted_song_{uuid.uuid4().hex}.wav"
|
| 466 |
+
output_path = OUTPUT_FOLDER / output_filename
|
| 467 |
+
|
| 468 |
+
print(f"[API] Starting song conversion...")
|
| 469 |
+
print(f"[API] Language: {language}")
|
| 470 |
+
print(f"[API] Add effects: {add_effects}")
|
| 471 |
+
|
| 472 |
+
# Import song processor
|
| 473 |
+
from app.song_conversion.song_processor import SongProcessor
|
| 474 |
+
|
| 475 |
+
processor = SongProcessor(MODELS_DIR)
|
| 476 |
+
result_path = processor.convert_song(
|
| 477 |
+
song_path=song_path,
|
| 478 |
+
voice_path=voice_filepath,
|
| 479 |
+
output_path=output_path,
|
| 480 |
+
language=language,
|
| 481 |
+
add_effects=add_effects,
|
| 482 |
+
models_dir=MODELS_DIR
|
| 483 |
+
)
|
| 484 |
+
|
| 485 |
+
print(f"[API] Song conversion complete: {result_path}")
|
| 486 |
+
|
| 487 |
+
# Return download URL
|
| 488 |
+
return jsonify({
|
| 489 |
+
'success': True,
|
| 490 |
+
'message': 'Song converted successfully',
|
| 491 |
+
'audio_url': f'/api/audio/{output_filename}',
|
| 492 |
+
'filename': output_filename
|
| 493 |
+
}), 200
|
| 494 |
+
|
| 495 |
+
except Exception as e:
|
| 496 |
+
print(f"[API] ✗ Error in convert_song: {e}")
|
| 497 |
+
import traceback
|
| 498 |
+
traceback.print_exc()
|
| 499 |
+
return jsonify({'error': str(e)}), 500
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
@bp.route('/separate_vocals', methods=['POST'])
|
| 503 |
+
def separate_vocals():
|
| 504 |
+
"""
|
| 505 |
+
Separate vocals from a song file.
|
| 506 |
+
|
| 507 |
+
Form data:
|
| 508 |
+
- song: audio file
|
| 509 |
+
|
| 510 |
+
Returns: JSON with vocal and instrumental file URLs
|
| 511 |
+
"""
|
| 512 |
+
try:
|
| 513 |
+
print("\n[API] POST /api/separate_vocals")
|
| 514 |
+
|
| 515 |
+
if 'song' not in request.files:
|
| 516 |
+
return jsonify({'error': 'No song file provided'}), 400
|
| 517 |
+
|
| 518 |
+
song_file = request.files['song']
|
| 519 |
+
|
| 520 |
+
if not allowed_file(song_file.filename):
|
| 521 |
+
return jsonify({'error': f'File type not allowed'}), 400
|
| 522 |
+
|
| 523 |
+
# Save uploaded song
|
| 524 |
+
song_filename = f"song_{uuid.uuid4().hex}.wav"
|
| 525 |
+
song_path = OUTPUT_FOLDER / song_filename
|
| 526 |
+
song_file.save(song_path)
|
| 527 |
+
|
| 528 |
+
print(f"[API] Song saved: {song_path}")
|
| 529 |
+
print(f"[API] Separating vocals...")
|
| 530 |
+
|
| 531 |
+
from app.song_conversion.vocal_separator import VocalSeparator
|
| 532 |
+
|
| 533 |
+
separator = VocalSeparator()
|
| 534 |
+
vocals_path, instrumental_path = separator.separate_and_save(
|
| 535 |
+
song_path,
|
| 536 |
+
OUTPUT_FOLDER,
|
| 537 |
+
sr=16000
|
| 538 |
+
)
|
| 539 |
+
|
| 540 |
+
return jsonify({
|
| 541 |
+
'success': True,
|
| 542 |
+
'vocals_url': f'/api/audio/{vocals_path.name}',
|
| 543 |
+
'instrumental_url': f'/api/audio/{instrumental_path.name}',
|
| 544 |
+
'vocals_file': vocals_path.name,
|
| 545 |
+
'instrumental_file': instrumental_path.name
|
| 546 |
+
}), 200
|
| 547 |
+
|
| 548 |
+
except Exception as e:
|
| 549 |
+
print(f"[API] ✗ Error in separate_vocals: {e}")
|
| 550 |
+
import traceback
|
| 551 |
+
traceback.print_exc()
|
| 552 |
+
return jsonify({'error': str(e)}), 500
|
backend/app/song_conversion/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Song conversion module for voice-to-song transformation."""
|
| 2 |
+
|
| 3 |
+
from .vocal_separator import VocalSeparator
|
| 4 |
+
from .audio_mixer import AudioMixer
|
| 5 |
+
from .song_processor import SongProcessor
|
| 6 |
+
|
| 7 |
+
__all__ = ['VocalSeparator', 'AudioMixer', 'SongProcessor']
|
backend/app/song_conversion/audio_mixer.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Audio mixing and effects for song generation."""
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import soundfile as sf
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Tuple, Optional
|
| 7 |
+
import subprocess
|
| 8 |
+
import sys
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class AudioMixer:
|
| 12 |
+
"""Mixes vocals with instrumental and applies effects."""
|
| 13 |
+
|
| 14 |
+
@staticmethod
|
| 15 |
+
def normalize_audio(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:
|
| 16 |
+
"""
|
| 17 |
+
Normalize audio to target dB level.
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
audio: Audio array
|
| 21 |
+
target_db: Target peak level in dB (default -3dB is professional standard)
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
Normalized audio
|
| 25 |
+
"""
|
| 26 |
+
# Convert dB to linear
|
| 27 |
+
target_linear = 10 ** (target_db / 20.0)
|
| 28 |
+
|
| 29 |
+
# Find current peak
|
| 30 |
+
current_peak = np.max(np.abs(audio))
|
| 31 |
+
|
| 32 |
+
if current_peak > 0:
|
| 33 |
+
# Scale to target
|
| 34 |
+
audio = audio * (target_linear / current_peak)
|
| 35 |
+
|
| 36 |
+
# Clip to prevent distortion
|
| 37 |
+
audio = np.clip(audio, -1.0, 1.0)
|
| 38 |
+
|
| 39 |
+
return audio
|
| 40 |
+
|
| 41 |
+
@staticmethod
|
| 42 |
+
def add_reverb(audio: np.ndarray, sr: int = 16000, room_scale: float = 0.3,
|
| 43 |
+
delay_ms: float = 50) -> np.ndarray:
|
| 44 |
+
"""
|
| 45 |
+
Add simple reverb effect.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
audio: Input audio
|
| 49 |
+
sr: Sample rate
|
| 50 |
+
room_scale: Reverb amount (0-1)
|
| 51 |
+
delay_ms: Delay in milliseconds
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
Audio with reverb
|
| 55 |
+
"""
|
| 56 |
+
delay_samples = int((delay_ms / 1000.0) * sr)
|
| 57 |
+
|
| 58 |
+
# Create delayed version
|
| 59 |
+
delayed = np.zeros_like(audio)
|
| 60 |
+
if delay_samples < len(audio):
|
| 61 |
+
delayed[delay_samples:] = audio[:-delay_samples]
|
| 62 |
+
|
| 63 |
+
# Mix original with delayed
|
| 64 |
+
reverb = audio + room_scale * delayed
|
| 65 |
+
|
| 66 |
+
return reverb
|
| 67 |
+
|
| 68 |
+
@staticmethod
|
| 69 |
+
def compress_audio(audio: np.ndarray, threshold: float = 0.6, ratio: float = 4.0) -> np.ndarray:
|
| 70 |
+
"""
|
| 71 |
+
Apply dynamic range compression.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
audio: Input audio
|
| 75 |
+
threshold: Compression threshold (0-1)
|
| 76 |
+
ratio: Compression ratio
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
Compressed audio
|
| 80 |
+
"""
|
| 81 |
+
# Simple peak compression
|
| 82 |
+
abs_audio = np.abs(audio)
|
| 83 |
+
|
| 84 |
+
# Find samples above threshold
|
| 85 |
+
mask = abs_audio > threshold
|
| 86 |
+
|
| 87 |
+
# Apply compression to loud parts
|
| 88 |
+
audio[mask] = np.sign(audio[mask]) * (threshold + (abs_audio[mask] - threshold) / ratio)
|
| 89 |
+
|
| 90 |
+
return audio
|
| 91 |
+
|
| 92 |
+
@staticmethod
|
| 93 |
+
def mix_audio(vocal: np.ndarray, instrumental: np.ndarray,
|
| 94 |
+
vocal_level: float = 0.7, instrumental_level: float = 0.3,
|
| 95 |
+
add_reverb: bool = True, add_compression: bool = True,
|
| 96 |
+
sr: int = 16000) -> np.ndarray:
|
| 97 |
+
"""
|
| 98 |
+
Mix vocals and instrumental with effects.
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
vocal: Vocal audio
|
| 102 |
+
instrumental: Instrumental audio
|
| 103 |
+
vocal_level: Vocal volume level (0-1)
|
| 104 |
+
instrumental_level: Instrumental volume level (0-1)
|
| 105 |
+
add_reverb: Whether to add reverb to vocals
|
| 106 |
+
add_compression: Whether to add compression
|
| 107 |
+
sr: Sample rate
|
| 108 |
+
|
| 109 |
+
Returns:
|
| 110 |
+
Mixed audio
|
| 111 |
+
"""
|
| 112 |
+
print("[AudioMixer] Normalizing tracks...")
|
| 113 |
+
|
| 114 |
+
# Normalize individual tracks
|
| 115 |
+
vocal = AudioMixer.normalize_audio(vocal, -6.0) # Vocals a bit quieter initially
|
| 116 |
+
instrumental = AudioMixer.normalize_audio(instrumental, -6.0)
|
| 117 |
+
|
| 118 |
+
print("[AudioMixer] Adding effects...")
|
| 119 |
+
|
| 120 |
+
# Add reverb to vocals
|
| 121 |
+
if add_reverb:
|
| 122 |
+
vocal = AudioMixer.add_reverb(vocal, sr, room_scale=0.2, delay_ms=40)
|
| 123 |
+
|
| 124 |
+
# Apply compression
|
| 125 |
+
if add_compression:
|
| 126 |
+
vocal = AudioMixer.compress_audio(vocal, threshold=0.5, ratio=3.0)
|
| 127 |
+
|
| 128 |
+
print("[AudioMixer] Mixing tracks...")
|
| 129 |
+
|
| 130 |
+
# Ensure same length
|
| 131 |
+
min_len = min(len(vocal), len(instrumental))
|
| 132 |
+
vocal = vocal[:min_len]
|
| 133 |
+
instrumental = instrumental[:min_len]
|
| 134 |
+
|
| 135 |
+
# Mix with specified levels
|
| 136 |
+
mixed = vocal_level * vocal + instrumental_level * instrumental
|
| 137 |
+
|
| 138 |
+
# Normalize final mix
|
| 139 |
+
mixed = AudioMixer.normalize_audio(mixed, -3.0)
|
| 140 |
+
|
| 141 |
+
print(f"[AudioMixer] Mix complete - Peak: {np.max(np.abs(mixed)):.4f}")
|
| 142 |
+
|
| 143 |
+
return mixed
|
| 144 |
+
|
| 145 |
+
@staticmethod
|
| 146 |
+
def save_audio(audio: np.ndarray, output_path: Path, sr: int = 16000) -> None:
|
| 147 |
+
"""
|
| 148 |
+
Save audio to file.
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
audio: Audio array
|
| 152 |
+
output_path: Output file path
|
| 153 |
+
sr: Sample rate
|
| 154 |
+
"""
|
| 155 |
+
output_path = Path(output_path)
|
| 156 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 157 |
+
|
| 158 |
+
print(f"[AudioMixer] Saving to {output_path}")
|
| 159 |
+
sf.write(output_path, audio, sr)
|
| 160 |
+
print(f"[AudioMixer] Saved successfully")
|
| 161 |
+
|
| 162 |
+
@staticmethod
|
| 163 |
+
def mix_and_save(vocal: np.ndarray, instrumental: np.ndarray,
|
| 164 |
+
output_path: Path, sr: int = 16000,
|
| 165 |
+
add_effects: bool = True) -> Path:
|
| 166 |
+
"""
|
| 167 |
+
Mix audio and save to file.
|
| 168 |
+
|
| 169 |
+
Args:
|
| 170 |
+
vocal: Vocal audio
|
| 171 |
+
instrumental: Instrumental audio
|
| 172 |
+
output_path: Output file path
|
| 173 |
+
sr: Sample rate
|
| 174 |
+
add_effects: Whether to add effects
|
| 175 |
+
|
| 176 |
+
Returns:
|
| 177 |
+
Output file path
|
| 178 |
+
"""
|
| 179 |
+
mixed = AudioMixer.mix_audio(
|
| 180 |
+
vocal, instrumental,
|
| 181 |
+
add_reverb=add_effects,
|
| 182 |
+
add_compression=add_effects,
|
| 183 |
+
sr=sr
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
AudioMixer.save_audio(mixed, output_path, sr)
|
| 187 |
+
|
| 188 |
+
return Path(output_path)
|
backend/app/song_conversion/song_processor.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Main song processing orchestrator."""
|
| 2 |
+
|
| 3 |
+
import gc
|
| 4 |
+
import torch
|
| 5 |
+
import numpy as np
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Optional
|
| 8 |
+
import sys
|
| 9 |
+
|
| 10 |
+
from app.song_conversion.vocal_separator import VocalSeparator
|
| 11 |
+
from app.song_conversion.audio_mixer import AudioMixer
|
| 12 |
+
from encoder import inference as encoder_infer
|
| 13 |
+
from synthesizer import inference as synthesizer_infer
|
| 14 |
+
from app.vocoder import inference as vocoder_infer
|
| 15 |
+
from synthesizer.hparams import hparams as syn_hp
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class SongProcessor:
|
| 19 |
+
"""Orchestrates the complete song voice conversion process."""
|
| 20 |
+
|
| 21 |
+
def __init__(self, models_dir: Path):
|
| 22 |
+
"""
|
| 23 |
+
Initialize song processor.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
models_dir: Directory containing pre-trained models
|
| 27 |
+
"""
|
| 28 |
+
self.models_dir = Path(models_dir)
|
| 29 |
+
self.separator = None
|
| 30 |
+
self.sr = 16000
|
| 31 |
+
|
| 32 |
+
def _ensure_separator(self) -> VocalSeparator:
|
| 33 |
+
"""Lazy load vocal separator."""
|
| 34 |
+
if self.separator is None:
|
| 35 |
+
print("[SongProcessor] Initializing vocal separator...")
|
| 36 |
+
self.separator = VocalSeparator(model_name="htdemucs")
|
| 37 |
+
return self.separator
|
| 38 |
+
|
| 39 |
+
def _load_voice_models(self, models_dir: Path, language: str = 'english') -> None:
|
| 40 |
+
"""Load voice cloning models."""
|
| 41 |
+
print(f"[SongProcessor] Loading {language} voice models...")
|
| 42 |
+
|
| 43 |
+
enc_path = models_dir / "default" / "encoder.pt"
|
| 44 |
+
syn_path = models_dir / "default" / "synthesizer.pt"
|
| 45 |
+
voc_path = models_dir / "default" / "vocoder.pt"
|
| 46 |
+
|
| 47 |
+
for path in [enc_path, syn_path, voc_path]:
|
| 48 |
+
if not path.exists():
|
| 49 |
+
raise RuntimeError(f"Model missing: {path}")
|
| 50 |
+
|
| 51 |
+
encoder_infer.load_model(enc_path)
|
| 52 |
+
print("[SongProcessor] Encoder loaded")
|
| 53 |
+
|
| 54 |
+
synthesizer = synthesizer_infer.Synthesizer(syn_path)
|
| 55 |
+
print("[SongProcessor] Synthesizer loaded")
|
| 56 |
+
|
| 57 |
+
vocoder_infer.load_model(voc_path)
|
| 58 |
+
print("[SongProcessor] Vocoder loaded")
|
| 59 |
+
|
| 60 |
+
return synthesizer
|
| 61 |
+
|
| 62 |
+
def _extract_lyrics_from_audio(self, audio_path: Path, voice_sample_path: Path) -> str:
|
| 63 |
+
"""
|
| 64 |
+
Simple lyrics extraction (placeholder - returns generic text).
|
| 65 |
+
In production, would use speech-to-text.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
audio_path: Path to vocal audio
|
| 69 |
+
voice_sample_path: Path to reference voice
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
Extracted lyrics text
|
| 73 |
+
"""
|
| 74 |
+
print("[SongProcessor] Extracting lyrics from audio...")
|
| 75 |
+
|
| 76 |
+
# Placeholder: return generic phonetically rich text
|
| 77 |
+
# In production, use Whisper or other STT model
|
| 78 |
+
lyrics = "The music is playing so well with this song today"
|
| 79 |
+
|
| 80 |
+
print(f"[SongProcessor] Using default lyrics: {lyrics}")
|
| 81 |
+
return lyrics
|
| 82 |
+
|
| 83 |
+
def convert_song(self, song_path: Path, voice_path: Path, output_path: Path,
|
| 84 |
+
language: str = 'english', add_effects: bool = True,
|
| 85 |
+
models_dir: Optional[Path] = None) -> Path:
|
| 86 |
+
"""
|
| 87 |
+
Convert song to user's voice.
|
| 88 |
+
|
| 89 |
+
Complete pipeline:
|
| 90 |
+
1. Separate vocals from instrumental
|
| 91 |
+
2. Extract lyrics from vocals (or use placeholder)
|
| 92 |
+
3. Synthesize vocals using user's voice
|
| 93 |
+
4. Mix synthesized vocals with instrumental
|
| 94 |
+
5. Add audio effects
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
song_path: Path to input song
|
| 98 |
+
voice_path: Path to reference voice sample
|
| 99 |
+
output_path: Path for output song
|
| 100 |
+
language: 'english' or 'hindi'
|
| 101 |
+
add_effects: Whether to add reverb/compression
|
| 102 |
+
models_dir: Directory with models (uses self.models_dir if None)
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
Path to output song
|
| 106 |
+
"""
|
| 107 |
+
if models_dir is None:
|
| 108 |
+
models_dir = self.models_dir
|
| 109 |
+
|
| 110 |
+
song_path = Path(song_path)
|
| 111 |
+
voice_path = Path(voice_path)
|
| 112 |
+
output_path = Path(output_path)
|
| 113 |
+
|
| 114 |
+
try:
|
| 115 |
+
print(f"\n[SongProcessor] ========== SONG CONVERSION START ==========")
|
| 116 |
+
print(f"[SongProcessor] Song: {song_path}")
|
| 117 |
+
print(f"[SongProcessor] Voice: {voice_path}")
|
| 118 |
+
print(f"[SongProcessor] Language: {language}")
|
| 119 |
+
print(f"[SongProcessor] Output: {output_path}")
|
| 120 |
+
|
| 121 |
+
# Step 1: Separate vocals
|
| 122 |
+
print(f"\n[SongProcessor] STEP 1: Separating vocals...")
|
| 123 |
+
separator = self._ensure_separator()
|
| 124 |
+
vocals, instrumental = separator.separate(song_path, sr=self.sr)
|
| 125 |
+
|
| 126 |
+
# Step 2: Extract/prepare lyrics (using placeholder for now)
|
| 127 |
+
print(f"\n[SongProcessor] STEP 2: Preparing lyrics...")
|
| 128 |
+
lyrics = self._extract_lyrics_from_audio(song_path, voice_path)
|
| 129 |
+
|
| 130 |
+
# Step 3: Load voice models
|
| 131 |
+
print(f"\n[SongProcessor] STEP 3: Loading voice models...")
|
| 132 |
+
synthesizer = self._load_voice_models(models_dir, language)
|
| 133 |
+
|
| 134 |
+
# Step 4: Synthesize voice with your voice
|
| 135 |
+
print(f"\n[SongProcessor] STEP 4: Synthesizing vocals with your voice...")
|
| 136 |
+
wav = encoder_infer.preprocess_wav(voice_path)
|
| 137 |
+
embed = encoder_infer.embed_utterance(wav)
|
| 138 |
+
|
| 139 |
+
mels = synthesizer.synthesize_spectrograms([lyrics], [embed])
|
| 140 |
+
mel = mels[0]
|
| 141 |
+
|
| 142 |
+
print("[SongProcessor] Vocoding...")
|
| 143 |
+
try:
|
| 144 |
+
synthesized_vocal = vocoder_infer.infer_waveform(
|
| 145 |
+
mel, normalize=True, batched=False, target=8000, overlap=800
|
| 146 |
+
).astype(np.float32)
|
| 147 |
+
except Exception as e:
|
| 148 |
+
print(f"[SongProcessor] Vocoder failed: {e}, using Griffin-Lim fallback")
|
| 149 |
+
synthesized_vocal = synthesizer.griffin_lim(mel).astype(np.float32)
|
| 150 |
+
|
| 151 |
+
# Normalize synthesized vocal
|
| 152 |
+
max_val = np.max(np.abs(synthesized_vocal))
|
| 153 |
+
if max_val > 0:
|
| 154 |
+
target_level = 0.707
|
| 155 |
+
synthesized_vocal = synthesized_vocal * (target_level / max_val)
|
| 156 |
+
synthesized_vocal = np.clip(synthesized_vocal, -1.0, 1.0)
|
| 157 |
+
|
| 158 |
+
print(f"[SongProcessor] Synthesized vocal shape: {synthesized_vocal.shape}")
|
| 159 |
+
|
| 160 |
+
# Step 5: Mix with instrumental
|
| 161 |
+
print(f"\n[SongProcessor] STEP 5: Mixing vocals with instrumental...")
|
| 162 |
+
final_audio = AudioMixer.mix_and_save(
|
| 163 |
+
synthesized_vocal, instrumental,
|
| 164 |
+
output_path, sr=self.sr,
|
| 165 |
+
add_effects=add_effects
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
# Cleanup
|
| 169 |
+
print(f"\n[SongProcessor] Cleaning up models...")
|
| 170 |
+
try:
|
| 171 |
+
encoder_infer._model = None
|
| 172 |
+
synthesizer_infer._model = None
|
| 173 |
+
vocoder_infer._model = None
|
| 174 |
+
gc.collect()
|
| 175 |
+
if torch.cuda.is_available():
|
| 176 |
+
torch.cuda.empty_cache()
|
| 177 |
+
except Exception as e:
|
| 178 |
+
print(f"[SongProcessor] Warning during cleanup: {e}")
|
| 179 |
+
|
| 180 |
+
print(f"\n[SongProcessor] ========== SONG CONVERSION COMPLETE ==========")
|
| 181 |
+
print(f"[SongProcessor] Output saved to: {final_audio}")
|
| 182 |
+
|
| 183 |
+
return final_audio
|
| 184 |
+
|
| 185 |
+
except Exception as e:
|
| 186 |
+
print(f"\n[SongProcessor] ✗ ERROR: {e}")
|
| 187 |
+
import traceback
|
| 188 |
+
traceback.print_exc()
|
| 189 |
+
sys.stdout.flush()
|
| 190 |
+
raise
|
backend/app/song_conversion/vocal_separator.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Vocal separation using Demucs model."""
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import numpy as np
|
| 5 |
+
import librosa
|
| 6 |
+
import soundfile as sf
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Tuple
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
from demucs.pretrained import get_model
|
| 13 |
+
DEMUCS_AVAILABLE = True
|
| 14 |
+
except ImportError:
|
| 15 |
+
DEMUCS_AVAILABLE = False
|
| 16 |
+
print("[Warning] Demucs not available. Song conversion will not work.")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class VocalSeparator:
|
| 20 |
+
"""Separates vocals from instrumental music using Demucs."""
|
| 21 |
+
|
| 22 |
+
def __init__(self, model_name: str = "htdemucs", device: str = None):
|
| 23 |
+
"""
|
| 24 |
+
Initialize vocal separator.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
model_name: Demucs model to use ('htdemucs', 'mdx_extra', etc.)
|
| 28 |
+
device: 'cuda' or 'cpu'. Auto-detects if None.
|
| 29 |
+
"""
|
| 30 |
+
if not DEMUCS_AVAILABLE:
|
| 31 |
+
raise RuntimeError("Demucs not installed. Install with: pip install demucs")
|
| 32 |
+
|
| 33 |
+
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
|
| 34 |
+
print(f"[VocalSeparator] Loading {model_name} on {self.device}...")
|
| 35 |
+
|
| 36 |
+
self.model = get_model(model_name)
|
| 37 |
+
self.model = self.model.to(self.device)
|
| 38 |
+
self.model.eval()
|
| 39 |
+
|
| 40 |
+
print(f"[VocalSeparator] Model loaded successfully")
|
| 41 |
+
|
| 42 |
+
def separate(self, audio_path: Path, sr: int = 16000) -> Tuple[np.ndarray, np.ndarray]:
|
| 43 |
+
"""
|
| 44 |
+
Separate vocals and instrumental from audio file.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
audio_path: Path to audio file
|
| 48 |
+
sr: Sample rate (default 16000)
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
Tuple of (vocals, instrumental) as numpy arrays
|
| 52 |
+
"""
|
| 53 |
+
print(f"[VocalSeparator] Loading audio: {audio_path}")
|
| 54 |
+
|
| 55 |
+
# Load audio
|
| 56 |
+
if isinstance(audio_path, str):
|
| 57 |
+
audio_path = Path(audio_path)
|
| 58 |
+
|
| 59 |
+
# Use librosa to load and resample
|
| 60 |
+
wav, original_sr = librosa.load(str(audio_path), sr=None, mono=True)
|
| 61 |
+
|
| 62 |
+
# Resample if needed
|
| 63 |
+
if original_sr != sr:
|
| 64 |
+
wav = librosa.resample(wav, orig_sr=original_sr, target_sr=sr)
|
| 65 |
+
|
| 66 |
+
print(f"[VocalSeparator] Audio loaded: {len(wav)} samples at {sr}Hz")
|
| 67 |
+
|
| 68 |
+
# Convert to tensor (Demucs expects shape: [1, channels, samples])
|
| 69 |
+
wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)
|
| 70 |
+
wav_tensor = wav_tensor.to(self.device)
|
| 71 |
+
|
| 72 |
+
print(f"[VocalSeparator] Separating vocals and instrumental...")
|
| 73 |
+
sys.stdout.flush()
|
| 74 |
+
|
| 75 |
+
# Perform separation
|
| 76 |
+
with torch.no_grad():
|
| 77 |
+
sources = self.model.separate(wav_tensor)
|
| 78 |
+
|
| 79 |
+
# Extract vocals and other sources
|
| 80 |
+
# sources dict typically has: 'drums', 'bass', 'other', 'vocals'
|
| 81 |
+
sources = {k: v.cpu().numpy().squeeze() for k, v in sources.items()}
|
| 82 |
+
|
| 83 |
+
vocals = sources.get('vocals', np.zeros_like(wav))
|
| 84 |
+
|
| 85 |
+
# Combine other sources as instrumental
|
| 86 |
+
instrumental = np.zeros_like(wav)
|
| 87 |
+
for key in sources:
|
| 88 |
+
if key != 'vocals':
|
| 89 |
+
instrumental += sources[key]
|
| 90 |
+
|
| 91 |
+
print(f"[VocalSeparator] Separation complete")
|
| 92 |
+
print(f"[VocalSeparator] Vocals shape: {vocals.shape}")
|
| 93 |
+
print(f"[VocalSeparator] Instrumental shape: {instrumental.shape}")
|
| 94 |
+
|
| 95 |
+
return vocals, instrumental
|
| 96 |
+
|
| 97 |
+
def separate_and_save(self, audio_path: Path, output_dir: Path, sr: int = 16000) -> Tuple[Path, Path]:
|
| 98 |
+
"""
|
| 99 |
+
Separate vocals and save to files.
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
audio_path: Input audio file
|
| 103 |
+
output_dir: Directory to save separated audio
|
| 104 |
+
sr: Sample rate
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
Tuple of (vocals_path, instrumental_path)
|
| 108 |
+
"""
|
| 109 |
+
output_dir = Path(output_dir)
|
| 110 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 111 |
+
|
| 112 |
+
vocals, instrumental = self.separate(audio_path, sr)
|
| 113 |
+
|
| 114 |
+
vocals_path = output_dir / "vocals.wav"
|
| 115 |
+
instrumental_path = output_dir / "instrumental.wav"
|
| 116 |
+
|
| 117 |
+
print(f"[VocalSeparator] Saving vocals to {vocals_path}")
|
| 118 |
+
sf.write(vocals_path, vocals, sr)
|
| 119 |
+
|
| 120 |
+
print(f"[VocalSeparator] Saving instrumental to {instrumental_path}")
|
| 121 |
+
sf.write(instrumental_path, instrumental, sr)
|
| 122 |
+
|
| 123 |
+
return vocals_path, instrumental_path
|
backend/requirements.txt
CHANGED
|
@@ -12,3 +12,5 @@ scipy>=1.6.0
|
|
| 12 |
scikit-learn>=1.1.0
|
| 13 |
unidecode>=1.2.0
|
| 14 |
inflect>=6.0.0
|
|
|
|
|
|
|
|
|
| 12 |
scikit-learn>=1.1.0
|
| 13 |
unidecode>=1.2.0
|
| 14 |
inflect>=6.0.0
|
| 15 |
+
demucs>=4.0.0
|
| 16 |
+
pydub>=0.25.1
|