AJ50 commited on
Commit
e049981
·
1 Parent(s): c222fbc

Add song generation backend: Demucs vocal separation + voice synthesis + audio mixing

Browse files
backend/app/routes.py CHANGED
@@ -407,3 +407,146 @@ def get_waveform(audio_filename):
407
  err_msg = f'Failed to generate waveform: {str(e)}'
408
  return jsonify({'error': err_msg}), 500
409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  err_msg = f'Failed to generate waveform: {str(e)}'
408
  return jsonify({'error': err_msg}), 500
409
 
410
+
411
+ # ============================================================================
412
+ # SONG GENERATION ENDPOINTS
413
+ # ============================================================================
414
+
415
+ @bp.route('/convert_song', methods=['POST'])
416
+ def convert_song():
417
+ """
418
+ Convert a song to user's voice.
419
+
420
+ Form data:
421
+ - song: audio file (mp3, wav, etc.)
422
+ - voice_id: ID of enrolled voice to use
423
+ - language: 'english' or 'hindi'
424
+ - add_effects: 'true' or 'false' to add reverb/compression
425
+
426
+ Returns: Generated song audio file
427
+ """
428
+ try:
429
+ print("\n[API] POST /api/convert_song")
430
+
431
+ # Validate input
432
+ if 'song' not in request.files:
433
+ return jsonify({'error': 'No song file provided'}), 400
434
+
435
+ if 'voice_id' not in request.form:
436
+ return jsonify({'error': 'No voice_id provided'}), 400
437
+
438
+ song_file = request.files['song']
439
+ voice_id = request.form.get('voice_id')
440
+ language = request.form.get('language', 'english')
441
+ add_effects = request.form.get('add_effects', 'true').lower() == 'true'
442
+
443
+ if not allowed_file(song_file.filename):
444
+ return jsonify({'error': f'File type not allowed. Allowed: {ALLOWED_EXTENSIONS}'}), 400
445
+
446
+ # Load voices database
447
+ voices_db = load_voices_db()
448
+ voice_data = next((v for v in voices_db if v['id'] == voice_id), None)
449
+
450
+ if not voice_data:
451
+ return jsonify({'error': f'Voice {voice_id} not found'}), 404
452
+
453
+ # Save uploaded song
454
+ song_filename = f"song_{uuid.uuid4().hex}.wav"
455
+ song_path = OUTPUT_FOLDER / song_filename
456
+ song_file.save(song_path)
457
+ print(f"[API] Song saved: {song_path}")
458
+
459
+ # Get voice file path
460
+ voice_filepath = UPLOAD_FOLDER / voice_data['filename']
461
+ if not voice_filepath.exists():
462
+ return jsonify({'error': 'Voice file not found'}), 404
463
+
464
+ # Output path
465
+ output_filename = f"converted_song_{uuid.uuid4().hex}.wav"
466
+ output_path = OUTPUT_FOLDER / output_filename
467
+
468
+ print(f"[API] Starting song conversion...")
469
+ print(f"[API] Language: {language}")
470
+ print(f"[API] Add effects: {add_effects}")
471
+
472
+ # Import song processor
473
+ from app.song_conversion.song_processor import SongProcessor
474
+
475
+ processor = SongProcessor(MODELS_DIR)
476
+ result_path = processor.convert_song(
477
+ song_path=song_path,
478
+ voice_path=voice_filepath,
479
+ output_path=output_path,
480
+ language=language,
481
+ add_effects=add_effects,
482
+ models_dir=MODELS_DIR
483
+ )
484
+
485
+ print(f"[API] Song conversion complete: {result_path}")
486
+
487
+ # Return download URL
488
+ return jsonify({
489
+ 'success': True,
490
+ 'message': 'Song converted successfully',
491
+ 'audio_url': f'/api/audio/{output_filename}',
492
+ 'filename': output_filename
493
+ }), 200
494
+
495
+ except Exception as e:
496
+ print(f"[API] ✗ Error in convert_song: {e}")
497
+ import traceback
498
+ traceback.print_exc()
499
+ return jsonify({'error': str(e)}), 500
500
+
501
+
502
+ @bp.route('/separate_vocals', methods=['POST'])
503
+ def separate_vocals():
504
+ """
505
+ Separate vocals from a song file.
506
+
507
+ Form data:
508
+ - song: audio file
509
+
510
+ Returns: JSON with vocal and instrumental file URLs
511
+ """
512
+ try:
513
+ print("\n[API] POST /api/separate_vocals")
514
+
515
+ if 'song' not in request.files:
516
+ return jsonify({'error': 'No song file provided'}), 400
517
+
518
+ song_file = request.files['song']
519
+
520
+ if not allowed_file(song_file.filename):
521
+ return jsonify({'error': f'File type not allowed'}), 400
522
+
523
+ # Save uploaded song
524
+ song_filename = f"song_{uuid.uuid4().hex}.wav"
525
+ song_path = OUTPUT_FOLDER / song_filename
526
+ song_file.save(song_path)
527
+
528
+ print(f"[API] Song saved: {song_path}")
529
+ print(f"[API] Separating vocals...")
530
+
531
+ from app.song_conversion.vocal_separator import VocalSeparator
532
+
533
+ separator = VocalSeparator()
534
+ vocals_path, instrumental_path = separator.separate_and_save(
535
+ song_path,
536
+ OUTPUT_FOLDER,
537
+ sr=16000
538
+ )
539
+
540
+ return jsonify({
541
+ 'success': True,
542
+ 'vocals_url': f'/api/audio/{vocals_path.name}',
543
+ 'instrumental_url': f'/api/audio/{instrumental_path.name}',
544
+ 'vocals_file': vocals_path.name,
545
+ 'instrumental_file': instrumental_path.name
546
+ }), 200
547
+
548
+ except Exception as e:
549
+ print(f"[API] ✗ Error in separate_vocals: {e}")
550
+ import traceback
551
+ traceback.print_exc()
552
+ return jsonify({'error': str(e)}), 500
backend/app/song_conversion/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """Song conversion module for voice-to-song transformation."""
2
+
3
+ from .vocal_separator import VocalSeparator
4
+ from .audio_mixer import AudioMixer
5
+ from .song_processor import SongProcessor
6
+
7
+ __all__ = ['VocalSeparator', 'AudioMixer', 'SongProcessor']
backend/app/song_conversion/audio_mixer.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Audio mixing and effects for song generation."""
2
+
3
+ import numpy as np
4
+ import soundfile as sf
5
+ from pathlib import Path
6
+ from typing import Tuple, Optional
7
+ import subprocess
8
+ import sys
9
+
10
+
11
+ class AudioMixer:
12
+ """Mixes vocals with instrumental and applies effects."""
13
+
14
+ @staticmethod
15
+ def normalize_audio(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:
16
+ """
17
+ Normalize audio to target dB level.
18
+
19
+ Args:
20
+ audio: Audio array
21
+ target_db: Target peak level in dB (default -3dB is professional standard)
22
+
23
+ Returns:
24
+ Normalized audio
25
+ """
26
+ # Convert dB to linear
27
+ target_linear = 10 ** (target_db / 20.0)
28
+
29
+ # Find current peak
30
+ current_peak = np.max(np.abs(audio))
31
+
32
+ if current_peak > 0:
33
+ # Scale to target
34
+ audio = audio * (target_linear / current_peak)
35
+
36
+ # Clip to prevent distortion
37
+ audio = np.clip(audio, -1.0, 1.0)
38
+
39
+ return audio
40
+
41
+ @staticmethod
42
+ def add_reverb(audio: np.ndarray, sr: int = 16000, room_scale: float = 0.3,
43
+ delay_ms: float = 50) -> np.ndarray:
44
+ """
45
+ Add simple reverb effect.
46
+
47
+ Args:
48
+ audio: Input audio
49
+ sr: Sample rate
50
+ room_scale: Reverb amount (0-1)
51
+ delay_ms: Delay in milliseconds
52
+
53
+ Returns:
54
+ Audio with reverb
55
+ """
56
+ delay_samples = int((delay_ms / 1000.0) * sr)
57
+
58
+ # Create delayed version
59
+ delayed = np.zeros_like(audio)
60
+ if delay_samples < len(audio):
61
+ delayed[delay_samples:] = audio[:-delay_samples]
62
+
63
+ # Mix original with delayed
64
+ reverb = audio + room_scale * delayed
65
+
66
+ return reverb
67
+
68
+ @staticmethod
69
+ def compress_audio(audio: np.ndarray, threshold: float = 0.6, ratio: float = 4.0) -> np.ndarray:
70
+ """
71
+ Apply dynamic range compression.
72
+
73
+ Args:
74
+ audio: Input audio
75
+ threshold: Compression threshold (0-1)
76
+ ratio: Compression ratio
77
+
78
+ Returns:
79
+ Compressed audio
80
+ """
81
+ # Simple peak compression
82
+ abs_audio = np.abs(audio)
83
+
84
+ # Find samples above threshold
85
+ mask = abs_audio > threshold
86
+
87
+ # Apply compression to loud parts
88
+ audio[mask] = np.sign(audio[mask]) * (threshold + (abs_audio[mask] - threshold) / ratio)
89
+
90
+ return audio
91
+
92
+ @staticmethod
93
+ def mix_audio(vocal: np.ndarray, instrumental: np.ndarray,
94
+ vocal_level: float = 0.7, instrumental_level: float = 0.3,
95
+ add_reverb: bool = True, add_compression: bool = True,
96
+ sr: int = 16000) -> np.ndarray:
97
+ """
98
+ Mix vocals and instrumental with effects.
99
+
100
+ Args:
101
+ vocal: Vocal audio
102
+ instrumental: Instrumental audio
103
+ vocal_level: Vocal volume level (0-1)
104
+ instrumental_level: Instrumental volume level (0-1)
105
+ add_reverb: Whether to add reverb to vocals
106
+ add_compression: Whether to add compression
107
+ sr: Sample rate
108
+
109
+ Returns:
110
+ Mixed audio
111
+ """
112
+ print("[AudioMixer] Normalizing tracks...")
113
+
114
+ # Normalize individual tracks
115
+ vocal = AudioMixer.normalize_audio(vocal, -6.0) # Vocals a bit quieter initially
116
+ instrumental = AudioMixer.normalize_audio(instrumental, -6.0)
117
+
118
+ print("[AudioMixer] Adding effects...")
119
+
120
+ # Add reverb to vocals
121
+ if add_reverb:
122
+ vocal = AudioMixer.add_reverb(vocal, sr, room_scale=0.2, delay_ms=40)
123
+
124
+ # Apply compression
125
+ if add_compression:
126
+ vocal = AudioMixer.compress_audio(vocal, threshold=0.5, ratio=3.0)
127
+
128
+ print("[AudioMixer] Mixing tracks...")
129
+
130
+ # Ensure same length
131
+ min_len = min(len(vocal), len(instrumental))
132
+ vocal = vocal[:min_len]
133
+ instrumental = instrumental[:min_len]
134
+
135
+ # Mix with specified levels
136
+ mixed = vocal_level * vocal + instrumental_level * instrumental
137
+
138
+ # Normalize final mix
139
+ mixed = AudioMixer.normalize_audio(mixed, -3.0)
140
+
141
+ print(f"[AudioMixer] Mix complete - Peak: {np.max(np.abs(mixed)):.4f}")
142
+
143
+ return mixed
144
+
145
+ @staticmethod
146
+ def save_audio(audio: np.ndarray, output_path: Path, sr: int = 16000) -> None:
147
+ """
148
+ Save audio to file.
149
+
150
+ Args:
151
+ audio: Audio array
152
+ output_path: Output file path
153
+ sr: Sample rate
154
+ """
155
+ output_path = Path(output_path)
156
+ output_path.parent.mkdir(parents=True, exist_ok=True)
157
+
158
+ print(f"[AudioMixer] Saving to {output_path}")
159
+ sf.write(output_path, audio, sr)
160
+ print(f"[AudioMixer] Saved successfully")
161
+
162
+ @staticmethod
163
+ def mix_and_save(vocal: np.ndarray, instrumental: np.ndarray,
164
+ output_path: Path, sr: int = 16000,
165
+ add_effects: bool = True) -> Path:
166
+ """
167
+ Mix audio and save to file.
168
+
169
+ Args:
170
+ vocal: Vocal audio
171
+ instrumental: Instrumental audio
172
+ output_path: Output file path
173
+ sr: Sample rate
174
+ add_effects: Whether to add effects
175
+
176
+ Returns:
177
+ Output file path
178
+ """
179
+ mixed = AudioMixer.mix_audio(
180
+ vocal, instrumental,
181
+ add_reverb=add_effects,
182
+ add_compression=add_effects,
183
+ sr=sr
184
+ )
185
+
186
+ AudioMixer.save_audio(mixed, output_path, sr)
187
+
188
+ return Path(output_path)
backend/app/song_conversion/song_processor.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Main song processing orchestrator."""
2
+
3
+ import gc
4
+ import torch
5
+ import numpy as np
6
+ from pathlib import Path
7
+ from typing import Optional
8
+ import sys
9
+
10
+ from app.song_conversion.vocal_separator import VocalSeparator
11
+ from app.song_conversion.audio_mixer import AudioMixer
12
+ from encoder import inference as encoder_infer
13
+ from synthesizer import inference as synthesizer_infer
14
+ from app.vocoder import inference as vocoder_infer
15
+ from synthesizer.hparams import hparams as syn_hp
16
+
17
+
18
+ class SongProcessor:
19
+ """Orchestrates the complete song voice conversion process."""
20
+
21
+ def __init__(self, models_dir: Path):
22
+ """
23
+ Initialize song processor.
24
+
25
+ Args:
26
+ models_dir: Directory containing pre-trained models
27
+ """
28
+ self.models_dir = Path(models_dir)
29
+ self.separator = None
30
+ self.sr = 16000
31
+
32
+ def _ensure_separator(self) -> VocalSeparator:
33
+ """Lazy load vocal separator."""
34
+ if self.separator is None:
35
+ print("[SongProcessor] Initializing vocal separator...")
36
+ self.separator = VocalSeparator(model_name="htdemucs")
37
+ return self.separator
38
+
39
+ def _load_voice_models(self, models_dir: Path, language: str = 'english') -> None:
40
+ """Load voice cloning models."""
41
+ print(f"[SongProcessor] Loading {language} voice models...")
42
+
43
+ enc_path = models_dir / "default" / "encoder.pt"
44
+ syn_path = models_dir / "default" / "synthesizer.pt"
45
+ voc_path = models_dir / "default" / "vocoder.pt"
46
+
47
+ for path in [enc_path, syn_path, voc_path]:
48
+ if not path.exists():
49
+ raise RuntimeError(f"Model missing: {path}")
50
+
51
+ encoder_infer.load_model(enc_path)
52
+ print("[SongProcessor] Encoder loaded")
53
+
54
+ synthesizer = synthesizer_infer.Synthesizer(syn_path)
55
+ print("[SongProcessor] Synthesizer loaded")
56
+
57
+ vocoder_infer.load_model(voc_path)
58
+ print("[SongProcessor] Vocoder loaded")
59
+
60
+ return synthesizer
61
+
62
+ def _extract_lyrics_from_audio(self, audio_path: Path, voice_sample_path: Path) -> str:
63
+ """
64
+ Simple lyrics extraction (placeholder - returns generic text).
65
+ In production, would use speech-to-text.
66
+
67
+ Args:
68
+ audio_path: Path to vocal audio
69
+ voice_sample_path: Path to reference voice
70
+
71
+ Returns:
72
+ Extracted lyrics text
73
+ """
74
+ print("[SongProcessor] Extracting lyrics from audio...")
75
+
76
+ # Placeholder: return generic phonetically rich text
77
+ # In production, use Whisper or other STT model
78
+ lyrics = "The music is playing so well with this song today"
79
+
80
+ print(f"[SongProcessor] Using default lyrics: {lyrics}")
81
+ return lyrics
82
+
83
+ def convert_song(self, song_path: Path, voice_path: Path, output_path: Path,
84
+ language: str = 'english', add_effects: bool = True,
85
+ models_dir: Optional[Path] = None) -> Path:
86
+ """
87
+ Convert song to user's voice.
88
+
89
+ Complete pipeline:
90
+ 1. Separate vocals from instrumental
91
+ 2. Extract lyrics from vocals (or use placeholder)
92
+ 3. Synthesize vocals using user's voice
93
+ 4. Mix synthesized vocals with instrumental
94
+ 5. Add audio effects
95
+
96
+ Args:
97
+ song_path: Path to input song
98
+ voice_path: Path to reference voice sample
99
+ output_path: Path for output song
100
+ language: 'english' or 'hindi'
101
+ add_effects: Whether to add reverb/compression
102
+ models_dir: Directory with models (uses self.models_dir if None)
103
+
104
+ Returns:
105
+ Path to output song
106
+ """
107
+ if models_dir is None:
108
+ models_dir = self.models_dir
109
+
110
+ song_path = Path(song_path)
111
+ voice_path = Path(voice_path)
112
+ output_path = Path(output_path)
113
+
114
+ try:
115
+ print(f"\n[SongProcessor] ========== SONG CONVERSION START ==========")
116
+ print(f"[SongProcessor] Song: {song_path}")
117
+ print(f"[SongProcessor] Voice: {voice_path}")
118
+ print(f"[SongProcessor] Language: {language}")
119
+ print(f"[SongProcessor] Output: {output_path}")
120
+
121
+ # Step 1: Separate vocals
122
+ print(f"\n[SongProcessor] STEP 1: Separating vocals...")
123
+ separator = self._ensure_separator()
124
+ vocals, instrumental = separator.separate(song_path, sr=self.sr)
125
+
126
+ # Step 2: Extract/prepare lyrics (using placeholder for now)
127
+ print(f"\n[SongProcessor] STEP 2: Preparing lyrics...")
128
+ lyrics = self._extract_lyrics_from_audio(song_path, voice_path)
129
+
130
+ # Step 3: Load voice models
131
+ print(f"\n[SongProcessor] STEP 3: Loading voice models...")
132
+ synthesizer = self._load_voice_models(models_dir, language)
133
+
134
+ # Step 4: Synthesize voice with your voice
135
+ print(f"\n[SongProcessor] STEP 4: Synthesizing vocals with your voice...")
136
+ wav = encoder_infer.preprocess_wav(voice_path)
137
+ embed = encoder_infer.embed_utterance(wav)
138
+
139
+ mels = synthesizer.synthesize_spectrograms([lyrics], [embed])
140
+ mel = mels[0]
141
+
142
+ print("[SongProcessor] Vocoding...")
143
+ try:
144
+ synthesized_vocal = vocoder_infer.infer_waveform(
145
+ mel, normalize=True, batched=False, target=8000, overlap=800
146
+ ).astype(np.float32)
147
+ except Exception as e:
148
+ print(f"[SongProcessor] Vocoder failed: {e}, using Griffin-Lim fallback")
149
+ synthesized_vocal = synthesizer.griffin_lim(mel).astype(np.float32)
150
+
151
+ # Normalize synthesized vocal
152
+ max_val = np.max(np.abs(synthesized_vocal))
153
+ if max_val > 0:
154
+ target_level = 0.707
155
+ synthesized_vocal = synthesized_vocal * (target_level / max_val)
156
+ synthesized_vocal = np.clip(synthesized_vocal, -1.0, 1.0)
157
+
158
+ print(f"[SongProcessor] Synthesized vocal shape: {synthesized_vocal.shape}")
159
+
160
+ # Step 5: Mix with instrumental
161
+ print(f"\n[SongProcessor] STEP 5: Mixing vocals with instrumental...")
162
+ final_audio = AudioMixer.mix_and_save(
163
+ synthesized_vocal, instrumental,
164
+ output_path, sr=self.sr,
165
+ add_effects=add_effects
166
+ )
167
+
168
+ # Cleanup
169
+ print(f"\n[SongProcessor] Cleaning up models...")
170
+ try:
171
+ encoder_infer._model = None
172
+ synthesizer_infer._model = None
173
+ vocoder_infer._model = None
174
+ gc.collect()
175
+ if torch.cuda.is_available():
176
+ torch.cuda.empty_cache()
177
+ except Exception as e:
178
+ print(f"[SongProcessor] Warning during cleanup: {e}")
179
+
180
+ print(f"\n[SongProcessor] ========== SONG CONVERSION COMPLETE ==========")
181
+ print(f"[SongProcessor] Output saved to: {final_audio}")
182
+
183
+ return final_audio
184
+
185
+ except Exception as e:
186
+ print(f"\n[SongProcessor] ✗ ERROR: {e}")
187
+ import traceback
188
+ traceback.print_exc()
189
+ sys.stdout.flush()
190
+ raise
backend/app/song_conversion/vocal_separator.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Vocal separation using Demucs model."""
2
+
3
+ import torch
4
+ import numpy as np
5
+ import librosa
6
+ import soundfile as sf
7
+ from pathlib import Path
8
+ from typing import Tuple
9
+ import sys
10
+
11
+ try:
12
+ from demucs.pretrained import get_model
13
+ DEMUCS_AVAILABLE = True
14
+ except ImportError:
15
+ DEMUCS_AVAILABLE = False
16
+ print("[Warning] Demucs not available. Song conversion will not work.")
17
+
18
+
19
+ class VocalSeparator:
20
+ """Separates vocals from instrumental music using Demucs."""
21
+
22
+ def __init__(self, model_name: str = "htdemucs", device: str = None):
23
+ """
24
+ Initialize vocal separator.
25
+
26
+ Args:
27
+ model_name: Demucs model to use ('htdemucs', 'mdx_extra', etc.)
28
+ device: 'cuda' or 'cpu'. Auto-detects if None.
29
+ """
30
+ if not DEMUCS_AVAILABLE:
31
+ raise RuntimeError("Demucs not installed. Install with: pip install demucs")
32
+
33
+ self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
34
+ print(f"[VocalSeparator] Loading {model_name} on {self.device}...")
35
+
36
+ self.model = get_model(model_name)
37
+ self.model = self.model.to(self.device)
38
+ self.model.eval()
39
+
40
+ print(f"[VocalSeparator] Model loaded successfully")
41
+
42
+ def separate(self, audio_path: Path, sr: int = 16000) -> Tuple[np.ndarray, np.ndarray]:
43
+ """
44
+ Separate vocals and instrumental from audio file.
45
+
46
+ Args:
47
+ audio_path: Path to audio file
48
+ sr: Sample rate (default 16000)
49
+
50
+ Returns:
51
+ Tuple of (vocals, instrumental) as numpy arrays
52
+ """
53
+ print(f"[VocalSeparator] Loading audio: {audio_path}")
54
+
55
+ # Load audio
56
+ if isinstance(audio_path, str):
57
+ audio_path = Path(audio_path)
58
+
59
+ # Use librosa to load and resample
60
+ wav, original_sr = librosa.load(str(audio_path), sr=None, mono=True)
61
+
62
+ # Resample if needed
63
+ if original_sr != sr:
64
+ wav = librosa.resample(wav, orig_sr=original_sr, target_sr=sr)
65
+
66
+ print(f"[VocalSeparator] Audio loaded: {len(wav)} samples at {sr}Hz")
67
+
68
+ # Convert to tensor (Demucs expects shape: [1, channels, samples])
69
+ wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)
70
+ wav_tensor = wav_tensor.to(self.device)
71
+
72
+ print(f"[VocalSeparator] Separating vocals and instrumental...")
73
+ sys.stdout.flush()
74
+
75
+ # Perform separation
76
+ with torch.no_grad():
77
+ sources = self.model.separate(wav_tensor)
78
+
79
+ # Extract vocals and other sources
80
+ # sources dict typically has: 'drums', 'bass', 'other', 'vocals'
81
+ sources = {k: v.cpu().numpy().squeeze() for k, v in sources.items()}
82
+
83
+ vocals = sources.get('vocals', np.zeros_like(wav))
84
+
85
+ # Combine other sources as instrumental
86
+ instrumental = np.zeros_like(wav)
87
+ for key in sources:
88
+ if key != 'vocals':
89
+ instrumental += sources[key]
90
+
91
+ print(f"[VocalSeparator] Separation complete")
92
+ print(f"[VocalSeparator] Vocals shape: {vocals.shape}")
93
+ print(f"[VocalSeparator] Instrumental shape: {instrumental.shape}")
94
+
95
+ return vocals, instrumental
96
+
97
+ def separate_and_save(self, audio_path: Path, output_dir: Path, sr: int = 16000) -> Tuple[Path, Path]:
98
+ """
99
+ Separate vocals and save to files.
100
+
101
+ Args:
102
+ audio_path: Input audio file
103
+ output_dir: Directory to save separated audio
104
+ sr: Sample rate
105
+
106
+ Returns:
107
+ Tuple of (vocals_path, instrumental_path)
108
+ """
109
+ output_dir = Path(output_dir)
110
+ output_dir.mkdir(parents=True, exist_ok=True)
111
+
112
+ vocals, instrumental = self.separate(audio_path, sr)
113
+
114
+ vocals_path = output_dir / "vocals.wav"
115
+ instrumental_path = output_dir / "instrumental.wav"
116
+
117
+ print(f"[VocalSeparator] Saving vocals to {vocals_path}")
118
+ sf.write(vocals_path, vocals, sr)
119
+
120
+ print(f"[VocalSeparator] Saving instrumental to {instrumental_path}")
121
+ sf.write(instrumental_path, instrumental, sr)
122
+
123
+ return vocals_path, instrumental_path
backend/requirements.txt CHANGED
@@ -12,3 +12,5 @@ scipy>=1.6.0
12
  scikit-learn>=1.1.0
13
  unidecode>=1.2.0
14
  inflect>=6.0.0
 
 
 
12
  scikit-learn>=1.1.0
13
  unidecode>=1.2.0
14
  inflect>=6.0.0
15
+ demucs>=4.0.0
16
+ pydub>=0.25.1