"""Vocal separation using Demucs model.""" import torch import numpy as np import librosa import soundfile as sf from pathlib import Path from typing import Tuple import sys try: from demucs.pretrained import get_model DEMUCS_AVAILABLE = True except ImportError: DEMUCS_AVAILABLE = False print("[Warning] Demucs not available. Song conversion will not work.") class VocalSeparator: """Separates vocals from instrumental music using Demucs.""" def __init__(self, model_name: str = "htdemucs", device: str = None): """ Initialize vocal separator. Args: model_name: Demucs model to use ('htdemucs', 'mdx_extra', etc.) device: 'cuda' or 'cpu'. Auto-detects if None. """ if not DEMUCS_AVAILABLE: raise RuntimeError("Demucs not installed. Install with: pip install demucs") self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu') print(f"[VocalSeparator] Loading {model_name} on {self.device}...") self.model = get_model(model_name) self.model = self.model.to(self.device) self.model.eval() print(f"[VocalSeparator] Model loaded successfully") def separate(self, audio_path: Path, sr: int = 16000) -> Tuple[np.ndarray, np.ndarray]: """ Separate vocals and instrumental from audio file. Args: audio_path: Path to audio file sr: Sample rate (default 16000) Returns: Tuple of (vocals, instrumental) as numpy arrays """ print(f"[VocalSeparator] Loading audio: {audio_path}") # Load audio if isinstance(audio_path, str): audio_path = Path(audio_path) # Use librosa to load and resample wav, original_sr = librosa.load(str(audio_path), sr=None, mono=True) # Resample if needed if original_sr != sr: wav = librosa.resample(wav, orig_sr=original_sr, target_sr=sr) print(f"[VocalSeparator] Audio loaded: {len(wav)} samples at {sr}Hz") # Convert to tensor (Demucs expects shape: [1, channels, samples]) wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0) wav_tensor = wav_tensor.to(self.device) print(f"[VocalSeparator] Separating vocals and instrumental...") sys.stdout.flush() # Perform separation with torch.no_grad(): sources = self.model.separate(wav_tensor) # Extract vocals and other sources # sources dict typically has: 'drums', 'bass', 'other', 'vocals' sources = {k: v.cpu().numpy().squeeze() for k, v in sources.items()} vocals = sources.get('vocals', np.zeros_like(wav)) # Combine other sources as instrumental instrumental = np.zeros_like(wav) for key in sources: if key != 'vocals': instrumental += sources[key] print(f"[VocalSeparator] Separation complete") print(f"[VocalSeparator] Vocals shape: {vocals.shape}") print(f"[VocalSeparator] Instrumental shape: {instrumental.shape}") return vocals, instrumental def separate_and_save(self, audio_path: Path, output_dir: Path, sr: int = 16000) -> Tuple[Path, Path]: """ Separate vocals and save to files. Args: audio_path: Input audio file output_dir: Directory to save separated audio sr: Sample rate Returns: Tuple of (vocals_path, instrumental_path) """ output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) vocals, instrumental = self.separate(audio_path, sr) vocals_path = output_dir / "vocals.wav" instrumental_path = output_dir / "instrumental.wav" print(f"[VocalSeparator] Saving vocals to {vocals_path}") sf.write(vocals_path, vocals, sr) print(f"[VocalSeparator] Saving instrumental to {instrumental_path}") sf.write(instrumental_path, instrumental, sr) return vocals_path, instrumental_path