Spaces:

AJ50
/

voice-cloning-backend

Sleeping

App Files Files Community

AJ50 commited on 23 days ago

Commit

e049981

1 Parent(s): c222fbc

Add song generation backend: Demucs vocal separation + voice synthesis + audio mixing

Browse files

Files changed (6) hide show

backend/app/routes.py +143 -0
backend/app/song_conversion/__init__.py +7 -0
backend/app/song_conversion/audio_mixer.py +188 -0
backend/app/song_conversion/song_processor.py +190 -0
backend/app/song_conversion/vocal_separator.py +123 -0
backend/requirements.txt +2 -0

backend/app/routes.py CHANGED Viewed

@@ -407,3 +407,146 @@ def get_waveform(audio_filename):
         err_msg = f'Failed to generate waveform: {str(e)}'
         return jsonify({'error': err_msg}), 500

         err_msg = f'Failed to generate waveform: {str(e)}'
         return jsonify({'error': err_msg}), 500
+# ============================================================================
+# SONG GENERATION ENDPOINTS
+# ============================================================================
+@bp.route('/convert_song', methods=['POST'])
+def convert_song():
+    """
+    Convert a song to user's voice.
+    Form data:
+    - song: audio file (mp3, wav, etc.)
+    - voice_id: ID of enrolled voice to use
+    - language: 'english' or 'hindi'
+    - add_effects: 'true' or 'false' to add reverb/compression
+    Returns: Generated song audio file
+    """
+    try:
+        print("\n[API] POST /api/convert_song")
+        # Validate input
+        if 'song' not in request.files:
+            return jsonify({'error': 'No song file provided'}), 400
+        if 'voice_id' not in request.form:
+            return jsonify({'error': 'No voice_id provided'}), 400
+        song_file = request.files['song']
+        voice_id = request.form.get('voice_id')
+        language = request.form.get('language', 'english')
+        add_effects = request.form.get('add_effects', 'true').lower() == 'true'
+        if not allowed_file(song_file.filename):
+            return jsonify({'error': f'File type not allowed. Allowed: {ALLOWED_EXTENSIONS}'}), 400
+        # Load voices database
+        voices_db = load_voices_db()
+        voice_data = next((v for v in voices_db if v['id'] == voice_id), None)
+        if not voice_data:
+            return jsonify({'error': f'Voice {voice_id} not found'}), 404
+        # Save uploaded song
+        song_filename = f"song_{uuid.uuid4().hex}.wav"
+        song_path = OUTPUT_FOLDER / song_filename
+        song_file.save(song_path)
+        print(f"[API] Song saved: {song_path}")
+        # Get voice file path
+        voice_filepath = UPLOAD_FOLDER / voice_data['filename']
+        if not voice_filepath.exists():
+            return jsonify({'error': 'Voice file not found'}), 404
+        # Output path
+        output_filename = f"converted_song_{uuid.uuid4().hex}.wav"
+        output_path = OUTPUT_FOLDER / output_filename
+        print(f"[API] Starting song conversion...")
+        print(f"[API] Language: {language}")
+        print(f"[API] Add effects: {add_effects}")
+        # Import song processor
+        from app.song_conversion.song_processor import SongProcessor
+        processor = SongProcessor(MODELS_DIR)
+        result_path = processor.convert_song(
+            song_path=song_path,
+            voice_path=voice_filepath,
+            output_path=output_path,
+            language=language,
+            add_effects=add_effects,
+            models_dir=MODELS_DIR
+        )
+        print(f"[API] Song conversion complete: {result_path}")
+        # Return download URL
+        return jsonify({
+            'success': True,
+            'message': 'Song converted successfully',
+            'audio_url': f'/api/audio/{output_filename}',
+            'filename': output_filename
+        }), 200
+    except Exception as e:
+        print(f"[API] ✗ Error in convert_song: {e}")
+        import traceback
+        traceback.print_exc()
+        return jsonify({'error': str(e)}), 500
+@bp.route('/separate_vocals', methods=['POST'])
+def separate_vocals():
+    """
+    Separate vocals from a song file.
+    Form data:
+    - song: audio file
+    Returns: JSON with vocal and instrumental file URLs
+    """
+    try:
+        print("\n[API] POST /api/separate_vocals")
+        if 'song' not in request.files:
+            return jsonify({'error': 'No song file provided'}), 400
+        song_file = request.files['song']
+        if not allowed_file(song_file.filename):
+            return jsonify({'error': f'File type not allowed'}), 400
+        # Save uploaded song
+        song_filename = f"song_{uuid.uuid4().hex}.wav"
+        song_path = OUTPUT_FOLDER / song_filename
+        song_file.save(song_path)
+        print(f"[API] Song saved: {song_path}")
+        print(f"[API] Separating vocals...")
+        from app.song_conversion.vocal_separator import VocalSeparator
+        separator = VocalSeparator()
+        vocals_path, instrumental_path = separator.separate_and_save(
+            song_path,
+            OUTPUT_FOLDER,
+            sr=16000
+        )
+        return jsonify({
+            'success': True,
+            'vocals_url': f'/api/audio/{vocals_path.name}',
+            'instrumental_url': f'/api/audio/{instrumental_path.name}',
+            'vocals_file': vocals_path.name,
+            'instrumental_file': instrumental_path.name
+        }), 200
+    except Exception as e:
+        print(f"[API] ✗ Error in separate_vocals: {e}")
+        import traceback
+        traceback.print_exc()
+        return jsonify({'error': str(e)}), 500

backend/app/song_conversion/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Song conversion module for voice-to-song transformation."""
+from .vocal_separator import VocalSeparator
+from .audio_mixer import AudioMixer
+from .song_processor import SongProcessor
+__all__ = ['VocalSeparator', 'AudioMixer', 'SongProcessor']

backend/app/song_conversion/audio_mixer.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""Audio mixing and effects for song generation."""
+import numpy as np
+import soundfile as sf
+from pathlib import Path
+from typing import Tuple, Optional
+import subprocess
+import sys
+class AudioMixer:
+    """Mixes vocals with instrumental and applies effects."""
+    @staticmethod
+    def normalize_audio(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:
+        """
+        Normalize audio to target dB level.
+        Args:
+            audio: Audio array
+            target_db: Target peak level in dB (default -3dB is professional standard)
+        Returns:
+            Normalized audio
+        """
+        # Convert dB to linear
+        target_linear = 10 ** (target_db / 20.0)
+        # Find current peak
+        current_peak = np.max(np.abs(audio))
+        if current_peak > 0:
+            # Scale to target
+            audio = audio * (target_linear / current_peak)
+        # Clip to prevent distortion
+        audio = np.clip(audio, -1.0, 1.0)
+        return audio
+    @staticmethod
+    def add_reverb(audio: np.ndarray, sr: int = 16000, room_scale: float = 0.3,
+                   delay_ms: float = 50) -> np.ndarray:
+        """
+        Add simple reverb effect.
+        Args:
+            audio: Input audio
+            sr: Sample rate
+            room_scale: Reverb amount (0-1)
+            delay_ms: Delay in milliseconds
+        Returns:
+            Audio with reverb
+        """
+        delay_samples = int((delay_ms / 1000.0) * sr)
+        # Create delayed version
+        delayed = np.zeros_like(audio)
+        if delay_samples < len(audio):
+            delayed[delay_samples:] = audio[:-delay_samples]
+        # Mix original with delayed
+        reverb = audio + room_scale * delayed
+        return reverb
+    @staticmethod
+    def compress_audio(audio: np.ndarray, threshold: float = 0.6, ratio: float = 4.0) -> np.ndarray:
+        """
+        Apply dynamic range compression.
+        Args:
+            audio: Input audio
+            threshold: Compression threshold (0-1)
+            ratio: Compression ratio
+        Returns:
+            Compressed audio
+        """
+        # Simple peak compression
+        abs_audio = np.abs(audio)
+        # Find samples above threshold
+        mask = abs_audio > threshold
+        # Apply compression to loud parts
+        audio[mask] = np.sign(audio[mask]) * (threshold + (abs_audio[mask] - threshold) / ratio)
+        return audio
+    @staticmethod
+    def mix_audio(vocal: np.ndarray, instrumental: np.ndarray,
+                  vocal_level: float = 0.7, instrumental_level: float = 0.3,
+                  add_reverb: bool = True, add_compression: bool = True,
+                  sr: int = 16000) -> np.ndarray:
+        """
+        Mix vocals and instrumental with effects.
+        Args:
+            vocal: Vocal audio
+            instrumental: Instrumental audio
+            vocal_level: Vocal volume level (0-1)
+            instrumental_level: Instrumental volume level (0-1)
+            add_reverb: Whether to add reverb to vocals
+            add_compression: Whether to add compression
+            sr: Sample rate
+        Returns:
+            Mixed audio
+        """
+        print("[AudioMixer] Normalizing tracks...")
+        # Normalize individual tracks
+        vocal = AudioMixer.normalize_audio(vocal, -6.0)  # Vocals a bit quieter initially
+        instrumental = AudioMixer.normalize_audio(instrumental, -6.0)
+        print("[AudioMixer] Adding effects...")
+        # Add reverb to vocals
+        if add_reverb:
+            vocal = AudioMixer.add_reverb(vocal, sr, room_scale=0.2, delay_ms=40)
+        # Apply compression
+        if add_compression:
+            vocal = AudioMixer.compress_audio(vocal, threshold=0.5, ratio=3.0)
+        print("[AudioMixer] Mixing tracks...")
+        # Ensure same length
+        min_len = min(len(vocal), len(instrumental))
+        vocal = vocal[:min_len]
+        instrumental = instrumental[:min_len]
+        # Mix with specified levels
+        mixed = vocal_level * vocal + instrumental_level * instrumental
+        # Normalize final mix
+        mixed = AudioMixer.normalize_audio(mixed, -3.0)
+        print(f"[AudioMixer] Mix complete - Peak: {np.max(np.abs(mixed)):.4f}")
+        return mixed
+    @staticmethod
+    def save_audio(audio: np.ndarray, output_path: Path, sr: int = 16000) -> None:
+        """
+        Save audio to file.
+        Args:
+            audio: Audio array
+            output_path: Output file path
+            sr: Sample rate
+        """
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        print(f"[AudioMixer] Saving to {output_path}")
+        sf.write(output_path, audio, sr)
+        print(f"[AudioMixer] Saved successfully")
+    @staticmethod
+    def mix_and_save(vocal: np.ndarray, instrumental: np.ndarray,
+                     output_path: Path, sr: int = 16000,
+                     add_effects: bool = True) -> Path:
+        """
+        Mix audio and save to file.
+        Args:
+            vocal: Vocal audio
+            instrumental: Instrumental audio
+            output_path: Output file path
+            sr: Sample rate
+            add_effects: Whether to add effects
+        Returns:
+            Output file path
+        """
+        mixed = AudioMixer.mix_audio(
+            vocal, instrumental,
+            add_reverb=add_effects,
+            add_compression=add_effects,
+            sr=sr
+        )
+        AudioMixer.save_audio(mixed, output_path, sr)
+        return Path(output_path)

backend/app/song_conversion/song_processor.py ADDED Viewed

	@@ -0,0 +1,190 @@

+"""Main song processing orchestrator."""
+import gc
+import torch
+import numpy as np
+from pathlib import Path
+from typing import Optional
+import sys
+from app.song_conversion.vocal_separator import VocalSeparator
+from app.song_conversion.audio_mixer import AudioMixer
+from encoder import inference as encoder_infer
+from synthesizer import inference as synthesizer_infer
+from app.vocoder import inference as vocoder_infer
+from synthesizer.hparams import hparams as syn_hp
+class SongProcessor:
+    """Orchestrates the complete song voice conversion process."""
+    def __init__(self, models_dir: Path):
+        """
+        Initialize song processor.
+        Args:
+            models_dir: Directory containing pre-trained models
+        """
+        self.models_dir = Path(models_dir)
+        self.separator = None
+        self.sr = 16000
+    def _ensure_separator(self) -> VocalSeparator:
+        """Lazy load vocal separator."""
+        if self.separator is None:
+            print("[SongProcessor] Initializing vocal separator...")
+            self.separator = VocalSeparator(model_name="htdemucs")
+        return self.separator
+    def _load_voice_models(self, models_dir: Path, language: str = 'english') -> None:
+        """Load voice cloning models."""
+        print(f"[SongProcessor] Loading {language} voice models...")
+        enc_path = models_dir / "default" / "encoder.pt"
+        syn_path = models_dir / "default" / "synthesizer.pt"
+        voc_path = models_dir / "default" / "vocoder.pt"
+        for path in [enc_path, syn_path, voc_path]:
+            if not path.exists():
+                raise RuntimeError(f"Model missing: {path}")
+        encoder_infer.load_model(enc_path)
+        print("[SongProcessor] Encoder loaded")
+        synthesizer = synthesizer_infer.Synthesizer(syn_path)
+        print("[SongProcessor] Synthesizer loaded")
+        vocoder_infer.load_model(voc_path)
+        print("[SongProcessor] Vocoder loaded")
+        return synthesizer
+    def _extract_lyrics_from_audio(self, audio_path: Path, voice_sample_path: Path) -> str:
+        """
+        Simple lyrics extraction (placeholder - returns generic text).
+        In production, would use speech-to-text.
+        Args:
+            audio_path: Path to vocal audio
+            voice_sample_path: Path to reference voice
+        Returns:
+            Extracted lyrics text
+        """
+        print("[SongProcessor] Extracting lyrics from audio...")
+        # Placeholder: return generic phonetically rich text
+        # In production, use Whisper or other STT model
+        lyrics = "The music is playing so well with this song today"
+        print(f"[SongProcessor] Using default lyrics: {lyrics}")
+        return lyrics
+    def convert_song(self, song_path: Path, voice_path: Path, output_path: Path,
+                    language: str = 'english', add_effects: bool = True,
+                    models_dir: Optional[Path] = None) -> Path:
+        """
+        Convert song to user's voice.
+        Complete pipeline:
+        1. Separate vocals from instrumental
+        2. Extract lyrics from vocals (or use placeholder)
+        3. Synthesize vocals using user's voice
+        4. Mix synthesized vocals with instrumental
+        5. Add audio effects
+        Args:
+            song_path: Path to input song
+            voice_path: Path to reference voice sample
+            output_path: Path for output song
+            language: 'english' or 'hindi'
+            add_effects: Whether to add reverb/compression
+            models_dir: Directory with models (uses self.models_dir if None)
+        Returns:
+            Path to output song
+        """
+        if models_dir is None:
+            models_dir = self.models_dir
+        song_path = Path(song_path)
+        voice_path = Path(voice_path)
+        output_path = Path(output_path)
+        try:
+            print(f"\n[SongProcessor] ========== SONG CONVERSION START ==========")
+            print(f"[SongProcessor] Song: {song_path}")
+            print(f"[SongProcessor] Voice: {voice_path}")
+            print(f"[SongProcessor] Language: {language}")
+            print(f"[SongProcessor] Output: {output_path}")
+            # Step 1: Separate vocals
+            print(f"\n[SongProcessor] STEP 1: Separating vocals...")
+            separator = self._ensure_separator()
+            vocals, instrumental = separator.separate(song_path, sr=self.sr)
+            # Step 2: Extract/prepare lyrics (using placeholder for now)
+            print(f"\n[SongProcessor] STEP 2: Preparing lyrics...")
+            lyrics = self._extract_lyrics_from_audio(song_path, voice_path)
+            # Step 3: Load voice models
+            print(f"\n[SongProcessor] STEP 3: Loading voice models...")
+            synthesizer = self._load_voice_models(models_dir, language)
+            # Step 4: Synthesize voice with your voice
+            print(f"\n[SongProcessor] STEP 4: Synthesizing vocals with your voice...")
+            wav = encoder_infer.preprocess_wav(voice_path)
+            embed = encoder_infer.embed_utterance(wav)
+            mels = synthesizer.synthesize_spectrograms([lyrics], [embed])
+            mel = mels[0]
+            print("[SongProcessor] Vocoding...")
+            try:
+                synthesized_vocal = vocoder_infer.infer_waveform(
+                    mel, normalize=True, batched=False, target=8000, overlap=800
+                ).astype(np.float32)
+            except Exception as e:
+                print(f"[SongProcessor] Vocoder failed: {e}, using Griffin-Lim fallback")
+                synthesized_vocal = synthesizer.griffin_lim(mel).astype(np.float32)
+            # Normalize synthesized vocal
+            max_val = np.max(np.abs(synthesized_vocal))
+            if max_val > 0:
+                target_level = 0.707
+                synthesized_vocal = synthesized_vocal * (target_level / max_val)
+            synthesized_vocal = np.clip(synthesized_vocal, -1.0, 1.0)
+            print(f"[SongProcessor] Synthesized vocal shape: {synthesized_vocal.shape}")
+            # Step 5: Mix with instrumental
+            print(f"\n[SongProcessor] STEP 5: Mixing vocals with instrumental...")
+            final_audio = AudioMixer.mix_and_save(
+                synthesized_vocal, instrumental,
+                output_path, sr=self.sr,
+                add_effects=add_effects
+            )
+            # Cleanup
+            print(f"\n[SongProcessor] Cleaning up models...")
+            try:
+                encoder_infer._model = None
+                synthesizer_infer._model = None
+                vocoder_infer._model = None
+                gc.collect()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+            except Exception as e:
+                print(f"[SongProcessor] Warning during cleanup: {e}")
+            print(f"\n[SongProcessor] ========== SONG CONVERSION COMPLETE ==========")
+            print(f"[SongProcessor] Output saved to: {final_audio}")
+            return final_audio
+        except Exception as e:
+            print(f"\n[SongProcessor] ✗ ERROR: {e}")
+            import traceback
+            traceback.print_exc()
+            sys.stdout.flush()
+            raise

backend/app/song_conversion/vocal_separator.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""Vocal separation using Demucs model."""
+import torch
+import numpy as np
+import librosa
+import soundfile as sf
+from pathlib import Path
+from typing import Tuple
+import sys
+try:
+    from demucs.pretrained import get_model
+    DEMUCS_AVAILABLE = True
+except ImportError:
+    DEMUCS_AVAILABLE = False
+    print("[Warning] Demucs not available. Song conversion will not work.")
+class VocalSeparator:
+    """Separates vocals from instrumental music using Demucs."""
+    def __init__(self, model_name: str = "htdemucs", device: str = None):
+        """
+        Initialize vocal separator.
+        Args:
+            model_name: Demucs model to use ('htdemucs', 'mdx_extra', etc.)
+            device: 'cuda' or 'cpu'. Auto-detects if None.
+        """
+        if not DEMUCS_AVAILABLE:
+            raise RuntimeError("Demucs not installed. Install with: pip install demucs")
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        print(f"[VocalSeparator] Loading {model_name} on {self.device}...")
+        self.model = get_model(model_name)
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        print(f"[VocalSeparator] Model loaded successfully")
+    def separate(self, audio_path: Path, sr: int = 16000) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Separate vocals and instrumental from audio file.
+        Args:
+            audio_path: Path to audio file
+            sr: Sample rate (default 16000)
+        Returns:
+            Tuple of (vocals, instrumental) as numpy arrays
+        """
+        print(f"[VocalSeparator] Loading audio: {audio_path}")
+        # Load audio
+        if isinstance(audio_path, str):
+            audio_path = Path(audio_path)
+        # Use librosa to load and resample
+        wav, original_sr = librosa.load(str(audio_path), sr=None, mono=True)
+        # Resample if needed
+        if original_sr != sr:
+            wav = librosa.resample(wav, orig_sr=original_sr, target_sr=sr)
+        print(f"[VocalSeparator] Audio loaded: {len(wav)} samples at {sr}Hz")
+        # Convert to tensor (Demucs expects shape: [1, channels, samples])
+        wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)
+        wav_tensor = wav_tensor.to(self.device)
+        print(f"[VocalSeparator] Separating vocals and instrumental...")
+        sys.stdout.flush()
+        # Perform separation
+        with torch.no_grad():
+            sources = self.model.separate(wav_tensor)
+        # Extract vocals and other sources
+        # sources dict typically has: 'drums', 'bass', 'other', 'vocals'
+        sources = {k: v.cpu().numpy().squeeze() for k, v in sources.items()}
+        vocals = sources.get('vocals', np.zeros_like(wav))
+        # Combine other sources as instrumental
+        instrumental = np.zeros_like(wav)
+        for key in sources:
+            if key != 'vocals':
+                instrumental += sources[key]
+        print(f"[VocalSeparator] Separation complete")
+        print(f"[VocalSeparator] Vocals shape: {vocals.shape}")
+        print(f"[VocalSeparator] Instrumental shape: {instrumental.shape}")
+        return vocals, instrumental
+    def separate_and_save(self, audio_path: Path, output_dir: Path, sr: int = 16000) -> Tuple[Path, Path]:
+        """
+        Separate vocals and save to files.
+        Args:
+            audio_path: Input audio file
+            output_dir: Directory to save separated audio
+            sr: Sample rate
+        Returns:
+            Tuple of (vocals_path, instrumental_path)
+        """
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        vocals, instrumental = self.separate(audio_path, sr)
+        vocals_path = output_dir / "vocals.wav"
+        instrumental_path = output_dir / "instrumental.wav"
+        print(f"[VocalSeparator] Saving vocals to {vocals_path}")
+        sf.write(vocals_path, vocals, sr)
+        print(f"[VocalSeparator] Saving instrumental to {instrumental_path}")
+        sf.write(instrumental_path, instrumental, sr)
+        return vocals_path, instrumental_path

backend/requirements.txt CHANGED Viewed

@@ -12,3 +12,5 @@ scipy>=1.6.0
 scikit-learn>=1.1.0
 unidecode>=1.2.0
 inflect>=6.0.0

 scikit-learn>=1.1.0
 unidecode>=1.2.0
 inflect>=6.0.0
+demucs>=4.0.0
+pydub>=0.25.1