import torch
import torchaudio
import gradio as gr
import os
import tempfile
import numpy as np
import struct

# Define the model ID for the 0.16 kbps codec config
MODEL_CONFIG = "lucadellalib/focalcodec_12_5hz"

# Load the model globally using torch.hub
codec = None
try:
    print("Loading FocalCodec model...")
    codec = torch.hub.load(
        repo_or_dir="lucadellalib/focalcodec",
        model="focalcodec",
        config=MODEL_CONFIG,
        force_reload=False,
        trust_repo=True
    )
    codec.eval()
    for param in codec.parameters():
        param.requires_grad = False
    
    if torch.cuda.is_available():
        codec = codec.cuda()
        print("Model loaded successfully on GPU!")
    else:
        print("Model loaded successfully on CPU!")
    
except Exception as e:
    print(f"ERROR loading model via torch.hub: {e}")
    print("\nTrying alternative installation method...")
    try:
        import subprocess
        subprocess.check_call(["pip", "install", "focalcodec@git+https://github.com/lucadellalib/focalcodec.git@main"])
        import focalcodec
        codec = focalcodec.FocalCodec.from_pretrained(MODEL_CONFIG)
        codec.eval()
        for param in codec.parameters():
            param.requires_grad = False
        if torch.cuda.is_available():
            codec = codec.cuda()
        print("Model loaded via pip installation!")
    except Exception as e2:
        print(f"ERROR with alternative method: {e2}")
        codec = None


def save_compressed_codes_optimal(toks, codes, fc_file_path, codec):
    """Save codes with optimal bit packing to achieve true 160 bps"""
    
    codes_cpu = codes.cpu().numpy()
    toks_cpu = toks.cpu().numpy()
    
    print(f"\n=== Optimal Compression ===")
    print(f"Codes shape: {codes.shape}")
    print(f"Codes dtype: {codes.dtype}")
    
    # Determine actual bits needed based on token range
    max_token = int(toks_cpu.max())
    if max_token <= 1:
        bits_needed = 1
    elif max_token <= 3:
        bits_needed = 2
    elif max_token <= 7:
        bits_needed = 3
    elif max_token <= 15:
        bits_needed = 4
    elif max_token <= 31:
        bits_needed = 5
    elif max_token <= 63:
        bits_needed = 6
    elif max_token <= 127:
        bits_needed = 7
    elif max_token <= 255:
        bits_needed = 8
    elif max_token <= 511:
        bits_needed = 9
    elif max_token <= 1023:
        bits_needed = 10
    elif max_token <= 2047:
        bits_needed = 11
    elif max_token <= 4095:
        bits_needed = 12
    elif max_token <= 8191:
        bits_needed = 13
    elif max_token <= 16383:
        bits_needed = 14
    elif max_token <= 32767:
        bits_needed = 15
    else:
        bits_needed = 16
    
    print(f"Token range: 0 to {max_token}")
    print(f"Bits needed per token: {bits_needed}")
    
    # If codes are already binary (batch, time, bits), use them directly
    if len(codes.shape) == 3 and codes.dtype in [torch.bool, torch.uint8]:
        print(f"Using binary codes directly: {codes.shape[2]} bits per token")
        # Pack the binary codes
        codes_flat = codes_cpu.flatten()
        packed_bits = np.packbits(codes_flat)
        bits_per_token = codes.shape[2]
        num_tokens = codes.shape[1]
        
    else:
        # Pack tokens manually using exact bit width
        print(f"Packing tokens with {bits_needed} bits each")
        toks_flat = toks_cpu.flatten().astype(np.uint32)
        num_tokens = len(toks_flat)
        
        # Convert to binary string and pack
        total_bits = num_tokens * bits_needed
        
        # Create bit array
        bit_array = []
        for tok in toks_flat:
            # Convert to binary with exact bit width
            bits = format(int(tok), f'0{bits_needed}b')
            bit_array.extend([int(b) for b in bits])
        
        # Pad to byte boundary
        while len(bit_array) % 8 != 0:
            bit_array.append(0)
        
        # Pack into bytes
        packed_bits = np.packbits(np.array(bit_array, dtype=np.uint8))
        bits_per_token = bits_needed
    
    # Write to file
    with open(fc_file_path, 'wb') as f:
        # Magic number
        f.write(b'FC01')
        
        # Metadata
        f.write(struct.pack('<I', toks.shape[0]))  # batch size
        f.write(struct.pack('<I', num_tokens))      # number of tokens
        f.write(struct.pack('<B', bits_per_token))  # bits per token
        
        # Packed data
        f.write(packed_bits.tobytes())
    
    file_size = os.path.getsize(fc_file_path)
    header_size = 4 + 4 + 4 + 1  # magic + 2 ints + 1 byte
    data_size = file_size - header_size
    
    print(f"File size: {file_size} bytes (header: {header_size}B, data: {data_size}B)")
    print(f"===========================\n")
    
    return file_size, bits_per_token, data_size


def load_compressed_codes_optimal(fc_file_path):
    """Load optimally packed codes"""
    
    with open(fc_file_path, 'rb') as f:
        # Verify magic
        magic = f.read(4)
        if magic != b'FC01':
            raise ValueError("Invalid .fc file!")
        
        # Read metadata
        batch_size = struct.unpack('<I', f.read(4))[0]
        num_tokens = struct.unpack('<I', f.read(4))[0]
        bits_per_token = struct.unpack('<B', f.read(1))[0]
        
        # Read packed data
        packed_data = np.frombuffer(f.read(), dtype=np.uint8)
    
    print(f"\n=== Loading Optimal Codes ===")
    print(f"Batch: {batch_size}, Tokens: {num_tokens}, Bits/token: {bits_per_token}")
    
    # Unpack bits
    unpacked_bits = np.unpackbits(packed_data)
    
    # Extract exact number of bits needed
    total_bits = num_tokens * bits_per_token
    token_bits = unpacked_bits[:total_bits]
    
    # Reconstruct tokens
    tokens = []
    for i in range(num_tokens):
        start = i * bits_per_token
        end = start + bits_per_token
        token_bits_slice = token_bits[start:end]
        
        # Convert binary to integer
        token_value = 0
        for bit in token_bits_slice:
            token_value = (token_value << 1) | bit
        tokens.append(token_value)
    
    tokens_array = np.array(tokens, dtype=np.int64).reshape(batch_size, -1)
    tokens_tensor = torch.from_numpy(tokens_array)
    
    print(f"Loaded tokens: {tokens_tensor.shape}")
    print(f"==============================\n")
    
    return tokens_tensor


def encode_decode_focal(audio_input):
    """
    Processes input audio through the 160 bps FocalCodec, saves the tokens,
    and returns both the decoded WAV and the path to the FC file for download.
    """
    if codec is None:
        return None, None, "❌ ERROR: Model failed to load. Check console for details."
    
    if audio_input is None:
        return None, None, "❌ Please provide audio input."
    
    try:
        sr, wav_numpy = audio_input
        
        print(f"\n{'='*50}")
        print(f"Processing new audio...")
        print(f"Input audio: sample_rate={sr}, shape={wav_numpy.shape}")
        
        # Handle stereo to mono conversion
        if len(wav_numpy.shape) > 1:
            if wav_numpy.shape[1] == 2:
                wav_numpy = wav_numpy.mean(axis=1)
                print("Converted stereo to mono")
            elif wav_numpy.shape[0] == 2:
                wav_numpy = wav_numpy.mean(axis=0)
                print("Converted stereo to mono (channels first)")
        
        # Ensure float32 and normalize
        wav_numpy = wav_numpy.astype(np.float32)
        if wav_numpy.max() > 1.0 or wav_numpy.min() < -1.0:
            wav_numpy = wav_numpy / 32768.0
        
        # Convert to torch tensor
        sig = torch.from_numpy(wav_numpy).unsqueeze(0)
        
        # Resample to 16kHz
        if sr != codec.sample_rate_input:
            print(f"Resampling from {sr}Hz to {codec.sample_rate_input}Hz...")
            resampler = torchaudio.transforms.Resample(
                orig_freq=sr, 
                new_freq=codec.sample_rate_input
            )
            sig = resampler(sig)
        
        print(f"Signal shape: {sig.shape}")
        
        if torch.cuda.is_available():
            sig = sig.cuda()
        
        # --- Encode and Decode ---
        with torch.no_grad():
            print("\n--- Encoding ---")
            toks = codec.sig_to_toks(sig)
            
            duration_sec = sig.shape[-1] / codec.sample_rate_input
            token_rate = toks.shape[1] / duration_sec
            
            print(f"Tokens shape: {toks.shape}")
            print(f"Token range: {toks.min().item()} to {toks.max().item()}")
            print(f"Duration: {duration_sec:.2f}s")
            print(f"Token rate: {token_rate:.2f} tokens/sec")
            
            # Get binary codes
            codes = codec.toks_to_codes(toks)
            print(f"Codes shape: {codes.shape}")
            print(f"Codes dtype: {codes.dtype}")
            if len(codes.shape) == 3:
                print(f"Bits per token (from codes): {codes.shape[2]}")
            
            print("\n--- Decoding ---")
            rec_sig = codec.toks_to_sig(toks)
            print(f"Reconstructed signal shape: {rec_sig.shape}")
        
        # --- Save with optimal bit packing ---
        temp_dir = tempfile.mkdtemp()
        fc_file_path = os.path.join(temp_dir, "compressed_tokens.fc")
        
        file_size, bits_per_token, data_size = save_compressed_codes_optimal(
            toks, codes, fc_file_path, codec
        )
        
        # Calculate bitrates
        total_bitrate = (file_size * 8) / duration_sec
        data_bitrate = (data_size * 8) / duration_sec
        theoretical_bitrate = token_rate * bits_per_token
        
        print(f"--- Results ---")
        print(f"Total bitrate: {total_bitrate:.1f} bps (with header)")
        print(f"Data bitrate: {data_bitrate:.1f} bps (data only)")
        print(f"Theoretical: {theoretical_bitrate:.1f} bps")
        print(f"Target: 160 bps")
        print(f"Efficiency: {(160/data_bitrate)*100:.1f}% of target")
        print(f"{'='*50}\n")
        
        # Prepare output
        decoded_wav_output = rec_sig.cpu().numpy().squeeze()
        
        if len(decoded_wav_output.shape) == 0:
            decoded_wav_output = decoded_wav_output.reshape(1)
        
        status_msg = f"✅ {duration_sec:.1f}s | {file_size}B | {data_bitrate:.0f} bps | {bits_per_token} bits/tok | target: 160 bps"
        
        return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg
    
    except Exception as e:
        error_msg = f"❌ Error: {str(e)}"
        print(error_msg)
        import traceback
        traceback.print_exc()
        return None, None, error_msg


def decode_from_fc_file(fc_file):
    """Decode audio from uploaded .fc file"""
    
    if codec is None:
        return None, "❌ Model not loaded"
    
    if fc_file is None:
        return None, "❌ Please upload a .fc file"
    
    try:
        print(f"\n{'='*50}")
        print(f"Decoding from file: {fc_file.name}")
        
        # Load tokens
        toks = load_compressed_codes_optimal(fc_file.name)
        
        if torch.cuda.is_available():
            toks = toks.cuda()
        
        # Decode to audio
        with torch.no_grad():
            print("Decoding tokens to audio...")
            rec_sig = codec.toks_to_sig(toks)
            print(f"Reconstructed signal shape: {rec_sig.shape}")
        
        decoded_wav = rec_sig.cpu().numpy().squeeze()
        
        # Calculate stats
        duration_sec = decoded_wav.shape[0] / codec.sample_rate_output
        file_size = os.path.getsize(fc_file.name)
        header_size = 4 + 4 + 4 + 1
        data_size = file_size - header_size
        bitrate = (data_size * 8) / duration_sec
        
        print(f"Duration: {duration_sec:.2f}s")
        print(f"Bitrate: {bitrate:.1f} bps")
        print(f"{'='*50}\n")
        
        status = f"✅ Decoded! {duration_sec:.1f}s | {bitrate:.0f} bps"
        
        return (codec.sample_rate_output, decoded_wav), status
    
    except Exception as e:
        import traceback
        traceback.print_exc()
        return None, f"❌ Error: {str(e)}"


# --- Gradio Interface ---
with gr.Blocks(title="FocalCodec 160 bps", theme=gr.themes.Soft()) as iface:
    gr.Markdown("# 🎙️ FocalCodec at 160 bps")
    gr.Markdown(f"**Neural speech codec at insanely low bitrate!** Using `{MODEL_CONFIG}`")
    gr.Markdown("⚠️ **Optimized for speech only** - not suitable for music | 🔥 **1600x compression ratio!**")
    
    with gr.Tab("🎤 Encode Audio"):
        gr.Markdown("### Compress audio to ~160 bps with optimal bit packing")
        
        with gr.Row():
            audio_input = gr.Audio(
                sources=["microphone", "upload"], 
                type="numpy", 
                label="Input Audio (any format/sample rate)"
            )
            
            with gr.Column():
                audio_output = gr.Audio(
                    type="numpy", 
                    label="🔊 Decoded Output (16kHz)"
                )
                file_output = gr.File(
                    label="💾 Download Compressed .fc File"
                )
                status_output = gr.Textbox(label="📊 Status", lines=2)
        
        encode_btn = gr.Button("🔄 Encode & Decode", variant="primary", size="lg")
        encode_btn.click(
            fn=encode_decode_focal,
            inputs=[audio_input],
            outputs=[audio_output, file_output, status_output]
        )
        
        gr.Markdown("### How it works:")
        gr.Markdown("- ✅ Automatically resamples to 16kHz")
        gr.Markdown("- ✅ Converts stereo to mono")
        gr.Markdown("- ✅ Encodes to discrete tokens (~12.5 tokens/sec)")
        gr.Markdown("- ✅ Packs tokens using only needed bits (no waste!)")
        gr.Markdown("- ✅ Decodes tokens back to audio")
        gr.Markdown("- 📈 Check console for detailed bitrate analysis!")
    
    with gr.Tab("📂 Decode from .fc File"):
        gr.Markdown("### Decode previously compressed audio")
        
        with gr.Row():
            fc_input = gr.File(
                label="Upload .fc File", 
                file_types=[".fc"]
            )
            
            with gr.Column():
                decoded_output = gr.Audio(
                    type="numpy", 
                    label="🔊 Decoded Audio"
                )
                decode_status = gr.Textbox(label="📊 Status", lines=2)
        
        decode_btn = gr.Button("🔊 Decode Audio", variant="primary", size="lg")
        decode_btn.click(
            fn=decode_from_fc_file,
            inputs=[fc_input],
            outputs=[decoded_output, decode_status]
        )
        
        gr.Markdown("### Note:")
        gr.Markdown("Upload a .fc file created by this tool to decode it back to audio.")
    
    with gr.Tab("ℹ️ About"):
        gr.Markdown("""
        ## FocalCodec - Ultra Low Bitrate Neural Audio Codec
        
        ### 🎯 Compression Ratios:
        | Format | Bitrate | 1-Hour File Size | Compression |
        |--------|---------|------------------|-------------|
        | **Uncompressed PCM** (16kHz mono) | 256 kbps | ~115 MB | 1x |
        | **MP3** (standard) | 128 kbps | ~57 MB | 2x |
        | **Opus** (voice optimized) | 16 kbps | ~7.2 MB | 16x |
        | **FocalCodec** | **0.16 kbps** | **~72 KB** | **1600x** 🔥 |
        
        ### 💡 Use Cases:
        - 📞 **Ultra-low bandwidth voice calls** (satellite, deep space)
        - 🤖 **AI-generated podcasts** (NotebookLM-style apps)
        - 🌍 **Low-bandwidth regions** (2G networks)
        - 📻 **Emergency communications** (disaster relief)
        - 🎓 **Educational content distribution** (offline learning)
        - 💾 **Voice memo storage** (years of recordings in MB)
        
        ### ⚖️ Trade-offs:
        
        **Pros:**
        - ✅ Insanely efficient compression (1600x!)
        - ✅ Speech remains highly intelligible
        - ✅ Works on any sample rate (auto-resamples)
        - ✅ Tiny storage/bandwidth requirements
        
        **Cons:**
        - ❌ Voice characteristics may change
        - ❌ Emotional nuances can be lost
        - ❌ Occasional pronunciation artifacts
        - ❌ Not suitable for music or non-speech audio
        
        ### 🔧 Technical Details:
        - **Model:** `lucadellalib/focalcodec_12_5hz`
        - **Sample Rate:** 16 kHz
        - **Token Rate:** ~12.5 tokens/second
        - **Bits per Token:** 13 bits (auto-detected, optimally packed)
        - **Target Bitrate:** 160 bps (12.5 × 13 = 162.5 bps)
        - **File Format:** Custom binary format with metadata header
        
        ### 🧮 How We Achieve 160 bps:
        
        Traditional approach would waste bits:
        ```
        Token (0-8191) → int16 (16 bits) → 16 × 12.5 = 200 bps ❌
        Wasting 3 bits per token!
        ```
        
        Our optimal approach:
        ```
        Token (0-8191) → 13 bits exactly → 13 × 12.5 = 162.5 bps ✅
        Zero waste!
        ```
        
        ### 🔬 Debug Information:
        Check the **console/terminal** for detailed encoding information:
        - Actual token rate and range
        - Bits per token (detected automatically)
        - Expected vs actual bitrate
        - File size breakdown (header vs data)
        - Compression efficiency
        
        ### 📚 Example Use Case - AI Podcast Library:
        
        Imagine storing **1000 hours** of AI-generated podcasts:
        - **Uncompressed:** 115 GB
        - **MP3:** 57 GB
        - **Opus:** 7.2 GB
        - **FocalCodec:** **72 MB** 🤯
        
        You could fit an entire podcast library on a USB flash drive!
        
        ---
        
        ### 🔗 Links:
        - [FocalCodec GitHub](https://github.com/lucadellalib/focalcodec)
        - [Research Paper](https://arxiv.org/abs/2410.03608)
        
        ### 🏗️ Built with:
        - PyTorch + TorchAudio
        - Gradio
        - FocalCodec (Luca Della Libera et al.)
        """)

if __name__ == "__main__":
    print("\n" + "="*50)
    print("🎙️  FocalCodec 160 bps Demo")
    print("="*50 + "\n")
    iface.launch()