import torch import torchaudio import gradio as gr import os import tempfile import numpy as np import struct # Define the model ID for the 0.16 kbps codec config MODEL_CONFIG = "lucadellalib/focalcodec_12_5hz" # Load the model globally using torch.hub codec = None try: print("Loading FocalCodec model...") codec = torch.hub.load( repo_or_dir="lucadellalib/focalcodec", model="focalcodec", config=MODEL_CONFIG, force_reload=False, trust_repo=True ) codec.eval() for param in codec.parameters(): param.requires_grad = False if torch.cuda.is_available(): codec = codec.cuda() print("Model loaded successfully on GPU!") else: print("Model loaded successfully on CPU!") except Exception as e: print(f"ERROR loading model via torch.hub: {e}") print("\nTrying alternative installation method...") try: import subprocess subprocess.check_call(["pip", "install", "focalcodec@git+https://github.com/lucadellalib/focalcodec.git@main"]) import focalcodec codec = focalcodec.FocalCodec.from_pretrained(MODEL_CONFIG) codec.eval() for param in codec.parameters(): param.requires_grad = False if torch.cuda.is_available(): codec = codec.cuda() print("Model loaded via pip installation!") except Exception as e2: print(f"ERROR with alternative method: {e2}") codec = None def save_compressed_codes_optimal(toks, codes, fc_file_path, codec): """Save codes with optimal bit packing to achieve true 160 bps""" codes_cpu = codes.cpu().numpy() toks_cpu = toks.cpu().numpy() print(f"\n=== Optimal Compression ===") print(f"Codes shape: {codes.shape}") print(f"Codes dtype: {codes.dtype}") # Determine actual bits needed based on token range max_token = int(toks_cpu.max()) if max_token <= 1: bits_needed = 1 elif max_token <= 3: bits_needed = 2 elif max_token <= 7: bits_needed = 3 elif max_token <= 15: bits_needed = 4 elif max_token <= 31: bits_needed = 5 elif max_token <= 63: bits_needed = 6 elif max_token <= 127: bits_needed = 7 elif max_token <= 255: bits_needed = 8 elif max_token <= 511: bits_needed = 9 elif max_token <= 1023: bits_needed = 10 elif max_token <= 2047: bits_needed = 11 elif max_token <= 4095: bits_needed = 12 elif max_token <= 8191: bits_needed = 13 elif max_token <= 16383: bits_needed = 14 elif max_token <= 32767: bits_needed = 15 else: bits_needed = 16 print(f"Token range: 0 to {max_token}") print(f"Bits needed per token: {bits_needed}") # If codes are already binary (batch, time, bits), use them directly if len(codes.shape) == 3 and codes.dtype in [torch.bool, torch.uint8]: print(f"Using binary codes directly: {codes.shape[2]} bits per token") # Pack the binary codes codes_flat = codes_cpu.flatten() packed_bits = np.packbits(codes_flat) bits_per_token = codes.shape[2] num_tokens = codes.shape[1] else: # Pack tokens manually using exact bit width print(f"Packing tokens with {bits_needed} bits each") toks_flat = toks_cpu.flatten().astype(np.uint32) num_tokens = len(toks_flat) # Convert to binary string and pack total_bits = num_tokens * bits_needed # Create bit array bit_array = [] for tok in toks_flat: # Convert to binary with exact bit width bits = format(int(tok), f'0{bits_needed}b') bit_array.extend([int(b) for b in bits]) # Pad to byte boundary while len(bit_array) % 8 != 0: bit_array.append(0) # Pack into bytes packed_bits = np.packbits(np.array(bit_array, dtype=np.uint8)) bits_per_token = bits_needed # Write to file with open(fc_file_path, 'wb') as f: # Magic number f.write(b'FC01') # Metadata f.write(struct.pack('