Uploaded model
- Developed by: Cosmobillian
- License: apache-2.0
- Finetuned from model : unsloth/orpheus-3b-0.1-ft
This llama model was trained 2x faster with Unsloth and Huggingface's TRL library.
model_name = "Cosmobillian/orpheust-tts-base-fine-tune"
model_name = "canopylabs/orpheus-3b-0.1-ft"
Restart the kernel if needed
from snac import SNAC import torch import torch from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer import numpy as np import soundfile as sf import IPython.display as ipd import librosa from ipywebrtc import AudioRecorder, Audio from IPython.display import display import ipywidgets as widgets from huggingface_hub import snapshot_download import torchaudio.transforms as T import librosa import torch from IPython.display import Audio, display
device = "cuda" if torch.cuda.is_available() else "mps" #or cpu if you aren't on an M type mac print(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
Download only model config and safetensors
model_path = snapshot_download( repo_id=model_name, allow_patterns=[ "config.json", ".safetensors", "model.safetensors.index.json", ], ignore_patterns=[ "optimizer.pt", "pytorch_model.bin", "training_args.bin", "scheduler.pt", "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json", "vocab.json", "merges.txt", "tokenizer." ] )
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) model.to(device)
CHANGE THIS TO YOUR OWN FILE AND TEXT
my_wav_file_is = "/content/drive/MyDrive/Colab Notebooks/short15s_sezen_aksu.wav" and_the_transcript_is = "Ayşeciğin filmi var zeynep değirmencioğlunun hafızasını kaybediyor yolda birileri buluyorlar "
the_model_should_say = [ "Hayat, her gün karşımıza yeni fırsatlar ve zorluklar çıkarır. Önemli olan, bu anları nasıl değerlendirdiğimizdir. Bazen küçük bir adım bile büyük değişimlerin başlangıcı olabilir. Her sabah yeni bir başlangıçtır; dünü geride bırakıp bugünü en iyi şekilde değerlendirmek elimizde. İnsan, hedeflerine ulaşmak için kararlılıkla ilerlemeli ve karşılaştığı engellerden yılmadan yoluna devam etmelidir."
] #@title Tokenising your stuff for the prompt ''' Here we tokenise the prompt you gave us, we also tokenise the prompts you want the model to say
The template is:
start_of_human, start_of_text, text, end_of_text, start_of_ai, start_of_speech, speech, end_of_speech, end_of_ai, start_of_human, text, end_of_human and then generate from here
'''
filename = my_wav_file_is
audio_array, sample_rate = librosa.load(filename, sr=24000)
def tokenise_audio(waveform): waveform = torch.from_numpy(waveform).unsqueeze(0) waveform = waveform.to(dtype=torch.float32)
waveform = waveform.unsqueeze(0)
with torch.inference_mode(): codes = snac_model.encode(waveform)
all_codes = [] for i in range(codes[0].shape[1]): all_codes.append(codes[0][0][i].item()+128266) all_codes.append(codes[1][0][2i].item()+128266+4096) all_codes.append(codes[2][0][4i].item()+128266+(24096)) all_codes.append(codes[2][0][(4i)+1].item()+128266+(34096)) all_codes.append(codes[1][0][(2i)+1].item()+128266+(44096)) all_codes.append(codes[2][0][(4i)+2].item()+128266+(54096)) all_codes.append(codes[2][0][(4i)+3].item()+128266+(6*4096))
return all_codes
myts = tokenise_audio(audio_array) start_tokens = torch.tensor([[ 128259]], dtype=torch.int64) end_tokens = torch.tensor([[128009, 128260, 128261, 128257]], dtype=torch.int64) final_tokens = torch.tensor([[128258, 128262]], dtype=torch.int64) voice_prompt = and_the_transcript_is prompt_tokked = tokenizer(voice_prompt, return_tensors="pt")
input_ids = prompt_tokked["input_ids"]
zeroprompt_input_ids = torch.cat([start_tokens, input_ids, end_tokens, torch.tensor([myts]), final_tokens], dim=1) # SOH SOT Text EOT EOH
prompts = the_model_should_say
all_modified_input_ids = [] for prompt in prompts: input_ids = tokenizer(prompt, return_tensors="pt").input_ids second_input_ids = torch.cat([zeroprompt_input_ids, start_tokens, input_ids, end_tokens], dim=1) all_modified_input_ids.append(second_input_ids)
all_padded_tensors = [] all_attention_masks = []
max_length = max([modified_input_ids.shape[1] for modified_input_ids in all_modified_input_ids])
for modified_input_ids in all_modified_input_ids: padding = max_length - modified_input_ids.shape[1] padded_tensor = torch.cat([torch.full((1, padding), 128263, dtype=torch.int64), modified_input_ids], dim=1) attention_mask = torch.cat([torch.zeros((1, padding), dtype=torch.int64), torch.ones((1, modified_input_ids.shape[1]), dtype=torch.int64)], dim=1) all_padded_tensors.append(padded_tensor) all_attention_masks.append(attention_mask)
all_padded_tensors = torch.cat(all_padded_tensors, dim=0) all_attention_masks = torch.cat(all_attention_masks, dim=0)
input_ids = all_padded_tensors.to(device) attention_mask = all_attention_masks.to(device)
#@title Run Inference
with torch.no_grad(): generated_ids = model.generate( input_ids=input_ids, # attention_mask=attention_mask, max_new_tokens=1500, do_sample=True, temperature=0.5, # top_k=40, top_p=0.9, repetition_penalty=1.1, num_return_sequences=1, eos_token_id=128258, # end_token_id=128009 )
generated_ids = torch.cat([generated_ids, torch.tensor([[128262]]).to(device)], dim=1) # EOAI
#@title Convert output to speech token_to_find = 128257 token_to_remove = 128258
Check if the token exists in the tensor
token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
if len(token_indices[1]) > 0: last_occurrence_idx = token_indices[1][-1].item() cropped_tensor = generated_ids[:, last_occurrence_idx+1:] else: cropped_tensor = generated_ids
mask = cropped_tensor != token_to_remove processed_rows = [] for row in cropped_tensor: # Apply the mask to each row masked_row = row[row != token_to_remove] processed_rows.append(masked_row)
code_lists = [] for row in processed_rows: # row is a 1D tensor with its own length row_length = row.size(0) new_length = (row_length // 7) * 7 # largest multiple of 7 that fits in this row trimmed_row = row[:new_length] trimmed_row = [t - 128266 for t in trimmed_row] code_lists.append(trimmed_row)
def redistribute_codes(code_list): layer_1 = [] layer_2 = [] layer_3 = [] for i in range((len(code_list)+1)//7): layer_1.append(code_list[7i]) layer_2.append(code_list[7i+1]-4096) layer_3.append(code_list[7i+2]-(24096)) layer_3.append(code_list[7i+3]-(34096)) layer_2.append(code_list[7i+4]-(44096)) layer_3.append(code_list[7i+5]-(54096)) layer_3.append(code_list[7i+6]-(64096)) codes = [torch.tensor(layer_1).unsqueeze(0), torch.tensor(layer_2).unsqueeze(0), torch.tensor(layer_3).unsqueeze(0)] audio_hat = snac_model.decode(codes) return audio_hat
my_samples = [] for code_list in code_lists: samples = redistribute_codes(code_list) my_samples.append(samples)
# Eğer soundfile yüklü değilse çalıştırın:
!pip install soundfile
import soundfile as sf from IPython.display import Audio, display from google.colab import files
for idx, samples in enumerate(my_samples): # Tensörü NumPy dizisine çevir audio = samples.detach().squeeze().cpu().numpy() filename = f'audio_{idx}.wav'
# WAV dosyası olarak kaydet
sf.write(filename, audio, 24000)
# Ses oynatıcıyı göster
display(Audio(audio, rate=24000))
# İndir butonunu çalıştır
files.download(filename)
- Downloads last month
- 4
Model tree for Cosmobillian/orpheust-tts-base-fine-tune
Base model
meta-llama/Llama-3.2-3B-Instruct