Spaces:
Sleeping
Sleeping
| # import torch | |
| # from trl import SFTTrainer | |
| from datasets import load_dataset | |
| # from transformers import TrainingArguments, TextStreamer | |
| from unsloth.chat_templates import get_chat_template | |
| # from unsloth import FastLanguageModel, is_bfloat16_supported | |
| def load_data(dataset, tokenizer, samples=None): | |
| print("Loading finetuning dataset.") | |
| # Base models don't have chat templates so we can choose any - ChatML is popular | |
| tokenizer = get_chat_template(tokenizer, | |
| mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}, | |
| chat_template="chatml", | |
| ) | |
| def apply_template(examples): | |
| # Ensuring we parse the ShareGPT reformat datasets into the format we want | |
| messages = examples["conversations"] | |
| text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages] | |
| return {"text": text} | |
| if samples is not None: | |
| # Reducing the training load by only training on a subset | |
| dataset = load_dataset(dataset, split=f"train[:{int(samples)}]") | |
| else: | |
| dataset = load_dataset(dataset, split="train") | |
| return dataset.map(apply_template, batched=True) |