Spaces:

thebigoed
/

LLMTesting

Sleeping

LLMTesting / data /fine_tune_dataset.py

updated app

c41146d about 1 year ago

1.22 kB

	# import torch
	# from trl import SFTTrainer
	from datasets import load_dataset
	# from transformers import TrainingArguments, TextStreamer
	from unsloth.chat_templates import get_chat_template
	# from unsloth import FastLanguageModel, is_bfloat16_supported


	def load_data(dataset, tokenizer, samples=None):
	print("Loading finetuning dataset.")

	# Base models don't have chat templates so we can choose any - ChatML is popular
	tokenizer = get_chat_template(tokenizer,
	mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
	chat_template="chatml",
	)

	def apply_template(examples):
	# Ensuring we parse the ShareGPT reformat datasets into the format we want
	messages = examples["conversations"]
	text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages]
	return {"text": text}


	if samples is not None:
	# Reducing the training load by only training on a subset
	dataset = load_dataset(dataset, split=f"train[:{int(samples)}]")
	else:
	dataset = load_dataset(dataset, split="train")

	return dataset.map(apply_template, batched=True)