Spaces:

mojtaba-nafez
/

persian-poem-recommender-based-on-text

Runtime error

App Files Files Community

persian-poem-recommender-based-on-text / datasets.py

mojtaba-nafez

add initial files to deploy

2fa2727 almost 3 years ago

raw

history blame contribute delete

7.69 kB

	import os
	import cv2
	import torch
	import albumentations as A
	import config as CFG


	class PoemTextDataset(torch.utils.data.Dataset):
	"""
	torch Dataset for PoemTextModel.
	...
	Attributes:
	-----------
	dataset_dict : list of dict
	dataset containing poem-text pair with ids
	encoded_poems : dict
	output of tokenizer for beyts found in dataset_dict. max_length spedified in configs.
	padding and truncation set to True to be truncated or padded to max length.
	encoded_texts : dict
	output of tokenizer for texts found in dataset_dict. max_length spedified in configs.
	padding and truncation set to True to be truncated or padded to max length.

	Methods:
	--------
	__get_item__(idx)
	returns item with index idx.
	__len__()
	represents length of dataset
	"""
	def __init__(self, dataset_dict):
	"""
	Init class, save dataset_dict and calculate output of tokenizers for each text and poem using their corresponding tokenizers.
	The tokenizers are chosen based on configs.

	Parameters:
	-----------
	dataset_dict: list of dict
	a list containing dictionaries which have "beyt", "text" and "id" keys.
	"""
	self.dataset_dict = dataset_dict
	poem_tokenizer = CFG.tokenizers[CFG.poem_encoder_model].from_pretrained(CFG.poem_tokenizer)
	text_tokenizer = CFG.tokenizers[CFG.text_encoder_model].from_pretrained(CFG.text_tokenizer)
	self.encoded_poems = poem_tokenizer(
	[item['beyt'] for item in dataset_dict], padding=True, truncation=True, max_length=CFG.poems_max_length
	)
	self.encoded_texts = text_tokenizer(
	[item['text'] for item in dataset_dict], padding=True, truncation=True, max_length=CFG.text_max_length
	)

	def __getitem__(self, idx):
	"""
	returns a dict having data with index idx. the dict is used as an input to the PoemTextModel.

	Parameters:
	-----------
	idx: int
	index of the data to get

	Returns:
	--------
	item: dict
	a dict having tokenizers' output for poem and text, and id of the data with index idx
	"""
	item = {}
	item["beyt"] = {
	key: torch.tensor(values[idx])
	for key, values in self.encoded_poems.items()
	}

	item["text"] = {
	key: torch.tensor(values[idx])
	for key, values in self.encoded_texts.items()
	}
	item['id'] = self.dataset_dict[idx]['id']

	return item


	def __len__(self):
	"""
	returns the length of the dataset

	Returns:
	--------
	length: int
	length using the length of dataset_dict we saved in class
	"""
	return len(self.dataset_dict)


	class CLIPDataset(torch.utils.data.Dataset):
	"""
	torch Dataset for CLIPModel.
	...
	Attributes:
	-----------
	dataset_dict : list of dict
	dataset containing poem-image or text-image pair with ids
	encoded : dict
	output of tokenizer for beyts/texts found in dataset_dict. max_length spedified in configs.
	padding and truncation set to True to be truncated or padded to max length.
	transforms: albumentations.BasicTransform
	transforms to apply to the images

	Methods:
	--------
	__get_item__(idx)
	returns item with index idx.
	__len__()
	represents length of dataset
	"""
	def __init__(self, dataset_dict, transforms, is_image_poem_pair=True):
	"""
	Init class, save dataset_dict and transforms and calculate output of tokenizers for each text and poem using their corresponding tokenizers.
	The tokenizers are chosen based on configs.

	Parameters:
	-----------
	dataset_dict: list of dict
	a list containing dictionaries which have "beyt", "text" and "id" keys.
	transforms: albumentations.BasicTransform
	transforms to apply to the images
	is_image_poem_pair: Bool, optional
	if set to False, dataset has text-image pairs and must use the corresponding text tokenizer.
	else has poem-images pairs and uses the poem tokenizer.
	"""
	self.dataset_dict = dataset_dict
	# using the poem tokenizer to encode poems or text tokenizer to encode text (based on configs).
	if is_image_poem_pair:
	poem_tokenizer = CFG.tokenizers[CFG.poem_encoder_model].from_pretrained(CFG.poem_tokenizer)
	self.encoded = poem_tokenizer(
	[item['beyt'] for item in dataset_dict], padding=True, truncation=True, max_length=CFG.poems_max_length
	)
	else:
	text_tokenizer = CFG.tokenizers[CFG.text_encoder_model].from_pretrained(CFG.text_tokenizer)
	self.encoded = text_tokenizer(
	[item['text'] for item in dataset_dict], padding=True, truncation=True, max_length=CFG.text_max_length
	)
	self.transforms = transforms

	def __getitem__(self, idx):
	"""
	returns a dict having data with index idx. the dict is used as an input to the CLIPModel.

	Parameters:
	-----------
	idx: int
	index of the data to get

	Returns:
	--------
	item: dict
	a dict having tokenizers' output for poem and text, and id of the data with index idx
	"""
	item = {}
	# getting text from encoded texts
	item["text"] = {
	key: torch.tensor(values[idx])
	for key, values in self.encoded.items()
	}

	# opening the image
	image = cv2.imread(f"{CFG.image_path}{self.dataset_dict[idx]['image']}")
	# converting BGR to RGB for transforms
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	# apply transforms
	image = self.transforms(image=image)['image']
	# permute dims of image
	item['image'] = torch.tensor(image).permute(2, 0, 1).float()

	return item


	def __len__(self):
	"""
	returns the length of the dataset

	Returns:
	--------
	length: int
	length using the length of dataset_dict we saved in class
	"""
	return len(self.dataset_dict)



	def get_transforms(mode="train"):
	"""
	returns transforms to use on image based on mode

	Parameters:
	-----------
	mode: str, optional
	to distinguish between train and val/test transforms (here they are the same!)

	Returns:
	--------
	item: dict
	a dict having tokenizers' output for poem and text, and id of the data with index idx
	"""
	if mode == "train":
	return A.Compose(
	[
	A.Resize(CFG.size, CFG.size, always_apply=True), # resizing image to CFG.size
	A.Normalize(max_pixel_value=255.0, always_apply=True), # normalizing image values
	]
	)
	else:
	return A.Compose(
	[
	A.Resize(CFG.size, CFG.size, always_apply=True), # resizing image to CFG.size
	A.Normalize(max_pixel_value=255.0, always_apply=True), # normalizing image values
	]
	)