In [None]:
!pip install transformers
!pip install datasets
!pip install numpy
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback, RobertaConfig
from datasets import load_dataset, load_metric, Dataset, metric, concatenate_datasets
from sklearn.model_selection import train_test_split
import numpy as np
import random
import pandas as pd
import torch

In [None]:
def tokenize(batch):
    encoding = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=256, return_tensors='pt')
    return encoding


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def random_swap(sentence, n=5):
    length = len(sentence)
    if length <= 1:
        return sentence
    n = min(n, length - 1)
    for _ in range(n):
        idx1, idx2 = random.sample(range(length), 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1]
    return sentence

metric = load_metric("accuracy")

# Load the dataset
dataset = load_dataset("vedantgaur/GPTOutputs-MWP", trust_remote_code=True)

# Combine the 'Prompts' and 'Outputs' columns into one
prompts = Dataset.from_dict({'text': dataset['train']['Prompts'], 'labels': [0]*len(dataset['train']['Prompts'])})
outputs = Dataset.from_dict({'text': dataset['train']['Outputs'], 'labels': [1]*len(dataset['train']['Outputs'])})
combined_dataset = concatenate_datasets([prompts, outputs])

# Apply data augmentation
combined_dataset = combined_dataset.map(lambda x: {'text': ' '.join(random_swap(' '.join(x['text']).split())), 'labels': x['labels']})
# Shuffle the data
shuffled_dataset = combined_dataset.shuffle()

# Define the configuration for the model
config = RobertaConfig.from_pretrained("roberta-base", num_labels=2, hidden_dropout_prob=0.3)

# Tokenize the 'text' column
tokenizer = AutoTokenizer.from_pretrained("roberta-base", config=config)
tokenizedDataset = shuffled_dataset.map(tokenize, batched=True)

# Split the tokenized dataset into a training set and a validation set
tokenizedDataset = tokenizedDataset.train_test_split(test_size=0.2)

# Model: roberta-base
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

BertTraining_args = TrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    num_train_epochs=3,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    save_strategy="steps",
    weight_decay=0.01,
)

# Creation of Bert Trainer Object
trainer = Trainer(
    model=model,
    args=BertTraining_args,
    train_dataset=tokenizedDataset['train'],
    eval_dataset=tokenizedDataset['test'],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_threshold=0.01, early_stopping_patience=1)],
)

# Fine-tune the Model
trainer.train()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Map:   0%|          | 0/6096 [00:00<?, ? examples/s]

Map:   0%|          | 0/6096 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Accuracy
500,0.0612,0.02911,0.993443


TrainOutput(global_step=915, training_loss=0.0392882383586279, metrics={'train_runtime': 632.8636, 'train_samples_per_second': 23.114, 'train_steps_per_second': 1.446, 'total_flos': 1924394258903040.0, 'train_loss': 0.0392882383586279, 'epoch': 3.0})

In [None]:
# Predictions
predictions = trainer.predict(tokenizedDataset['test'])
classPredictions = np.argmax(predictions.predictions, axis=-1)
evalMetrics = compute_metrics((predictions.predictions, predictions.label_ids))
print(evalMetrics)

apiToken = "REDACTED"

# Push the fine-tuned model to Huggingface
model.push_to_hub("SkwarczynskiP/roberta-base-finetuned-vedantgaur-AI-and-human-generated", token=apiToken)

# Push the tokenizer to Huggingface
tokenizer.push_to_hub("SkwarczynskiP/roberta-base-finetuned-vedantgaur-AI-and-human-generated", token=apiToken)

{'accuracy': 0.9934426229508196}


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/SkwarczynskiP/roberta-base-uncased-finetuned-vedantgaur-AI-and-human-generated/commit/2a2b5c95844131ccbee9f4cb271d1485db8b01e7', commit_message='Upload tokenizer', commit_description='', oid='2a2b5c95844131ccbee9f4cb271d1485db8b01e7', pr_url=None, pr_revision=None, pr_num=None)