Introduction

这是一个基于bert-base-chinese微调的情感分类的模型,准确率大概为94.6% ,数据集为sentiment-classification,一个学习项目,旨在学习NLP的基础知识以及了解hugging face生态。

代码请查看Sentiment-Classification,具体的细节可以查看我的博客

Usage


from transformers.models.auto.tokenization_auto import AutoTokenizer
from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification

from transformers.training_args import TrainingArguments
from transformers.data.data_collator import DataCollatorWithPadding
from transformers.trainer import Trainer
from transformers.trainer_utils import EvalPrediction
from datasets import load_dataset, Features, Value, ClassLabel


model = AutoModelForSequenceClassification.from_pretrained("left0ver/bert-base-chinese-finetune-sentiment-classification",num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese",return_tensors="pt")


dataset = load_dataset("left0ver/sentiment-classification")
tokenized_dataset = dataset.map(
    lambda examples: tokenizer(examples["text"],truncation=True,max_length=512),
    batched=True,
    remove_columns=["text"],
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,padding=True)
training_args = TrainingArguments(
    output_dir="./char_based_bert_finetune",
    num_train_epochs =10,
    eval_strategy = "epoch",
    per_device_train_batch_size =64,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps =1,
    learning_rate = 1e-6,
    lr_scheduler_type = "cosine",
    logging_strategy= "steps",
    logging_steps = 20,
    save_strategy = "epoch",
    save_total_limit = 4,
    seed = 42,
    data_seed = 42,
    load_best_model_at_end=True,
    # 指定label的字段
    label_names=["labels"],
    run_name="char_based_bert_finetune",
    report_to="wandb",
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    optim="adamw_torch",
    # eval_on_start=True, # just for test eval
)
def compute_metrics(eval_pred:EvalPrediction):
    predictions, labels = eval_pred
    accuracy = (predictions == labels).mean()
    return {
        'accuracy': accuracy,
    }

def preprocess_logits_for_metrics(logits, labels):
    predictions = logits.argmax(axis=1)
    return predictions

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    # tokenizer = tokenizer,
    processing_class = tokenizer,
    data_collator=data_collator,
    preprocess_logits_for_metrics =preprocess_logits_for_metrics,
)
trainer.evaluate()
Downloads last month
16
Safetensors
Model size
0.1B params
Tensor type
F32
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for left0ver/bert-base-chinese-finetune-sentiment-classification

Finetuned
(239)
this model

Dataset used to train left0ver/bert-base-chinese-finetune-sentiment-classification