数据集准备和预处理
通过dataset包加载数据集
定义Dataset.map要使用的预处理函数
定义DataCollator来用于构造训练batch
加载预训练模型
随机初始化ForSequenceClassificationHead
微调训练
Trainer是Huggingface transformers库的一个高级API,可以帮助我们快速搭建训练框架。
默认情况下,Trainer和TrainingArguments会使用:
batch size=8
epochs = 3
AdamW优化器
可以提供一个compute_metrics函数,用于输出我们希望有的一些指标。
import os
import torch
import numpy as np
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
sequences = [
"I've been waiting for a HuggingFace course my whole life.",
"This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
batch['labels'] = torch.tensor([1, 1]) # tokenizer出来的结果是一个dictionary,所以可以直接加入新的 key-value
optimizer = AdamW(model.parameters())
loss = model(**batch).loss #这里的 loss 是直接根据 batch 中提供的 labels 来计算的,回忆:前面章节查看 model 的输出的时候,有loss这一项
loss.backward()
optimizer.step()
from datasets import load_dataset
raw_datasets = load_dataset("glue", "mrpc")
raw_train_dataset = raw_datasets['train']
tokenized_sentences_1 = tokenizer(raw_train_dataset['sentence1'])
tokenized_sentences_2 = tokenizer(raw_train_dataset['sentence2'])
from pprint import pprint as print
inputs = tokenizer("first sentence", "second one")
print(inputs)
def tokenize_function(sample):
# 这里可以添加多种操作,不光是tokenize
# 这个函数处理的对象,就是Dataset这种数据类型,通过features中的字段来选择要处理的数据
return tokenizer(sample['sentence1'], sample['sentence2'], truncation=True)
# Dataset.map节约内存
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets)
# 划分batch的时候再进行padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
samples = tokenized_datasets['train'][:5]
print(samples.keys())
# 把这里多余的几列去掉
samples = {k:v for k,v in samples.items() if k not in ["idx", "sentence1", "sentence2"]} # 把这里多余的几列去掉
print(samples.keys())
# 打印出每个句子的长度:
print([len(x) for x in samples["input_ids"]])
# 然后我们使用data_collator来处理padding
batch = data_collator(samples) # samples中必须包含 input_ids 字段,因为这就是collator要处理的对象
print(batch.keys())
# 再打印长度:
print([len(x) for x in batch['input_ids']])
from transformers import Trainer, TrainingArguments
from datasets import load_metric
def compute_metrics(eval_preds):
metric = load_metric("glue", "mrpc")
logits, labels = eval_preds.predictions, eval_preds.label_ids
# 上一行可以直接简写成:
# logits, labels = eval_preds 因为它相当于一个tuple
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
training_args = TrainingArguments(output_dir='test_trainer') # 指定输出文件夹,没有会自动创建
trainer = Trainer(
model,
training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator, # 在定义了tokenizer之后,其实这里的data_collator就不用再写了,会自动根据tokenizer创建
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
# 启动训练
trainer.train()
# trainer.save_model()
# 或者加载之前的训练结果
from transformers.trainer_utils import EvalPrediction, get_last_checkpoint
#last_checkpoint = get_last_checkpoint('test_trainer')
#trainer.train(resume_from_checkpoint=last_checkpoint)