0x01 TLDR;
接上一篇,完成了CLM模型的预训练实践后,这里进行模型微调的实践。
微调 = 现有模型 + 新数据训练
所以,流程上,我们得先有一份领域数据集。
提取其训练数据和测试数据。
然后,对数据进行Token处理。
接下来加载一个已存在的模型,配制参数后,就可以进行二次训练(微调)了。
输出结果和上次CLM一样,主要包含 token,model数据。
0x02 上代码
- 本次采用在线直接下载模型数据,配置了国内镜像站点
import numpy as np
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments
from sklearn.metrics import accuracy_score
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
os.environ["TRANSFORMERS_OFFLINE"] = "0"
def poc_fine_tuning():
# =============== STEP 1: 加载数据集
# 加载IMDB数据集, 通过上面的环境变量,在线下载,直接保存到 ~/.cache/huggingface/hub/datasets--imdb
dataset = load_dataset("imdb")
print(f'数据集信息: {dataset}')
# 加载训练和测试集
train_dataset = dataset['train']
test_dataset = dataset['test']
# =============== STEP 2: 分词器
# 会从hf下载分词数据
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
print(f'预训练的Token: {tokenizer}')
def tokenize_function(example):
return tokenizer(
example["text"],
truncation=True,
padding="max_length",
max_length=512,
)
# Tokenize 数据, 从原始数据集获取 训练和测试数据
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)
# 设置torch格式
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
# =============== STEP 3: 加载模型
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
# =============== STEP 4: 设置Trainer
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=1)
return {"accuracy": accuracy_score(labels, predictions)}
training_args = TrainingArguments(
output_dir="./bert-imdb",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
weight_decay=0.01,
save_strategy="steps", # Explicitly set to "steps"
save_steps=500, # Save every 500 steps (adjust as needed)
save_total_limit=2,
eval_strategy="steps",
eval_steps=500, # Match save_steps
load_best_model_at_end=True,
metric_for_best_model="eval_loss", # Use validation loss to select best model
greater_is_better=False, # Lower eval_loss is better
logging_dir="./logs",
logging_steps=50,
fp16=True, # Enable mixed precision for efficiency
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_test,
compute_metrics=compute_metrics,
)
# ============== STEP 5: 开始训练
trainer.train()
# 保存模型
model.save_pretrained("./model_output/final_model")
tokenizer.save_pretrained("./model_output/final_model")
# =============== STEP 6: 模型评估
eval_result = trainer.evaluate()
print(f"评估结果: {eval_result}")
# =============== STEP 7: 推理测试
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
result = classifier("This movie was incredibly boring and too long.")
print(f"推理结果: {result}")
最后两行打印结果:
评估结果: {'eval_loss': 0.23269198834896088, 'eval_accuracy': 0.93304, 'eval_runtime': 31.4731, 'eval_samples_per_second': 794.328, 'eval_steps_per_second': 99.291, 'epoch': 3.0}
Device set to use cuda:0
推理结果: [{'label': 'LABEL_0', 'score': 0.9989494681358337}]