E:\CODE\pythonProject\other\Test_Learn\HuggingFace
使用教程 (评论区有数据集加载不出来的办法)
安装环境
- transformers
安装huggingface提供的模型 - datasets
安装huggingface提供的数据集(都是API)- 加载数据集失败问题(加载本地数据集)
from datasets import load_from_disk self.dataset = load_from_disk("path")
- 保存
dataset.save_to_disk(dataset_dict_path="path")
-
读取本地csv文件
读写CSV格式本地文件-
读取本地json文件
读写本地json文件 -
保存模型参数
保存模型参数-
test
加载数据进行test
-
-
- 加载数据集失败问题(加载本地数据集)
文本分类案例(bert)(juputer格式)
- 定义数据集
import torch
from datasets import load_dataset
#定义数据集
class Dataset(torch.utils.data.Dataset):
def __init__(self, split):
self.dataset = load_dataset(path='seamew/ChnSentiCorp', split=split)
def __len__(self):
return len(self.dataset)
def __getitem__(self, i):
text = self.dataset[i]['text']
label = self.dataset[i]['label']
return text, label
dataset = Dataset('train')
len(dataset), dataset[0]
- 加载字典和分词工具
from transformers import BertTokenizer
#加载字典和分词工具
token = BertTokenizer.from_pretrained('bert-base-chinese')
token
- 编码部分
def collate_fn(data):
sents = [i[0] for i in data]
labels = [i[1] for i in data]
#编码
data = token.batch_encode_plus(batch_text_or_text_pairs=sents,
truncation=True,
padding='max_length',
max_length=500,
return_tensors='pt',
return_length=True)
#input_ids:编码之后的数字
#attention_mask:是补零的位置是0,其他位置是1
input_ids = data['input_ids']
attention_mask = data['attention_mask']
token_type_ids = data['token_type_ids']
labels = torch.LongTensor(labels)
#print(data['length'], data['length'].max())
return input_ids, attention_mask, token_type_ids, labels
#数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
batch_size=16,
collate_fn=collate_fn,
shuffle=True,
drop_last=True)
for i, (input_ids, attention_mask, token_type_ids,
labels) in enumerate(loader):
break
print(len(loader))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels
- 加载bert中文模型
from transformers import BertModel
#加载预训练模型
pretrained = BertModel.from_pretrained('bert-base-chinese')
# 不训练,不需要计算梯度
for param in pretrained.parameters():
param.requires_grad_(False)
# 模型试算
out = pretrained(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids)
out.last_hidden_state.shape # [batch_size, 数据分词的长度, 词编码的维度]
- 定义下游任务模型
#定义下游任务模型
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.fc = torch.nn.Linear(768, 2)
def forward(self, input_ids, attention_mask, token_type_ids):
with torch.no_grad(): # Context-manager that disabled gradient calculation.
out = pretrained(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids)
out = self.fc(out.last_hidden_state[:, 0]) # 看一下bert的专门设计思路,对于句子的情感分类,只需要拿特征中的第0个词来进行分类就可以了
out = out.softmax(dim=1)
return out
model = Model()
model(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids).shape
- 训练
from transformers import AdamW
#训练
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss() # 损失
model.train()
for i, (input_ids, attention_mask, token_type_ids,
labels) in enumerate(loader):
out = model(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids)
loss = criterion(out, labels)
loss.backward()
optimizer.step()
optimizer.zero_grad()
if i % 5 == 0:
out = out.argmax(dim=1)
accuracy = (out == labels).sum().item() / len(labels)
print(i, loss.item(), accuracy)
if i == 300:
break
- 测试
#测试
def test():
model.eval()
correct = 0
total = 0
loader_test = torch.utils.data.DataLoader(dataset=Dataset('validation'),
batch_size=32,
collate_fn=collate_fn,
shuffle=True,
drop_last=True)
for i, (input_ids, attention_mask, token_type_ids,
labels) in enumerate(loader_test):
if i == 5:
break
print(i)
with torch.no_grad():
out = model(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids)
out = out.argmax(dim=1)
correct += (out == labels).sum().item()
total += len(labels)
print(correct / total)
test()
中文填空(bert)(juputer格式)
- 定义数据集
import torch
from datasets import load_dataset
#定义数据集
class Dataset(torch.utils.data.Dataset): # 直接定义为torch.utils.data.Dataset的子类了
def __init__(self, split):
dataset = load_dataset(path='seamew/ChnSentiCorp', split=split) # split为加载数据的哪一部分(train)
def f(data):
return len(data['text']) > 30
self.dataset = dataset.filter(f) # 传入过滤的function
def __len__(self):
return len(self.dataset)
def __getitem__(self, i):
text = self.dataset[i]['text']
return text
dataset = Dataset('train')
len(dataset), dataset[0]
- 加载字典和分词工具
from transformers import BertTokenizer
#加载字典和分词工具
token = BertTokenizer.from_pretrained('bert-base-chinese')
token
- 定义批处理函数
# 定义批处理函数
def collate_fn(data):
#编码
data = token.batch_encode_plus(batch_text_or_text_pairs=data,
truncation=True, # 是否切
padding='max_length', # padding
max_length=30,
return_tensors='pt', # 返回的是pytorch格式的tensor
return_length=True)
#input_ids:编码之后的数字
#attention_mask:是补零的位置是0,其他位置是1
input_ids = data['input_ids']
attention_mask = data['attention_mask']
token_type_ids = data['token_type_ids']
#把第15个词固定替换为mask
labels = input_ids[:, 15].reshape(-1).clone() # 把每句话中的第15个(中间那个)字填充为mask,并将其定义为label。
input_ids[:, 15] = token.get_vocab()[token.mask_token]
#print(data['length'], data['length'].max())
return input_ids, attention_mask, token_type_ids, labels
#数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
batch_size=16,
collate_fn=collate_fn,
shuffle=True,
drop_last=True)
for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
break
print(len(loader))
print(token.decode(input_ids[0]))
print(token.decode(labels[0]))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels.shape
- 加载bert中文模型
from transformers import BertModel
#加载预训练模型
pretrained = BertModel.from_pretrained('bert-base-chinese')
#不训练,不需要计算梯度(冻结参数)
for param in pretrained.parameters():
param.requires_grad_(False)
#模型试算
out = pretrained(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids)
out.last_hidden_state.shape
- 定义下游任务模型
#定义下游任务模型
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.decoder = torch.nn.Linear(768, token.vocab_size, bias=False) # 填空题,可以填所有可能,所以分类结果是vocab_size
self.bias = torch.nn.Parameter(torch.zeros(token.vocab_size))
self.decoder.bias = self.bias
def forward(self, input_ids, attention_mask, token_type_ids):
with torch.no_grad():
out = pretrained(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids)
out = self.decoder(out.last_hidden_state[:, 15])
return out
model = Model()
model(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids).shape
- 训练
from transformers import AdamW
#训练
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()
model.train()
for epoch in range(5):
for i, (input_ids, attention_mask, token_type_ids,
labels) in enumerate(loader):
out = model(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids)
loss = criterion(out, labels)
loss.backward()
optimizer.step()
optimizer.zero_grad()
if i % 50 == 0:
out = out.argmax(dim=1)
accuracy = (out == labels).sum().item() / len(labels)
print(epoch, i, loss.item(), accuracy)
- 测试
#测试
def test():
model.eval()
correct = 0
total = 0
loader_test = torch.utils.data.DataLoader(dataset=Dataset('test'),
batch_size=32,
collate_fn=collate_fn,
shuffle=True,
drop_last=True)
for i, (input_ids, attention_mask, token_type_ids,
labels) in enumerate(loader_test):
if i == 15:
break
print(i)
with torch.no_grad():
out = model(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids)
out = out.argmax(dim=1)
correct += (out == labels).sum().item()
total += len(labels)
print(token.decode(input_ids[0]))
print(token.decode(labels[0]), token.decode(labels[0]))
print(correct / total)
test()
中文句子关系推断(bert)(juputer格式)
- 定义数据集
import torch
from datasets import load_dataset
import random
#定义数据集
class Dataset(torch.utils.data.Dataset):
def __init__(self, split):
dataset = load_dataset(path='seamew/ChnSentiCorp', split=split)
def f(data):
return len(data['text']) > 40
self.dataset = dataset.filter(f)
def __len__(self):
return len(self.dataset)
def __getitem__(self, i):
text = self.dataset[i]['text']
#切分一句话为前半句和后半句
sentence1 = text[:20]
sentence2 = text[20:40]
label = 0
#有一半的概率把后半句替换为一句无关的话
if random.randint(0, 1) == 0:
j = random.randint(0, len(self.dataset) - 1)
sentence2 = self.dataset[j]['text'][20:40]
label = 1
return sentence1, sentence2, label
dataset = Dataset('train')
sentence1, sentence2, label = dataset[0]
len(dataset), sentence1, sentence2, label
- 加载字典和分词工具
from transformers import BertTokenizer
#加载字典和分词工具
token = BertTokenizer.from_pretrained('bert-base-chinese')
token
- 定义批处理函数
# 定义批处理函数
def collate_fn(data):
sents = [i[:2] for i in data]
labels = [i[2] for i in data]
#编码
data = token.batch_encode_plus(batch_text_or_text_pairs=sents,
truncation=True,
padding='max_length',
max_length=45,
return_tensors='pt',
return_length=True,
add_special_tokens=True)
#input_ids:编码之后的数字
#attention_mask:是补零的位置是0,其他位置是1
#token_type_ids:第一个句子和特殊符号的位置是0,第二个句子的位置是1
input_ids = data['input_ids']
attention_mask = data['attention_mask']
token_type_ids = data['token_type_ids']
labels = torch.LongTensor(labels)
#print(data['length'], data['length'].max())
return input_ids, attention_mask, token_type_ids, labels
#数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
batch_size=8,
collate_fn=collate_fn,
shuffle=True,
drop_last=True)
for i, (input_ids, attention_mask, token_type_ids,
labels) in enumerate(loader):
break
print(len(loader))
print(token.decode(input_ids[0]))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels
- 加载bert中文模型
from transformers import BertModel
#加载预训练模型
pretrained = BertModel.from_pretrained('bert-base-chinese')
#不训练,不需要计算梯度
for param in pretrained.parameters():
param.requires_grad_(False)
#模型试算
out = pretrained(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids)
out.last_hidden_state.shape
- 定义下游任务模型
#定义下游任务模型
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.fc = torch.nn.Linear(768, 2)
def forward(self, input_ids, attention_mask, token_type_ids):
with torch.no_grad():
out = pretrained(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids)
out = self.fc(out.last_hidden_state[:, 0])
out = out.softmax(dim=1)
return out
model = Model()
model(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids).shape
- 训练
from transformers import AdamW
#训练
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()
model.train()
for i, (input_ids, attention_mask, token_type_ids,
labels) in enumerate(loader):
out = model(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids)
loss = criterion(out, labels)
loss.backward()
optimizer.step()
optimizer.zero_grad()
if i % 5 == 0:
out = out.argmax(dim=1)
accuracy = (out == labels).sum().item() / len(labels)
print(i, loss.item(), accuracy)
if i == 300:
break
- 测试
#测试
def test():
model.eval()
correct = 0
total = 0
loader_test = torch.utils.data.DataLoader(dataset=Dataset('test'),
batch_size=32,
collate_fn=collate_fn,
shuffle=True,
drop_last=True)
for i, (input_ids, attention_mask, token_type_ids,
labels) in enumerate(loader_test):
if i == 5:
break
print(i)
with torch.no_grad():
out = model(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids)
pred = out.argmax(dim=1)
correct += (pred == labels).sum().item()
total += len(labels)
print(correct / total)
test()