在上次的tokenizer化之后,接下来还要作嵌入层的编码,这些细节,就是要手一次一次打才能熟练吧。人老了,不办法,只能多练。
具体思路如图:
image.png
要注意几个参数的意思,如4 * 4 *256。
max_legnth,batch_size,stride等这些的具体意思。其实,结合我们学习本领,上课打工,教育,吵架的经验,这大模型的权重更新,和脑细胞的神经联接,也就不是那么神秘了。
代码
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader
class PGTDatasetV1(Dataset):
def __init__(self, txt, tokenizer, max_length, stride):
self.tokenizer = tokenizer
self.input_ids = []
self.target_ids = []
token_ids = tokenizer.encode(txt)
for i in range(0, len(token_ids)-max_length, stride):
input_chunk = token_ids[i:i+max_length]
target_chunk = token_ids[i+1:i+max_length+1]
self.input_ids.append(torch.tensor(input_chunk))
self.target_ids.append(torch.tensor(target_chunk))
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return self.input_ids[idx], self.target_ids[idx]
def create_dataloader_v1(txt, batch_size=4,
max_length=256, stride=128,
shuffle=True, drop_last=True,
num_workers=0):
tokenizer = tiktoken.get_encoding('gpt2')
dataset = PGTDatasetV1(txt, tokenizer, max_length, stride)
dataloader = DataLoader(
dataset,
batch_size=batch_size,
shuffle=shuffle,
drop_last=drop_last,
num_workers=num_workers
)
return dataloader
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
raw_text = f.read()
output_dim = 256
vocab_size = 50257
max_length = 4
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
dataloader = create_dataloader_v1(
raw_text,
batch_size=8,
max_length=max_length,
stride=max_length,
shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print('Tokin IDs: \n', inputs)
print('\nInputs shape:\n', inputs.shape)
token_embddings = token_embedding_layer(inputs)
print('token_embddings shape:', token_embddings.shape)
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print('pos_embeddings shape:',pos_embeddings.shape)
input_embeddings = token_embddings + pos_embeddings
print('input_embeddings.shape:', input_embeddings.shape)
输出
D:\Python\Python310\python.exe D:\test\Llm_Scratch\pt_data_set_loader.py
Tokin IDs:
tensor([[ 40, 367, 2885, 1464],
[ 1807, 3619, 402, 271],
[10899, 2138, 257, 7026],
[15632, 438, 2016, 257],
[ 922, 5891, 1576, 438],
[ 568, 340, 373, 645],
[ 1049, 5975, 284, 502],
[ 284, 3285, 326, 11]])
Inputs shape:
torch.Size([8, 4])
token_embddings shape: torch.Size([8, 4, 256])
pos_embeddings shape: torch.Size([4, 256])
input_embeddings.shape: torch.Size([8, 4, 256])
Process finished with exit code 0