安装transformers
pip install transformers
常用import
from transformers import BertTokenizer, BertModel, BertForMaskedLM
定义下载bert模型
下载中文bert-wwm模型wwm的地址
将config文件、vocab文件、bin文件放在/model/(bert)的下面
- bert_config.json 改名为 config.json
- chinese_wwm_pytorch.bin 改名为 pytorch_model.bin
bert_path='./model/bert-wwm/'
model_config = BertConfig.from_pretrained(bert_path)
bert = BertModel.from_pretrained(bert_path,config=model_config)
bert_path='./model/bert-wwm/' #bert-wwm是针对中文的一种优化版本
使用bert的分词器
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(bert_path)
分词
s1="我要去看北京太难梦"
print(tokenizer.tokenize(s1))
句子编码向量化
print(tokenizer.encode('吾儿莫慌'))
句子处理编码,加入分割
sen_code = tokenizer.encode_plus('这个故事没有终点', "正如星空没有彼岸")
编码后的句子反过来
print(tokenizer.convert_ids_to_tokens(sen_code['input_ids']))
text_dict = tokenizer.encode_plus(
text, # Sentence to encode.
add_special_tokens=True, # Add '[CLS]' and '[SEP]'
max_length=max_length, # Pad & truncate all sentences.
ad_to_max_length=True,
return_attention_mask=True, # Construct attn. masks.
# return_tensors='pt', # Return pytorch tensors.
)
encode_plus产生字典的数据索引
input_ids, attention_mask, token_type_ids = text_dict['input_ids'], text_dict['attention_mask'], text_dict['token_type_ids']
编码输出
result = bert(ids, output_all_encoded_layers=True)
bert模型返回
result = (
[encoder_0_output, encoder_1_output, ..., encoder_11_output],
pool_output
)
返回bert的12层的transformer的输出
output_all_encoded_layers参数设置为Fasle,那么result中的第一个元素就不是列表了,只是encoder_11_output,大小为
[batch_size, sequence_length, hidden_size]的张量,可以看作bert对于这句话的表示
transformers有任务pipeline
情感分类
from transformers import pipeline
nlp = pipeline("sentiment-analysis")
print(nlp("I hate you"))
print(nlp("I love you"))
抽取式问答
from transformers import pipeline
nlp = pipeline("question-answering")
context = r"""
Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
a model on a SQuAD task, you may leverage the `run_squad.py`.
"""
print(nlp(question="What is extractive question answering?", context=context))
print(nlp(question="What is a good example of a question answering dataset?", context=context))