class Progbar(object):
"""Progbar class copied from keras (https://github.com/fchollet/keras/)
Displays a progress bar.
Small edit : added strict arg to update
# Arguments
target: Total number of steps expected.
interval: Minimum visual progress update interval (in seconds).
"""
def __init__(self, target, width=30, verbose=1):
self.width = width
self.target = target
self.sum_values = {}
self.unique_values = []
self.start = time.time()
self.total_width = 0
self.seen_so_far = 0
self.verbose = verbose
def update(self, current, values=[], exact=[], strict=[]):
"""
Updates the progress bar.
# Arguments
current: Index of current step.
values: List of tuples (name, value_for_last_step).
The progress bar will display averages for these values.
exact: List of tuples (name, value_for_last_step).
The progress bar will display these values directly.
"""
for k, v in values:
if k not in self.sum_values:
self.sum_values[k] = [v * (current - self.seen_so_far),
current - self.seen_so_far]
self.unique_values.append(k)
else:
self.sum_values[k][0] += v * (current - self.seen_so_far)
self.sum_values[k][1] += (current - self.seen_so_far)
for k, v in exact:
if k not in self.sum_values:
self.unique_values.append(k)
self.sum_values[k] = [v, 1]
for k, v in strict:
if k not in self.sum_values:
self.unique_values.append(k)
self.sum_values[k] = v
self.seen_so_far = current
now = time.time()
if self.verbose == 1:
prev_total_width = self.total_width
sys.stdout.write("\b" * prev_total_width)
sys.stdout.write("\r")
numdigits = int(np.floor(np.log10(self.target))) + 1
barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
bar = barstr % (current, self.target)
prog = float(current)/self.target
prog_width = int(self.width*prog)
if prog_width > 0:
bar += ('='*(prog_width-1))
if current < self.target:
bar += '>'
else:
bar += '='
bar += ('.'*(self.width-prog_width))
bar += ']'
sys.stdout.write(bar)
self.total_width = len(bar)
if current:
time_per_unit = (now - self.start) / current
else:
time_per_unit = 0
eta = time_per_unit*(self.target - current)
info = ''
if current < self.target:
info += ' - ETA: %ds' % eta
else:
info += ' - %ds' % (now - self.start)
for k in self.unique_values:
if type(self.sum_values[k]) is list:
info += ' - %s: %.4f' % (k,
self.sum_values[k][0] / max(1, self.sum_values[k][1]))
else:
info += ' - %s: %s' % (k, self.sum_values[k])
self.total_width += len(info)
if prev_total_width > self.total_width:
info += ((prev_total_width-self.total_width) * " ")
sys.stdout.write(info)
sys.stdout.flush()
if current >= self.target:
sys.stdout.write("\n")
if self.verbose == 2:
if current >= self.target:
info = '%ds' % (now - self.start)
for k in self.unique_values:
info += ' - %s: %.4f' % (k,
self.sum_values[k][0] / max(1, self.sum_values[k][1]))
sys.stdout.write(info + "\n")
def add(self, n, values=[]):
self.update(self.seen_so_far+n, values)
先看函数体介绍,这是从Keras参考来的一个类。OK,了然了,看这函数介绍,哈哈哈,原来是一个显示程序进程的函数哈哈哈,得嘞,默默马下来先,然后我们再细细讲解!好了关于这个函数我会接下来单独出一章分析。
那好,我们继续来看run_epoch()。
def run_epoch(self, train, dev, epoch):
"""Performs one complete pass over the train set and evaluate on dev
Args:
train: dataset that yields tuple of sentences, tags
dev: dataset
epoch: (int) index of the current epoch
Returns:
f1: (python float), score to select model on, higher is better
"""
# progbar stuff for logging
batch_size = self.config.batch_size
nbatches = (len(train) + batch_size - 1) // batch_size
prog = Progbar(target=nbatches)
# iterate over dataset
for i, (words, labels) in enumerate(minibatches(train, batch_size)):
fd, _ = self.get_feed_dict(words, labels, self.config.lr,
self.config.dropout)
_, train_loss, summary = self.sess.run(
[self.train_op, self.loss, self.merged], feed_dict=fd)
prog.update(i + 1, [("train loss", train_loss)])
# tensorboard
if i % 10 == 0:
self.file_writer.add_summary(summary, epoch*nbatches + i)
metrics = self.run_evaluate(dev)
msg = " - ".join(["{} {:04.2f}".format(k, v)
for k, v in metrics.items()])
self.logger.info(msg)
return metrics["f1"]
接下来,映入眼帘的是for循环一个enumerate目标。
enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标,一般用在 for 循环当中。
这里下标赋给i,并用(words,labels)组成一个键值对,传入的参数是minibatches(train,batch_size),关于minibatches(),这是一个在data_utils.py中的函数,来看一下。
def minibatches(data, minibatch_size):
"""
Args:
data: generator of (sentence, tags) tuples
minibatch_size: (int)
Yields:
list of tuples
"""
x_batch, y_batch = [], []
for (x, y) in data:
if len(x_batch) == minibatch_size:
yield x_batch, y_batch
x_batch, y_batch = [], []
if type(x[0]) == tuple:
x = zip(*x)
x_batch += [x]
y_batch += [y]
if len(x_batch) != 0:
yield x_batch, y_batch
这里传入的参数是data和minibatch_size,也就是文中的mini_batch(= 20),返回一个生成器,用来不断给出新的数据。
先构建两个空列表用来存放data和label(x和y),首先判断x_batch的长度是不是已经等于minibatch_size,也就是说有没有迭代到最后一个元素,如果是的话就说明一个mini_batch结束了,于是yield并且清空列表。
然后一个判断,如果x[0]是tuple类型(元组),那么将x解压,x和y(为啥y不解压?)依次赋在x_batch和y_batch这两个列表里,最后如果x_batch长度不为0的话,返回x_batch和y_batch的迭代器。
- run_epoch()的这一句for i, (words, labels) in enumerate(minibatches(train, batch_size)):就讲解完了,下一句,在for循环里,已知我们有了i,有了(words,labels)键值对,下面该喂入数据了,然后进行sess.run操作,嗯。
2019年7月22日12:01:25 俺先吃个饭。
2019年7月22日13:49:29 上钟!
下一句,
for i, (words, labels) in enumerate(minibatches(train, batch_size)):
fd, _ = self.get_feed_dict(words, labels, self.config.lr,
self.config.dropout)
get_feed_dict()函数,字面意思是将获取到的数据打包成DICT类型,才能喂入数据,来看一下train.py ->model.train() ->base_model.py ->run_epoch() ->ner_model.py ->run_epoch() ->get_feed_dict()。
def get_feed_dict(self, words, labels=None, lr=None, dropout=None):
"""Given some data, pad it and build a feed dictionary
Args:
words: list of sentences. A sentence is a list of ids of a list of
words. A word is a list of ids
labels: list of ids
lr: (float) learning rate
dropout: (float) keep prob
Returns:
dict {placeholder: value}
"""
# perform padding of the given data
if self.config.use_chars:
char_ids, word_ids = zip(*words)
word_ids, sequence_lengths = pad_sequences(word_ids, 0)
char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0,
nlevels=2)
else:
word_ids, sequence_lengths = pad_sequences(words, 0)
# build feed dictionary
feed = {
self.word_ids: word_ids,
self.sequence_lengths: sequence_lengths
}
if self.config.use_chars:
feed[self.char_ids] = char_ids
feed[self.word_lengths] = word_lengths
if labels is not None:
labels, _ = pad_sequences(labels, 0)
feed[self.labels] = labels
if lr is not None:
feed[self.lr] = lr
if dropout is not None:
feed[self.dropout] = dropout
return feed, sequence_lengths
传入 self、words、labels、leraning_rate、dropout,看函数体介绍,给定一些数据,进行pad操作并喂入数据(字典类型),这里words是一个列表类型的句子集合(20个句子?),然后每个句子是一对Word id的集合,返回字典类型的数据。
如果我们使用了char_embedding,也就是之前将char_embedding和word_embedding concat的那个操作,那么在这里要先把它们解压出来,使用zip(*words)操作。分别对word_ids和char_ids执行一个pad_sequences操作。
data_utils.py -> pad_sequences()
def pad_sequences(sequences, pad_tok, nlevels=1):
"""
Args:
sequences: a generator of list or tuple
pad_tok: the char to pad with
nlevels: "depth" of padding, for the case where we have characters ids
Returns:
a list of list where each sublist has same length
"""
if nlevels == 1:
max_length = max(map(lambda x : len(x), sequences))
sequence_padded, sequence_length = _pad_sequences(sequences,
pad_tok, max_length)
elif nlevels == 2:
max_length_word = max([max(map(lambda x: len(x), seq))
for seq in sequences])
sequence_padded, sequence_length = [], []
for seq in sequences:
# all words are same length now
sp, sl = _pad_sequences(seq, pad_tok, max_length_word)
sequence_padded += [sp]
sequence_length += [sl]
max_length_sentence = max(map(lambda x : len(x), sequences))
sequence_padded, _ = _pad_sequences(sequence_padded,
[pad_tok]*max_length_word, max_length_sentence)
sequence_length, _ = _pad_sequences(sequence_length, 0,
max_length_sentence)
return sequence_padded, sequence_length
进函数看一下,仨参数:
- sequences:列表或元祖的生成器。
- pad_tok:要填入的pad东东。
- nlevels:要填入的深度,只有我们使用char_embeddings时才考虑,所以这里设默认值为1,有char_id参与的时候才设为2。
返回值: - 一个列表的列表元素,其中每个子列表都有相同的长度。
先看nlevels为1的情形:
先求最大长度,max_length = max(map(lambda x : len(x), sequences))。
这句话看上去有点复杂,实则不然,最外部是一个max求最大值的函数,中间一个map函数,最内部内接一个lambda匿名函数,我们来分析一下,首先什么是map? - map() 会根据提供的函数对指定序列做映射。第一个参数 function 以参数序列中的每一个元素调用 function 函数,返回包含每次 function 函数返回值的新列表。
talk is cheap,show me the code.
以下示例展示了map的使用方法:
>>>def square(x) : # 计算平方数
... return x ** 2
...
>>> map(square, [1,2,3,4,5]) # 计算列表各个元素的平方
[1, 4, 9, 16, 25]
>>> map(lambda x: x ** 2, [1, 2, 3, 4, 5]) # 使用 lambda 匿名函数
[1, 4, 9, 16, 25]
# 提供了两个列表,对相同位置的列表数据进行相加
>>> map(lambda x, y: x + y, [1, 3, 5, 7, 9], [2, 4, 6, 8, 10])
[3, 7, 11, 15, 19]
所以这里,
max_length = max(map(lambda x : len(x), sequences))
就是把sequences输入作为参数代入到lambda里,然后用map进行批处理运算,最后呢一个max函数教大家做人。
得到了max_length,也就是最大长度,接下来要进行PAD填充操作。
sequence_padded, sequence_length = _pad_sequences(sequences,
pad_tok, max_length)
看字面,返回的是padd处理过后的sequence和其长度sequence_length,这里又出现了_pad_sequences()函数,刺激,我们来看一下。
data_utils.py -> _pad_sequences()
def _pad_sequences(sequences, pad_tok, max_length):
"""
Args:
sequences: a generator of list or tuple
pad_tok: the char to pad with
Returns:
a list of list where each sublist has same length
"""
sequence_padded, sequence_length = [], []
for seq in sequences:
seq = list(seq)
seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0)
sequence_padded += [seq_]
sequence_length += [min(len(seq), max_length)]
return sequence_padded, sequence_length
看函数体应该是往sequences对象内pad内容,传入三个参数:
- sequences:序列数据,一个列表或元组的生成器。
- pad_tok:需要pad进sequences的内容。
- max_length:sequences的最大长度,稍后要用这个值去对比,判断pad的数量。
返回: - 具有一系列相同长度的子列表的列表。
首先构建两个空列表,sequence_padded、sequence_length,分别用来存储pad处理后的sequence列表和sequence长度列表(这里我有个疑问:讲道理pad过后的sequence长度不都应该等于max_length吗?)
然后遍历sequences序列,先将seq转为列表类型(本来不就是list吗?)
下一步对seq进行填充,seq,先取出seq前max_length个数据:我们知道不论seq中有多少元素,总数不会超过max_length,这里设置seq[:max_length]会显示前max_length个数据,如果不够max_length就输出所有的。
看一个示例就全懂了。
然后在seq的尾部填入max(max_length - len(seq), 0)数量的pad_tok,这里作者将pad_tok设为0,那也就是填入[0]咯,应该是的吧。
最后再将**seq加入到sequence_padded列表中,同时用一个min函数将seq的真实长度更新到sequence_length**序列中,之前的疑惑也解决了,这里存的是原本的长度,不过问题又来了:存原本的长度做啥??? - 好,返回到pad_sequences中,nlevels == 1懂了,接着是nlevels == 2。
elif nlevels == 2:
max_length_word = max([max(map(lambda x: len(x), seq))
for seq in sequences])
sequence_padded, sequence_length = [], []
for seq in sequences:
# all words are same length now
sp, sl = _pad_sequences(seq, pad_tok, max_length_word)
sequence_padded += [sp]
sequence_length += [sl]
max_length_sentence = max(map(lambda x : len(x), sequences))
sequence_padded, _ = _pad_sequences(sequence_padded,
[pad_tok]*max_length_word, max_length_sentence)
sequence_length, _ = _pad_sequences(sequence_length, 0,
max_length_sentence)
return sequence_padded, sequence_lengt
nlevel为2时是使用char_imbedding技术的。
max_length_word = max([max(map(lambda x: len(x), seq)) for seq in sequences])
这句也好理解,无非比刚才多嵌套了一个for循环,这是在遍历sequences中的seq中的每一个word,寻找最长的单词长度!和刚才寻找最长的sequence长度大同小异。
然后这里新建两个空列表sequence_padded、sequence_length,用来存放每个句子中的每个单词的padded和长度。说白了!现在的sequences中的每一个seq,就类比于上一个的sequences,现在的seq中的每一个word,就类比于上一个的seq~不知道各位懂了没233反正我是懂了。
- 然后一个for循环,和上一个要用,执行_pad_sequences()操作,赋给sp(sequence_padded)、sl(sequence_length),最后添加到列表元素里。
max_length_sentence = max(map(lambda x : len(x), sequences))
sequence_padded, _ = _pad_sequences(sequence_padded,
[pad_tok]*max_length_word, max_length_sentence)
sequence_length, _ = _pad_sequences(sequence_length, 0,
max_length_sentence)
最后的这三句话有点意思,首先我们每个句子的单词长度不是一样了嘛(笑哭),但我们每个句子的长度还不一样啊,所以像之前nlevels==1那样,把每个句子也得处理一下啊,不过这时候填充的内容就不是[pad_tok]了,而是[pad_tok]*max_length_word,相信各位也能懂,将它赋给sequence_padded,最后sequence_length也要处理一下。
不过我一直不懂要sequences_length能干啥!!!