Bert

import numpy as np
import pandas as pd
from tqdm import tqdm

tqdm.pandas()
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate, Conv1D, Activation
from keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.optimizers import Adam
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler

# crawl-300d-2M.vec--> https://fasttext.cc/docs/en/english-vectors.html
# When pre-train embedding is helpful? https://www.aclweb.org/anthology/N18-2084
# There are many pretrained word embedding models:
# fasttext, GloVe, Word2Vec, etc
# crawl-300d-2M.vec is trained from Common Crawl (a website that collects almost everything)
# it has 2 million words. Each word is represent by a vector of 300 dimensions.

# https://nlp.stanford.edu/projects/glove/
# GloVe is similar to crawl-300d-2M.vec. Probably, they use different algorithms.
# glove.840B.300d.zip: Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download)
# tokens mean words. It has 2.2M different words and 840B (likely duplicated) words in total

# note that these two pre-trained models give 300d vectors.
EMBEDDING_FILES = [
    # '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec',
    './data/glove.840B.300d.txt'
]

NUM_MODELS = 2
# the maximum number of different words to keep in the original texts
# 40_000 is a normal number
# 100_000 seems good too
MAX_FEATURES = 100000

# this is the number of training sample to put in theo model each step
BATCH_SIZE = 512

# units parameters in Keras.layers.LSTM/cuDNNLSTM
# it it the dimension of the output vector of each LSTM cell.
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS

EPOCHS = 4

# we will convert each word in a comment_text to a number.
# So a comment_text is a list of number. How many numbers in this list?
# we want the length of this list is a constant -> MAX_LEN
MAX_LEN = 220


def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path):
    # each line in the file looks like
    # apple 0.3 0.4 0.5 0.6 ...
    # that is a word followed by 300 float numbers

    with open(path) as f:
        # return dict(get_coefs(*line.strip().split(' ')) for line in f)
        return dict(get_coefs(*o.strip().split(" ")) for o in tqdm(f))


def build_matrix(word_index, path):
    # path: a path that contains embedding matrix
    # word_index is a dict of the form ('apple': 123, 'banana': 349, etc)
    # that means word_index[word] gives the index of the word
    # word_index was built from all commment_texts

    # we will construct an embedding_matrix for the words in word_index
    # using pre-trained embedding word vectors from 'path'

    embedding_index = load_embeddings(path)

    # embedding_matrix is a matrix of len(word_index)+1  x 300
    embedding_matrix = np.zeros((len(word_index) + 1, 300))

    # word_index is a dict. Each element is (word:i) where i is the index
    # of the word
    for word, i in word_index.items():
        try:
            # RHS is a vector of 300d
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix


def build_model(embedding_matrix, num_aux_targets):
    # a simpler version can be found here
    # https://www.tensorflow.org/tutorials/keras/basic_text_classification

    # Trainable params of the model: 1,671,687
    # Recall that the number of samples in train.csv is 1_804_874

    # words is a vector of MAX_LEN dimension
    words = Input(shape=(MAX_LEN,))

    # Embedding is the keras layer. We use the pre-trained embbeding_matrix
    # https://keras.io/layers/embeddings/
    # have to say that parameters in this layer are not trainable
    # x is a vector of 600 dimension
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)

    # *embedding_matrix.shape is a short way for
    # input_dim = embedding_matrix.shape[0], output_dim  = embedding_matrix.shape[1]

    # here the author used pre-train embedding matrix.
    # instead of train from begining like in tensorflow example

    # https://stackoverflow.com/questions/50393666/how-to-understand-spatialdropout1d-and-when-to-use-it
    x = SpatialDropout1D(0.25)(x)
    x = Conv1D(200, 3, padding='valid', activation='relu', strides=1)
    x = GlobalMaxPooling1D()
    x = Dropout(0.5)
    hidden = Activation('relu')
    # x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
    #
    # x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
    #
    # hidden = concatenate([
    #     GlobalMaxPooling1D()(x),
    #     GlobalAveragePooling1D()(x),
    # ])
    #
    # hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='tanh')(hidden)])
    # hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid', name='main_output')(hidden)

    # num_aux_targets = 6 since y_aux_train has 6 columns
    aux_result = Dense(num_aux_targets, activation='sigmoid', name='aux_ouput')(hidden)

    model = Model(inputs=words, outputs=[result, aux_result])

    # model.summary() will gives a good view of the model structure

    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(clipnorm=0.1),
        metrics=['accuracy'])

    return model


train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

#
# Take the columns 'comment_text' from train,
# then fillall NaN values by emtpy string '' (redundant)
x_train = train['comment_text'].fillna('').values

# if true, y_train[i] =1, if false, it is = 0
y_train = np.where(train['target'] >= 0.5, 1, 0)

y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]

#
# Take the columns 'comment_text' from test,
# then fillall NaN values by emtpy string '' (redundant)
x_test = test['comment_text'].fillna('').values

# https://keras.io/preprocessing/text/
# tokenizer is a class with some method
tokenizer = text.Tokenizer(num_words=MAX_FEATURES)

# we apply method fit_on_texts of tokenizer on x_train and x_test
# it will initialize some parameters/attribute inside tokenizer
# https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py#L139
# https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py#L210

tokenizer.fit_on_texts(list(x_train) + list(x_test))
# for example, after fit_on_texts, we can type
# tokenizer.word_counts #give a OderedDict
# tokenizer.document_counts # an int
# tokenizer.word_index is a dict of words with correponding indices
# There are 410046 different words in all 'comment_text'
# len(tokenizer.word_index) == 410_046


# these words come from all 'comment_text' in training.csv and test.csv

# tokenizer.index_word: the inverse of tokenizer.word_index


# https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py#L267
# we will convert each word in a comment_text to a number.
# So a comment_text is a list of number.


x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

# https://keras.io/preprocessing/sequence/
# https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/sequence.py
# each comment_text is now a list of word
# we want the length of this list is a constant -> MAX_LEN
# if the list is longer, then we cut/trim it
# if shorter, then we add/pad it with 0's at the beginning
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

# create an embedding_matrix
# after this, embedding_matrix is a matrix of size
# len(tokenizer.word_index)+1   x 600
# we concatenate two matrices, 600 = 300+300
embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)
# embedding_matrix.shape
# == (410047, 600)

# embedding_matrix[i] is a 600d vector representation of the word whose index is i
# embedding_matrix[10]
# tokenizer.index_word[10] == 'you'


checkpoint_predictions = []
weights = []

# https://keras.io/callbacks/#learningratescheduler

for model_idx in range(NUM_MODELS):
    # build the same models
    model = build_model(embedding_matrix, y_aux_train.shape[-1])
    # We train each model EPOCHS times
    # After each epoch, we reset learning rate (we are using Adam Optimizer)
    # https://towardsdatascience.com/learning-rate-scheduler-d8a55747dd90

    # https://github.com/keras-team/keras/blob/master/keras/callbacks.py#L921
    # learningrate is the attribute 'lr' from Adam optimizer
    # see https://github.com/keras-team/keras/blob/master/keras/optimizers.py#L460
    # In Adam Optimizer, learning rate is changing after each batch
    for global_epoch in range(EPOCHS):
        model.fit(
            x_train,
            [y_train, y_aux_train],
            batch_size=BATCH_SIZE,
            epochs=1,
            verbose=1,
            callbacks=[
                LearningRateScheduler(lambda epoch: 1e-3 * (0.6 ** global_epoch), verbose=1)
            ]
        )
        # model.predict will give two outputs: main_output (target) and aux_output
        # we only take main_output
        checkpoint_predictions.append(model.predict(x_test, batch_size=2048)[0].flatten())
        weights.append(2 ** global_epoch)

# take average (with weights) of predictions from two models
# predictions is an np.array
predictions = np.average(checkpoint_predictions, weights=weights, axis=0)

submission = pd.DataFrame.from_dict({
    'id': test['id'],
    'prediction': predictions
})

submission.to_csv('./result/submission.csv', index=False)
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 204,590评论 6 478
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 86,808评论 2 381
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 151,151评论 0 337
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 54,779评论 1 277
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 63,773评论 5 367
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 48,656评论 1 281
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 38,022评论 3 398
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 36,678评论 0 258
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 41,038评论 1 299
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 35,659评论 2 321
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 37,756评论 1 330
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 33,411评论 4 321
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 39,005评论 3 307
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 29,973评论 0 19
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 31,203评论 1 260
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 45,053评论 2 350
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 42,495评论 2 343

推荐阅读更多精彩内容