import jieba
import numpy as np
import xml.dom.minidom
import random
from gensim.models import Word2Vec
from gensim.corpora.dictionary import Dictionary
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten,Activation,Dropout,Merge
from keras.layers import Conv1D, MaxPooling1D, Embedding,GlobalMaxPooling1D
from keras.models import Model
from keras.models import Sequential
from gensim.models.keyedvectors import KeyedVectors
from keras.callbacks import EarlyStopping,ModelCheckpoint,Callback
np.random.seed(1337)
MAX_SEQUENCE_LENGTH = 70
MAX_NB_WORDS = 600000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2
bwfile = open("badwordlist.txt",encoding='utf8')
bwlist = [line.strip('\n') for line in bwfile]
bwfile.close()
for bw in bwlist:
jieba.add_word(bw)
emo = ['like','fear','disgust','anger','surprise','sadness','happiness','none']
smallemo= ['angry/disgusted','happy/like','sad','afraid/surprised','other']
'''
stop_words = ["的", "一", "不", "在", "人", "有", "是", "为", "以", "于", "上", "他", "而",
"后", "之", "来", "及", "了", "因", "下", "可", "到", "由", "这", "与", "也",
"此", "但", "并", "个", "其", "已", "无", "小", "我", "们", "起", "最", "再",
"今", "去", "好", "只", "又", "或", "很", "亦", "某", "把", "那", "你", "乃",
"它","要", "将", "应", "位", "新", "两", "中", "更", "我们", "自己", "没有", "“", "”",
",", "(", ")", " ",'[',']',' ','~','。','!',':','、','/','…']
'''
stop_words = []
sw = open('stopwords.txt')
for line in sw:
line = line.strip("\n")
stop_words.append(line)
sw.close()
def segmentWord(cont):
c = []
for i in cont:
a = list(jieba.cut(i,cut_all=True))
b = " ".join(a)
c.append(b)
return c
def text_to_index_array(p_new_dic, p_sen): # 文本转为索引数字模式
new_sentences = []
for sen in p_sen:
new_sen = []
for word in sen:
try:
new_sen.append(p_new_dic[word]) # 单词转索引数字
except:
new_sen.append(0) # 索引字典里没有的词转为数字0
new_sentences.append(new_sen)
return new_sentences
def readTrain(filename):
DOMTree = xml.dom.minidom.parse(filename)
collection = DOMTree.documentElement
weibos = collection.getElementsByTagName("weibo")
traindata = []
trainlabel = []
data = [[],[],[],[],[],[],[],[]]
for weibo in weibos:
sentence = weibo.getElementsByTagName('sentence')
for e in sentence:
sen = e.childNodes[0].data
if e.getAttribute('opinionated')=='Y':
emotion1 = e.getAttribute('emotion-1-type')
emotion2 = e.getAttribute('emotion-2-type')
data[emo.index(emotion1)].append(sen)
if emotion2 != 'none':
data[emo.index(emotion2)].append(sen)
else:
data[7].append(sen)
#smalldata = [[],[],[],[],[]]
smalldata = [[],[],[],[]]
smalldata[0]=data[2]+data[3]
smalldata[1]=data[0]+data[6]
smalldata[2]=data[5]
smalldata[3]=data[1]+data[4]
smalldata[3] *= 2
#smalldata[4]=data[7]
for i,d in enumerate(smalldata):
for ele in d:
traindata.append(ele)
trainlabel.append(i)
return traindata,trainlabel
traindata,label = readTrain("Training data for Emotion Classification.xml")
#traindata,label = adjustData(traindata,label)
traindata = segmentWord(traindata)
testdata = []
tfile = open('withoutother.txt',encoding='utf8')
for line in tfile:
testdata.append(line)
tfile.close()
testdata = segmentWord(testdata)
testlabel = []
labelfile = open('testlabelnoother.txt')
for l in labelfile:
line = l.strip('\n')
testlabel.append(smallemo.index(line))
labelfile.close()
traintexts = [[word for word in document.split() if word not in stop_words] for document in traindata]
testtexts = [[word for word in document.split() if word not in stop_words] for document in testdata]
word_vectors = KeyedVectors.load_word2vec_format('zhwiki_2017_03.sg_50d.word2vec', binary=False)
#word_vectors = Word2Vec(traintexts+testtexts, size=EMBEDDING_DIM, window=5, min_count=1)
#word_vectors.wv.save_word2vec_format('smallwv.txt',binary=False)
#word_vectors = KeyedVectors.load_word2vec_format('smallwv.txt', binary=False)
gensim_dict = Dictionary()
gensim_dict.doc2bow(word_vectors.vocab.keys(), allow_update=True)
w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 词语的索引,从1开始编号
w2vec = {word: word_vectors[word] for word in w2indx.keys()}
trainseq = text_to_index_array(w2indx, traintexts)
testseq = text_to_index_array(w2indx, testtexts)
traindata = pad_sequences(trainseq, maxlen=MAX_SEQUENCE_LENGTH)
testdata = pad_sequences(testseq, maxlen=MAX_SEQUENCE_LENGTH)
word_index = w2indx
print('Found %s unique tokens.' % len(word_index))
labels = to_categorical(np.asarray(label))
testlabels = to_categorical(np.asarray(testlabel))
indices = np.arange(traindata.shape[0])
np.random.shuffle(indices)
traindata = traindata[indices]
labels = labels[indices]
x_train = traindata[:] # 训练集
y_train = labels[:]# 训练集的标签
x_val = testdata[:]# 测试集,英文原意是验证集
y_val = testlabels[:]
print('Preparing embedding matrix.')
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
if i > MAX_NB_WORDS:
continue
embedding_vector = w2vec.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector # word_index to word_embedding_vector ,<20000(nb_words)
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(nb_words + 1,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False)
print('Training model.')
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(256,4,padding='valid',activation='relu',strides=1))
model.add(GlobalMaxPooling1D())
#model.add(Dropout(0.2))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(4))
model.add(Activation('softmax'))
# 优化器我这里用了adadelta,也可以使用其他方法
model.compile(loss='categorical_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
# =下面开始训练,nb_epoch是迭代次数,可以高一些,训练效果会更好,但是训练会变慢
#early_stopping =EarlyStopping(monitor='val_loss', patience=2)
#checkpointer = ModelCheckpoint(filepath='weights.hdf5', verbose=1, save_best_only=True)
model.fit(x_train, y_train,batch_size=64,epochs=4,validation_data=(x_val,y_val))
pre = model.predict_classes(x_val,verbose=0,batch_size=64)
prelabel = []
predlabelfile = open("predfile.txt",'w+')
for p in pre:
predlabelfile.write(str(smallemo[p])+'\n')
predlabelfile.close()
中文微博badword分类
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...