最近,在DataFountain平台由中国计算机学会 & 西门子举办了一个“离散制造过程中典型工件的质量符合率预测”的比赛,就尝试了一下。
1.数据清洗
发现数据集的中每个特征最大最小值相差非常大,而且很大的数字不在少数。尝试用正态分布异常点检测法初步对异常值进行判断,并用均值进行替换。然而,替换完后,还是存在不少的异常点。
2.构建模型
其实,还可以利用加减乘除构建人工特征……并用多个模型融合以提高准确率,这里就是做了一个Baseline,在没有调参数的情况下,分别用SVM、MLP、CNN、LihtGBM、XGBoost跑了一遍,发现最后一个准确率在50%左右,其他的都是在41%-45%之间。
2.1 SVM (这个使用MATLAB跑的)
Data=csvread('Train_AfterQinXi.csv');
BiLi=0.1; %注意点 1.最后一列要按顺序排列,并且最后一列一定是类型,需要设定测试集的比例
[m,n]=size(Data);%最后一列是分类的类型,Excel要排序
Testnum=zeros(1,max(Data(:,n))+1);
Speicesnum=Testnum;
kkk=1;
sum0=0;
BJS=Data(1,n);
for j=1:m
if Data(j,n)==BJS
sum0=sum0+1;
else
Speicesnum(kkk)=sum0;
Testnum(kkk)=floor(BiLi*sum0);kkk=kkk+1;
sum0=1;BJS=Data(j,n);
end
end
Testnum(1,end)=floor(BiLi*sum0);
Speicesnum(1,end)=sum0;
for j=1:length(Testnum)
if Testnum(j)==0
Testnum(j)=1;
end
end
%求出每类的个数
Train_Feature=[];
Train_Label=[];
Test_Feature=[];
Test_Label=[];
for j=1:max(Data(:,n))+1
if j==1
Kaishi=1;
else
Kaishi=sum(Speicesnum(1,1:j-1))+1;
end
JieSu1=sum(Speicesnum(1,1:j))-Testnum(j);
JieSu2=sum(Speicesnum(1,1:j));
Train_Feature=[Train_Feature;Data(Kaishi:JieSu1,1:n-1)];
Train_Label=[Train_Label;Data(Kaishi:JieSu1,n)];
Test_Feature=[Test_Feature;Data(JieSu1+1:JieSu2,1:n-1)];
Test_Label=[Test_Label;Data(JieSu1+1:JieSu2,n)];
end
%数据预处理,将训练集和测试集归一化到[0,1]区间
[mtrain,ntrain] = size(Train_Feature);
[mtest,ntest] = size(Test_Feature);
dataset = [Train_Feature;Test_Feature];
[dataset_scale,ps] = mapminmax(dataset',0,1);
dataset_scale = dataset_scale';
Train_Feature = dataset_scale(1:mtrain,:);
Test_Feature = dataset_scale( (mtrain+1):(mtrain+mtest),: );
%SVM网络训练和预测
model = fitcecoc(Train_Feature,Train_Label);
[predict_label] =predict(model,Test_Feature);
accuracy=0;
for j=1:length(Test_Label)
if Test_Label(j)==predict_label(j)
accuracy=accuracy+1;
end
end
accuracy=accuracy/length(Test_Label)
2.2 LightGBM
import lightgbm as lgb
import numpy as np
from pandas import read_csv
from sklearn import datasets
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
dataset = read_csv('ZeroOne_Train.csv')
XXX = read_csv('ZeroOne_Test.csv')
values = dataset.values
XY= values
Y = XY[:,10]
n_train_hours1 =5398
x_train=XY[:n_train_hours1,0:10]
trainY =Y[:n_train_hours1]
x_test =XY[n_train_hours1:, 0:10]
testY =Y[n_train_hours1:]
X_train=np.array(x_train,dtype=np.float)
X_test=np.array(x_test,dtype=np.float)
y_train=np.array(trainY,dtype=np.int)
y_test=np.array(testY,dtype=np.int)
XXX=np.array(XXX,dtype=np.float)
params = {
'boosting_type': 'gbdt',
'objective': 'multiclassova',
'num_class': 4,
'metric': 'multi_error',
'num_leaves': 63,
'learning_rate': 0.01,
'feature_fraction': 0.9,
'bagging_fraction': 0.9,
'bagging_seed':0,
'bagging_freq': 1,
'verbose': -1,
'reg_alpha':1,
'reg_lambda':2,
'lambda_l1': 0,
'lambda_l2': 1,
'num_threads': 8,
}
train_data=lgb.Dataset(X_train,label=y_train)
validation_data=lgb.Dataset(X_test,label=y_test)
clf=lgb.train(params,train_data,valid_sets=[validation_data],num_boost_round = 1300,verbose_eval = 100)
y_pred=clf.predict(XXX, num_iteration=1300)
2.3 XGBoost
import xgboost as xgb
import numpy as np
from pandas import read_csv
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
dataset = read_csv('ZeroOne_Train.csv')
XXX = read_csv('ZeroOne_Test.csv')
values = dataset.values
XY= values
Y = XY[:,10]
n_train_hours1 =5398
x_train=XY[:n_train_hours1,0:10]
trainY =Y[:n_train_hours1]
x_test =XY[n_train_hours1:, 0:10]
testY =Y[n_train_hours1:]
X_train=np.array(x_train,dtype=np.float)
X_test=np.array(x_test,dtype=np.float)
y_train=np.array(trainY,dtype=np.int)
y_test=np.array(testY,dtype=np.int)
XXX=np.array(XXX,dtype=np.float)
params = {
'booster': 'gbtree',
'objective': 'multi:softmax',
'num_class': 4,
'gamma': 0.1,
'max_depth': 6,
'lambda': 2,
'subsample': 0.7,
'colsample_bytree': 0.7,
'min_child_weight': 3,
'silent': 1,
'eta': 0.1,
'seed': 1000,
'nthread': 4,
}
plst = params.items()
dtrain = xgb.DMatrix(X_train, y_train)
num_rounds = 500
model = xgb.train(plst, dtrain, num_rounds)
# 对测试集进行预测
dtest = xgb.DMatrix(XXX)
ans = model.predict(dtest)
2.4 MLP
from __future__ import print_function
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from pandas import read_csv
batch_size = 100
num_classes = 4
epochs = 200
dataset = read_csv('ZeroOne_Train.csv')
XXX = read_csv('ZeroOne_Test.csv')
values = dataset.values
XY= values
Y = XY[:,10]
n_train_hours1 =5398
x_train=XY[:n_train_hours1,0:10]
trainY =Y[:n_train_hours1]
x_test =XY[n_train_hours1:, 0:10]
testY =Y[n_train_hours1:]
y_train = keras.utils.to_categorical(trainY, num_classes)
y_test = keras.utils.to_categorical(testY, num_classes)
model = Sequential()
model.add(Dense(128,input_dim=10,kernel_initializer='normal',activation='relu'))
model.add(Dense(128,kernel_initializer='normal',activation='relu'))
model.add(Dense(128,kernel_initializer='normal',activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy,
optimizer=keras.optimizers.Adadelta(),
metrics=['accuracy'])
history=model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=2,
validation_data=(x_test, y_test))
prediction=model.predict_classes(XXX)
2.5 CNN
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from pandas import read_csv
batch_size = 32
num_classes = 4
epochs = 200
# input image dimensions
# 输入图像维度
img_rows, img_cols = 4, 4
input_shape = (img_rows, img_cols, 1)
# the data, shuffled and split between train and test sets
# 用于训练和测试的数据集,经过了筛选(清洗、数据样本顺序打乱)和分割(分割为训练和测试集)
dataset = read_csv('ZeroOne_Train_CNN.csv')
values = dataset.values
XY= values
Featurenumber=img_rows*img_cols
Y = XY[:,Featurenumber]
n_train_hours1 =5398
x_train=XY[:n_train_hours1,0:Featurenumber]
trainY =Y[:n_train_hours1]
x_test =XY[n_train_hours1:, 0:Featurenumber]
testY =Y[n_train_hours1:]
x_train = x_train.reshape(-1,4,4,1)
x_test = x_test.reshape(-1,4,4,1)
y_train = keras.utils.to_categorical(trainY, num_classes)
y_test = keras.utils.to_categorical(testY, num_classes)
model = Sequential()
model.add(Conv2D(16, kernel_size=(3, 3),
activation='relu',
padding='same',
input_shape=input_shape))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, kernel_size=(3, 3),
activation='relu',
padding='same'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy,
optimizer=keras.optimizers.Adadelta(),
metrics=['accuracy'])
history=model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=2,
validation_data=(x_test, y_test))
a=history.history['acc']
b=history.history['val_acc']