二元逻辑回归实现鸢尾花数据分类（python）

说明：

本文利用python实现二元逻辑回归，没有加正则项。挑选iris数据前100个样本作为训练集，他们分属于两个类别，样本特征选择第1列（花萼长度x1）和第2列（花萼宽度x2）。
程序以函数形式实现，附录中给出封装式程序实现。代码均以矩阵计算的思路编写。
四部分程序是连续的，拷贝在一起就能完整运行。
没考虑过拟合、欠拟合等问题。

1.查看鸢尾花数据集

from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import numpy as np

#%matplotlib

iris = load_iris() #导入鸢尾花数据
data = iris.data #numpy.ndarray
target = iris.target #numpy.ndarray

#print(data.shape) #查看维度（150,4），即150个样本，每个样本4个属性
#print(target.shape) #（150,）每个样本的类别0,1,2值，三个类型，前100个样本属于0和1类，后50个样本属于2类
#print(data) #查看完整数据
#print(target) #查看类型数据

#根据不同类型0,1,2和第0列（x1），第1列（x2）绘图。
plt.figure(1)
index_0 = np.where(target==0) #找出0所在位置
plt.scatter(data[index_0,0],data[index_0,1],marker='x',color = 'b',label = '0',s = 15)
index_1 =np.where(target==1) #找出1所在位置
plt.scatter(data[index_1,0],data[index_1,1],marker='o',color = 'r',label = '1',s = 15)
index_2 =np.where(target==2) #找出2所在位置
plt.scatter(data[index_2,0],data[index_2,1],marker='s',color = 'k',label = '2',s = 15) 
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend(loc = 'upper left')

上述程序的结果如下：

鸢尾花.png

2.二元逻辑回归程序实现

#定义二元逻辑回归需要的各个函数
def h_sigmod(W,X): #输出1×m
    g = np.dot(W,X.T) #1×m 计算h(X)
    return sigmod(g)

def sigmod(X):
    return 1/(1+np.exp(-X))

def train(X,y,learn_rate=0.001):
    #利用已知样本训练模型，得到系数
    W = None #存放系数
    Loss = [] #存放每次迭代的损失函数值

    num_train,num_feature = X.shape
    W = 0.001*np.random.randn(num_feature,1).reshape(1,-1) #初始化系数,并转化为1个行向量
    #print('W={}'.format(W))
    
    for i in range(5000):
        h = h_sigmod(W,X_train) #1×m
    
        #注意loss是一个标量，但是一个ndarray型数据，而非纯粹的int
        loss = -(np.log(h).dot(y) + np.log(1-h).dot(1-y)) 
    #    print(loss.shape)
        loss = loss/num_train
    
        dW = (y.T-h).dot(X_train) #计算(y-h(X))*X，实际上就是J(W)的一阶导数
    #    print('dW={}'.format(dW))
        W += learn_rate*dW #更新系数W=W+a(y-h(X))*X
    
        Loss.append(loss[0])
    #    print ('i={},Loss={}'.format(i,Loss[i]))
    
    return W,Loss

#利用训练得到的W系数对结果进行预测
def predict(W,X_test):
        h = h_sigmod(W,X_test) #利用训练得到的系数W计算h(X)结果就是预测值
        y_pred = np.where(h>=0.5,1,0)
        return y_pred

3.利用鸢尾花数据对模型进行训练得到系数

##利用鸢尾花部分数据训练模型并查看结果##############################################################
#从原始数据中抽取部分数据作为训练数据
# 前100个样本分别属于0和1类型，本例中用逻辑回归做二分类，不考虑花型数据为2类的数据。
X = data[0:100,[0,1]] #抽出第1和2个特征（x1和x2），分别是花萼的长和宽
y = target[0:100].reshape(-1,1) #每个样本对应的类型，0或1。转化成一个列向量。
# print(X[:5])
# print(y[-5:])

##增加一列“1”，表示x0列，这列只是为了与系数w的第一列w0相匹配（该列不需要与实际特征x1或x2相乘）
#该步很重要
one = np.ones((X.shape[0],1))
X_train = np.hstack((one,X))

W,Loss = train(X_train,y) #利用已知样本训练模型，得到系数和损失函数结果，（）
print('预测的系数：[w0,w1,w2]={}'.format(W)) #输出训练得到的系数

plt.figure(2)
#print('W={}'.format(W))
plt.plot(Loss)
plt.xlabel('number of iteration')
plt.ylabel('Loss')
plt.show()

plt.figure(3)
label = np.array(y)
index_0 = np.where(label==0)
plt.scatter(X[index_0,0],X[index_0,1],marker='x',color = 'b',label = '0',s = 15)
index_1 =np.where(label==1)
plt.scatter(X[index_1,0],X[index_1,1],marker='o',color = 'r',label = '1',s = 15)
 
#show the decision boundary
x1 = np.arange(4,7.5,0.5)
x2 = (- W[0][0] - W[0][1]*x1) / W[0][2]
plt.plot(x1,x2,color = 'black')
plt.xlabel('X1')
plt.ylabel('X2')
plt.legend(loc = 'upper left')

##利用鸢尾花部分数据训练模型并查看结果

输出结果：

预测的系数：[w0,w1,w2]=[[-2.01401491  5.05730268 -8.19245827]]

损失函数.png

分类边界.png

4.利用前面得到的模型数据预测对其他数据进行分类

#预测测试一
#取X_train的前20个样本，让分类器对其进行预测，如果结果都是0，那么说明分类很准确。
X_test= X_train[:20,[0,1,2]]
result = predict(W,X_test) #根据预测得到的系数W对新的样本进行分类
print("测试一结果:{}".format(result)) #看输出是否为0
    
#预测测试二
#暂时抽取第2类数据的x1和x2特征来作为测试数据。
#从第一幅途中可以看出，仅从x1和x2两个特征看，第2类鸢尾花是属于第一1类的，即输出应该都是1
Test_data = data[100:,[0,1]] #暂时抽取第2类数据的x1和x2特征来作为测试数据
one = np.ones((Test_data.shape[0],1)) 
X_test = np.hstack((one,Test_data))
result = predict(W,X_test) #根据预测得到的系数W对新的样本进行分类
print("测试二结果:{}".format(result)) #看输出是否为1

输出分类结果：

测试一结果:[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]] #全正确
测试二结果:[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1]] #全正确

附录：上述过程的封装式实现如下

# -*- coding: utf-8 -*-
"""
Created on Thu Mar  7 19:50:46 2019

@author: sean
"""

from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import numpy as np


# 多cell输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#%matplotlib

iris = load_iris()
data = iris.data #numpy.ndarray
target = iris.target #numpy.ndarray

#print(data.shape) #查看维度（150,4），即150个样本，每个样本4个属性
#print(target.shape) #（150,）每个样本的类别0,1,2值，三个类型，前100个样本属于0和1类，后50个样本属于2类
#print(data) #查看完整数据
#print(target) #查看类型数据

#根据不同类型0,1,2和第0列（x1），第1列（x2）绘图。
plt.figure(1)
index_0 = np.where(target==0) #找出0所在位置
plt.scatter(data[index_0,0],data[index_0,1],marker='x',color = 'b',label = '0',s = 15)
index_1 =np.where(target==1) #找出1所在位置
plt.scatter(data[index_1,0],data[index_1,1],marker='o',color = 'r',label = '1',s = 15)
index_2 =np.where(target==2) #找出2所在位置
plt.scatter(data[index_2,0],data[index_2,1],marker='s',color = 'k',label = '2',s = 15) 
plt.xlabel('sepal length (x1)')
plt.ylabel('sepal width (x2)')
plt.legend(loc = 'upper left')

#准备数据############################################################################################
# 前100个样本分别属于0和1类型，本例中用逻辑回归做二分类，不考虑花型数据为2类的数据。
X = data[0:100,[0,1]] #抽出第1和2个特征（x1和x2），分别是花萼的长和宽
y = target[0:100].reshape(-1,1) #每个样本对应的类型，0或1。转化成一个列向量。
# print(X[:5])
# print(y[-5:])
##增加一列“1”，表示x0列，这列只是为了与系数w的第一列w0相匹配（该列不需要与实际特征x1或x2相乘）
one = np.ones((X.shape[0],1))
X_train = np.hstack((one,X))


#构造一个二元逻辑回归分类器########################################################################
class logistic(object):
    def __init__(self):
        self.W = None
        self.Loss = []
    def train(self,X,y,learn_rate = 0.001,num_iters = 5000):
        num_train,num_feature = X.shape
        #init the weight
        self.W = 0.001*np.random.randn(num_feature,1).reshape((1,-1))#初始化系数,并转化为1个行向量
        
        for i in range(num_iters):
            loss,dW = self.compute_loss(X,y)
#            print('i={},loss={}'.format(i,loss))
            
            self.W += learn_rate*dW #更新系数W=W+a(y-h(X))*X
            #print(self.W.shape)
            self.Loss.append(loss[0][0])
            
    def compute_loss(self,X,y):
        num_train = X.shape[0]
        h = self.h_sigmod(X) #1×m

        #注意loss是一个标量，但是一个ndarray型数据，而非纯粹的int
        loss = -(np.log(h).dot(y) + np.log(1-h).dot(1-y))
        #print(loss.shape)
        loss = loss/num_train

        dW = (y.T-h).dot(X_train) #计算(y-h(X))*X，实际上就是J(W)的一阶导数
        #    print('dW={}'.format(dW))
    
        return loss,dW

    def h_sigmod(self,X): #输出1×m
        g = np.dot(self.W,X.T) #1×m，计算h(X)
        return self.sigmod(g)
    
    def sigmod(self,X):
        return 1/(1+np.exp(-X))

    def predict(self,X): #利用训练得到的系数W计算h(X)结果就是预测值
        h = self.h_sigmod(X)
        y_pred = np.where(h>=0.5,1,0)
        return y_pred

#产生一个二元逻辑回归分类器对象######################################################################
my_logistic = logistic() #构造一个二元逻辑回归分类器
my_logistic.train(X_train,y) #利用数据X_train 和y对模型进行训练，确定W和Loss值
Loss = my_logistic.Loss
W = my_logistic.W
#print( 'W={}'.format(my_logistic.W) )
#print( 'Loss={}'.format(my_logistic.Loss) )

#查看损失函数
plt.figure(2)
plt.plot(Loss)
plt.xlabel('Iteration number')
plt.ylabel('Loss value')
plt.show()

##绘制根据二元逻辑回归得到的分界线
#将鸢尾花的前100个样本，根据x1和x2特征绘图
plt.figure(3)
label = np.array(y)
index_0 = np.where(label==0)
plt.scatter(X[index_0,0],X[index_0,1],marker='x',color = 'b',label = '0',s = 15)
index_1 =np.where(label==1)
plt.scatter(X[index_1,0],X[index_1,1],marker='o',color = 'r',label = '1',s = 15)
#根据训练得到的系数画分界线
x1 = np.arange(4,7.5,0.5)
x2 = (- W[0][0] - W[0][1]*x1) / W[0][2] 
plt.plot(x1,x2,color = 'black')
plt.xlabel('X1')
plt.ylabel('X2')
plt.legend(loc = 'upper left')

#利用训练得到的参数对新的样本进行分类
#预测测试一
#取X_train的前20个样本，让分类器对其进行预测，如果结果都是0，那么说明分类很准确。
X_test= X_train[:20,[0,1,2]]
result = my_logistic.predict(X_test) #对测试数据进行预测
print("测试一结果:{}".format(result)) #看结果有多少是0
    
#预测测试二
#暂时抽取第2类数据的x1和x2特征来作为测试数据。
#从第一幅途中可以看出，仅从x1和x2两个特征看，第2类鸢尾花是属于第一1类的，即输出应该都是1
Test_data = data[100:,[0,1]] 
one = np.ones((Test_data.shape[0],1)) 
X_test = np.hstack((one,Test_data))

result = my_logistic.predict(X_test)
print("测试二结果:{}".format(result)) #看输出是否为1

二元逻辑回归实现鸢尾花数据分类（python）

说明：

1.查看鸢尾花数据集

2.二元逻辑回归程序实现

3.利用鸢尾花数据对模型进行训练得到系数

4.利用前面得到的模型数据预测对其他数据进行分类

附录：上述过程的封装式实现如下

推荐阅读更多精彩内容