逻辑回归是一个分类算法,利用回归来做分类.它可以处理二元分类以及多元分类,逻辑回归与线性回归不同主要体现在以下两点:
1.sigmiod函数
2.损失函数
import copy
import numpy as np
import matplotlib.pyplot as plt
def loadata(filename):
fr = open(filename)
data = []
for line in fr.readlines():
data.append(line.strip().split('\t'))
data = np.array(data, dtype=np.float32)
m, n = data.shape
x = np.mat(np.c_[data[:, 0:n-1], np.ones((m,1))])
y = np.mat(data[:, n-1]).reshape((m,1))
return data, x, y
def sigmoid(x, theta):
return 1.0/(1.0 + np.exp(-x * theta))
def J(theta, x, y):
m, n = x.shape
h = sigmoid(x, theta)
J = (-1.0/m)*(np.log(h).T*y+np.log(1.0-h).T*(1-y))
return J
def gradient(x, y, maxloop, rate, eplsion):
m, n = x.shape
error = float('inf')
errors = []
theta = np.ones((n, 1))
thetas = []
count = 0
while count < maxloop and error > eplsion:
count += 1
h = sigmoid(x, theta)
thetas.append(copy.deepcopy(theta))
theta -= rate * (1.0/m) * x.T * (h-y)
error = J(theta, x, y)
errors.append(copy.deepcopy(error))
if error < eplsion:
break
return thetas, errors, count
def sgd(x, y, maxloop, rate, eplsion):
m, n = x.shape
error = float('inf')
errors = []
theta = np.ones((n, 1))
thetas = []
count = 0
for i in range(maxloop):
count = i
if i >= m:
i = i%m
h = sigmoid(x[i], theta)
theta -= rate*(1.0/m)*x[i].T*(h-y[i])
thetas.append(copy.deepcopy(theta))
error = J(theta, x, y)
errors.append(error)
if error < eplsion:
break
return thetas, errors, count+1
def paint(data, thetas, errors, count, rate):
m, n = data.shape
data1 = []
data2 = []
for i in range(m):
if data[i, 2] == 1.0:
data1.append(data[i, 0:2])
else:
data2.append(data[i, 0:2])
data1 = np.array(data1)
data2 = np.array(data2)
plt.figure()
plt.scatter(np.array(data1)[:, 0], np.array(data1)[:, 1], marker='o', c='b')
plt.scatter(np.array(data2)[:, 0], np.array(data2)[:, 1], marker='*', c='b')
title = 'rate=%2f, itercount=%d, error=%2f \n' % (rate, count, errors[-1])
x = np.arange(-5, 5, 0.01)
plt.plot(x, -(thetas[-1][0]*x + thetas[-1][2])/thetas[-1][1], c='r')
plt.xlabel('x')
plt.ylabel('y')
plt.title(title)
plt.show()
plt.figure()
plt.plot(np.array(errors).flatten(), range(count), c='b')
plt.xlabel('count')
plt.ylabel('error')
plt.title('error')
plt.show()
plt.subplot(3,1,1)
plt.plot(np.array(thetas).reshape(count, 3)[:, 0], range(count))
plt.subplot(3,1,2)
plt.plot(np.array(thetas).reshape(count, 3)[:, 1], range(count))
plt.subplot(3,1,3)
plt.plot(np.array(thetas).reshape(count, 3)[:, 2], range(count))
plt.show()
if __name__ == "__main__":
data, x, y = loadata('linear.txt')
thetas, errors, count = sgd(x, y, 10000, 0.01, 0.01)
print thetas, errors, count
paint(data, thetas, errors, count, 0.01)