1.应用场景、二分类问题
1.预测一个用户是否点击特定的商品
2.判断用户的性别
3.预测用户是否会购买给定的品类
4.判断一条评论是正面的还是负面的
2.sigmoid 函数
Logistic回归实质上还是线性回归模型,只是在回归的连续值结果上加上了一层函数映射,将特征线性求和,然后使用g(z)作映射,将结果映射到离散值0/1上
(1)首先是映射,将线性模型经过g(z)进行映射
(2)构造损失函数
最大似然估计推导出单个样本正确预测的概率
取似然函数
对于训练数据集,数据特征X={x1,x2,…,xm},对应分段数据Y={y1,y2,…,ym};极大似然函数:整个样本的概率分布:
对数似然函数,将连乘转换为连加求和
最大似然估计就是寻找使得对数似然函数取最大值的参数,这里可以使用梯度法求解,求得的 就是要求的最佳参数。
(3)使用梯度下降法求解:
然后是向量化,向量化是使用矩阵计算来代替for循环,以简化计算过程,提高效率。
# -*- coding: utf-8 -*-
from numpy import *
from matplotlib import pyplot as plt
def plot_best_fit(wei, data_set, label):
weights = wei
data_set = array(data_set)
n = shape(data_set)[0]
xcourd1 = []; ycourd1 = []
xcourd2 = []; ycourd2 = []
for i in range(n):
if int(label[i]) == 1:
xcourd1.append(data_set[i, 1]); ycourd1.append(data_set[i, 2])
else:
xcourd2.append(data_set[i, 1]); ycourd2.append(data_set[i, 2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcourd1, ycourd1, s=30, c='red', marker='s')
ax.scatter(xcourd2, ycourd2, s=30, c='green')
x = arange(-3.0, 3.0, 0.1)
y = (-weights[0] - weights[1]*x)/weights[2]
ax.plot(x, y)
plt.xlabel('X1'); plt.ylabel('X2')
plt.show()
def load_data():
data_set = []
label = []
fr = open('./text.txt')
for line in fr.readlines():
line = line.strip().split()
data_set.append([1.0, float(line[0]), float(line[1])])
label.append(int(line[2]))
return data_set, label
def sigmoid(x):
return 1.0 / (1 + exp(-x))
def train(data_set, label):
data_matrix = mat(data_set)
label = mat(label).transpose()
m, n = shape(data_matrix)
alpha = 0.001
max_cycles = 500
weights = ones((n, 1))
for k in range(max_cycles):
h = sigmoid(data_matrix*weights)
error = label - h
weights = weights + alpha * data_matrix.transpose() * error
return weights
# on line to study
def stoc_grad_ascent(data_set, label):
m, n = shape(data_set)
alpha = 0.01
weights = ones(n)
for i in range(m):
h = sigmoid(sum(data_set[i]*weights))
error = label[i] - h
weights = weights + alpha * error * data_set[i]
return weights
# on line to study prove
def prove_grad_ascent(data_set, label, num_iter=450):
m, n = shape(data_set)
weights = ones(n)
for j in range(num_iter):
data_index = range(m)
for i in range(m):
alpha = 4/(1.0+j+i)+0.01 # prevent swings
# choose a random value to prevent periodic swings
rand_index = int(random.uniform(0, len(data_index)))
h = sigmoid(sum(data_set[rand_index]*weights))
error = label[rand_index] -h
weights = weights + alpha * error * data_set[rand_index]
del data_index[rand_index]
return weights
if __name__ == "__main__":
data_set, label = load_data()
#print label
#weights = train(array(data_set), label)
#weights = stoc_grad_ascent(array(data_set), label)
weights = prove_grad_ascent(array(data_set), label)
plot_best_fit(weights, data_set, label)