逻辑回归函数定义
逻辑回归
** g(z)就是传说中的sigmoid函数 **
sigmoid求导
** 因为是二分类问题,所以我们假设: **
假设
** 这里我们可以将其写成如下公式 **
公示整合
** 似然函数(这里表示为θ的似然): **
似然函数
** 对数似然 **
对数似然
求导
** 神奇的事情出现了,这里的公式和梯度下降的公式何其相似啊。我们称之为梯度上升
批梯度上升和随机梯度上升与我的上篇文章http://www.jianshu.com/p/52f5ea825f7f提到的批梯度下降和随机梯度下降是一样的逻辑。不过你需要仔细想想这两个公式有什么不同 **
梯度上升
%matplotlib inline
from numpy import *
#导入数据并整理
def loadDataSet(fileName):
dataMat = []
labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
labelMat.append(float(lineArr[2]))
return dataMat,labelMat
#sigmoid函数
def sigmoid(inX):
return 1.0/(1+exp(-inX))
#画图
def plotBestFit(weights):
import matplotlib.pyplot as plt
dataMat,labelMat=loadDataSet('testSet.txt')
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelMat[i])== 1:
xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = arange(-3.0, 3.0, 0.1)
y = (-weights[0]-weights[1]*x)/weights[2]
ax.plot(x, y)
plt.xlabel('X1'); plt.ylabel('X2');
plt.show()
#批梯度上升
def gradAscent(dataMatIn,classLabels,alpha = 0.001,maxCycles = 500):
dataMatrix = mat(dataMatIn)
labelMat = mat(classLabels).transpose()
m,n = shape(dataMatrix)
weights = ones((n,1))
for k in range(maxCycles):
h = sigmoid(dataMatrix * weights)
error = labelMat - h
weights = weights + alpha * dataMatrix.transpose()*error
return weights
#测试
dataArr,labelArr = loadDataSet('testSet.txt')
weights = gradAscent(dataArr,labelArr,0.001,500)
plotBestFit(weights.getA())
批梯度上升
#随机梯度上升
def stocGradAscent(dataMatrix, classLabels, numIter=150):
m,n=shape(dataMatrix)
weights=ones(n)
for j in range(numIter):
dataIndex=range(m)
for i in range(m):
alpha=4/(1.0+j+i)+0.01
randIndex=int(random.uniform(0,len(dataIndex)))
h=sigmoid(sum(dataMatrix[randIndex]*weights))
error=classLabels[randIndex]-h
weights=weights+alpha*error*dataMatrix[randIndex]
del[dataIndex[randIndex]]
return weights
dataArr,labelMat = loadDataSet('testSet.txt')
weights = stocGradAscent(array(dataArr),labelMat,500)
plotBestFit(weights)
随机梯度上升
spark代码示例
public class LogisticWithElasticNet {
/**
* 日志控制
*/
static{
LogSetting.setWarningLogLevel("org");
LogSetting.setWarningLogLevel("akka");
LogSetting.setWarningLogLevel("io");
LogSetting.setWarningLogLevel("httpclient.wire");
}
public static void main(String[] args) {
String resources = Thread.currentThread().getContextClassLoader().getResource("").getPath();
SparkConf conf = new SparkConf().setAppName("Logistic Regression with Elastic Net Example").setMaster("local[2]");
SparkContext sc = new SparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
String path = resources + "libsvm_data.txt";
DataFrame training = sqlContext.read().format("libsvm").load(path);
LogisticRegression lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(0.3)
.setElasticNetParam(0.8);
// Fit the model
LogisticRegressionModel lrModel = lr.fit(training);
System.out.println("Coefficients: "
+ lrModel.coefficients() + " Intercept: " + lrModel.intercept());
// Extract the summary from the returned LogisticRegressionModel instance trained in the earlier example
LogisticRegressionTrainingSummary trainingSummary = lrModel.summary();
// Obtain the loss per iteration.
double[] objectiveHistory = trainingSummary.objectiveHistory();
for (double lossPerIteration : objectiveHistory) {
System.out.println(lossPerIteration);
}
// Obtain the metrics useful to judge performance on test data.
// We cast the summary to a BinaryLogisticRegressionSummary since the problem is a binary classification problem.
BinaryLogisticRegressionSummary binarySummary =
(BinaryLogisticRegressionSummary) trainingSummary;
// Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
DataFrame roc = binarySummary.roc();
roc.show();
roc.select("FPR").show();
System.out.println(binarySummary.areaUnderROC());
// Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with this selected threshold.
DataFrame fMeasure = binarySummary.fMeasureByThreshold();
double maxFMeasure = fMeasure.select(functions.max("F-Measure")).head().getDouble(0);
double bestThreshold = fMeasure.where(fMeasure.col("F-Measure").equalTo(maxFMeasure))
.select("threshold").head().getDouble(0);
lrModel.setThreshold(bestThreshold);
}
}