import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
path = r'F:\机器学习入门\黄海广-机器学习\Coursera-ML-AndrewNg-Notes-master\code\ex1-linear regression\ex1data1.txt'#这个直接去GitHub搜
data = pd.read_csv(path, header=None, names=['Population', 'Profit'])
data.head()
#查看数据的分布情况
data.plot(kind='scatter', x='Population', y='Profit', figsize=(10,6))
plt.show()
#线性回归的梯度下降公式(不懂的百度,放图不方便)
def computeCost(X,y,theta):
inner = np.power((X * theta.T - y),2)
return np.sum(inner)/(2*len(X))
#这个部分计算J(Ѳ),X是矩阵
#数据前面已经读取完毕,我们要为加入一列x,用于更新theta,然后我们将theta初始化为0,学习率初始化为0.01,迭代次数为1500次
data.insert(0,"Ones",1)
#初始化X和y
cols = data.shape[1]
X = data.iloc[:,:-1]
y = data.iloc[:,cols-1:cols]#y是data最后一列
X.head()
y.head()
X = np.matrix(X.values)
y = np.matrix(y.values)
theta = np.matrix(np.array([0,0]))
X.shape, theta.shape, y.shape
((97, 2), (1, 2), (97, 1))
a = np.array([[1,2],[2,2]])
print(a.shape)
(2, 2)
temp = np.matrix(np.zeros(theta.shape))
parameters = int(theta.ravel().shape[1])
parameters
2
alpha = 0.01
iters = 1500
def gradientDescent(X, y, theta, alpha, iters):
temp = np.matrix(np.zeros(theta.shape))#等于0
parameters = int(theta.ravel().shape[1])#等于2
cost = np.zeros(iters)#损失函数刚开始为0
print(cost)
for i in range(iters):#第一次迭代
error = (X * theta.T) - y#第一次错误的值
for j in range(parameters):#遍历两次
term = np.multiply(error, X[:,j])##矩阵对应元素位置相乘
temp[0,j] = theta[0,j] - ((alpha / len(X)) * np.sum(term))
theta = temp
cost[i] = computeCost(X, y, theta)#第一次计算出来的损失函数
return theta, cost
g, cost = gradientDescent(X, y, theta, alpha, iters)
g
matrix([[-3.63029144, 1.16636235]])
cost
array([6.73719046, 5.93159357, 5.90115471, ..., 4.48343473, 4.48341145,
4.48338826])
cost.shape
(1500,)
predict1 = [1,3.5]*g.T
print("predict1:",predict1)
predict2 = [1,7]*g.T
print("predict2:",predict2)
#预测35000和70000城市规模的小吃摊利润
predict1: [[0.45197679]]
predict2: [[4.53424501]]
x = np.linspace(data.Population.min(), data.Population.max(), 100)
print(x)
print(g[0, 0],g[0, 1])
f = g[0, 0] + (g[0, 1] * x)
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(x, f, 'r', label='Prediction')
ax.scatter(data.Population, data.Profit, label='Traning Data')
ax.legend(loc=2)
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')
plt.show()
#原始数据以及拟合的直线
[ 5.0269 5.20039596 5.37389192 5.54738788 5.72088384 5.8943798
6.06787576 6.24137172 6.41486768 6.58836364 6.7618596 6.93535556
7.10885152 7.28234747 7.45584343 7.62933939 7.80283535 7.97633131
8.14982727 8.32332323 8.49681919 8.67031515 8.84381111 9.01730707
9.19080303 9.36429899 9.53779495 9.71129091 9.88478687 10.05828283
10.23177879 10.40527475 10.57877071 10.75226667 10.92576263 11.09925859
11.27275455 11.44625051 11.61974646 11.79324242 11.96673838 12.14023434
12.3137303 12.48722626 12.66072222 12.83421818 13.00771414 13.1812101
13.35470606 13.52820202 13.70169798 13.87519394 14.0486899 14.22218586
14.39568182 14.56917778 14.74267374 14.9161697 15.08966566 15.26316162
15.43665758 15.61015354 15.78364949 15.95714545 16.13064141 16.30413737
16.47763333 16.65112929 16.82462525 16.99812121 17.17161717 17.34511313
17.51860909 17.69210505 17.86560101 18.03909697 18.21259293 18.38608889
18.55958485 18.73308081 18.90657677 19.08007273 19.25356869 19.42706465
19.60056061 19.77405657 19.94755253 20.12104848 20.29454444 20.4680404
20.64153636 20.81503232 20.98852828 21.16202424 21.3355202 21.50901616
21.68251212 21.85600808 22.02950404 22.203 ]
-3.6302914394043597 1.166362350335582