5.20 Linear regression

import matplotlib.pyplot as plt
import numpy as np

x = [0, 1, 2, 3, 4, 5]
# Going by our formula, every y value at
#a position is the same as the x-value 
#in the same position.
# We could write y = x, but let's write
#them all out to make this more clear.
y = [0, 1, 2, 3, 4, 5]

# As you can see, this is a straight line
# that passes through the points
# (0,0), (1,1), (2,2), and so on.
plt.plot(x, y)
plt.show()

# Let's try a slightly more ambitious line.
# What if we did y = x + 1?
# We'll make x an array now, so we 
# can add 1 to every element more easily.
x = np.asarray([0, 1, 2, 3, 4, 5])
y = x + 1

# y is the same as x, but every 
# element has 1 added to it.
print(y)

# This plot passes through (0,1), (1,2), and so on.
# It's the same line as before, 
# but shifted up 1 on the y-axis.
plt.plot(x, y)
plt.show()

# By adding 1 to the line, we moved what's 
#called the y-intercept -- where 
#the line intersects with the y-axis.
# Moving the intercept can shift 
#the whole line up (or down when we subtract).
y = x - 1
plt.plot(x, y)
plt.show()

y = x + 10
plt.plot(x, y)
plt.show()

slope 斜率 intercept 截距

import matplotlib.pyplot as plt
import numpy as np

x = np.asarray([0, 1, 2, 3, 4, 5])
# Let's set the slope of the line to 2.
y = 2 * x

# See how this line is "steeper" than before?  
# The larger the slope is, 
# the steeper the line becomes.
# On the flipside, fractional slopes 
# will create a "shallower" line.
# Negative slopes will create a line 
# where y values decrease as x values increase.
plt.plot(x, y)
plt.show()

y = 4 * x
plt.plot(x, y)
plt.show()

y = .5 * x
plt.plot(x, y)
plt.show()

y = -2 * x
plt.plot(x, y)
plt.show()

y=mx+b This equation is saying "the predicted value of the second variable (y) is equal to the value of the first variable (x) times the slope (m) plus the intercept (b)".

We'll calculate slope first -- the formula is cov(x,y)/σx2, which is just the covariance of x and y divided by the variance of x.
We can use the cov function to calculate covariance, and the .var()method on Pandas series to calculate variance.

from numpy import cov
slope_density = cov(wine_quality['density'],
                    wine_quality['quality']
                   )/wine_quality['density'].var()

We can compute the intercept by taking the slope we calculated and doing this:y¯−mx¯. So we just take the mean of the y values, and then subtract the slope times the mean of the x values from that.

Remember that we can calculate the mean by using the .mean() method.

from numpy import cov

def calc_slope(x,y):
    return cov(x,y)[0,1]/ x.var()

# y-mx

intercept_density = wine_quality['quality'].mean(
) -calc_slope(
    wine_quality["density"],
    wine_quality['quality']
)* wine_quality['density'].mean()
from numpy import cov
def calc_slope(x,y):
    return cov(x,y)[0,1] /x.var()

def calc_intercept(x,y,slope):
    return y.mean()-slope*x.mean()

def compute_predicted_y(x):
    return x*slope + intercept

slope = calc_slope(wine_quality['density'], wine_quality['quality'])
intercept = calc_intercept(wine_quality['density'], wine_quality['quality'],slope)

predicted_quality = wine_quality['density'].apply(compute_predicted_y)

print(predicted_quality)

linregress function

from scipy.stats import linregress


slope, intercept, r_value, p_value, stderr_slope = linregress(
    wine_quality["density"], wine_quality["quality"])

# As you can see, these are the same values
# we calculated (except for slight rounding differences)
print(slope)
print(intercept)
import numpy
predicted_y = numpy.asarray([slope * x + intercept
                             for x in wine_quality["density"]])
residuals = (wine_quality["quality"] - predicted_y) ** 2
rss = sum(residuals)

From the sum of squared residuals, we can find the standard error. The standard error is similar to the standard deviation, but it tries to make an estimate for the whole population of y-values -- even the ones we haven't seen yet that we may want to predict in the future.

The standard error lets us quickly determine how good or bad a linear model is at prediction.

You take the sum of squared residuals, divide by the number of y-points minus two, and then take the square root.

(rss/(n-2))**.5



from scipy.stats import linregress 
import numpy as np

slope,intercept,r_value,p_value,stderr_slope = linregress(
    wine_quality['density'],
    wine_quality['quality']
)
predicted_y = np.array(
    [slope*x+intercept 
     for x in wine_quality["density"]]
)

residuals = (
    wine_quality['quality']-predicted_y
)**2

rss = sum(residuals)

stderr = (rss/(len(wine_quality['density'])-2))**.5
# Assume that "within" means "up to and including", 
#so be sure to count values that are exactly 1, 2,
# or 3 standard errors away.

def within_percentage(y, 
                      predicted_y, 
                      stderr, 
                      error_count
                     ):
    
    within = stderr*error_count
    differences = abs(y-predicted_y)
    lower_differences = [d 
                         for d in differences 
                         if d <=within]
    
    within_count = len(lower_differences)
    return within_count/len(y)



within_one = within_percentage(
    wine_quality["quality"],
    predicted_y, 
    stderr, 
    1)
within_two = within_percentage(
    wine_quality["quality"],
    predicted_y, 
    stderr, 
    2)
within_three = within_percentage(
    wine_quality["quality"],
    predicted_y,
    stderr, 
    3)

©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

  • rljs by sennchi Timeline of History Part One The Cognitiv...
    sennchi阅读 12,178评论 0 10
  • 没有多余的旁白,今天在暑期工作的地方,老板请我们吃了海鲜。人很多我和其中一个阿姨吃好之后就离开餐桌,但是阿姨吃饭时...
    未姑娘啊阅读 2,047评论 0 1
  • 达尔文的进化论提炼了非常精辟的四个字:物竞天择,适者生存。从进化的视角观察我们个体,其实就是这二个字:生存。 从进...
    逄格亮阅读 3,056评论 0 0

友情链接更多精彩内容