5.20 Linear regression

import matplotlib.pyplot as plt
import numpy as np

x = [0, 1, 2, 3, 4, 5]
# Going by our formula, every y value at
#a position is the same as the x-value 
#in the same position.
# We could write y = x, but let's write
#them all out to make this more clear.
y = [0, 1, 2, 3, 4, 5]

# As you can see, this is a straight line
# that passes through the points
# (0,0), (1,1), (2,2), and so on.
plt.plot(x, y)
plt.show()

# Let's try a slightly more ambitious line.
# What if we did y = x + 1?
# We'll make x an array now, so we 
# can add 1 to every element more easily.
x = np.asarray([0, 1, 2, 3, 4, 5])
y = x + 1

# y is the same as x, but every 
# element has 1 added to it.
print(y)

# This plot passes through (0,1), (1,2), and so on.
# It's the same line as before, 
# but shifted up 1 on the y-axis.
plt.plot(x, y)
plt.show()

# By adding 1 to the line, we moved what's 
#called the y-intercept -- where 
#the line intersects with the y-axis.
# Moving the intercept can shift 
#the whole line up (or down when we subtract).
y = x - 1
plt.plot(x, y)
plt.show()

y = x + 10
plt.plot(x, y)
plt.show()

slope 斜率 intercept 截距

import matplotlib.pyplot as plt
import numpy as np

x = np.asarray([0, 1, 2, 3, 4, 5])
# Let's set the slope of the line to 2.
y = 2 * x

# See how this line is "steeper" than before?  
# The larger the slope is, 
# the steeper the line becomes.
# On the flipside, fractional slopes 
# will create a "shallower" line.
# Negative slopes will create a line 
# where y values decrease as x values increase.
plt.plot(x, y)
plt.show()

y = 4 * x
plt.plot(x, y)
plt.show()

y = .5 * x
plt.plot(x, y)
plt.show()

y = -2 * x
plt.plot(x, y)
plt.show()

y=mx+b This equation is saying "the predicted value of the second variable (y) is equal to the value of the first variable (x) times the slope (m) plus the intercept (b)".

We'll calculate slope first -- the formula is cov(x,y)/σx2, which is just the covariance of x and y divided by the variance of x.
We can use the cov function to calculate covariance, and the .var()method on Pandas series to calculate variance.

from numpy import cov
slope_density = cov(wine_quality['density'],
                    wine_quality['quality']
                   )/wine_quality['density'].var()

We can compute the intercept by taking the slope we calculated and doing this:y¯−mx¯. So we just take the mean of the y values, and then subtract the slope times the mean of the x values from that.

Remember that we can calculate the mean by using the .mean() method.

from numpy import cov

def calc_slope(x,y):
    return cov(x,y)[0,1]/ x.var()

# y-mx

intercept_density = wine_quality['quality'].mean(
) -calc_slope(
    wine_quality["density"],
    wine_quality['quality']
)* wine_quality['density'].mean()

from numpy import cov
def calc_slope(x,y):
    return cov(x,y)[0,1] /x.var()

def calc_intercept(x,y,slope):
    return y.mean()-slope*x.mean()

def compute_predicted_y(x):
    return x*slope + intercept

slope = calc_slope(wine_quality['density'], wine_quality['quality'])
intercept = calc_intercept(wine_quality['density'], wine_quality['quality'],slope)

predicted_quality = wine_quality['density'].apply(compute_predicted_y)

print(predicted_quality)

linregress function

from scipy.stats import linregress


slope, intercept, r_value, p_value, stderr_slope = linregress(
    wine_quality["density"], wine_quality["quality"])

# As you can see, these are the same values
# we calculated (except for slight rounding differences)
print(slope)
print(intercept)
import numpy
predicted_y = numpy.asarray([slope * x + intercept
                             for x in wine_quality["density"]])
residuals = (wine_quality["quality"] - predicted_y) ** 2
rss = sum(residuals)

From the sum of squared residuals, we can find the standard error. The standard error is similar to the standard deviation, but it tries to make an estimate for the whole population of y-values -- even the ones we haven't seen yet that we may want to predict in the future.

The standard error lets us quickly determine how good or bad a linear model is at prediction.

You take the sum of squared residuals, divide by the number of y-points minus two, and then take the square root.

(rss/(n-2))**.5


from scipy.stats import linregress 
import numpy as np

slope,intercept,r_value,p_value,stderr_slope = linregress(
    wine_quality['density'],
    wine_quality['quality']
)
predicted_y = np.array(
    [slope*x+intercept 
     for x in wine_quality["density"]]
)

residuals = (
    wine_quality['quality']-predicted_y
)**2

rss = sum(residuals)

stderr = (rss/(len(wine_quality['density'])-2))**.5
# Assume that "within" means "up to and including", 
#so be sure to count values that are exactly 1, 2,
# or 3 standard errors away.

def within_percentage(y, 
                      predicted_y, 
                      stderr, 
                      error_count
                     ):
    
    within = stderr*error_count
    differences = abs(y-predicted_y)
    lower_differences = [d 
                         for d in differences 
                         if d <=within]
    
    within_count = len(lower_differences)
    return within_count/len(y)



within_one = within_percentage(
    wine_quality["quality"],
    predicted_y, 
    stderr, 
    1)
within_two = within_percentage(
    wine_quality["quality"],
    predicted_y, 
    stderr, 
    2)
within_three = within_percentage(
    wine_quality["quality"],
    predicted_y,
    stderr, 
    3)

5.20 Linear regression

推荐阅读更多精彩内容