import scipy.stats
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
standard_norm = scipy.stats.norm
x = np.arange(-4, 4, 0.01)
plt.plot(x, standard_norm.pdf(x))
plt.show()
t_dist = scipy.stats.t
plt.plot(x, standard_norm.pdf(x), label='standard normal')
x = np.arange(-4, 4, 0.01)
plt.plot(x, t_dist.pdf(x, df=1), label='t distribution')
plt.legend()
plt.show()
house = pd.read_csv('house_size.csv', header=None)
house_size = house.iloc[:,0]
print(list(house_size))
[314, 119, 217, 326, 342, 318, 130, 465, 383, 396, 507, 283, 250, 326, 279, 363, 229, 303, 367, 246, 247, 262, 209, 294, 112, 249, 354, 355, 272, 277, 377, 411, 223, 232, 445, 333, 336, 349, 611, 516, 233, 275, 395, 241, 127, 228, 305, 321, 235, 226, 288, 503, 305, 280, 318, 281, 227, 279, 171, 290, 336, 284, 380, 314, 316, 476, 309, 293, 160, 300, 319, 396, 275, 212, 344, 305, 280, 331, 359, 283, 136, 322, 359, 202, 188, 187, 457, 340, 262, 288, 318, 381, 289, 205, 373, 200, 320, 213, 261, 357]
pop_std = 86
sample_mean = house_size.mean()
sample_mean
300.85
sample_size = len(house_size)
sample_size
100
z_score = scipy.stats.norm.isf(0.025) # 95% 置信度
z_score
1.9599639845400545
margin_error = z_score * pop_std / np.sqrt(sample_size)
margin_error
16.855690267044469
lower_limit = sample_mean - margin_error
upper_limit = sample_mean + margin_error
print('95%% Confidence Interval: ( %.1f, %.1f)' % (lower_limit, upper_limit))
95% Confidence Interval: ( 284.0, 317.7)
def ci_z(data, pop_std, confidence):
sample_mean = np.mean(data)
sample_size = len(data)
alpha = (1 - confidence) / 2
z_score = scipy.stats.norm.isf(alpha)
ME = z_score * pop_std / np.sqrt(sample_size)
lower_limit = sample_mean - ME
upper_limit = sample_mean + ME
return (lower_limit, upper_limit)
ci_z(house_size, pop_std, 0.90)
(286.70425880821733, 314.99574119178271)
ci_z(house_size, pop_std, 0.95)
(283.99430973295557, 317.70569026704447)
ci_z(house_size, pop_std, 0.99)
(278.69786798947951, 323.00213201052054)
ci_z(house_size, pop_std, 1)
(-inf, inf)
np.random.choice(house_size, size=10) # 从house_size数据中随机抽取10个数据,可重复抽取
array([112, 294, 228, 283, 318, 355, 233, 277, 300, 395], dtype=int64)
def bootstrap_mean(data):
# 从数据data中重复抽样,样本大小与data相同,并返回样本均值
return np.mean(np.random.choice(data, size=len(data)))
def draw_bootstrap(data, times=1):
#初始化长度为times的空数组
bs_mean = np.empty(times)
#进行多次(times次)抽样,将每次得到的样本均值存储在bs_mean中
for i in range(times):
bs_mean[i] = bootstrap_mean(data)
return bs_mean
bs_mean = draw_bootstrap(house_size, 10000)
plt.hist(bs_mean, bins=50, normed=True, rwidth=0.9)
plt.show()
np.percentile(bs_mean, [2.5, 97.5])
array([ 283.62 , 318.49025])
作业确实看不懂。需要请教!!