【1.7】 相关性分析
# Pearson相关系数
data1 = pd.Series(np.random.rand(100)*100).sort_values()
data2 = pd.Series(np.random.rand(100)*50).sort_values()
data = pd.DataFrame({'value1':data1.values,
'value2':data2.values})
print(data.head())
print('------')
# 创建样本数据
u1,u2 = data['value1'].mean(),data['value2'].mean() # 计算均值
std1,std2 = data['value1'].std(),data['value2'].std() # 计算标准差
print('value1正态性检验:\n',stats.kstest(data['value1'], 'norm', (u1, std1)))
print('value2正态性检验:\n',stats.kstest(data['value2'], 'norm', (u2, std2)))
print('------')
# 正态性检验 → pvalue >0.05
data['(x-u1)*(y-u2)'] = (data['value1'] - u1) * (data['value2'] - u2)
data['(x-u1)**2'] = (data['value1'] - u1)**2
data['(y-u2)**2'] = (data['value2'] - u2)**2
print(data.head())
print('------')
# 制作Pearson相关系数求值表
r = data['(x-u1)*(y-u2)'].sum() / (np.sqrt(data['(x-u1)**2'].sum() * data['(y-u2)**2'].sum()))
print('Pearson相关系数为:%.4f' % r)
# 求出r
# |r| > 0.8 → 高度线性相关
value1 value2
0 0.438122 1.055646
1 1.505379 1.515092
2 1.508023 2.323125
3 1.832305 3.552254
4 3.406128 4.155919
------
value1正态性检验:
KstestResult(statistic=0.095884626585008847, pvalue=0.29839852339800688)
value2正态性检验:
KstestResult(statistic=0.080469682048596169, pvalue=0.51965015851411267)
------
value1 value2 (x-u1)*(y-u2) (x-u1)**2 (y-u2)**2
0 0.438122 1.055646 1292.819837 2814.467243 593.854178
1 1.505379 1.515092 1242.927702 2702.366975 571.672643
2 1.508023 2.323125 1200.861611 2702.092121 533.685953
3 1.832305 3.552254 1129.876614 2668.483878 478.406924
4 3.406128 4.155919 1065.219453 2508.361644 452.363990
------
Pearson相关系数为:0.9968
# Pearson相关系数 - 算法
data = pd.DataFrame({'智商':[106,86,100,101,99,103,97,113,112,110],
'每周看电视小时数':[7,0,27,50,28,29,20,12,6,17]})
print(data)
print('------')
# 创建样本数据
data.corr(method='spearman')
# pandas相关性方法:data.corr(method='pearson', min_periods=1) → 直接给出数据字段的相关系数矩阵
# method默认pearson
智商 每周看电视小时数
0 106 7
1 86 0
2 100 27
3 101 50
4 99 28
5 103 29
6 97 20
7 113 12
8 112 6
9 110 17
------
智商 每周看电视小时数
智商 1.000000 -0.175758
每周看电视小时数 -0.175758 1.000000