因为工作的原因,编写了许多用于差异检验的统计分析工具,使分析流程更加高效。鉴于差异检验的原理较为基础,因此本文不对其进行介绍,只展示编写好的相应分析代码。
1. 多变量均值与理论均值差异的单样本t检验
def multiVar_onesamp_ttest(df,varlist):
'''
varlist是列名称的列表
'''
try:
for i in orderlist:
d_mean = df.loc[:,i].mean()
d_max = df.loc[:,i].max()
d_min = df.loc[:,i].min()
d_theory_mean = (d_max + d_min) / 2
t,p = stats.ttest_1samp(df[i],d_theory_mean)
sd = df.loc[:,i].std()
cohen_d = abs(d_mean - d_theory_mean) / sd
print('题目名称:{}'.format(df[i].name))
print('均值为{0:.3f},标准差为:{4:.3f},理论均值为{1:.3f},均值与理论中值的差异检验,t={2:.3f},p={3:.3f}'
.format(d_mean,d_theory_mean,t,p,sd))
print('效应量为{:.3f}'.format(cohen_d))
except:
print('参数输入错误,请重新输入。')
2. 方差分析
def anova(df,classes,varia):
'''
classes 和 varia分别是作为自变量的类别变量和作为因变量的连续变量在df里的列序号
'''
try:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
nominal = df[classes].unique()
data_for_nominal = dict({})
for i in nominal:
data_for_nominal[i] = df[df[classes]==i][varia].tolist()
mean_i = np.mean(data_for_nominal[i])
std_i = np.std(data_for_nominal[i])
len_count_arrayi = len(data_for_nominal[i])
print('组别为“{0}”,n={3},该组均值为{1:.3f},该组方差为{2:.3f}'.format(i,mean_i,std_i,len_count_arrayi))
fenlei = df[classes].name
lianxu = df[varia].name
df1 = df.copy()
df1.rename(columns={fenlei:'fenlei',lianxu:'lianxu'},inplace=True)
formula = 'lianxu' +'~'+'C(fenlei)'
data_anova = anova_lm(ols(formula,df1).fit())
print('方差分析结果如下:')
print(data_anova)
tukey = pairwise_tukeyhsd(df1['lianxu'],df1['fenlei'],alpha=0.05)
print('事后比较结果如下:')
print(tukey)
except:
print('参数输入错误,请重新输入。')
3. 独立样本t检验
def ind_ttest(df,cate,iv): # 输入数据集、分组变量和因变量序号,就可以进行分析
from scipy import stats
categories = df[cate].unique()
if len(categories) == 2:
x1 = categories[0]
x2 = categories[1]
from scipy.stats import levene
class_1 = df[df[cate] == x1][iv]
class_2 = df[df[cate] == x2][iv]
stat, p_lev = levene(class_1,class_2)
t,p = stats.ttest_ind(class_1,class_2)
mean_1 = round(np.mean(class_1),3)
std_1 = round(np.std(class_1),3)
mean_2 = round(np.mean(class_2),3)
std_2 = round(np.std(class_2),3)
sampleN_1 = len(class_1)
sampleN_2 = len(class_2)
DV = abs(mean_1 - mean_2)
co_std = ((std_1**2 + std_2**2)/2)**0.5
cohen_d = DV / co_std
print('方差齐性检验的统计量为{0:.3f},p = {1:.2f}'.format(stat,p_lev))
print('类别为“{0}”的个案数n = {1}, 平均数M = {2:.3f}, 标准差std = {3:.3f}.'.format(x1,sampleN_1,mean_1,std_1))
print('类别为“{0}”的个案数n = {1}, 平均数M = {2:.3f}, 标准差std = {3:.3f}.'.format(x2,sampleN_2,mean_2,std_2))
print('独立样本t检验的结果为 t = {0:.3f}, p = {1:.3f}, 两组平均值的差值 = {2:.3f}'.format(t,p,DV))
print('效应量 d = {0:.3f}'.format(cohen_d))
else:
print('分类变量不是二分类,无法执行独立样本t检验。')
4. 卡方检验
def chi(df,v1,v2): #数据集,两个分类变量的序号
from scipy.stats import chi2_contingency
#先计算出列联表,并且存储在已有的空列联表中
try:
v1_class = df[v1].unique().tolist()
v2_class = df[v2].unique().tolist()
df_contingencyT = pd.DataFrame(index=v1_class,columns=v2_class)
for i in v1_class:
for j in v2_class:
df_cut = df[[v1,v2]]
df_cut_v1 = df_cut[df_cut.iloc[:,0] == i]
df_cut_v1_v2 = df_cut_v1[df_cut_v1.iloc[:,1] == j]
freq = len(df_cut_v1_v2.iloc[:,0])
df_contingencyT.loc[i,j] = freq
chi2, p, dof, ex = chi2_contingency(df_contingencyT,correction=False)
print('卡方计算结果为 = {0:.3f}, p = {1:.3f}'.format(chi2,p))
print('自由度为 {}'.format(dof))
except:
print('参数输入有误,请重新写入。')
5. 前后测单项李克特五点计分题目比较
def m_ttest_percent_compare(df,before,after): # df为dataframe,before为前测变量在dataframe中的序号,after为后测的序号
try:
before_score = df.loc[:,before]
after_score = df.loc[:,after]
print('输入的前测变量名为:{}'.format(df.loc[:,before].name))
print('输入的后测变量名为:{}'.format(df.loc[:,after].name))
mean_b = round(before_score.mean(),3)
mean_a = round(after_score.mean(),3)
percent = round(abs(mean_a - mean_b)/mean_b * 100,3)
t,p = stats.ttest_rel(before_score,after_score)
before_VC = before_score.value_counts()
after_VC = after_score.value_counts()
print('前测平均分为:{}'.format(mean_b))
print('后测平均分为:{}'.format(mean_a))
print('前后测分数差:{:.3f}'.format(abs(mean_a-mean_b)))
print('前后测分数变化比率(变化绝对值除以前测分数):{}%'.format(percent))
print('前后测相关样本t检验的结果,t={0:.3f},p={1:.3f}'.format(t,p))
print('前测得分统计:')
for i in before_VC.index:
ratio = before_VC[i] / len(before_score) * 100
ratio = round(ratio,3)
print('分类名称:{0}; 总数:{1}; 占比:{2:.3f}%'.format(i,before_VC[i],ratio))
print('后测得分统计:')
for i in after_VC.index:
ratio = after_VC[i] / len(after_score) * 100
ratio = round(ratio,3)
print('得分:{0}; 总数:{1}; 占比:{2:.3f}%'.format(i,after_VC[i],ratio))
except:
print('输入的参数不正确,请重新输入。')
目前本人常用到的工具就是这些,后续将继续完善参数与非参数检验的常见统计分析方法的python实现。