之前做的,今天做个笔记,以便后续继续学习查阅
关于双因素方差分析和单因素方差分析
数据是关于迪士尼桶装爆米花的销售量数据
```
def level_avg(data, x_name, y_name):
df = data.groupby([x_name]).agg(['mean'])
df = df[y_name]
dict_ = dict(df["mean"])
return dict_
def SST(Y):
sst = sum(np.power(Y - np.mean(Y), 2))
return sst
def SSA(data, x_name, y_name):
total_avg = np.mean(data[y_name])
df = data.groupby([x_name]).agg(['mean', 'count'])
df = df[y_name]
ssa = sum(df["count"]*(np.power(df["mean"] - total_avg, 2)))
return ssa
def SSE(data, y_name):
data_ = data.copy()
total_avg = np.mean(data[y_name])
x_var = set(list(data.columns))-set([y_name])
cnt=1
for i in x_var:
dict_ = level_avg(data, i, y_name)
var_name = 'v_avg_{}'.format(cnt)
data_[var_name] = data_[i].map(lambda x: dict_[x])
cnt += 1
sse = sum(np.power(data_[y_name] - data_["v_avg_1"] - data_["v_avg_2"] + total_avg, 2))
return sse
def two_way_anova(data, row_name,col_name,y_name,alpha=0.05):
n = len(data)
k = len(data[row_name].unique())
r = len(data[col_name].unique())
sst = SST(data[y_name])
ssr = SSA(data, row_name, y_name)
ssc = SSA(data, col_name, y_name)
sse = SSE(data, y_name)
msr = ssr / (k-1)
msc = ssc / (r-1)
mse = sse / ((k-1)*(r-1))
Fr = msr / mse
Fc = msc / mse
pfr = scipy.stats.f.sf(Fr, k-1, (k-1)*(r-1))
pfc = scipy.stats.f.sf(Fc, r-1, (k-1)*(r-1))
Far = scipy.stats.f.isf(alpha, dfn=k-1, dfd=(k-1)*(r-1))
Fac = scipy.stats.f.isf(alpha, dfn=r-1, dfd=(k-1)*(r-1))
r_square = (ssr+ssc) / sst
table = pd.DataFrame({'source_of_variance':[row_name, col_name, 'error', 'total'],
'square_sum':[ssr, ssc, sse, sst],
'degree_of_freedom':[k-1, r-1, (k-1)*(r-1), k*r-1],
'meansquare':[msr, msc, mse, '_'],
'F_value':[Fr, Fc, '_', '_'],
'P_value':[pfr, pfc, '_', '_'],
'F_crit':[Far, Fac, '_', '_'],
'R^2':[r_square, '_', '_', '_']})
return table
# input data
data_test = data
# output result
two_way_anova(data_test, 'location', 'bucket', 'Q1_sales_USD', alpha=0.05)
```