1、确定最优k值:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
#标签处理,有时需要对标签取对数
data1=data[['fans','money_num','uv','danmu_cnt','gap_days']].apply(np.log1p)
#1、确定最优聚类K值
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
'利用SSE选择k'
SSE = [] # 存放每次结果的误差平方和
for k in range(1,11):
estimator = KMeans(n_clusters=k) # 构造聚类器
estimator.fit(data1)
SSE.append(estimator.inertia_)
X = range(1,11)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(X,SSE,'o-')
plt.show()
2、聚类过程:
#开始聚类
data1=data[['room_dau','revenue','live_days_in_30d','fans']]
scaler=StandardScaler()
kmeans=KMeans(n_clusters=3)
pipeline=make_pipeline(scaler,kmeans)
pipeline.fit(data1)#训练模型
labels=pipeline.predict(data1)#预测
df=pd.DataFrame({'labels':labels,'upid':upid})
# ct=pd.crosstab(df['labels'],df['upid'])
# r = pd.concat([data,pd.Series(labels,index = data.index)],axis = 1)
# print(r)
df['labels'].value_counts()
3、查看特征分布:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="white", palette="muted", color_codes=True)
#开播天数分布
sns.kdeplot(data2['live_days_in_30d'][data2['labels']==0],color='r',label='0')
sns.kdeplot(data2['live_days_in_30d'][data2['labels']==1],color='b',label='1')
sns.kdeplot(data2['live_days_in_30d'][data2['labels']==2],color='g',label='2')
sns.kdeplot(data2['live_days_in_30d'][data2['labels']==3],color='y',label='3')
plt.legend()
#各特征平均数
data_result1[['fans','money_num','uv','danmu_cnt','gap_days','live_days_in_30d']].groupby(data_result1['labels']).mean()