1、利用爬虫爬取http://aligulac.com/网站上的职业选手数据
爬虫与数据集在我的github上https://github.com/wuchangsheng951/kaggle
1)网站概况
2)选手数据
3)利用bs4爬取数据,代码和获取到的数据集都放在了我的github里。
https://github.com/wuchangsheng951/kaggle
csv数据如下
2、数据清晰
#加载数据集
df=pd.read_csv('/home/kesci/input/temp4946/星际争霸数据集.csv')
#将玩家1和玩家二都处于排名之中
df=df[df.player_2.isin(df.player_1.unique())]
#将win替换为1,输替换为0
df.player_1_match_status=df.player_1_match_status.map({'[\'winner\']':1,'[\'loser\']':0 })
df.tournament_type=df.tournament_type.map({'offline':1,'online':0 })
df=df.drop(['addon','player_2_match_status'],axis=1)
类型转换以及分割字符串
df.score=df.score.str.replace('–',' ')
df.score=df.score.str.strip()
df['player_1_win']=df.score.str.slice(0,1)
df['player_2_win']=df.score.str.slice(2,5)
df['player_2_win']=df['player_2_win'].str.strip()
df['player_1_win']=df['player_1_win'].astype(int)
df['player_2_win']=df['player_2_win'].astype(int)
#将时间转换为datatime类型
#df.match_date=pd.to_datetime(df.match_date)
df=df.rename(columns={'player_1_match_status':'win'})
df['P_win']=0
df['Z_win']=0
df['T_win']=0
df['total']=0
def k(row):
if row['player_1_race']=='P':
row['P_win']=row['player_1_win']
elif row['player_1_race']=='T':
row['T_win']=row['player_1_win']
elif row['player_1_race']=='Z':
row['Z_win']=row['player_1_win']
if row['player_2_race']=='P':
row['P_win']=row['player_2_win']
elif row['player_2_race']=='Z':
row['Z_win']=row['player_2_win']
elif row['player_2_race']=='T':
row['T_win']=row['player_2_win']
row['total'] = row['player_1_win']+row['player_2_win']
return row
df=df.apply(k,axis=1)
df=df.drop('score',axis=1)
然后就可以计算啦
#各个种族职业选手人数
labels={'P','IMBAZ','IMBAT'}#标题
plt.rcParams['font.sans-serif']=['SimHei']
sizes=[race.count('P'),race.count('Z'),race.count('T')]#占比
#colors=['yellowgreen','green','red','orange']#规定颜色
explode=[0,0,0]#占比突出
plt.pie(sizes,explode=explode,labels=labels,autopct='%1.1f%%')#,colors=colors)
plt.axis('equal')#显示比例
#剔除随机选手
df=df[df.player_1_race!='R' ]
df=df[df.player_2_race!='R' ]
#创建十三个以三个月为间隔的时间间隔
date=pd.date_range('1/1/2015', periods=13, freq='3M')
#将日期转为datetime类型方便比较
df.match_date=pd.to_datetime(df.match_date)
计算pvz各个种族胜率
PVT=[]
PVZ=[]
ZVT=[]
for i in range(12):
df_temp=df[(df.match_date>date[i]) &(df.match_date<date[i+1])]
rate=df_temp.groupby(['player_1_race','player_2_race']).sum().reset_index()
PVT.append((rate.iloc[1,6]/rate.iloc[1,9]+rate.iloc[3,6]/rate.iloc[3,9])/2)
PVZ.append((rate.iloc[2,6]/rate.iloc[2,9]+rate.iloc[6,6]/rate.iloc[6,9])/2)
ZVT.append((rate.iloc[7,7]/rate.iloc[7,9]+rate.iloc[5,7]/rate.iloc[5,9])/2)
种族胜率排名
#选取2015年之后的数据
df=df[df.match_date>'2015']
rate=df.pivot_table(index=['player_1','player_1_race','player_2_race'],aggfunc=np.sum)
rate=rate.reset_index()
#种族胜率排名
def mm(row):
row['win']=row.player_1_win/row.total
return row
rating=rate[rate.total>100].apply(mm,axis=1)
lala=rating.groupby(['player_1','player_1_race','player_2_race']).mean().sort_values('win').reset_index()
#在此处修改对阵种族
lala[(lala['player_1_race']=='T') & (lala['player_2_race']=='P')][-30:].plot('player_1','win',kind='barh')
选手胜率排名
#选手胜率排名
player='ByuN'
def func(row):
row['win'] = row['player_1_win']/row['total']
return row
#此处可以修改时间
temp1=df[(df.player_1==player)&(df.match_date>='2017-4')].groupby('player_2').sum().reset_index().apply(func,axis=1).sort_values('win')
temp1=temp1[temp1.total>10].reset_index()
temp1.plot('player_2','win',kind='barh',title=player,xticks=[x*0.1 for x in range(10)])
线上线下胜率
#线下线上胜率
df=df[df.match_date>'2017']
if_on_line=df[df.tournament_type==0]
def lala(row):
row['win']=row.player_1_win/row.total
return row
offline=if_on_line.groupby(['player_1']).sum().reset_index()
offline=offline.apply(lala,axis=1).sort_values('win')[-30:]
offline.plot('player_1','win',kind='barh')