[ml-100k电影数据] 3 按性别统计各星评价数

hive 命令行输入查询代码

insert overwrite local directory '/home/badou/hiveout/udata_gender_rating' row format delimited fields terminated by ','
select gender,
sum(case rating when 1 then 1 else 0 end) as star1,
sum(case rating when 2 then 1 else 0 end) as star2,
sum(case rating when 3 then 1 else 0 end) as star3,
sum(case rating when 4 then 1 else 0 end) as star4,
sum(case rating when 5 then 1 else 0 end) as star5
from
(select rating,gender from u_data join u_user on u_data.userid = u_user.userid) t
group by gender;

python 读取数据并画图

''' 按性别统计各星评价数 udata_gender_rating'''
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


base_dir = r'C:\Users\yyy\Desktop\share_folder'
file = base_dir + '\\udata_gender_rating'

data = pd.read_csv(file,header=None,sep=',',index_col=0)
data.columns = ['star1','star2','star3','star4','star5']
data_pro.index.name='gender'

data_pro = data.copy()
data_pro['col_sum'] = data_pro[['star1','star2','star3','star4','star5']].apply(lambda x: x.sum(),axis=1)
data_pro[['star1','star2','star3','star4','star5']] = data_pro[['star1','star2','star3','star4','star5']].div(data_pro['col_sum'],axis='rows')

# 比例图
data_pro.drop(['col_sum'],axis=1).plot(kind='bar', stacked=True,alpha=0.5)
plt.ylim(0,1.1)

# 原始数据图
data.plot.bar(stacked=True,alpha=0.7)
比例图

原始数据图
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

友情链接更多精彩内容