一、思维导图
二、准备工作
2.1使用工具
由于hive安装复杂,本项目使用金融数据分析案例第四篇《Hive初步学习》提供的【数据蛙环境(linux服务地址:106.13.128.83、用户名:froghd、密码:暂不透露)】进行操作。
操作工具主要为Xshell.6.0.0121.7z【优点:更便于使用linux服务】。
2.2数据下载
数据下载链接
下载完成后,我放在桌面。
2.3数据描述
三份数据如下(均为 .dat 文件):
1、users.dat
数据格式为: 2::M::56::16::70072,共有6040条数据
对应字段为:UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
对应字段中文解释:用户id,性别,年龄,职业,邮政编码
2、movies.dat
数据格式为: 2::Jumanji (1995)::Adventure|Children's|Fantasy,共有3883条数据
对应字段为:MovieID BigInt, Title String, Genres String
对应字段中文解释:电影ID,电影名字,电影类型
3、ratings.dat
数据格式为: 1::1193::5::978300760,共有1000209条数据
对应字段为:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
对应字段中文解释:用户ID,电影ID,评分,评分时间戳
2.4数据上传至linux服务器
cd /home/mike -- 然后输入左侧代码,回车后显示如下图
sudo rz
-- 如果输入rz报错,说明你没有权限;此时需要通过输入sudo rz来
-- sudo允许一个已授权用户以超级用户角色来访问
三、十题操作
1、正确建表,导入数据(三张表,三份数据),并验证是否正确
创建一个数据库wt,在wt数据库中创建3张表,t_user,t_movie,t_rating
t_user : userid bigint,sex string,age int,occupation string,zipcode string
t_movie : movieid bigint,moviename string,movietype string
t_rating : userid bigint,movieid bigint,rate double,times string
原始数据是以::进行切分的,所以需要使用能解析多字节分隔符的Serde即可
使用RegexSerde
需要两个参数:
input.regex = "(.)::(.)::(.*)"
output.format.string = "%1s %3$s"
-- 创建数据库
drop database if exists wt;
create database if not exists wt;
use wt; -- 此步不可少!
-- 创建表t_user
create table t_user(
userid bigint,
sex string,
age int,
occupation string,
zipcode string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s')
stored as textfile;
-- 创建表t_movie
create table t_movie(
movieid bigint,
moviename string,
movietype string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s')
stored as textfile;
-- 创建表t_rating
create table t_rating(
userid bigint,
movieid bigint,
rate double,
times string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s')
stored as textfile;
-- 导入数据
load data local inpath "/home/mike/users.dat" into table t_user;
-- 验证
select t.* from t_user t;
2、求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)
select b.moviename as moviename,count(a.rate) as total from t_rating a
join t_movie b
on a.movieid=b.movieid
group by moviename --【此处是注释】或者b.moviename
order by total desc
limit 10;
3、分别求男性,女性当中评分最高的10部电影(性别,电影名,影评分)
select a.sex as sex,c.moviename as moviename,avg(b.rate) as avgrate,count(c.moviename) as total from t_user a
join t_rating b on a.userid=b.userid
join t_movie c on c.movieid=b.movieid
group by sex,moviename
having sex="F" -- having sex="M"即为男性
order by avgrate desc
limit 100;
4、求movieid = 2116这部电影各年龄段(因为年龄就只有7个,就按这个7个分就好了)的平均影评(年龄段,影评分)
select a.age as age,avg(b.rate) as avgrate
from t_user a
join t_rating b on a.userid=b.userid
where b.movieid=2116
group by age
order by avgrate desc;
5、求最喜欢看电影(影评次数最多)的那位女性评最高分的10部电影的平均影评分(观影者,电影名,影评分)
select a.userid ,count(a.rate) as ratecount
from t_rating a
join t_user b on a.userid=b.userid
where b.sex="F"
group by a.userid
order by ratecount desc
limit 5;
create table answer_B as
select a.movieid as movieid, a.rate as rate
from t_rating a
where a.userid=1150
order by rate desc;
select * from answer_B limit 10;
select b.moviename as moviename,avg(c.rate) as avgrate
from answer_B a
join t_movie b on a.movieid=b.movieid
join t_rating c on b.movieid=c.movieid
group by moviename;
6、求好片(评分>=4.0)最多的那个年份的最好看的10部电影
create table answer6_A as
select a.movieid as movieid,a.moviename as moviename,substr(a.moviename,-5,4) as year,avg(b.rate) as avgrate
from t_movie a
join t_rating b on a.movieid=b.movieid
group by a.movieid,a.moviename;
select * from answer6_A limit 10;
select year,count(avgrate>=4) as count
from answer6_A
group by year
order by count desc
limit 5;
select movieid,moviename,avgrate
from answer6_A
where year=1998
order by avgrate desc
limit 10;
7、求1997年上映的电影中,评分最高的10部Comedy类电影
create table answer7_A as
select a.movieid as id,a.moviename as name,a.year as year,a.avgrate as avgrate,b.movietype as type
from answer6_A a
join t_movie b on a.movieid=b.movieid;
select * from answer7_A limit 10;
select id,name,avgrate,type
from answer7_A
where year=1997 and instr(lcase(type),"comedy")>0
order by avgrate desc
limit 10;
8、该影评库中各种类型电影中评价最高的5部电影(类型,电影名,平均影评分)
create table answer8_A as
select a.id as id ,a.name as name,a.year as year,a.avgrate as avgrate,tv.typesplit as typesplit
from answer7_A a
lateral view explode(split(type,"\\|")) tv as typesplit;
create table answer8_B as
select id,year,lcase(typesplit) as type,name,avgrate,row_number() over(partition by lcase(typesplit) order by avgrate desc) as num
from answer8_A;
select * from answer8_B
limit 10;
select type,name,avgrate from answer8_B
where num<=5;
9、各年评分最高的电影类型(年份,类型,影评分)
create table answer9_A as
select year,lcase(typesplit) as type,avg(avgrate) as rate
from answer8_A
group by year,lcase(typesplit) ;
select * from answer9_A limit 10;
create table answer9_B as
select year,type,rate,row_number() over (partition by year order by rate) as num
from answer9_A;
select * from answer9_B where num=1;
10、每个地区最高评分的电影名,把结果存入HDFS(地区,电影名,影评分)
create table answer10_A as
select a.zipcode as city,c.moviename as name, avg(b.rate) as avgrate
from t_user a
join t_rating b on a.userid=b.userid
join t_movie c on b.movieid=c.movieid
group by a.zipcode, c.moviename;
create table answer10_B as
select city,name,avgrate,row_number() over(partition by city order by avgrate desc) as num
from answer10_A;
select * from answer10_B limit 10;
insert overwrite directory "/wt/answer10"
row format delimited fields terminated by "\t"
select * from answer10_B where num=1;
参考资料:
(1)Xshell 怎么上传文件到Linux
(2)Hive学习之路 (十二)Hive SQL练习之影评案例
(3)金融数据分析案例第四篇《Hive初步学习》