1. 有以下数据,利用动态分区导入到分区表中(按照年、月进行多级分区)
10001 100 2019-03-01
10002 200 2019-03-02
10003 300 2019-03-03
10004 400 2019-04-01
10005 500 2019-04-02
10006 600 2019-04-03
10007 700 2019-05-01
10008 800 2019-05-02
10009 900 2019-05-03
10010 910 2019-06-01
10011 920 2019-06-02
10012 930 2019-06-03
把以上数据利用动态分区导入到分区表中(按照年、月进行多级分区)
//创建普通表
create table t_order(
order_number string,
order_price double,
order_time string
)row format delimited fields terminated by '\t';
//向普通表t_order加载数据
load data local inpath '/kkb/Alfred/hive/20191105/order.txt' overwrite into table t_order;
//创建目标动态分区表
create table t_order_dynamic_partition(
order_number string,
order_price double,
order_time string
)partitioned BY(year_month string)
row format delimited fields terminated by '\t';
//要想进行动态分区,需要设置参数
//开启动态分区功能
set hive.exec.dynamic.partition=true;
//设置hive为非严格模式
set hive.exec.dynamic.partition.mode=nonstrict;
//把原始的普通表加载到动态分区表中
insert into table t_order_dynamic_partition partition(year_month)
select
order_number,
order_price,
order_time,
concat(year(order_time),'-',month(order_time)) as year_month
from t_order;
//结果
hive (myhive)> show partitions t_order_dynamic_partition;
OK
partition
year_month=2019-3
year_month=2019-4
year_month=2019-5
year_month=2019-6
Time taken: 0.097 seconds, Fetched: 4 row(s)
2. 根据user_trade(用户购买明细)数据创建出合理的表结构,导入数据
3.1 查出2019年一月到四月,每个品类有多人购买,累计金额是多少?
3.2 查出2019年4月,支付金额超过五万元的用户
3.3 查出2019年4月,支付金额最多的top5用户
//创建出合理的表结构
create table if not exists user_trade (
user_name string,
piece int,
price double,
pay_amount double,
goods_category string,
pay_time bigint)
partitioned by (dt string)
row format delimited fields terminated by '\t';
//先设置动态分区
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=10000;
set hive.exec.max.dynamic.partitions.pernode=10000;
//上传数据到hdfs;
//hive中的存放地址WEB页面中查 http://192.168.52.100:50070/dfshealth.html#tab-overview
hdfs dfs -put /kkb/Alfred/hive/20191105/user_trade/* /user/hive/warehouse/myhive.db/user_trade
//进行表修复
msck repair table user_trade;
3.1 查出2019年一月到四月,每个品类有多人购买,累计金额是多少?
select
goods_category,
count(1) as buy_people_count,
round(sum(pay_amount),2) as total_pay_amount
from user_trade
where dt between '2019-01-01' and '2019-04-30'
group by goods_category;
//结果
+-----------------+-------------------+-------------------+--+
| goods_category | buy_people_count | total_pay_amount |
+-----------------+-------------------+-------------------+--+
| book | 17 | 8944.5 |
| clothes | 19 | 165400.0 |
| computer | 16 | 5766090.0 |
| electronics | 16 | 2175338.0 |
| food | 18 | 1036.2 |
| shoes | 11 | 356109.6 |
+-----------------+-------------------+-------------------+--+
6 rows selected (13.39 seconds)
3.2 查出2019年4月,支付金额超过五万元的用户
select
user_name,
t1.total_pay_amount as 201904_pay_amount
from
(select
user_name,
round(sum(pay_amount),2) as total_pay_amount
from user_trade
where dt between '2019-04-01' and '2019-04-30'
group by user_name) t1
where t1.total_pay_amount > 50000;
//结果
+------------+--------------------+--+
| user_name | 201904_pay_amount |
+------------+--------------------+--+
| Carroll | 58596.0 |
| Cheryl | 123144.0 |
| Iris | 188870.0 |
| JUNE | 519948.0 |
| Mitchell | 193314.0 |
| Morris | 166984.0 |
| Scott | 95546.0 |
+------------+--------------------+--+
7 rows selected (19.071 seconds)
3.3 查出2019年4月,支付金额最多的top5用户
select
user_name as TOP5_201904,
t1.total_pay_amount as pay_amount_201904
from
(select
user_name,
round(sum(pay_amount),2) as total_pay_amount
from user_trade
where dt between '2019-04-01' and '2019-04-30'
group by user_name ) t1
order by pay_amount_201904 desc
limit 5;
//结果
+--------------+--------------------+--+
| top5_201904 | pay_amount_201904 |
+--------------+--------------------+--+
| JUNE | 519948.0 |
| Mitchell | 193314.0 |
| Iris | 188870.0 |
| Morris | 166984.0 |
| Cheryl | 123144.0 |
+--------------+--------------------+--+
Time taken: 17.028 seconds, Fetched: 5 row(s)
3. 根据user_info(用户信息)数据创建出合理的表结构,导入数据
4.1 用户的首次激活时间,与2019年5月1日的日期间隔
4.2 统计一下四个年龄段20岁以下,20-30岁,30-40岁,40岁以上的用户数
4.3 统计每个性别用户等级高低的分布情况(level大于5为高级)
4.4 统计每个月新激活的用户数
4.5 统计不同手机品牌的用户数
// 创建出合理的表结构
create table if not exists user_info (
user_id string,
user_name string,
sex string,
age int,
city string,
firstactivetime string,
level int,
extra1 string,
extra2 map<string,string>)
row format delimited fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n'
stored as textfile;
//先设置动态分区
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=10000;
set hive.exec.max.dynamic.partitions.pernode=10000;
//上传数据到hdfs;
//hive中的存放地址WEB页面中查 http://192.168.52.100:50070/dfshealth.html#tab-overview
hdfs dfs -put /kkb/Alfred/hive/20191105/user_info.txt /user/hive/warehouse/myhive.db/user_info
4.1 用户的首次激活时间,与2019年5月1日的日期间隔
select user_id, datediff(firstactivetime,'2019-05-01') as to_20190501 from user_info;
+----------+--------------+--+
| user_id | to_20190501 |
+----------+--------------+--+
| 10001 | -383 |
| 10002 | -332 |
| 10003 | -148 |
| 10004 | -45 |
| 10005 | -440 |
| 10006 | -2 |
| 10007 | -440 |
| 10008 | -523 |
| 10009 | -498 |
| 10010 | -420 |
+----------+--------------+--+
10 rows selected (0.193 seconds)
4.2 统计一下四个年龄段20岁以下,20-30岁,30-40岁,40岁以上的用户数
select case
when age<20 then '20岁以下'
when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else'40岁以上' end,
count(user_id) user_num
from user_info
group by case
when age<20 then '20岁以下'
when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else'40岁以上' end;
+---------+-----------+--+
| _c0 | user_num |
+---------+-----------+--+
| 20-30岁 | 48 |
| 20岁以下 | 37 |
| 30-40岁 | 62 |
| 40岁以上 | 180 |
+---------+-----------+--+
4 rows selected (13.812 seconds)
4.3 统计每个性别用户等级高低的分布情况(level大于5为高级)
select sex, sum(level_advance) as advance_nums, count(1) as total_nums
from(select sex, if(level>5,1,0) as level_advance from user_info) t1
group by sex;
+---------+---------------+-------------+--+
| sex | advance_nums | total_nums |
+---------+---------------+-------------+--+
| female | 97 | 177 |
| male | 67 | 150 |
+---------+---------------+-------------+--+
2 rows selected (9.753 seconds)
4.4 统计每个月新激活的用户数
select year_month, count(1) as total_new_user from
(select user_id, substr(firstactivetime, 1,7) as Year_month from user_info) t1
group by year_month;
+-------------+-----------------+--+
| year_month | total_new_user |
+-------------+-----------------+--+
| 2017-01 | 7 |
| 2017-02 | 14 |
| 2017-03 | 11 |
| 2017-04 | 15 |
| 2017-05 | 14 |
| 2017-06 | 10 |
| 2017-07 | 12 |
| 2017-08 | 20 |
| 2017-09 | 12 |
| 2017-10 | 11 |
| 2017-11 | 17 |
| 2017-12 | 7 |
| 2018-01 | 12 |
| 2018-02 | 5 |
| 2018-03 | 15 |
| 2018-04 | 10 |
| 2018-05 | 13 |
| 2018-06 | 14 |
| 2018-07 | 15 |
| 2018-08 | 11 |
| 2018-09 | 9 |
| 2018-10 | 9 |
| 2018-11 | 10 |
| 2018-12 | 12 |
| 2019-01 | 7 |
| 2019-02 | 12 |
| 2019-03 | 14 |
| 2019-04 | 8 |
| 2019-05 | 1 |
+-------------+-----------------+--+
29 rows selected (14.062 seconds)
4.5 统计不同手机品牌的用户数
select t1.phonebrand as phonebrand,count(1) as total from
(select extra2["phonebrand"] as phonebrand from user_info ) t1
group by t1.phonebrand;
+-------------+--------+--+
| phonebrand | total |
+-------------+--------+--+
| CHUIZI | 35 |
| HUAWEI | 28 |
| MI | 35 |
| VIVO | 36 |
| YIJIA | 33 |
| iphone4 | 20 |
| iphone5 | 24 |
| iphone6 | 27 |
| iphone6s | 24 |
| iphone7 | 22 |
| iphoneX | 19 |
| iphoneXS | 24 |
+-------------+--------+--+
12 rows selected (10.132 seconds)
4. 现在我们已经有了两张表,我们再根据user_refund(退款信息)创建出合理的表结构,导入数据
5.1 在2019年购买后又退款的用户
5.2 在2019年购买,但是在2019没有退款的用户
5.3 在2017年和2018年都购买的用户
5.4 在2019年购买用户的学历分布
5.5 2019年每个用户的支付和退款金额汇总
5.6 首次激活时间在2017年,但是一直没有支付的用户年龄段分布
// 创建出合理的表结构
create table if not exists user_refund(
user_name string,
refund_piece int,
refund_amount double,
refund_time string)
partitioned by (dt string)
row format delimited fields terminated by '\t';
//上传数据到hdfs;
//hive中的存放地址WEB页面中查 http://192.168.52.100:50070/dfshealth.html#tab-overview
hdfs dfs -put /kkb/Alfred/hive/20191105/user_refund/* /user/hive/warehouse/myhive.db/user_refund
//进行表修复
msck repair table user_refund;
5.1 在2019年购买后又退款的用户
select t1.user_name as user_name from
(select user_name from user_trade where dt between '2019-01-01'and'2019-12-31' group by user_name)t1
join
(select user_name,count(1) refund_count from user_refund where dt between '2019-01-01'and'2019-12-31' group by user_name)t2
on t1.user_name = t2.user_name;
+-------------+--+
| user_name |
+-------------+--+
| Angelia |
| April |
| Brooklyn |
| Cameron |
| Catherine |
| Cathy |
| Cheryl |
| Christy |
| DEBBIE |
| Ethan |
| Fiona |
| Francis |
| Frank |
| Frieda |
| Harris |
| Ingrid |
| JUNE |
| James |
| Janet |
| Keith |
| Knight |
| Maxwell |
| Morris |
| Parker |
| Pearson |
| Phillips |
| Ray |
| Scott |
| Warren |
| Washington |
| Wheeler |
+-------------+--+
31 rows selected (47.534 seconds)
//t1: 在2019年购买的用户
(select user_name from user_trade where dt between '2019-01-01'and'2019-12-31' group by user_name) t1
//t2: 在2019年有退款的用户
(select user_name,count(1) refund_count from user_refund where dt between '2019-01-01'and'2019-12-31' group by user_name) t2
//在2019年购买后又退款的用户 t1 join t2
select t1.user_name as user_name from
t1
join
t2
on t1.user_name = t2.user_name;
5.2 在2019年购买,并且在2019年没有退款的用户
select user_name
from
(select t1.user_name,t2.refund_count
from
(select user_name from user_trade where dt between '2019-01-01'and'2019-12-31' group by user_name) t1
left outer join
(select user_name,count(1) refund_count from user_refund where dt between '2019-01-01'and'2019-12-31' group by user_name) t2
on t1.user_name = t2.user_name)t3
where refund_count is null;
+------------+--+
| user_name |
+------------+--+
| Allen |
| Amanda |
| Baker |
| Bonnie |
| Campbell |
| Carl |
| Carroll |
| Cherry |
| Cloris |
| DAISY |
| Elliott |
| Emily |
| Emma |
| FANNY |
| Gloria |
| HELLEN |
| Henry |
| Iris |
| Jill |
| KAREN |
| King |
| Marshall |
| Mitchell |
| Moore |
| Nolan |
| Oliver |
| Payne |
| Peterson |
| Regan |
| Rupert |
| Ward |
+------------+--+
31 rows selected (46.349 seconds)
//t1: 在2019年购买的用户
(select user_name from user_trade where dt between '2019-01-01'and'2019-12-31' group by user_name) t1
//t2: 在2019年有退款的用户
(select user_name,count(1) refund_count from user_refund where dt between '2019-01-01'and'2019-12-31' group by user_name) t2
//t3: t1 left join t2 在2019年购买和退款的用户总表
(select t1.user_name,t2.refund_count
from t1 left join t2
on t1.user_name = t2.user_name) t3
//在2019年购买,并且在2019年没有退款的用户
select user_name
from t3
where refund_count is null;
5.3 在2017年和2018年都购买的用户
select t1.user_name as user_name
from (select user_name from user_trade where dt between '2017-01-01'and'2017-12-31' group by user_name ) t1
join (select user_name from user_trade where dt between '2018-01-01'and'2018-12-31' group by user_name ) t2
on t1.user_name = t2.user_name;
+-------------+--+
| user_name |
+-------------+--+
| Abby |
| Ailsa |
| Albert |
| Alexander |
| Alice |
| Alina |
| Allen |
| Amber |
| Andy |
| Angela |
| Angelia |
| Anita |
.....
// t1: 在2017年购买的用户
(select user_name from user_trade where dt between '2017-01-01'and'2017-12-31' group by user_name ) t1
// t2: 在2018年购买的用户
(select user_name from user_trade where dt between '2018-01-01'and'2018-12-31' group by user_name ) t2
//在2017年和2018年都购买的用户 t1 join t2
select t1.user_name
from t1
join t2
on t1.user_name = t2.user_name;
5.4 在2019年购买用户的学历分布在
select education, count(1) people_nums_2019_buying from
( select t1.user_name,t2.education
from
(select user_name from user_trade where dt between '2019-01-01' and '2019-12-31'
group by user_name) t1
left outer join
(select user_name, extra2['education'] as education from user_info ) t2
on t1.user_name = t2.user_name )t3
group by education;
+------------+--------------------------+--+
| education | people_nums_2019_buying |
+------------+--------------------------+--+
| bachelor | 18 |
| doctor | 19 |
| master | 25 |
+------------+--------------------------+--+
3 rows selected (34.367 seconds)
//t1:在2019年购买用户 - t1
(select user_name from user_trade where dt between '2019-01-01' and '2019-12-31'
group by user_name) t1
//t2:学历分布 t2
(select user_name, extra2['education'] as education from user_info ) t2
//t3: 在2019年购买用户的学历。-用user_id来join更合理,但原始数据user_info,user_trade缺user_id
( select t1.user_name,t2.education
from t1
left outer join t2
on t1.user_name = t2.user_name ) t3
//在2019年购买用户的学历分布在
select education, count(1) people_nums_2019_buying from t3
group by education;
5.5 2019年每个用户的支付和退款金额汇总
select t2.user_name as user_name, t2.pay_total as pay_total_2019, t1.refund_total as refund_total_2019
from
(select user_name, sum(refund_amount) refund_total from user_refund
where dt between '2019-01-01' and '2019-12-31'
group by user_name ) t1
full outer join
(select user_name, sum(pay_amount) pay_total from user_trade
where dt between '2019-01-01' and '2019-12-31'
group by user_name ) t2
on t1.user_name = t2.user_name;
+-------------+-----------------+--------------------+--+
| user_name | pay_total_2019 | refund_total_2019 |
+-------------+-----------------+--------------------+--+
| Allen | 489.5 | NULL |
| Amanda | 827.7 | NULL |
| Angelia | 677710.0 | 586608.0 |
| April | 284.8 | 284.8 |
| Baker | 66.0 | NULL |
| Bonnie | 33751.2 | NULL |
| Brooklyn | 7649.2 | 7400.0 |
| Cameron | 1219.3 | 418.3 |
| Campbell | 12.1 | NULL |
| Carl | 5600.0 | NULL |
| Carroll | 58596.0 | NULL |
| Catherine | 1320.9 | 600.0 |
| Cathy | 1000.0 | 1000.0 |
| Cherry | 359964.0 | NULL |
| Cheryl | 123144.0 | 115544.0 |
| Christy | 368868.5 | 217756.0 |
| Cloris | 418.3 | NULL |
| DAISY | 11000.0 | NULL |
| DEBBIE | 979.0 | 453.9 |
| Elliott | 94.6 | NULL |
| Emily | 97.9 | NULL |
| Emma | 486618.0 | NULL |
| Ethan | 124300.0 | 111100.0 |
| FANNY | 320.4 | NULL |
| Fiona | 160094.0 | 110.0 |
| Francis | 213358.2 | 46.2 |
| Frank | 22041.6 | 22041.6 |
| Frieda | 192670.0 | 192670.0 |
| Gloria | 195536.0 | NULL |
| HELLEN | 14406.6 | NULL |
| Harris | 16.5 | 16.5 |
| Henry | 62.7 | NULL |
| Ingrid | 465.2 | 82.5 |
| Iris | 188870.0 | NULL |
| JUNE | 519948.0 | 379962.0 |
| James | 286638.0 | 286638.0 |
| Janet | 122652.6 | 66124.8 |
| Jill | 14822.0 | NULL |
| KAREN | 99.0 | NULL |
| Keith | 1268762.0 | 579942.0 |
| King | 553856.5 | NULL |
| Knight | 77.0 | 77.0 |
| Marshall | 144430.0 | NULL |
| Maxwell | 246.2 | 46.2 |
| Mitchell | 193785.7 | NULL |
| Moore | 564144.0 | NULL |
| Morris | 175384.0 | 8400.0 |
| Nolan | 30996.0 | NULL |
| Oliver | 202202.0 | NULL |
| Parker | 631.9 | 631.9 |
| Payne | 12247.3 | NULL |
| Pearson | 13800.0 | 13800.0 |
| Peterson | 2066.4 | NULL |
| Phillips | 626604.0 | 626604.0 |
| Ray | 827.7 | 827.7 |
| Regan | 23419.2 | NULL |
| Rupert | 62680.8 | NULL |
| Scott | 95546.0 | 95546.0 |
| Ward | 26174.4 | NULL |
| Warren | 98310.9 | 97768.0 |
| Washington | 32460.5 | 32460.5 |
| Wheeler | 148254.0 | 126654.0 |
+-------------+-----------------+--------------------+--+
62 rows selected (28.218 seconds)
5.6 首次激活时间在2017年,但是一直没有支付的用户年龄段分布
select case
when age<20 then '20岁以下'
when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else'40岁以上' end,
count(t3.user_id) user_nums
from (
select t1.user_name,t1.user_id,t1.age, t2.pay_count
from (select user_name,user_ID,age,firstactivetime from user_info
where substr(firstactivetime,1,7) between '2017-01-01' and '2017-12-31') t1
left outer join
(select user_name, count(1) as pay_count from user_trade group by user_name) t2
on t1.user_name = t2.user_name) t3
where pay_count is NULL
group by case
when age<20 then '20岁以下'
when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else'40岁以上' end;
+---------+-----------+--+
| _c0 | user_nums |
+---------+-----------+--+
| 20-30岁 | 5 |
| 20岁以下 | 3 |
| 30-40岁 | 7 |
| 40岁以上 | 34 |
+---------+-----------+--+
4 rows selected (44.513 seconds)
//t1: 首次激活时间在2017年
(select user_name,user_ID,age,firstactivetime from user_info
where substr(firstactivetime,1,7) between '2017-01-01' and '2017-12-31') t1
//t2: 有支付的用户
(select user_name, count(1) as pay_count from user_trade group by user_name) t2
//t3: 首次激活时间在2017年, pay_count标识是否有支付
(select t1.user_name,t1.user_id, t1.age, t2.pay_count
from t1
left outer join t2
on t1.user_name = t2.user_name) t3
//统计四个年龄段20岁以下,20-30岁,30-40岁,40岁以上的用户数,where pay_count is NULL
select case
when age<20 then '20岁以下'
when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else'40岁以上' end,
count(user_id) user_num
from t3
where t3.pay_count is NULL
group by case
when age<20 then '20岁以下'
when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else'40岁以上' end;