HiveQL 常用操作练手

1. 有以下数据,利用动态分区导入到分区表中(按照年、月进行多级分区)

10001 100 2019-03-01
10002 200 2019-03-02
10003 300 2019-03-03
10004 400 2019-04-01
10005 500 2019-04-02
10006 600 2019-04-03
10007 700 2019-05-01
10008 800 2019-05-02
10009 900 2019-05-03
10010 910 2019-06-01
10011 920 2019-06-02
10012 930 2019-06-03

把以上数据利用动态分区导入到分区表中(按照年、月进行多级分区)

//创建普通表
create table t_order(
    order_number string,
    order_price  double, 
    order_time   string
)row format delimited fields terminated by '\t';

//向普通表t_order加载数据
load data local inpath '/kkb/Alfred/hive/20191105/order.txt' overwrite into table t_order;

//创建目标动态分区表
create table t_order_dynamic_partition(
    order_number string,
    order_price  double, 
    order_time   string
)partitioned BY(year_month string)
row format delimited fields terminated by '\t';

//要想进行动态分区,需要设置参数
//开启动态分区功能
set hive.exec.dynamic.partition=true; 
//设置hive为非严格模式
set hive.exec.dynamic.partition.mode=nonstrict; 

//把原始的普通表加载到动态分区表中
insert into table t_order_dynamic_partition partition(year_month) 
select 
order_number,
order_price,
order_time, 
concat(year(order_time),'-',month(order_time)) as year_month
from t_order;

//结果
hive (myhive)> show partitions t_order_dynamic_partition;
OK
partition
year_month=2019-3
year_month=2019-4
year_month=2019-5
year_month=2019-6
Time taken: 0.097 seconds, Fetched: 4 row(s)

2. 根据user_trade(用户购买明细)数据创建出合理的表结构,导入数据

3.1 查出2019年一月到四月,每个品类有多人购买,累计金额是多少?

3.2 查出2019年4月,支付金额超过五万元的用户

3.3 查出2019年4月,支付金额最多的top5用户

//创建出合理的表结构
create table if not exists user_trade (
user_name  string,
piece  int,
price  double, 
pay_amount  double,
goods_category  string, 
pay_time  bigint)  
partitioned by (dt string) 
row format delimited fields terminated by '\t';

//先设置动态分区
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;    
set hive.exec.max.dynamic.partitions=10000;
set hive.exec.max.dynamic.partitions.pernode=10000;

//上传数据到hdfs; 
//hive中的存放地址WEB页面中查 http://192.168.52.100:50070/dfshealth.html#tab-overview 
hdfs dfs -put /kkb/Alfred/hive/20191105/user_trade/*  /user/hive/warehouse/myhive.db/user_trade

//进行表修复
msck repair table user_trade;

3.1 查出2019年一月到四月,每个品类有多人购买,累计金额是多少?

select 
goods_category, 
count(1) as buy_people_count, 
round(sum(pay_amount),2) as total_pay_amount
from user_trade
where dt between '2019-01-01' and '2019-04-30'
group by goods_category;

//结果
+-----------------+-------------------+-------------------+--+
| goods_category  | buy_people_count  | total_pay_amount  |
+-----------------+-------------------+-------------------+--+
| book            | 17                | 8944.5            |
| clothes         | 19                | 165400.0          |
| computer        | 16                | 5766090.0         |
| electronics     | 16                | 2175338.0         |
| food            | 18                | 1036.2            |
| shoes           | 11                | 356109.6          |
+-----------------+-------------------+-------------------+--+
6 rows selected (13.39 seconds)

3.2 查出2019年4月,支付金额超过五万元的用户

select 
user_name,
t1.total_pay_amount as 201904_pay_amount
from 
(select
user_name,
round(sum(pay_amount),2) as total_pay_amount
from user_trade
where dt between '2019-04-01' and '2019-04-30'
group by user_name) t1
where t1.total_pay_amount > 50000;

//结果
+------------+--------------------+--+
| user_name  | 201904_pay_amount  |
+------------+--------------------+--+
| Carroll    | 58596.0            |
| Cheryl     | 123144.0           |
| Iris       | 188870.0           |
| JUNE       | 519948.0           |
| Mitchell   | 193314.0           |
| Morris     | 166984.0           |
| Scott      | 95546.0            |
+------------+--------------------+--+
7 rows selected (19.071 seconds)

3.3 查出2019年4月,支付金额最多的top5用户

select 
user_name as TOP5_201904,
t1.total_pay_amount as pay_amount_201904
from 
(select
user_name,
round(sum(pay_amount),2) as total_pay_amount
from user_trade
where dt between '2019-04-01' and '2019-04-30'
group by user_name ) t1
order by pay_amount_201904 desc
limit 5;

//结果
+--------------+--------------------+--+
| top5_201904  | pay_amount_201904  |
+--------------+--------------------+--+
| JUNE         | 519948.0           |
| Mitchell     | 193314.0           |
| Iris         | 188870.0           |
| Morris       | 166984.0           |
| Cheryl       | 123144.0           |
+--------------+--------------------+--+
Time taken: 17.028 seconds, Fetched: 5 row(s)

3. 根据user_info(用户信息)数据创建出合理的表结构,导入数据

4.1 用户的首次激活时间,与2019年5月1日的日期间隔

4.2 统计一下四个年龄段20岁以下,20-30岁,30-40岁,40岁以上的用户数

4.3 统计每个性别用户等级高低的分布情况(level大于5为高级)

4.4 统计每个月新激活的用户数

4.5 统计不同手机品牌的用户数

// 创建出合理的表结构
create table if not exists user_info (
user_id  string, 
user_name  string, 
sex  string, 
age  int, 
city  string, 
firstactivetime  string, 
level  int,
extra1  string, 
extra2  map<string,string>) 
row format delimited fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n'
stored as textfile;

//先设置动态分区
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;    
set hive.exec.max.dynamic.partitions=10000;
set hive.exec.max.dynamic.partitions.pernode=10000;

//上传数据到hdfs; 
//hive中的存放地址WEB页面中查 http://192.168.52.100:50070/dfshealth.html#tab-overview 
hdfs dfs -put /kkb/Alfred/hive/20191105/user_info.txt  /user/hive/warehouse/myhive.db/user_info

4.1 用户的首次激活时间,与2019年5月1日的日期间隔

select user_id, datediff(firstactivetime,'2019-05-01') as to_20190501 from user_info;
+----------+--------------+--+
| user_id  | to_20190501  |
+----------+--------------+--+
| 10001    | -383         |
| 10002    | -332         |
| 10003    | -148         |
| 10004    | -45          |
| 10005    | -440         |
| 10006    | -2           |
| 10007    | -440         |
| 10008    | -523         |
| 10009    | -498         |
| 10010    | -420         |
+----------+--------------+--+
10 rows selected (0.193 seconds)

4.2 统计一下四个年龄段20岁以下,20-30岁,30-40岁,40岁以上的用户数

select case 
when age<20 then '20岁以下'
when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else'40岁以上' end,
count(user_id) user_num
from user_info
group by case 
when age<20 then '20岁以下'
when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else'40岁以上' end;

+---------+-----------+--+
|   _c0   | user_num  |
+---------+-----------+--+
| 20-30岁  | 48        |
| 20岁以下   | 37        |
| 30-40岁  | 62        |
| 40岁以上   | 180       |
+---------+-----------+--+
4 rows selected (13.812 seconds)

4.3 统计每个性别用户等级高低的分布情况(level大于5为高级)

select sex, sum(level_advance) as advance_nums, count(1) as total_nums
from(select sex, if(level>5,1,0) as level_advance from user_info) t1
group by sex;

+---------+---------------+-------------+--+
|   sex   | advance_nums  | total_nums  |
+---------+---------------+-------------+--+
| female  | 97            | 177         |
| male    | 67            | 150         |
+---------+---------------+-------------+--+
2 rows selected (9.753 seconds)

4.4 统计每个月新激活的用户数

select year_month, count(1) as total_new_user from 
(select user_id, substr(firstactivetime, 1,7) as Year_month from user_info) t1 
group by year_month;
 +-------------+-----------------+--+
| year_month  | total_new_user  |
+-------------+-----------------+--+
| 2017-01     | 7               |
| 2017-02     | 14              |
| 2017-03     | 11              |
| 2017-04     | 15              |
| 2017-05     | 14              |
| 2017-06     | 10              |
| 2017-07     | 12              |
| 2017-08     | 20              |
| 2017-09     | 12              |
| 2017-10     | 11              |
| 2017-11     | 17              |
| 2017-12     | 7               |
| 2018-01     | 12              |
| 2018-02     | 5               |
| 2018-03     | 15              |
| 2018-04     | 10              |
| 2018-05     | 13              |
| 2018-06     | 14              |
| 2018-07     | 15              |
| 2018-08     | 11              |
| 2018-09     | 9               |
| 2018-10     | 9               |
| 2018-11     | 10              |
| 2018-12     | 12              |
| 2019-01     | 7               |
| 2019-02     | 12              |
| 2019-03     | 14              |
| 2019-04     | 8               |
| 2019-05     | 1               |
+-------------+-----------------+--+
29 rows selected (14.062 seconds)

4.5 统计不同手机品牌的用户数

select t1.phonebrand as phonebrand,count(1) as total from 
(select extra2["phonebrand"] as phonebrand from user_info ) t1
group by t1.phonebrand;
+-------------+--------+--+
| phonebrand  | total  |
+-------------+--------+--+
| CHUIZI      | 35     |
| HUAWEI      | 28     |
| MI          | 35     |
| VIVO        | 36     |
| YIJIA       | 33     |
| iphone4     | 20     |
| iphone5     | 24     |
| iphone6     | 27     |
| iphone6s    | 24     |
| iphone7     | 22     |
| iphoneX     | 19     |
| iphoneXS    | 24     |
+-------------+--------+--+
12 rows selected (10.132 seconds)

4. 现在我们已经有了两张表,我们再根据user_refund(退款信息)创建出合理的表结构,导入数据

5.1 在2019年购买后又退款的用户

5.2 在2019年购买,但是在2019没有退款的用户

5.3 在2017年和2018年都购买的用户

5.4 在2019年购买用户的学历分布

5.5 2019年每个用户的支付和退款金额汇总

5.6 首次激活时间在2017年,但是一直没有支付的用户年龄段分布

// 创建出合理的表结构
create table if not exists user_refund(
user_name string,
refund_piece int, 
refund_amount double, 
refund_time string) 
partitioned by (dt string) 
row format delimited fields terminated by  '\t';

//上传数据到hdfs; 
//hive中的存放地址WEB页面中查 http://192.168.52.100:50070/dfshealth.html#tab-overview 
hdfs dfs -put /kkb/Alfred/hive/20191105/user_refund/*  /user/hive/warehouse/myhive.db/user_refund

//进行表修复
msck repair table user_refund;

5.1 在2019年购买后又退款的用户

select t1.user_name as user_name from
(select user_name from user_trade where dt between '2019-01-01'and'2019-12-31' group by user_name)t1 
join
(select user_name,count(1) refund_count from user_refund where dt between '2019-01-01'and'2019-12-31' group by user_name)t2
on t1.user_name = t2.user_name;

+-------------+--+
|  user_name  |
+-------------+--+
| Angelia     |
| April       |
| Brooklyn    |
| Cameron     |
| Catherine   |
| Cathy       |
| Cheryl      |
| Christy     |
| DEBBIE      |
| Ethan       |
| Fiona       |
| Francis     |
| Frank       |
| Frieda      |
| Harris      |
| Ingrid      |
| JUNE        |
| James       |
| Janet       |
| Keith       |
| Knight      |
| Maxwell     |
| Morris      |
| Parker      |
| Pearson     |
| Phillips    |
| Ray         |
| Scott       |
| Warren      |
| Washington  |
| Wheeler     |
+-------------+--+
31 rows selected (47.534 seconds)

//t1: 在2019年购买的用户
(select user_name from user_trade where dt between '2019-01-01'and'2019-12-31' group by user_name) t1
//t2: 在2019年有退款的用户
(select user_name,count(1) refund_count from user_refund where dt between '2019-01-01'and'2019-12-31' group by user_name) t2
//在2019年购买后又退款的用户 t1 join t2
select t1.user_name as user_name from
t1 
join
t2
on t1.user_name = t2.user_name;

5.2 在2019年购买,并且在2019年没有退款的用户

select user_name
from 
(select t1.user_name,t2.refund_count 
from 
(select user_name from user_trade where dt between '2019-01-01'and'2019-12-31' group by user_name) t1 
left outer join 
(select user_name,count(1) refund_count from user_refund where dt between '2019-01-01'and'2019-12-31' group by user_name) t2
on t1.user_name = t2.user_name)t3
where refund_count is null;

+------------+--+
| user_name  |
+------------+--+
| Allen      |
| Amanda     |
| Baker      |
| Bonnie     |
| Campbell   |
| Carl       |
| Carroll    |
| Cherry     |
| Cloris     |
| DAISY      |
| Elliott    |
| Emily      |
| Emma       |
| FANNY      |
| Gloria     |
| HELLEN     |
| Henry      |
| Iris       |
| Jill       |
| KAREN      |
| King       |
| Marshall   |
| Mitchell   |
| Moore      |
| Nolan      |
| Oliver     |
| Payne      |
| Peterson   |
| Regan      |
| Rupert     |
| Ward       |
+------------+--+
31 rows selected (46.349 seconds)


//t1: 在2019年购买的用户
(select user_name from user_trade where dt between '2019-01-01'and'2019-12-31' group by user_name) t1
//t2: 在2019年有退款的用户
(select user_name,count(1) refund_count from user_refund where dt between '2019-01-01'and'2019-12-31' group by user_name) t2
//t3: t1 left join t2 在2019年购买和退款的用户总表
(select t1.user_name,t2.refund_count 
from t1 left join t2
on t1.user_name = t2.user_name) t3
//在2019年购买,并且在2019年没有退款的用户
select user_name
from t3
where refund_count is null;

5.3 在2017年和2018年都购买的用户

select t1.user_name as user_name
from (select user_name from user_trade where dt between '2017-01-01'and'2017-12-31' group by user_name ) t1
join (select user_name from user_trade where dt between '2018-01-01'and'2018-12-31' group by user_name ) t2
on t1.user_name = t2.user_name;

+-------------+--+
|  user_name  |
+-------------+--+
| Abby        |
| Ailsa       |
| Albert      |
| Alexander   |
| Alice       |
| Alina       |
| Allen       |
| Amber       |
| Andy        |
| Angela      |
| Angelia     |
| Anita       |
.....


// t1: 在2017年购买的用户
(select user_name from user_trade where dt between '2017-01-01'and'2017-12-31' group by user_name ) t1
// t2: 在2018年购买的用户
(select user_name from user_trade where dt between '2018-01-01'and'2018-12-31' group by user_name ) t2
//在2017年和2018年都购买的用户 t1 join t2
select t1.user_name
from t1
join t2
on t1.user_name = t2.user_name;

5.4 在2019年购买用户的学历分布在

select education, count(1) people_nums_2019_buying from 
( select t1.user_name,t2.education
from 
(select user_name from user_trade where dt between '2019-01-01' and '2019-12-31'
group by user_name) t1
left outer join 
(select user_name, extra2['education'] as education from user_info ) t2 
on t1.user_name = t2.user_name )t3
group by education;

+------------+--------------------------+--+
| education  | people_nums_2019_buying  |
+------------+--------------------------+--+
| bachelor   | 18                       |
| doctor     | 19                       |
| master     | 25                       |
+------------+--------------------------+--+
3 rows selected (34.367 seconds)


//t1:在2019年购买用户 - t1
(select user_name from user_trade where dt between '2019-01-01' and '2019-12-31'
group by user_name) t1
//t2:学历分布 t2
(select user_name, extra2['education'] as education from user_info ) t2
//t3: 在2019年购买用户的学历。-用user_id来join更合理,但原始数据user_info,user_trade缺user_id
( select t1.user_name,t2.education
from t1
left outer join t2 
on t1.user_name = t2.user_name ) t3
//在2019年购买用户的学历分布在
select education, count(1) people_nums_2019_buying from t3
group by education;

5.5 2019年每个用户的支付和退款金额汇总

select t2.user_name as user_name, t2.pay_total as pay_total_2019, t1.refund_total as refund_total_2019
from 
(select user_name, sum(refund_amount) refund_total from user_refund
where dt between '2019-01-01' and '2019-12-31'
group by user_name ) t1
full outer join
(select user_name, sum(pay_amount) pay_total from user_trade
where dt between '2019-01-01' and '2019-12-31'
group by user_name ) t2
on t1.user_name = t2.user_name;

+-------------+-----------------+--------------------+--+
|  user_name  | pay_total_2019  | refund_total_2019  |
+-------------+-----------------+--------------------+--+
| Allen       | 489.5           | NULL               |
| Amanda      | 827.7           | NULL               |
| Angelia     | 677710.0        | 586608.0           |
| April       | 284.8           | 284.8              |
| Baker       | 66.0            | NULL               |
| Bonnie      | 33751.2         | NULL               |
| Brooklyn    | 7649.2          | 7400.0             |
| Cameron     | 1219.3          | 418.3              |
| Campbell    | 12.1            | NULL               |
| Carl        | 5600.0          | NULL               |
| Carroll     | 58596.0         | NULL               |
| Catherine   | 1320.9          | 600.0              |
| Cathy       | 1000.0          | 1000.0             |
| Cherry      | 359964.0        | NULL               |
| Cheryl      | 123144.0        | 115544.0           |
| Christy     | 368868.5        | 217756.0           |
| Cloris      | 418.3           | NULL               |
| DAISY       | 11000.0         | NULL               |
| DEBBIE      | 979.0           | 453.9              |
| Elliott     | 94.6            | NULL               |
| Emily       | 97.9            | NULL               |
| Emma        | 486618.0        | NULL               |
| Ethan       | 124300.0        | 111100.0           |
| FANNY       | 320.4           | NULL               |
| Fiona       | 160094.0        | 110.0              |
| Francis     | 213358.2        | 46.2               |
| Frank       | 22041.6         | 22041.6            |
| Frieda      | 192670.0        | 192670.0           |
| Gloria      | 195536.0        | NULL               |
| HELLEN      | 14406.6         | NULL               |
| Harris      | 16.5            | 16.5               |
| Henry       | 62.7            | NULL               |
| Ingrid      | 465.2           | 82.5               |
| Iris        | 188870.0        | NULL               |
| JUNE        | 519948.0        | 379962.0           |
| James       | 286638.0        | 286638.0           |
| Janet       | 122652.6        | 66124.8            |
| Jill        | 14822.0         | NULL               |
| KAREN       | 99.0            | NULL               |
| Keith       | 1268762.0       | 579942.0           |
| King        | 553856.5        | NULL               |
| Knight      | 77.0            | 77.0               |
| Marshall    | 144430.0        | NULL               |
| Maxwell     | 246.2           | 46.2               |
| Mitchell    | 193785.7        | NULL               |
| Moore       | 564144.0        | NULL               |
| Morris      | 175384.0        | 8400.0             |
| Nolan       | 30996.0         | NULL               |
| Oliver      | 202202.0        | NULL               |
| Parker      | 631.9           | 631.9              |
| Payne       | 12247.3         | NULL               |
| Pearson     | 13800.0         | 13800.0            |
| Peterson    | 2066.4          | NULL               |
| Phillips    | 626604.0        | 626604.0           |
| Ray         | 827.7           | 827.7              |
| Regan       | 23419.2         | NULL               |
| Rupert      | 62680.8         | NULL               |
| Scott       | 95546.0         | 95546.0            |
| Ward        | 26174.4         | NULL               |
| Warren      | 98310.9         | 97768.0            |
| Washington  | 32460.5         | 32460.5            |
| Wheeler     | 148254.0        | 126654.0           |
+-------------+-----------------+--------------------+--+
62 rows selected (28.218 seconds)

5.6 首次激活时间在2017年,但是一直没有支付的用户年龄段分布

select case 
when age<20 then '20岁以下'
when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else'40岁以上' end,
count(t3.user_id) user_nums
from (
select t1.user_name,t1.user_id,t1.age, t2.pay_count
from (select user_name,user_ID,age,firstactivetime from user_info
where substr(firstactivetime,1,7) between '2017-01-01' and '2017-12-31') t1
left outer join
(select user_name, count(1) as pay_count from user_trade group by user_name) t2
on t1.user_name = t2.user_name) t3
where pay_count is NULL
group by case 
when age<20 then '20岁以下'
when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else'40岁以上' end;

+---------+-----------+--+
|   _c0   | user_nums  |
+---------+-----------+--+
| 20-30岁  | 5         |
| 20岁以下   | 3         |
| 30-40岁  | 7         |
| 40岁以上   | 34        |
+---------+-----------+--+
4 rows selected (44.513 seconds)


//t1: 首次激活时间在2017年
(select user_name,user_ID,age,firstactivetime from user_info
 where substr(firstactivetime,1,7) between '2017-01-01' and '2017-12-31') t1
//t2: 有支付的用户
(select user_name, count(1) as pay_count from user_trade group by user_name) t2
//t3: 首次激活时间在2017年, pay_count标识是否有支付
(select t1.user_name,t1.user_id, t1.age, t2.pay_count
from t1
left outer join t2
on t1.user_name = t2.user_name) t3
//统计四个年龄段20岁以下,20-30岁,30-40岁,40岁以上的用户数,where pay_count is NULL
select case 
when age<20 then '20岁以下'
when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else'40岁以上' end,
count(user_id) user_num
from t3
where t3.pay_count is NULL
group by case 
when age<20 then '20岁以下'
when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else'40岁以上' end;
运行结果
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。