操作手册:https://cwiki.apache.org/confluence/display/Hive/LanguageManual
一、示例数据
- 表
员工表:emp
字段
empno :员工编号
ename :员工姓名
job :职位
mgr :上级
hiredate :入职日期
sal :工资
comm :奖金
deptno :部门编号
create table IF NOT EXISTS emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' stored as textfile;
部门表:
字段
deptno :部门编号
dname :部门名称
loc :地址
create table IF NOT EXISTS default.dept(
deptno int,
dname string,
loc string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
- 数据
员工数据:
7499 ALLEN SALESMAN 7698 1981-02-20 1600.0 300.0 30
7521 WARD SALESMAN 7698 1981-02-22 1250.0 500.0 30
7566 JONES MANAGER 7839 1981-04-02 2975.0 0.0 20
7654 MARTIN SALESMAN 7698 1981-09-28 1250.0 1400.0 30
7698 BLAKE MANAGER 7839 1981-05-01 2850.0 0.0 30
7782 CLARK MANAGER 7839 1981-06-09 2450.0 0.0 10
7788 SCOTT ANALYST 7566 1987-07-13 3000.0 0.0 20
7839 KING PRESIDENT NULL 1981-11-07 5000.0 0.0 10
7844 TURNER SALESMAN 7698 1981-09-08 1500.0 0.0 30
7876 ADAMS CLERK 7788 1987-07-13 1100.0 0.0 20
7900 JAMES CLERK 7698 1981-12-03 950.0 0.0 30
7902 FORD ANALYST 7566 1981-12-03 3000.0 0.0 20
7934 MILLER CLERK 7782 1982-01-23 1300.0 0.0 10
部门数据:
10 ACCOUNTING NEW YORK
20 RESEARCH DALLAS
30 SALES CHICAGO
40 OPERATIONS BOSTON
- 加载数据
# 加载员工数据
load data local inpath '/root/empdata' overwrite into table emp;
# 加载部门数据
load data local inpath '/root/deptdata' overwrite into table dept ;
二、简单查询
SELECT语句用来从表中检索的数据。 WHERE子句中的工作原理类似于一个条件。
它使用这个条件过滤数据,并返回给出一个有限的结果。
内置运算符和函数产生一个表达式,满足以下条件。
语法:
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list]
[HAVING having_condition]
[CLUSTER BY col_list | [DISTRIBUTE BY col_list] [SORT BY col_list]]
[LIMIT number];
1. Where查询
查询检索超过30000薪水的员工详细信息。
0: jdbc:hive2://192.168.71.130:10000> SELECT * FROM emp WHERE sal>3000;
OK
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| emp.empno | emp.ename | emp.job | emp.mgr | emp.hiredate | emp.sal | emp.comm | emp.deptno |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| 7839 | KING | PRESIDENT | NULL | 1981-11-07 | 5000.0 | 0.0 | 10 |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
2. IN
0: jdbc:hive2://192.168.71.130:10000> select * from emp where deptno in (10,20);
OK
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| emp.empno | emp.ename | emp.job | emp.mgr | emp.hiredate | emp.sal | emp.comm | emp.deptno |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 | 2975.0 | 0.0 | 20 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 | 2450.0 | 0.0 | 10 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-07-13 | 3000.0 | 0.0 | 20 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-07 | 5000.0 | 0.0 | 10 |
| 7876 | ADAMS | CLERK | 7788 | 1987-07-13 | 1100.0 | 0.0 | 20 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 | 3000.0 | 0.0 | 20 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 | 1300.0 | 0.0 | 10 |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
3. EXISTS
0: jdbc:hive2://192.168.71.130:10000> select * from emp e where exists (select dname from dept d where e.deptno=d.deptno);
+----------+----------+------------+--------+-------------+---------+---------+-----------+
| e.empno | e.ename | e.job | e.mgr | e.hiredate | e.sal | e.comm | e.deptno |
+----------+----------+------------+--------+-------------+---------+---------+-----------+
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 | 1600.0 | 300.0 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 | 1250.0 | 500.0 | 30 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 | 2975.0 | 0.0 | 20 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 | 1250.0 | 1400.0 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 | 2850.0 | 0.0 | 30 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 | 2450.0 | 0.0 | 10 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-07-13 | 3000.0 | 0.0 | 20 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-07 | 5000.0 | 0.0 | 10 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 | 1500.0 | 0.0 | 30 |
| 7876 | ADAMS | CLERK | 7788 | 1987-07-13 | 1100.0 | 0.0 | 20 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 | 950.0 | 0.0 | 30 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 | 3000.0 | 0.0 | 20 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 | 1300.0 | 0.0 | 10 |
+----------+----------+------------+--------+-------------+---------+---------+-----------+
注意:
经测试,子查询,只支持相关子查询,如果子查询不是相关条件,则报错。
Invalid SubQuery expression 'deptno': For Exists/Not Exists operator SubQuery must be Correlated.
4. IS 与 IS NOT
0: jdbc:hive2://192.168.71.130:10000> select * from emp where mgr is null;
OK
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| emp.empno | emp.ename | emp.job | emp.mgr | emp.hiredate | emp.sal | emp.comm | emp.deptno |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| 7839 | KING | PRESIDENT | NULL | 1981-11-07 | 5000.0 | 0.0 | 10 |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
1 row selected (0.258 seconds)
0: jdbc:hive2://192.168.71.130:10000> select * from emp where mgr is not null;
OK
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| emp.empno | emp.ename | emp.job | emp.mgr | emp.hiredate | emp.sal | emp.comm | emp.deptno |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 | 1600.0 | 300.0 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 | 1250.0 | 500.0 | 30 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 | 2975.0 | 0.0 | 20 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 | 1250.0 | 1400.0 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 | 2850.0 | 0.0 | 30 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 | 2450.0 | 0.0 | 10 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-07-13 | 3000.0 | 0.0 | 20 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 | 1500.0 | 0.0 | 30 |
| 7876 | ADAMS | CLERK | 7788 | 1987-07-13 | 1100.0 | 0.0 | 20 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 | 950.0 | 0.0 | 30 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 | 3000.0 | 0.0 | 20 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 | 1300.0 | 0.0 | 10 |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
5. DISTINCT
0: jdbc:hive2://192.168.71.130:10000> select distinct deptno from emp;
+---------+
| deptno |
+---------+
| 10 |
| 40 |
| 30 |
| 20 |
| 50 |
+---------+
6. BETWEEN AND
包头包尾。
0: jdbc:hive2://192.168.71.130:10000> select * from emp where sal between 2000 and 3000;
OK
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| emp.empno | emp.ename | emp.job | emp.mgr | emp.hiredate | emp.sal | emp.comm | emp.deptno |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 | 2975.0 | 0.0 | 20 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 | 2850.0 | 0.0 | 30 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 | 2450.0 | 0.0 | 10 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-07-13 | 3000.0 | 0.0 | 20 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 | 3000.0 | 0.0 | 20 |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
三、排序
ORDER BY子句用于检索基于一列的细节并设置排序结果按升序或降序排列。
语法:
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list]
[HAVING having_condition]
[ORDER BY col_list]]
[LIMIT number];
1. order by
order by 全局排序
0: jdbc:hive2://192.168.71.130:10000> select * from emp order by empno;
OK
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| emp.empno | emp.ename | emp.job | emp.mgr | emp.hiredate | emp.sal | emp.comm | emp.deptno |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 | 1600.0 | 300.0 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 | 1250.0 | 500.0 | 30 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 | 2975.0 | 0.0 | 20 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 | 1250.0 | 1400.0 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 | 2850.0 | 0.0 | 30 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 | 2450.0 | 0.0 | 10 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-07-13 | 3000.0 | 0.0 | 20 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-07 | 5000.0 | 0.0 | 10 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 | 1500.0 | 0.0 | 30 |
| 7876 | ADAMS | CLERK | 7788 | 1987-07-13 | 1100.0 | 0.0 | 20 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 | 950.0 | 0.0 | 30 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 | 3000.0 | 0.0 | 20 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 | 1300.0 | 0.0 | 10 |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
Order By ,在strict 模式下(hive.mapred.mode=strict),order by 语句必须跟着limit语句,但是在非strict下就不是必须的,这样做的理由是必须有一个reduce对最终的结果进行排序,如果最后输出的行数过多,一个reduce需要花费很长的时间。
0: jdbc:hive2://192.168.71.130:10000> select * from emp order by empno desc limit 0,5;
OK
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| emp.empno | emp.ename | emp.job | emp.mgr | emp.hiredate | emp.sal | emp.comm | emp.deptno |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 | 1300.0 | 0.0 | 10 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 | 3000.0 | 0.0 | 20 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 | 950.0 | 0.0 | 30 |
| 7876 | ADAMS | CLERK | 7788 | 1987-07-13 | 1100.0 | 0.0 | 20 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 | 1500.0 | 0.0 | 30 |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
Hive有一个严格模式,在严格模式下会对可能产生较大查询结果的语句做限制,禁止其提交执行。
查看当前的模式:
0: jdbc:hive2://192.168.71.130:10000> set hive.mapred.mode;
+--------------------------------+
| set |
+--------------------------------+
| hive.mapred.mode is undefined |
未定义即为false,即no-strict模式。
开启严格模式:
set hive.mapred.mode=strict;
关闭严格模式:
set hive.mapred.mode=undefined;
order by 会对数据进行全局排序,和oracle和mysql等数据库中的order by 效果一样,它只在一个reduce中进行所以数据量特别大的时候效率非常低。
而且当设置 :set hive.mapred.mode=strict的时候不指定limit,执行select会报错,如下:
LIMIT must also be specified。
2. sort by
Sort By,它通常发生在每一个redcue里,“order by” 和“sort by"的区别在于,前者能给保证输出都是有顺序的,而后者如果有多个reduce的时候只是保证了输出的部分有序。
sort by 只有局部排序,保证每一个reduce 都是有序的,但是所有的reduce并不是有序的,(如果只有一个reduce那么就是有序的)。
set mapred.reduce.tasks=<number>
在sort by可以指定,在用sort by的时候,如果没有指定列,它会随机的分配到不同的reduce里去。
# 设置reduce数量
0: jdbc:hive2://192.168.71.130:10000> set mapred.reduce.tasks = 3;
0: jdbc:hive2://192.168.71.130:10000> select * from emp sort by empno;
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| emp.empno | emp.ename | emp.job | emp.mgr | emp.hiredate | emp.sal | emp.comm | emp.deptno |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 | 1250.0 | 500.0 | 30 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 | 2975.0 | 0.0 | 20 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 | 1250.0 | 1400.0 | 30 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 | 950.0 | 0.0 | 30 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 | 3000.0 | 0.0 | 20 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 | 2850.0 | 0.0 | 30 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 | 2450.0 | 0.0 | 10 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-07-13 | 3000.0 | 0.0 | 20 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-07 | 5000.0 | 0.0 | 10 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 | 1500.0 | 0.0 | 30 |
| 7876 | ADAMS | CLERK | 7788 | 1987-07-13 | 1100.0 | 0.0 | 20 |
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 | 1600.0 | 300.0 | 30 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 | 1300.0 | 0.0 | 10 |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
sort by 是单独在各自的reduce中进行排序,所以并不能保证全局有序,一般和distribute by 一起执行,而且distribute by 要写在sort by前面。
如果mapred.reduce.tasks=1和order by效果一样,如果大于1会分成几个文件输出每个文件会按照指定的字段排序,而不保证全局有序。
sort by 不受 hive.mapred.mode 是否为strict ,nostrict 的影响。
3. distributed by
DISTRIBUTE BY 控制map 中的输出在 reducer 中是如何进行划分的。使用DISTRIBUTE BY 可以保证相同KEY的记录被划分到一个Reduce 中。
类似MR中partition,进行分区,结合sort by使用每个reduce内部进行排序,不是全局排序, distribute by 一定是放在sort by 前面,
且必须要指定mapreduce.job.reduces数量,否则导出结果还是在一个文件中 。
# 设置reduce数量
set mapreduce.job.reduces=3;
# 进行分区
0: jdbc:hive2://192.168.71.130:10000> select * from emp distribute by deptno sort by empno;
---+------------+------------+----------+---------------+----------+-----------+-------------+
| emp.empno | emp.ename | emp.job | emp.mgr | emp.hiredate | emp.sal | emp.comm | emp.deptno |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 | 2450.0 | 0.0 | 10 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-07 | 5000.0 | 0.0 | 10 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 | 1300.0 | 0.0 | 10 |
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 | 1600.0 | 300.0 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 | 1250.0 | 500.0 | 30 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 | 1250.0 | 1400.0 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 | 2850.0 | 0.0 | 30 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 | 1500.0 | 0.0 | 30 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 | 950.0 | 0.0 | 30 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 | 2975.0 | 0.0 | 20 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-07-13 | 3000.0 | 0.0 | 20 |
| 7876 | ADAMS | CLERK | 7788 | 1987-07-13 | 1100.0 | 0.0 | 20 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 | 3000.0 | 0.0 | 20 |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
我们所有的empno相同的数据会被送到同一个reducer去处理,这就是因为指定了distribute by empno。
4. clustered by
cluster by的功能就是distribute by和sort by相结合,如下2个语句是等价的:
0: jdbc:hive2://192.168.71.130:10000> select empno, ename from emp distribute by empno sort by empno
0: jdbc:hive2://192.168.71.130:10000> select empno,ename from emp cluster by empno;
Number of reduce tasks not specified. Defaulting to jobconf value of: 3
OK
+--------+---------+
| empno | ename |
+--------+---------+
| 7654 | MARTIN |
| 7876 | ADAMS |
| 7900 | JAMES |
| 7521 | WARD |
| 7566 | JONES |
| 7698 | BLAKE |
| 7782 | CLARK |
| 7788 | SCOTT |
| 7839 | KING |
| 7902 | FORD |
| 7499 | ALLEN |
| 7844 | TURNER |
| 7934 | MILLER |
+--------+---------+
distribute by 和 sort by 合用就相当于cluster by,但是cluster by 不能指定排序为asc或 desc 的规则,只能是升序排列。
四、Group By
GROUP BY子句用于分类所有记录结果的特定集合列。它被用来查询一组记录。
GROUP BY子句的语法如下:
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list]
[HAVING having_condition]
[ORDER BY col_list]]
[LIMIT number];
产生一个查询以检索每个部门的员工数量。
0: jdbc:hive2://192.168.71.130:10000> SELECT Deptno,count(*) FROM emp GROUP BY deptno;
+---------+------+
| deptno | _c1 |
+---------+------+
| 10 | 3 |
| 30 | 6 |
| 20 | 4 |
+---------+------+
五、连接
JOIN是子句用于通过使用共同值组合来自两个表特定字段。它是用来从数据库中的两个或更多的表组合的记录。它或多或少类似于SQL JOIN。
语法
join_table:
table_reference JOIN table_factor [join_condition]
| table_reference {LEFT|RIGHT|FULL} [OUTER] JOIN table_reference
join_condition
| table_reference LEFT SEMI JOIN table_reference join_condition
| table_reference CROSS JOIN table_reference [join_condition]
连接是将两个表中在共同数据项上相互匹配的那些行合并起来,HiveQL的连接分为内连接、左向外连接、右向外连接、全外连接和半连接5种。
- 笛卡尔积
笛卡尔积会针对表1和表2的每条数据做连接。
0: jdbc:hive2://192.168.71.130:10000> select e.ename,d.dname from emp e cross join dept d;
+----------+-------------+
| e.ename | d.dname |
+----------+-------------+
| ALLEN | ACCOUNTING |
| ALLEN | RESEARCH |
| ALLEN | SALES |
| ALLEN | OPERATIONS |
| WARD | ACCOUNTING |
| WARD | RESEARCH |
| WARD | SALES |
| WARD | OPERATIONS |
| JONES | ACCOUNTING |
| JONES | RESEARCH |
| JONES | SALES |
| JONES | OPERATIONS |
| MARTIN | ACCOUNTING |
| MARTIN | RESEARCH |
| MARTIN | SALES |
| MARTIN | OPERATIONS |
| BLAKE | ACCOUNTING |
| BLAKE | RESEARCH |
| BLAKE | SALES |
| BLAKE | OPERATIONS |
| CLARK | ACCOUNTING |
| CLARK | RESEARCH |
| CLARK | SALES |
| CLARK | OPERATIONS |
| SCOTT | ACCOUNTING |
| SCOTT | RESEARCH |
| SCOTT | SALES |
| SCOTT | OPERATIONS |
| KING | ACCOUNTING |
| KING | RESEARCH |
| KING | SALES |
| KING | OPERATIONS |
| TURNER | ACCOUNTING |
| TURNER | RESEARCH |
| TURNER | SALES |
| TURNER | OPERATIONS |
| ADAMS | ACCOUNTING |
| ADAMS | RESEARCH |
| ADAMS | SALES |
| ADAMS | OPERATIONS |
| JAMES | ACCOUNTING |
| JAMES | RESEARCH |
| JAMES | SALES |
| JAMES | OPERATIONS |
| FORD | ACCOUNTING |
| FORD | RESEARCH |
| FORD | SALES |
| FORD | OPERATIONS |
| MILLER | ACCOUNTING |
| MILLER | RESEARCH |
| MILLER | SALES |
| MILLER | OPERATIONS |
+----------+-------------+
52 rows selected (16.283 seconds)
- JOIN
等值连接。
0: jdbc:hive2://192.168.71.130:10000> select e.empno,e.ename,e.sal,d.dname from emp e join dept d on (e.deptno=d.deptno);
+----------+----------+---------+-------------+
| e.empno | e.ename | e.sal | d.dname |
+----------+----------+---------+-------------+
| 7499 | ALLEN | 1600.0 | SALES |
| 7521 | WARD | 1250.0 | SALES |
| 7566 | JONES | 2975.0 | RESEARCH |
| 7654 | MARTIN | 1250.0 | SALES |
| 7698 | BLAKE | 2850.0 | SALES |
| 7782 | CLARK | 2450.0 | ACCOUNTING |
| 7788 | SCOTT | 3000.0 | RESEARCH |
| 7839 | KING | 5000.0 | ACCOUNTING |
| 7844 | TURNER | 1500.0 | SALES |
| 7876 | ADAMS | 1100.0 | RESEARCH |
| 7900 | JAMES | 950.0 | SALES |
| 7902 | FORD | 3000.0 | RESEARCH |
| 7934 | MILLER | 1300.0 | ACCOUNTING |
+----------+----------+---------+-------------+
- LEFT OUTER JOIN
左向外连接的结果集包括“LEFT OUTER”子句中指定的左表的所有行,而不仅仅是连接列所匹配的行。如果左表的某行在右表中没有匹配行,则在相关联的结果集中右表的所有选择列均为空值。
LEFT JOIN返回左表中的所有的值,加上右表,或JOIN子句没有匹配的情况下返回NULL。
0: jdbc:hive2://192.168.71.130:10000> insert into emp values(7935,"程超","bigdata",7902,'2019-12-19',5000,0,40);
0: jdbc:hive2://192.168.71.130:10000> insert into emp values(7937,"牛文娟","bigdata",7902,'2019-12-19',4000,0,50);
0: jdbc:hive2://192.168.71.130:10000> select e.empno,e.ename,e.sal,d.dname from emp e left outer join dept d on (e.deptno=d.deptno);
+----------+----------+---------+-------------+
| e.empno | e.ename | e.sal | d.dname |
+----------+----------+---------+-------------+
| 7499 | ALLEN | 1600.0 | SALES |
| 7521 | WARD | 1250.0 | SALES |
| 7566 | JONES | 2975.0 | RESEARCH |
| 7654 | MARTIN | 1250.0 | SALES |
| 7698 | BLAKE | 2850.0 | SALES |
| 7782 | CLARK | 2450.0 | ACCOUNTING |
| 7788 | SCOTT | 3000.0 | RESEARCH |
| 7839 | KING | 5000.0 | ACCOUNTING |
| 7844 | TURNER | 1500.0 | SALES |
| 7876 | ADAMS | 1100.0 | RESEARCH |
| 7900 | JAMES | 950.0 | SALES |
| 7902 | FORD | 3000.0 | RESEARCH |
| 7934 | MILLER | 1300.0 | ACCOUNTING |
+----------+----------+---------+-------------+
- RIGHT OUTER JOIN
右向外连接
右向外连接是左向外连接的反向连接,将返回右表的所有行。如果右表的某行在左表中没有匹配行,则将为左表返回空值。
JOIN是子句用于通过使用共同值组合来自两个表特定字段。它是用来从数据库中的两个或更多的表组合的记录。它或多或少类似于SQL JOIN。
0: jdbc:hive2://192.168.71.130:10000> select e.empno,e.ename,e.sal,d.dname from emp e right outer join dept d on (e.deptno=d.deptno);
+----------+----------+---------+-------------+
| e.empno | e.ename | e.sal | d.dname |
+----------+----------+---------+-------------+
| 7782 | CLARK | 2450.0 | ACCOUNTING |
| 7839 | KING | 5000.0 | ACCOUNTING |
| 7934 | MILLER | 1300.0 | ACCOUNTING |
| 7566 | JONES | 2975.0 | RESEARCH |
| 7788 | SCOTT | 3000.0 | RESEARCH |
| 7876 | ADAMS | 1100.0 | RESEARCH |
| 7902 | FORD | 3000.0 | RESEARCH |
| 7499 | ALLEN | 1600.0 | SALES |
| 7521 | WARD | 1250.0 | SALES |
| 7654 | MARTIN | 1250.0 | SALES |
| 7698 | BLAKE | 2850.0 | SALES |
| 7844 | TURNER | 1500.0 | SALES |
| 7900 | JAMES | 950.0 | SALES |
| NULL | NULL | NULL | OPERATIONS |
+----------+----------+---------+-------------+
- FULL OUTER JOIN
全外连接
全外连接返回左表和右表中的所有行。当某行在另一表中没有匹配行时,则另一个表的选择列表包含空值。如果表之间有匹配行,则整个结果集包含基表的数据值。
HiveQL FULL OUTER JOIN结合了左边,并且满足JOIN条件合适外部表的记录。连接表包含两个表的所有记录,或两侧缺少匹配结果那么使用NULL值填补。
0: jdbc:hive2://192.168.71.130:10000> select e.empno,e.ename,e.sal,d.dname from emp e full outer join dept d on (e.deptno=d.deptno);
+----------+----------+---------+-------------+
| e.empno | e.ename | e.sal | d.dname |
+----------+----------+---------+-------------+
| 7934 | MILLER | 1300.0 | ACCOUNTING |
| 7839 | KING | 5000.0 | ACCOUNTING |
| 7782 | CLARK | 2450.0 | ACCOUNTING |
| NULL | NULL | NULL | OPERATIONS |
| 7499 | ALLEN | 1600.0 | SALES |
| 7900 | JAMES | 950.0 | SALES |
| 7698 | BLAKE | 2850.0 | SALES |
| 7654 | MARTIN | 1250.0 | SALES |
| 7844 | TURNER | 1500.0 | SALES |
| 7521 | WARD | 1250.0 | SALES |
| 7876 | ADAMS | 1100.0 | RESEARCH |
| 7566 | JONES | 2975.0 | RESEARCH |
| 7902 | FORD | 3000.0 | RESEARCH |
| 7788 | SCOTT | 3000.0 | RESEARCH |
+----------+----------+---------+-------------+
- LEFT SEMI JOIN半连接(可不用)
LEFT SEMI JOIN 是 IN/EXISTS 子查询的一种更高效的实现。
Hive 当前没有实现 IN/EXISTS 子查询,所以你可以用 LEFT SEMI JOIN 重写你的子查询语句。
LEFT SEMI JOIN 的限制是, JOIN 子句中右边的表只能在 ON 子句中设置过滤条件,在 WHERE 子句、SELECT 子句或其他地方过滤都不行。
子查询:
0: jdbc:hive2://192.168.71.130:10000> select * from emp e where e.deptno in (select deptno from dept where deptno>20);
+----------+----------+------------+--------+-------------+---------+---------+-----------+
| e.empno | e.ename | e.job | e.mgr | e.hiredate | e.sal | e.comm | e.deptno |
+----------+----------+------------+--------+-------------+---------+---------+-----------+
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 | 1600.0 | 300.0 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 | 1250.0 | 500.0 | 30 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 | 1250.0 | 1400.0 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 | 2850.0 | 0.0 | 30 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 | 1500.0 | 0.0 | 30 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 | 950.0 | 0.0 | 30 |
+----------+----------+------------+--------+-------------+---------+---------+-----------+
可重写为:
0: jdbc:hive2://192.168.71.130:10000> select * from emp e left semi join dept d on (e.deptno=d.deptno);
+----------+----------+------------+--------+-------------+---------+---------+-----------+
| e.empno | e.ename | e.job | e.mgr | e.hiredate | e.sal | e.comm | e.deptno |
+----------+----------+------------+--------+-------------+---------+---------+-----------+
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 | 1600.0 | 300.0 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 | 1250.0 | 500.0 | 30 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 | 2975.0 | 0.0 | 20 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 | 1250.0 | 1400.0 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 | 2850.0 | 0.0 | 30 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 | 2450.0 | 0.0 | 10 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-07-13 | 3000.0 | 0.0 | 20 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-07 | 5000.0 | 0.0 | 10 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 | 1500.0 | 0.0 | 30 |
| 7876 | ADAMS | CLERK | 7788 | 1987-07-13 | 1100.0 | 0.0 | 20 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 | 950.0 | 0.0 | 30 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 | 3000.0 | 0.0 | 20 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 | 1300.0 | 0.0 | 10 |
+----------+----------+------------+--------+-------------+---------+---------+-----------+
- Map JOIN
若所有表中只有一张小表,那可在最大的表通过Mapper的时候将小表完全放到内存中,Hive可以在map端执行连接过程,称为map-side join,这是因为Hive可以和内存的小表逐一匹配,从而省略掉常规连接所需的reduce过程。即使对于很小的数据集,这个优化也明显地要快于常规的连接操作。其不仅减少了reduce过程,而且有时还可以同时减少Map过程的执行步骤。
使用MAPJOIN可解决以下实际问题:
A 有一个极小的表<1000行(a是小表)
B 需要做不等值join操作(a.x<b.y或者a.x like b.y等)。普通join语法不支持不等于操作,Hive语法解析会直接抛出错误。
有两种方法实现:
A.// 连接暗示/*+ MAPJOIN(customers) */
在Hive0.11前,必须使用MAPJOIN来标记显示地启动该优化操作,由于其需要将小表加载进内存所以要注意小表的大小。
0: jdbc:hive2://192.168.71.130:10000> SELECT /*+ MAPJOIN(emp) */ e.* FROM emp e JOIN dept d WHERE e.deptno <> d.deptno;
+----------+----------+------------+--------+-------------+---------+---------+-----------+
| e.empno | e.ename | e.job | e.mgr | e.hiredate | e.sal | e.comm | e.deptno |
+----------+----------+------------+--------+-------------+---------+---------+-----------+
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 | 1600.0 | 300.0 | 30 |
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 | 1600.0 | 300.0 | 30 |
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 | 1600.0 | 300.0 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 | 1250.0 | 500.0 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 | 1250.0 | 500.0 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 | 1250.0 | 500.0 | 30 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 | 2975.0 | 0.0 | 20 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 | 2975.0 | 0.0 | 20 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 | 2975.0 | 0.0 | 20 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 | 1250.0 | 1400.0 | 30 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 | 1250.0 | 1400.0 | 30 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 | 1250.0 | 1400.0 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 | 2850.0 | 0.0 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 | 2850.0 | 0.0 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 | 2850.0 | 0.0 | 30 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 | 2450.0 | 0.0 | 10 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 | 2450.0 | 0.0 | 10 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 | 2450.0 | 0.0 | 10 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-07-13 | 3000.0 | 0.0 | 20 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-07-13 | 3000.0 | 0.0 | 20 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-07-13 | 3000.0 | 0.0 | 20 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-07 | 5000.0 | 0.0 | 10 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-07 | 5000.0 | 0.0 | 10 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-07 | 5000.0 | 0.0 | 10 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 | 1500.0 | 0.0 | 30 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 | 1500.0 | 0.0 | 30 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 | 1500.0 | 0.0 | 30 |
| 7876 | ADAMS | CLERK | 7788 | 1987-07-13 | 1100.0 | 0.0 | 20 |
| 7876 | ADAMS | CLERK | 7788 | 1987-07-13 | 1100.0 | 0.0 | 20 |
| 7876 | ADAMS | CLERK | 7788 | 1987-07-13 | 1100.0 | 0.0 | 20 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 | 950.0 | 0.0 | 30 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 | 950.0 | 0.0 | 30 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 | 950.0 | 0.0 | 30 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 | 3000.0 | 0.0 | 20 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 | 3000.0 | 0.0 | 20 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 | 3000.0 | 0.0 | 20 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 | 1300.0 | 0.0 | 10 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 | 1300.0 | 0.0 | 10 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 | 1300.0 | 0.0 | 10 |
+----------+----------+------------+--------+-------------+---------+---------+-----------+
B. 通过设置自动map端连接转换,实现map连接
# 设置自动转换
0: jdbc:hive2://192.168.71.130:10000> set hive.auto.convert.join=true;
No rows affected (0.005 seconds)
0: jdbc:hive2://192.168.71.130:10000> select e.* from emp e join dept d where e.deptno<>d.deptno;
+----------+----------+------------+--------+-------------+---------+---------+-----------+
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 | 1600.0 | 300.0 | 30 |
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 | 1600.0 | 300.0 | 30 |
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 | 1600.0 | 300.0 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 | 1250.0 | 500.0 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 | 1250.0 | 500.0 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 | 1250.0 | 500.0 | 30 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 | 2975.0 | 0.0 | 20 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 | 2975.0 | 0.0 | 20 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 | 2975.0 | 0.0 | 20 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 | 1250.0 | 1400.0 | 30 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 | 1250.0 | 1400.0 | 30 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 | 1250.0 | 1400.0 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 | 2850.0 | 0.0 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 | 2850.0 | 0.0 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 | 2850.0 | 0.0 | 30 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 | 2450.0 | 0.0 | 10 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 | 2450.0 | 0.0 | 10 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 | 2450.0 | 0.0 | 10 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-07-13 | 3000.0 | 0.0 | 20 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-07-13 | 3000.0 | 0.0 | 20 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-07-13 | 3000.0 | 0.0 | 20 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-07 | 5000.0 | 0.0 | 10 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-07 | 5000.0 | 0.0 | 10 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-07 | 5000.0 | 0.0 | 10 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 | 1500.0 | 0.0 | 30 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 | 1500.0 | 0.0 | 30 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 | 1500.0 | 0.0 | 30 |
| 7876 | ADAMS | CLERK | 7788 | 1987-07-13 | 1100.0 | 0.0 | 20 |
| 7876 | ADAMS | CLERK | 7788 | 1987-07-13 | 1100.0 | 0.0 | 20 |
| 7876 | ADAMS | CLERK | 7788 | 1987-07-13 | 1100.0 | 0.0 | 20 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 | 950.0 | 0.0 | 30 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 | 950.0 | 0.0 | 30 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 | 950.0 | 0.0 | 30 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 | 3000.0 | 0.0 | 20 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 | 3000.0 | 0.0 | 20 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 | 3000.0 | 0.0 | 20 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 | 1300.0 | 0.0 | 10 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 | 1300.0 | 0.0 | 10 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 | 1300.0 | 0.0 | 10 |
+----------+----------+------------+--------+-------------+---------+---------+-----------+
相关参数
小表自动选择Mapjoin
set hive.auto.convert.join=true;
默认值:false。该参数为true时,Hive自动对左边的表统计量,若是小表就加入内存,即对小表使用Map join
小表阀值
set hive.mapjoin.smalltable.filesize=25000000;
默认值:25M
map join做group by操作时,可使用多大的内存来存储数据。若数据太大则不会保存在内存里
set hive.mapjoin.followby.gby.localtask.max.memory.usage;
默认值:0.55
本地任务可以使用内存的百分比
set hive.mapjoin.localtask.max.memory.usage;
默认值:0.90
六、合并查询
UNION ALL
union用于联合多个select语句的结果集,合并为一个独立的结果集。当前只支持UNION ALL(bag union)。不能消除重复行,每个select语句返回的列的数量和名字必须一样,否则会抛出语法错误。
语法:
select_statement UNION ALL select_statement UNION ALL select_statement....
示例:
# 去重
0: jdbc:hive2://192.168.71.130:10000> select * from emp e union select * from emp e1;
# 不去重
0: jdbc:hive2://192.168.71.130:10000> select * from emp e union all select * from emp e1;
七、子查询
hive本身支持的子查询非常有限,Hive不支持where子句中的子查询,只允许子查询在from中出现。
子查询语法
select .... from (subquery) name ...
1.Where子查询
0: jdbc:hive2://192.168.71.130:10000> select * from emp where mgr=(select empno from emp where trim(ename)='KING');
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| emp.empno | emp.ename | emp.job | emp.mgr | emp.hiredate | emp.sal | emp.comm | emp.deptno |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 | 2975.0 | 0.0 | 20 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 | 2850.0 | 0.0 | 30 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 | 2450.0 | 0.0 | 10 |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
注意:
只支持in exists is 不支持等值运算,且exists只支持相关子查询。
2. FROM子查询
select gz from (select sal+comm as gz from emp) e1;
八、将查询数据输出至目录
HDFS:
hive> INSERT OVERWRITE DIRECTORY '/tmp/hdfs_out' SELECT a.* FROM invites a WHERE a.ds='<DATE>';
将查询结果输出至本地目录:
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/local_out' SELECT a.* FROM pokes a;
选择所有列到本地目录 :
hive> INSERT OVERWRITE TABLE events SELECT a.* FROM profiles a;
hive> INSERT OVERWRITE TABLE events SELECT a.* FROM profiles a WHERE a.key < 100;
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/reg_3' SELECT a.* FROM events a;
hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_4' select a.invites, a.pokes FROM profiles a;
hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_5' SELECT COUNT(1) FROM invites a WHERE a.ds='<DATE>';
hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_5' SELECT a.foo, a.bar FROM invites a;
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/sum' SELECT SUM(a.pc) FROM pc1 a;
九、将一个表的统计结果插入另一个表中
// 必须先创建表
0: jdbc:hive2://192.168.71.130:10000> create table report (name string,gz double);
OK
0: jdbc:hive2://192.168.71.130:10000> FROM emp e INSERT OVERWRITE TABLE report SELECT e.ename as name, e.sal+e.comm as gz WHERE e.deptno = 30;
0: jdbc:hive2://192.168.71.130:10000> select * from report;
OK
+--------------+------------+
| report.name | report.gz |
+--------------+------------+
| ALLEN | 1900.0 |
| WARD | 1750.0 |
| MARTIN | 2650.0 |
| BLAKE | 2850.0 |
| TURNER | 1500.0 |
| JAMES | 950.0 |
+--------------+------------+
// 必须先建表
0: jdbc:hive2://192.168.71.130:10000> create table report1 (dept int,num int);
OK
0: jdbc:hive2://192.168.71.130:10000> INSERT OVERWRITE TABLE report1 SELECT e.deptno as dept, count(1) as num FROM emp e WHERE e.sal > 2000 group by deptno;
0: jdbc:hive2://192.168.71.130:10000> select * from report1;
OK
+---------------+--------------+
| report1.dept | report1.num |
+---------------+--------------+
| 10 | 2 |
| 20 | 3 |
| 30 | 1 |
+---------------+--------------+
JOIN
# 先创建表
0: jdbc:hive2://192.168.71.130:10000> create table report2 (name string,deptname string,address string);
OK
0: jdbc:hive2://192.168.71.130:10000> FROM emp e JOIN dept d ON (e.deptno = d.deptno) INSERT OVERWRITE TABLE report2 SELECT e.ename as name, d.dname, d.loc;
# 结果
0: jdbc:hive2://192.168.71.130:10000> select * from report2;
OK
+---------------+-------------------+------------------+
| report2.name | report2.deptname | report2.address |
+---------------+-------------------+------------------+
| ALLEN | SALES | CHICAGO |
| WARD | SALES | CHICAGO |
| JONES | RESEARCH | DALLAS |
| MARTIN | SALES | CHICAGO |
| BLAKE | SALES | CHICAGO |
| CLARK | ACCOUNTING | NEW YORK |
| SCOTT | RESEARCH | DALLAS |
| KING | ACCOUNTING | NEW YORK |
| TURNER | SALES | CHICAGO |
| ADAMS | RESEARCH | DALLAS |
| JAMES | SALES | CHICAGO |
| FORD | RESEARCH | DALLAS |
| MILLER | ACCOUNTING | NEW YORK |
+---------------+-------------------+------------------+
十、分号字符
分号是SQL语句结束标记,在HiveQL中也是结束标记,但是在HiveQL中,对分号的识别没有那么智能,例如:
0: jdbc:hive2://192.168.71.130:10000> select concat("a",concat(';',"b")) from dual;
OK
+------+
| _c0 |
+------+
| a;b |
+------+
但HiveQL在解析语句时提示:
FAILED: Parse Error: line 0:-1 mismatched input '<EOF>' expecting ) in function specification
解决的办法是,使用分号的八进制的ASCII码进行转义,那么上述语句应写成:
0: jdbc:hive2://192.168.71.130:10000> select concat('a',concat('\073','b')) from dual;
OK
+------+
| _c0 |
+------+
| a;b |
+------+
1 row selected (0.287 seconds)
附:
IS [NOT] NULL