基本内置聚集函数通常需要和GROUP BY子句一起使用。如果没有使用GROUP BY子句,聚集函数会缺省按照整行所有列来进行聚集。
- 无GROUP BY子句的聚集
> SELECT
> count(*) as rowcnt1,
> count(1) as rowcnt2 -- same to count(*)
> FROM employee;
+---------+---------+
| rowcnt1 | rowcnt2 |
+---------+---------+
| 4 | 4 |
+---------+---------+
1 row selected (0.184 seconds)
- 有GROUP BY子句的聚集
> SELECT
> gender_age.gender, count(*) as row_cnt
> FROM employee
> GROUP BY gender_age.gender;
+--------------------+----------+
| gender_age.gender | row_cnt |
+--------------------+----------+
| Female | 2 |
| Male | 3 |
+--------------------+----------+
2 rows selected (100.565 seconds)
-- The column name selected is not a group by columns causes error
> SELECT
> name, gender_age.gender, count(*) as row_cnt
> FROM employee GROUP BY gender_age.gender;
Error: Error while compiling statement: FAILED: SemanticException
[Error 10025]: Line 2:1 Expression not in GROUP BY key 'name' (state=42000,code=10025)
- 在同一个SELECT语句中有多个聚集函数
> SELECT
> gender_age.gender, avg(gender_age.age) as avg_age,
> count(*) as row_cnt
> FROM employee GROUP BY gender_age.gender;
+--------------------+---------------------+----------+
| gender_age.gender | avg_age | row_cnt |
+--------------------+---------------------+----------+
| Female | 42.0 | 2 |
| Male | 31.666666666666668 | 3 |
+--------------------+---------------------+----------+
2 rows selected (98.857 seconds)
- 聚集函数可以和条件函数等嵌套使用
> SELECT
> sum(CASE WHEN gender_age.gender = 'Male'
> THEN gender_age.age ELSE 0 END)/
> count(CASE WHEN gender_age.gender = 'Male' THEN 1
> ELSE NULL END) as male_age_avg
> FROM employee;
+---------------------+
| male_age_avg |
+---------------------+
| 31.666666666666668 |
+---------------------+
1 row selected (38.415 seconds)
> SELECT
> sum(coalesce(gender_age.age,0)) as age_sum,
> sum(if(gender_age.gender = 'Female',gender_age.age,0)) as female_age_sum
> FROM employee;
+----------+----------------+
| age_sum | female_age_sum |
+----------+----------------+
| 179 | 84 |
+----------+----------------+
1 row selected (42.137 seconds)
- GROUP BY子句可以使用表达式
> SELECT
> if(name = 'Will', 1, 0) as name_group,
> count(name) as name_cnt
> FROM employee
> GROUP BY if(name = 'Will', 1, 0);
+------------+----------+
| name_group | name_cnt |
+------------+----------+
| 0 | 3 |
| 1 | 1 |
+------------+----------+
2 rows selected (23.749 seconds)
- 聚集函数不可以嵌套使用
> SELECT avg(count(*)) as row_cnt FROM employee;
Error: Error while compiling statement: FAILED: SemanticException
[Error 10128]: Line 1:11 Not yet
supported place for UDAF 'count' (state=42000,code=10128)
- 聚集函数对NULL值的处理差异
> SELECT max(null), min(null), count(null);
+------+------+-----+
| _c0 | _c1 | _c2 |
+------+------+-----+
| NULL | NULL | 0 |
+------+------+-----+
1 row selected (23.54 seconds)
> SELECT sum(null), avg(null);
Error: Error while compiling statement: FAILED:
UDFArgumentTypeException Only numeric or string type
arguments are accepted but void is passed.
(state=42000,code=40000)
-- Create a table t for testing
> CREATE TABLE t (val1 int, val2 int);
> INSERT INTO TABLE t VALUES (1, 2),(null,2),(2,3);
No rows affected (0.138 seconds)
-- Check the rows in the table created
> SELECT * FROM t;
+---------+---------+
| t.val1 | t.val2 |
+---------+---------+
| 1 | 2 |
| NULL | 2 |
| 2 | 3 |
+---------+---------+
3 rows selected (0.069 seconds)
-- The 2nd row (NULL, 2) is ignored when doing sum(val1 + val2)
> SELECT sum(val1), sum(val1 + val2) FROM t;
+------+------+
| _c0 | _c1 |
+------+------+
| 3 | 8 |
+------+------+
1 row selected (57.775 seconds)
> SELECT
> sum(coalesce(val1,0)),
> sum(coalesce(val1,0) + val2)
> FROM t;
+------+------+
| _c0 | _c1 |
+------+------+
| 3 | 10 |
+------+------+
1 row selected (69.967 seconds)
- 聚集函数和DISTINCT关键字组合使用
> SELECT
> count(DISTINCT gender_age.gender) as gender_uni_cnt,
> count(DISTINCT name) as name_uni_cnt
> FROM employee;
+-----------------+---------------+
| gender_uni_cnt | name_uni_cnt |
+-----------------+---------------+
| 2 | 5 |
+-----------------+---------------+
1 row selected (35.935 seconds)
> SELECT gender_age.gender,
> max(struct(gender_age.age, name)).col1 as age,
> max(struct(gender_age.age, name)).col2 as name
> FROM employee
> GROUP BY gender_age.gender;
+-------------------+-----+------+
| gender_age.gender | age | name |
+-------------------+-----+------+
| Female | 57 | Lucy |
| Male | 35 | Will |
+-------------------+-----+------+
2 rows selected (26.896 seconds)