本篇以实例介绍如何优化Hive大表关联大表 小笛卡尔积如何优化
如果是没有关联条件的大笛卡尔积,是没法做优化的,如果是N:N join的小笛卡尔积,Hive有SMB Join来优化,下面上代码:
select a.*,b.*
from aBigTable a
join aBigTable b
on a.comm_id=b.comm_id
where a.pro_id>b.pro_id and a.hash != b.hash
Comm_id不唯一,小笛卡尔积,根据数据量2000w行数据,14Gb,拍脑袋 预估分了250个桶
create table fentongtable(
comm_id int,pro_id bigint,city_id int,hash string,feature string)
CLUSTERED BY (comm_id)
SORTED BY(comm_id)
INTO 250 BUCKETS;
set hive.enforce.bucketing = true;set hive.enforce.sortng=true;set mapred.reduce.tasks = 250;
insert OVERWRITE TABLE fentongtable
select
comm_id ,pro_id ,city_id,hash,feature
from aBigTable
distribute by comm_id
sort by comm_id;
SET hive.auto.convert.join=true;
set hive.auto.convert.sortmerge.join=true;
set hive.auto.convert.sortmerge.join.noconditionaltask=true;
set hive.optimize.bucketmapjoin = true;
set hive.optimize.bucketmapjoin.sortedmerge = true;
set hive.auto.convert.sortmerge.join.bigtable.selection.policy = org.apache.hadoop.hive.ql.optimizer.TableSizeBasedBigTableSelectorForAutoSMJ;
create table youhuatest_SMBjoin as
select a.pro_id,b.pro_id as pro_id_s,a.city_id,a.comm_id,a.hash, a.feature, b.feature as feature_s
from fentongtable A
join fentongtable b
on a.comm_id=b.comm_id
where a.pro_id>b.pro_id and a.hash != b.hash ;
跑了好久,发现卡在一个map
数据倾斜了?该如何优化笛卡尔积+数据倾斜呢?请看下集