hive
hive相关笔记记录
- 注释部分:
- 描述任务和表;
- 结果表定义字段类型和字段含义;
- 任务上游依赖事件或者应用了某些源表;
- 开发人,工号,开发时间等;
--@Name:sospdm.sosp_ml_jxgoods_shop_rec_d
--@Description:精选好物店铺偏好活动信息
--@Author:18051165
--@CreateDate:2019-08-21
--@Source:
--@PreJob:
- 任务名设置
- 生成表名和任务名关系对应;
- 设置分区;
- 时间参数
${hivevar:statis_date}
--设置作业名
set mapred.job.name = sosp_ml_jxgoods_shop_rec_actinfo_d(${hivevar:statis_date});
参数名 | 参数值 |
---|---|
${v_date} | @@{yyyyMMdd-7d} |
${hivevar:statis_date} | @@{yyyyMMdd} |
- hive参数设置,调优
--设置hive参数,map阶段慢,减小maxsize增大map数,reduce阶段慢,增大reduce数(资源不够时增大也无效)
set mapreduce.reduce.shuffle.memory.limit.percent=0.10;
set mapreduce.input.fileinputformat.split.maxsize=300000000;
set mapreduce.input.fileinputformat.split.minsize=150000000;
set mapreduce.input.fileinputformat.split.minsize.per.node=30000000;
set mapreduce.input.fileinputformat.split.minsize.per.rack=30000000;
--使用Combinefileinputformat,将多个小文件打包作为一个整体的inputsplit,减少map任务数
set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
--大表关联小表,把小表自动加载到内存中,需要确认以下配置为true,相当于写了一个mapjoin
set hive.auto.convert.join=false;
--在Map-only的任务结束时合并小文件,map阶段Hive自动对小文件合并
set hive.merge.mapfiles=true;
--默认false, true时在MapReduce的任务结束时合并小文件
set hive.merge.mapredfiles=false;
--设置reduce数最大值
set hive.exec.reducers.max=300;
---------------------------------- 数据处理 ------------------------------------
use sospdm; --使用数据库
- 导出数据
hive -e "set hive.cli.print.header=true;select * from sospdm.sosp_ml_pptv_yzt_noshopping_base_info;"
> tmp_wuyu/sosp_ml_pptv_yzt_noshopping_base_info.txt --导出带表头
hive -e "set hive.cli.print.header=false;select cust_num,shop_gds_cd from sospdm.sosp_ml_jxgoods_shop_rec_results_sit;"
> /home/sospdm/net_disk/data/sosp_ml_jxgoods_shop_rec_results_sit.txt --jupyter env
- 建分区表导入分区数据
--元数据管理DDL建表
drop table if exists sospdm.tssa_imp_mcps_send_detail_topic_d;
create table sospdm.tssa_imp_mcps_send_detail_topic_d (
serialId string comment '流水号,时间戳',
sysCode string comment '系统编码,如FHT,JBP,DSM',
terminalCode string comment '终端编码,如SNYG,SNXDGK'
) comment '云钻领任务关联规则推荐结果表'
partitioned by (statis_date string comment '数据日期')
stored as rcfile;
--导入分区表数据
insert overwrite table sospdm.sosp_ml_yz_gds_mysql_to_hive partition (statis_date = '${hivevar:statis_date}')
--删除分区,保留一定时间段
${statis_date_365} @@{yyyyMMdd-365d}
alter table sospdm.supermarket_transfer_pred_test drop if exists partition (statis_date = ${statis_date_365});
6导入本地数据
--创建表
create table sospdm.sosp_ml_yunzuan_rec_test(
`cust_num` string comment '会员编码',
`gds_info` string comment '商品编码',
`rating` double comment '商品偏好打分'
) row format delimited fields terminated by '\t';
--导入本地数据进表
load data local inpath '/data/home/sospdm/tmp_wuyu/sosp_ml_yunzuan_rec_mem_reverse.txt'
overwrite into table sospdm.sosp_ml_yunzuan_rec_test;
- 批量显示建表时间
hdfs dfs -ls hdfs://suninghadoop2/user/sospdm/hive/warehouse/sospdm.db/sosp_ml_yunzuan_rec*
- MD5加密
add jar /home/sospdm/yf/jar/GetMd5.jar;
create temporary function getmd5 as 'com.md5.GetMd5';
hivesql函数
常用函数
--时间函数使用方式
select from_unixtime(unix_timestamp('20180808','yyyyMMdd') - 86400 * 7,'yyyyMMdd'); --20180801
select regexp_replace(date_sub(from_unixtime(to_unix_timestamp('20180808', 'yyyyMMdd'), 'yyyy-MM-dd'), 7), '-', ''); --20180801
select regexp_replace(from_unixtime(to_unix_timestamp('2018-08-08 23:41:33', 'yyyy-MM-dd HH:mm:ss'), 'yyyy-MM-dd'), '-', ''); --20180801
select regexp_replace(substr('2018-08-08 23:41:33', 1,10), '-', ''); --20180808
--百分数
select concat(cast(cast(coverage_all/transforms * 100 as decimal(8,2)) as string),'%') as cov_all_rate, --预测总覆盖率
--模糊查询表名
show tables like '*sosp_ml_user_rfm*';
--字符串模糊查询
where label like 'abc%'
where gds_nm like concat('%','段奶粉','%') --中文
--排序
select row_number() over (partition by cust_num order by item_id desc) as rn
--中位数
select percentile_approx(cast(col as double),array(0.25,0.5,0.75),1000)
--字符串反转函数:reverse 语法:
reverse(string '00001') --10000
--字符串截取
select substr('abcde',3,2) --cd
--去除编号前面的0 如00000000000001
select regexp_extract('0000000102', '^(0{0,9})?(\\w+)$', 2); --去0
select lpad('asabc',18,'0'); --补0
--统计短字符串在长字符串中出现的次数
select size(split('abcabdav','a'))-1 from dual;
--以某个符号切分成两个字段
select split('abtcdtef','t') from lxw_dual; --["ab","cd","ef"]
--行转列 去重
select cust_num,concat_ws(',',collect_set(gds_cd)) as gds_cd from sosp_ml_yunzuan_rec_test_01 group by cust_num;
--不去重
select cust_num,concat_ws(',',collect_list(l4_grp_cd)) as pre_groups
from(
select '0001' as cust_num,'R0001' as l4_grp_cd from dual
union all
select '0001' as cust_num,'R0001' as l4_grp_cd from dual) t group by cust_num;
--行转列并且排序 : 138 12,123,1234
select t.a, concat_ws(',',sort_array(collect_set(t.b)))
from(
select 138 as a, '1234' as b
union all
select 138 as a, '12' as b
union all
select 138 as a, '123' as b
)t
group by t.a
;
正则化
--正则化匹配汉字
select regexp_extract('收件人为小明的15622150839','收件人为([\\u4E00-\\u9FFF]+)的',1 )
from sospdm.dual limit 1;
regexp_extract(gds_title,'([\\u4E00-\\u9FFF]+)') not in ('','-',' ')
--num_rank:数字排序,超过10个num要从10开始排序 '\\d+\#!#' 正则化去除数字+'#!#'
select regexp_replace(concat_ws('#,#',sort_array(collect_list(concat_ws('#!#',cast(num_rank as string),
shop_gds_cd)))),'\\d+\#!#','') as shop_gds_cd