hive笔记

`hive`

hive相关笔记记录

注释部分：

描述任务和表；
结果表定义字段类型和字段含义；
任务上游依赖事件或者应用了某些源表；
开发人，工号，开发时间等；

--@Name:sospdm.sosp_ml_jxgoods_shop_rec_d
--@Description:精选好物店铺偏好活动信息
--@Author:18051165
--@CreateDate:2019-08-21
--@Source:
--@PreJob:

任务名设置

生成表名和任务名关系对应；
设置分区；
时间参数 ${hivevar:statis_date}

--设置作业名
set mapred.job.name = sosp_ml_jxgoods_shop_rec_actinfo_d(${hivevar:statis_date});

参数名	参数值
${v_date}	@@{yyyyMMdd-7d}
${hivevar:statis_date}	@@{yyyyMMdd}

hive参数设置，调优

--设置hive参数,map阶段慢，减小maxsize增大map数,reduce阶段慢,增大reduce数(资源不够时增大也无效)
set mapreduce.reduce.shuffle.memory.limit.percent=0.10;
set mapreduce.input.fileinputformat.split.maxsize=300000000;
set mapreduce.input.fileinputformat.split.minsize=150000000;
set mapreduce.input.fileinputformat.split.minsize.per.node=30000000;
set mapreduce.input.fileinputformat.split.minsize.per.rack=30000000;
--使用Combinefileinputformat，将多个小文件打包作为一个整体的inputsplit，减少map任务数
set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
--大表关联小表，把小表自动加载到内存中，需要确认以下配置为true，相当于写了一个mapjoin
set hive.auto.convert.join=false;
--在Map-only的任务结束时合并小文件，map阶段Hive自动对小文件合并
set hive.merge.mapfiles=true;
--默认false， true时在MapReduce的任务结束时合并小文件
set hive.merge.mapredfiles=false;
--设置reduce数最大值
set hive.exec.reducers.max=300;
---------------------------------- 数据处理 ------------------------------------
use sospdm; --使用数据库

导出数据

hive -e "set hive.cli.print.header=true;select * from sospdm.sosp_ml_pptv_yzt_noshopping_base_info;"
 > tmp_wuyu/sosp_ml_pptv_yzt_noshopping_base_info.txt --导出带表头
hive -e "set hive.cli.print.header=false;select cust_num,shop_gds_cd from sospdm.sosp_ml_jxgoods_shop_rec_results_sit;"
 > /home/sospdm/net_disk/data/sosp_ml_jxgoods_shop_rec_results_sit.txt  --jupyter env

建分区表导入分区数据

--元数据管理DDL建表
drop table if exists sospdm.tssa_imp_mcps_send_detail_topic_d;
create table sospdm.tssa_imp_mcps_send_detail_topic_d (
  serialId      string  comment '流水号,时间戳',  
  sysCode       string  comment '系统编码，如FHT,JBP,DSM',  
  terminalCode  string  comment '终端编码，如SNYG,SNXDGK'
) comment '云钻领任务关联规则推荐结果表'    
partitioned by (statis_date string comment '数据日期')
stored as rcfile;
--导入分区表数据
insert overwrite table sospdm.sosp_ml_yz_gds_mysql_to_hive partition (statis_date = '${hivevar:statis_date}')
--删除分区，保留一定时间段
${statis_date_365} @@{yyyyMMdd-365d}
alter table sospdm.supermarket_transfer_pred_test drop if exists partition (statis_date = ${statis_date_365});

6导入本地数据

--创建表
create table sospdm.sosp_ml_yunzuan_rec_test(
  `cust_num` string comment '会员编码',
  `gds_info` string comment '商品编码',
  `rating` double comment '商品偏好打分'
) row format delimited fields terminated by '\t';

--导入本地数据进表
load data local inpath '/data/home/sospdm/tmp_wuyu/sosp_ml_yunzuan_rec_mem_reverse.txt' 
overwrite into table sospdm.sosp_ml_yunzuan_rec_test;

批量显示建表时间

hdfs dfs -ls hdfs://suninghadoop2/user/sospdm/hive/warehouse/sospdm.db/sosp_ml_yunzuan_rec*

MD5加密

add jar /home/sospdm/yf/jar/GetMd5.jar;
create temporary function getmd5 as 'com.md5.GetMd5';

hivesql函数

常用函数

--时间函数使用方式
select from_unixtime(unix_timestamp('20180808','yyyyMMdd') - 86400 * 7,'yyyyMMdd'); --20180801
select regexp_replace(date_sub(from_unixtime(to_unix_timestamp('20180808', 'yyyyMMdd'), 'yyyy-MM-dd'), 7), '-', ''); --20180801
select regexp_replace(from_unixtime(to_unix_timestamp('2018-08-08 23:41:33', 'yyyy-MM-dd HH:mm:ss'), 'yyyy-MM-dd'), '-', ''); --20180801
select regexp_replace(substr('2018-08-08 23:41:33', 1,10), '-', ''); --20180808
--百分数
select concat(cast(cast(coverage_all/transforms * 100 as decimal(8,2)) as string),'%') as cov_all_rate,  --预测总覆盖率
--模糊查询表名
show tables like '*sosp_ml_user_rfm*';
--字符串模糊查询
where label like 'abc%'
where gds_nm like concat('%','段奶粉','%') --中文
--排序
select row_number() over (partition by cust_num order by item_id desc) as rn
--中位数
select percentile_approx(cast(col as double),array(0.25,0.5,0.75),1000)
--字符串反转函数：reverse 语法: 
reverse(string '00001')  --10000
--字符串截取
select substr('abcde',3,2)  --cd
--去除编号前面的0 如00000000000001
select regexp_extract('0000000102', '^(0{0,9})?(\\w+)$', 2); --去0
select lpad('asabc',18,'0');  --补0
--统计短字符串在长字符串中出现的次数
select size(split('abcabdav','a'))-1 from dual;
--以某个符号切分成两个字段
select split('abtcdtef','t') from lxw_dual; --["ab","cd","ef"]
--行转列 去重
select cust_num,concat_ws(',',collect_set(gds_cd)) as gds_cd from sosp_ml_yunzuan_rec_test_01 group by cust_num;
--不去重
select cust_num,concat_ws(',',collect_list(l4_grp_cd)) as pre_groups
from(
select '0001' as cust_num,'R0001' as l4_grp_cd from dual
union all
select '0001' as cust_num,'R0001' as l4_grp_cd from dual) t group by cust_num;
--行转列并且排序 ： 138     12,123,1234
select t.a, concat_ws(',',sort_array(collect_set(t.b)))
  from(
         select 138 as a, '1234' as b
         union all
         select 138 as a, '12' as b
         union all
         select 138 as a, '123' as b
      )t
group by t.a
;

正则化

--正则化匹配汉字
select regexp_extract('收件人为小明的15622150839','收件人为([\\u4E00-\\u9FFF]+)的',1 ) 
from sospdm.dual limit 1;
regexp_extract(gds_title,'([\\u4E00-\\u9FFF]+)') not in ('','-',' ') 
--num_rank:数字排序,超过10个num要从10开始排序  '\\d+\#!#' 正则化去除数字+'#!#'
select regexp_replace(concat_ws('#,#',sort_array(collect_list(concat_ws('#!#',cast(num_rank as string),
shop_gds_cd)))),'\\d+\#!#','') as shop_gds_cd

hive笔记

hive

hive相关笔记记录

hivesql函数

推荐阅读更多精彩内容

`hive`