参考资料
用python实现Hive中的UDF函数
1.创建表 ——hive窗口或者命令行
create external table zhengyuan.person(
name string,
idcard string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED as TEXTFILE
location '/user/zhengyuan/t0418';
2.加载数据
- 文件路径为服务器上的路径
- 只能在hive命令行运行,客户端不能使用
load data local inpath '/data/pyspark/program/auto_report/zhengyuan/test/person.txt' into table zhengyuan.person;
3.加载 udf 函数
- 文件路径为服务器上的路径
- 只能在hive命令行运行,客户端不能使用
add file /data/pyspark/program/auto_report/zhengyuan/test/person.py
4.使用hive 实现相应功能
select idcard,
case when length(idcard) = 18 then
case when substring(idcard,-2,1) % 2 = 1 then '男'
when substring(idcard,-2,1) % 2 = 0 then '女'
else 'unknown' end
when length(idcard) = 15 then
case when substring(idcard,-1,1) % 2 = 1 then '男'
when substring(idcard,-1,1) % 2 = 0 then '女'
else 'unknown' end
else '不合法' end
from zhengyuan.person;
5. UDF编写
# -*- coding: utf-8 -*-
import sys
for line in sys.stdin:
detail = line.strip().split("\t")
if len(detail) != 2:
continue
else:
name = detail[0]
idcard = detail[1]
if len(idcard) == 15:
if int(idcard[-1]) % 2 == 0:
print("\t".join([name,idcard,"女"]))
else:
print("\t".join([name,idcard,"男"]))
elif len(idcard) == 18:
if int(idcard[-2]) % 2 == 0:
print("\t".join([name,idcard,"女"]))
else:
print("\t".join([name,idcard,"男"]))
else:
print("\t".join([name,idcard,"身份信息不合法!"]))
6.测试指令
cat person.txt|python person.py
7.使用udf——目前也只能用hive命令行执行
SELECT TRANSFORM (<columns>)
USING 'python <python_script>'
AS (<columns>)
FROM <table>;
select transform(name,idcard) USING 'python person.py' AS (name,idcard,gender) from person;