本文为工作中的一些小积累。
一,udf含义&作用
UDF指用户定义函数,即在hive中进行客户化的函数。在实际应用中,我们需要根据实际的业务需求自定义开发这种函数,自定义函数的例子参考文档: Hive Plugins
UDF函数一共分为三种类型:
- UDF-一进一出,给定一个参数,输出一个处理后的数据
- UDAF-多进一出,属于聚合函数,类似于count、sum等函数
- UDTF-一进多出,属于一个参数,返回一个列表作为结果
#在sql中可以show functions;展示目前系统已有的udf
>show functions;
+------------------------------+--+
| tab_name |
+------------------------------+--+
| ! |
| != |
| % |
| & |
| * |
| + |
| - |
| / |
| < |
| <= |
| <=> |
| <> |
| = |
| == |
| > |
| >= |
二,udf制作流程
1,pom中添加对应的依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.xxx</groupId>
<artifactId>xxx</artifactId>
<version>1.0.0-RELEASE</version>
<properties>
<java.version>1.8</java.version>
<lombok.version>1.16.18</lombok.version>
<fastjson.version>1.2.4</fastjson.version>
<commons.version>3.5</commons.version>
<slf4j.version>1.7.7</slf4j.version>
<log4j.version>1.2.17</log4j.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<scala.version>2.11.7</scala.version>
<hadoop.version>2.7.0</hadoop.version>
<spark.version>2.2.0</spark.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>2.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-metastore</artifactId>
<version>2.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.3.1</version>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.5.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.3</version>
<configuration>
<compilerVersion>${java.version}</compilerVersion>
<source>${java.version}</source>
<target>${java.version}</target>
<encoding>UTF-8</encoding>
<!-- prevents endPosTable exception for maven compile -->
<useIncrementalCompilation>false</useIncrementalCompilation>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>assembly</goal>
</goals>
</execution>
</executions>
<configuration>
<archive>
<manifest>
<mainClass>com.dazhen.udf.encryption.EncrHive</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<outputDirectory>${session.executionRootDirectory}/target/</outputDirectory>
<!--<destName>${project.name}</destName>-->
</configuration>
</plugin>
</plugins>
</build>
</project>
2,实现hive中的udf
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import utils.AESUtil;
/**
* @author water
* @desc AES 加密
*/
@Description(name = "AESEncr", value = "AESEncr context")
public class AESEncr extends UDF {
/**
* 根据password,对content进行AES加密,最后返回结果
* @param password
* @param content
* @return
* @throws Exception
*/
public String evaluate(String password, String content) throws Exception {
if(StringUtils.isBlank(content)){
return null;
}
String result;
try {
result = AESUtil.aesEncrypt(password, content);
return result;
} catch (Exception e) {
throw e;
}
}
}
3,打包
mvn clean assembly:assembly -DskipTests
#打包后在target文件夹会出现 xxx-1.0.0-RELEASE-jar-with-dependencies.jar
4,部署jar
#1将上面打包的jar放到hdfs上某路径下
# hdfs dfs -put -f(强制更新) xxx-1.0.0-RELEASE-jar-with-dependencies.jar(jar位置) /user/xxx/(hdfs路径)
> hdfs dfs -put -f xxx-1.0.0-RELEASE-jar-with-dependencies.jar /user/xxx/
#2 进入hive或者sparkSQL,将函数注册
#注册有分为永久函数,和临时函数
##永久方法,需要指定一个数据库名
##CREATE FUNCTION [db_name.]function_name AS class_name
## [USING JAR|FILE|ARCHIVE'file_uri' [, JAR|FILE|ARCHIVE'file_uri'] ];
create function udf.aesEncrypt as 'com.xxx.xxx.xxx.AESEncr' using jar 'hdfs:///user/xxx/xxx-1.0.0-RELEASE-jar-with-dependencies.jar';
##临时方法,作用域为当前session
create temporary function encr as 'EncrHive'
三,效果展示
> select udf.aesEncrypt('zmbd','4646');
+-----------------------------------+--+
| _c0 |
+-----------------------------------+--+
| B91F918A6FA7C191BD8C465A888EE828 |
+-----------------------------------+--+