前言:
本章主要讲解Hive的一对多"UDTF"函数解析Json字符串,根据业务需求返回对应的值;
正文:
开发工具:IDEA+JDK1.8+MAVEN
1.创建maven项目并导入UDTF函数的依赖jar包
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>1.1.1</version>
</dependency>
2.编写UDTF(User-Defined Table-Generating Functions)需要继承GenericUDTF类,类中部分代码如下:
/**
* A Generic User-defined Table Generating Function (UDTF)
*
* Generates a variable number of output rows for a single input row. Useful for
* explode(array)...
*/
public abstract class GenericUDTF {
public StructObjectInspector initialize(StructObjectInspector argOIs)
throws UDFArgumentException {
List<? extends StructField> inputFields = argOIs.getAllStructFieldRefs();
ObjectInspector[] udtfInputOIs = new ObjectInspector[inputFields.size()];
for (int i = 0; i < inputFields.size(); i++) {
udtfInputOIs[i] = inputFields.get(i).getFieldObjectInspector();
}
return initialize(udtfInputOIs);
}
/**
* Give a set of arguments for the UDTF to process.
*
* @param args
* object array of arguments
*/
public abstract void process(Object[] args) throws HiveException;
/**
* Called to notify the UDTF that there are no more rows to process.
* Clean up code or additional forward() calls can be made here.
*/
public abstract void close() throws HiveException;
}
继承GenericUDTF需要实现以上方法,其中initialize方法和UDF中类似,主要是判断输入类型并确定返回的字段类型。process方法对udft函数输入的每一样进行操作,通过调用forward方法返回一行或多行数据。close方法在process调用结束后调用,用于进行其它一些额外操作,只执行一次。
import com.google.common.collect.Lists;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import java.util.Iterator;
import java.util.List;
/**
* @author yangxuan
* @date 2019/08/19
*/
public class JsonUDTFextends GenericUDTF {
private String[]obj =new String[3];
/**
* process()方法
* 主要业务逻辑方法:
* @param objects
* objects接收的参数
*/
public void process(Object[] objects)throws HiveException {
//判断objects[0]是否为null,是直接return;
if(objects[0] ==null){
return;
}
//获取参数并转为String并用JSONObject解析字符串
String json = objects[0].toString();
try {
JSONObject jsonObject =new JSONObject(json);
Iterator keys = jsonObject.keys();
while (keys.hasNext()){
JSONArray o = (JSONArray)jsonObject.get(String.valueOf(keys.next()));
for(int i =0; i
JSONObject j2 = (JSONObject)o.get(i);
Iterator keys1 = j2.keys();
while (keys1.hasNext()){
JSONArray o2 = (JSONArray)j2.get(String.valueOf(keys1.next()));
for(int j =0;j
JSONObject j3 = (JSONObject)o2.get(j);
obj[0]= toStr(j3.get("equipParamNameId"));
obj[1]= toStr(j3.get("deviceParamNameId"));
obj[2]= toStr(j3.get("finalValue"));
//通过调用forward方法返回一行或多行数据
forward(obj);
}
}
}
}
}catch (JSONException e) {
e.printStackTrace();
}
}
private String toStr(Object o){
if(o==null){
return "";
}
return String.valueOf(o);
}
//close方法在process调用结束后调用,用于进行其它一些额外操作,只执行一次。
public void close()throws HiveException {
}
/**
*返回类型String,String,String
*init方法,主要定义输出的字段名及字段类型
*/
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs)throws UDFArgumentException {
//设置列名
List colName = Lists.newLinkedList();
colName.add("equipParamNameId");
colName.add("deviceValue");
colName.add("finalValue");
//设置对应每列的类型
List resType = Lists.newLinkedList();
resType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
resType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
resType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
//返回分别为列名 和列类型
return ObjectInspectorFactory.getStandardStructObjectInspector(colName,resType);
}
}