1、Hive支持
创建表时指定parquet格式即可:
create table tmp.orc_test(id bigint, name string, age int) stored as parquet TBLPROPERTIES('orc.compresssion'='SNAPPY')
压缩格式有"SNAPPY"和 "GZIP"两种,需要哪种格式指定即可。
2、SPARK支持
Spark读:
df = spark.read.parquet("/tmp/test/orc_data") # 读出来的数据是一个dataframe
Spark写:
df.write.format("parquet").save("/tmp/test/orc_data2")
3、Hadoop Streaming支持
hadoop jar /usr/local/hadoop-2.7.0/share/hadoop/tools/lib/hadoop-streaming-2.7.0.jar \
-libjars parquet_test.jar,hadoop2-iow-lib.jar,/usr/local/spark-2.1.0-bin-hadoop2.7/jars/parquet-column-1.8.1.jar,/usr/local/spark-2.1.0-bin-hadoop2.7/jars/parquet-common-1.8.1.jar,/usr/local/spark-2.1.0-bin-hadoop2.7/jars/parquet-encoding-1.8.1.jar,/usr/local/spark-2.1.0-bin-hadoop2.7/jars/parquet-hadoop-1.8.1.jar,/usr/local/spark-2.1.0-bin-hadoop2.7/jars/parquet-format-2.3.0-incubating.jar \
-D mapred.job.name="test_streaming" \
-D iow.streaming.output.schema="message example {required binary age;required binary name;required binary desc;}" \
-D mapreduce.output.fileoutputformat.compress=true \
-D parquet.compression=gzip \
-D parquet.read.support.class=net.iponweb.hadoop.streaming.parquet.GroupReadSupport \
-D parquet.write.support.class=net.iponweb.hadoop.streaming.parquet.GroupWriteSupport \
-inputformat net.iponweb.hadoop.streaming.parquet.ParquetAsTextInputFormat \
-outputformat net.iponweb.hadoop.streaming.parquet.ParquetAsTextOutputFormat \
-input "/tmp/test/parquet_test" \
-output "/tmp/test/streaming_parquet_test" \
-mapper /bin/cat -reducer /bin/cat
外部包:https://github.com/whale2/iow-hadoop-streaming
原本想用1.8的parquet格式,后面发现1.8parquet的读写的数据格式是mapreduce包下面的api,hadoop
streaming只能用mapred包下面的api。
class org.apache.parquet.hadoop.ParquetInputFormat not org.apache.hadoop.mapred.InputFormat
4、MapReduce支持
pom.xml
<dependencies>
<!-- https://mvnrepository.com/artifact/com.twitter/parquet-hadoop -->
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-common</artifactId>
<version>1.8.1</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-encoding</artifactId>
<version>1.8.1</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-column</artifactId>
<version>1.8.1</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>1.8.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.0</version>
</dependency>
</dependencies>
package is.parquet;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetInputFormat;
import org.apache.parquet.hadoop.ParquetOutputFormat;
import org.apache.parquet.hadoop.example.GroupReadSupport;
import org.apache.parquet.hadoop.example.GroupWriteSupport;
import java.io.IOException;
import java.util.StringTokenizer;
public class ParquetRWMR extends Configured implements Tool {
public int run(String[] strings) throws Exception {
Configuration conf = getConf();;
String writeSchema = "message example {\n" +
"required binary id;\n" +
"required binary name;\n" +
"required binary des;\n" +
"}";
conf.set("parquet.example.schema",writeSchema);
Job job = Job.getInstance(conf);
job.setJarByClass(ParquetRWMR.class);
job.setJobName("parquet");
String in = "/tmp/test/parquet_test";
String out = "/tmp/test/parquet_test_mr";
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputValueClass(Group.class);
job.setMapperClass(WordCountMap.class);
job.setReducerClass(WordCountReduce.class);
job.setInputFormatClass(ParquetInputFormat.class);
job.setOutputFormatClass(ParquetOutputFormat.class);
ParquetInputFormat.setInputPaths(job,new Path(in));
ParquetInputFormat.setReadSupportClass(job, GroupReadSupport.class);
ParquetOutputFormat.setOutputPath(job, new Path(out));
ParquetOutputFormat.setWriteSupportClass(job, GroupWriteSupport.class);
boolean rt =job.waitForCompletion(true);
return rt?0:1;
}
public static class WordCountMap extends
Mapper<Void, Group, Text, Text> {
private Text word = new Text();
public void map(Void key, Group value, Context context)
throws IOException, InterruptedException {
Long first = value.getLong("0",0); //value.getLong方法第一个参数是字段名,如果该参数是key-value类型的,第二个参数传0即可。因为根据key返回的值是一个list,0即是取第一个
String sec = value.getString("1",0);
String third = value.getString("2",0);
word.set(first.toString());
context.write(word, new Text(sec + "\t" + third));
}
}
public static class WordCountReduce extends
Reducer<Text, Text, Void, Group> {
private SimpleGroupFactory factory;
public void reduce(Text key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
StringBuilder str = new StringBuilder();
for (Text val : values) {
String tmp_file[] = val.toString().split("\t");
Group group = factory.newGroup()
.append("id", key.toString())
.append("name", tmp_file[0])
.append("des",tmp_file[1]);
context.write(null,group);
break;
}
}
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(context.getConfiguration()));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
int retnum = ToolRunner.run(conf,new ParquetRWMR(),args);
}
}