今天研究了一下hive里面的压缩来达到节省空间的目的。
说到压缩就不得不介绍下hive的常见的文件格式,textfile,sequencefile,rcffile,orcfile
[ ] textfile
hive 中默认格式,特点是占用空间大,压缩之后不支持切分
[ ] sequencefile
二进制存储格式
[ ] fcfile
数据按行分块 每块按照列存储
[ ] orcfile
数据按行分块 每块按照列存储 ,orc( Optimized RCFile)他是RCFile的plus版,支持 lzo snappy 压缩(这个是今天的主角)
主要操作(这里主要介绍关键点,默认是了解datax操作和了解hadoop知识)
之前我们用的是textfile这种格式存储
CREATE TABLE `t_user_active`(
`id` string COMMENT '主键ID',
`user_id` string COMMENT '用户ID',
`user_name` string COMMENT '用户名称',
`channel_id` string COMMENT '渠道号id',
`use_count` int COMMENT '使用个数据',
`create_by` string COMMENT '创建者ID',
`create_date` string COMMENT '创建时间',
`update_by` string COMMENT '更新者ID',
`update_date` timestamp COMMENT '更新时间'
)
PARTITIONED BY (
`create_data` string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;
修改压缩格式是这样的,使用ocr的格式存储,开启snappy压缩
CREATE TABLE `t_user_active`(
`id` string COMMENT '主键ID',
`user_id` string COMMENT '用户ID',
`user_name` string COMMENT '用户名称',
`channel_id` string COMMENT '渠道号id',
`use_count` int COMMENT '使用个数据',
`create_by` string COMMENT '创建者ID',
`create_date` string COMMENT '创建时间',
`update_by` string COMMENT '更新者ID',
`update_date` timestamp COMMENT '更新时间'
)
PARTITIONED BY ( `create_data` string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS orc
tblproperties("orc.compress"="SNAPPY");
datax 里面原先是这样的
{
"job": {
"setting": {
"speed": {
"channel": 2
},
"errorLimit": {
"record": 0,
"percentage": 0.02
}
},
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"column": [
"id",
"user_id",
"user_name",
"channel_id",
"use_count",
"create_by",
"create_date",
"update_by",
"update_date"
],
"connection": [
{
"jdbcUrl": [
"jdbc:mysql://{$ip}/sms?useUnicode=true&characterEncoding=UTF-8&?autoReconnect=true&zeroDateTimeBehavior=convertToNull"
],
"table": [
"t_user_active"
]
}
],
"where": "create_date >='${startDate}' AND create_date<='${endDate}' ",
"password": "${mysqlpwd}",
"username": "${mysqlun}"
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"column": [
{
"name": "id",
"type": "string"
},
{
"name": "user_id",
"type": "string"
},
{
"name": "user_name",
"type": "string"
},
{
"name": "channel_id",
"type": "string"
},
{
"name": "create_by",
"type": "string"
},
{
"name": "create_date",
"type": "string"
},
{
"name": "update_by",
"type": "string"
},
{
"name": "update_date",
"type": "string"
}
],
"writeMode": "append",
"defaultFS": "hdfs://${hdfsIp}:8020",
"fieldDelimiter": "\t",
"fileName": "datax",
"encoding": "UTF-8",
"fileType": "text",
"path": "/hadoop/smart.db/t_user_active/create_time=${cDT}"
}
}
}
]
}
}
修改后是 这样的,compress这里需要在hdfs 安装”sanppycodec“
{
"job": {
"setting": {
"speed": {
"channel": 2
},
"errorLimit": {
"record": 0,
"percentage": 0.02
}
},
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"column": [
"id",
"user_id",
"user_name",
"channel_id",
"use_count",
"create_by",
"create_date",
"update_by",
"update_date"
],
"connection": [
{
"jdbcUrl": [
"jdbc:mysql://{$ip}/sms?useUnicode=true&characterEncoding=UTF-8&?autoReconnect=true&zeroDateTimeBehavior=convertToNull"
],
"table": [
"t_user_active"
]
}
],
"where": "create_date >='${startDate}' AND create_date<='${endDate}' ",
"password": "${mysqlpwd}",
"username": "${mysqlun}"
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"column": [
{
"name": "id",
"type": "string"
},
{
"name": "user_id",
"type": "string"
},
{
"name": "user_name",
"type": "string"
},
{
"name": "channel_id",
"type": "string"
},
{
"name": "create_by",
"type": "string"
},
{
"name": "create_date",
"type": "string"
},
{
"name": "update_by",
"type": "string"
},
{
"name": "update_date",
"type": "string"
}
],
"writeMode": "append",
"defaultFS": "hdfs://${hdfsIp}:8020",
"fieldDelimiter": "\t",
"fileName": "datax",
"encoding": "UTF-8",
"fileType": "orc",
"compress": "SNAPPY",
"path": "/hadoop/smart.db/t_user_active/create_time=${cDT}"
}
}
}
]
}
}
说下结果(因为测试数据结果没有保存,后期补上)
1 我们将在hive建两张表,一张是原来使用的,一张使用orc格式的这种
2 在t_user_active造50w数据,使用datax分别同步到上面两张表中
3 比较上面两张表在hdfs生成的文件大小
测试效果 使用orc的比text格式的要小7-9倍。