参考:https://www.cnblogs.com/Leo_wl/p/3145108.html
1. pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.learn</groupId>
<artifactId>HdfsCompress</artifactId>
<version>1.0</version>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.4.0</version>
<scope>provided</scope>
<exclusions>
<exclusion>
<artifactId>httpclient</artifactId>
<groupId>org.apache.httpcomponents</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.4.0</version>
<scope>provided</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.0.0</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<artifactSet>
<excludes>
<exclude>org.slf4j:*</exclude>
<exclude>log4j:*</exclude>
</excludes>
</artifactSet>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>org.learn.Driver</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
2. 入口程序
public class Driver {
public static void main(String[] args) throws Exception {
if (args.length != 3) {
throw new Exception("parameters(command, inDir, outDir) size is 3, please check!");
}
String command = args[0];
String inputDir = args[1];
String outputDir = args[2];
if ("compress".equalsIgnoreCase(command)) {
Compress compress = new Compress();
compress.exe(inputDir, outputDir);
} else if ("uncompress".equalsIgnoreCase(command)) {
UnCompress unCompress = new UnCompress();
unCompress.exe(inputDir, outputDir);
}
}
}
3. 压缩
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.util.ReflectionUtils;
import java.io.File;
import java.io.IOException;
public class Compress {
public void exe(String inputDir, String outputDir) throws ClassNotFoundException, IOException {
Configuration conf = new Configuration();
Class codecClass = Class.forName("org.apache.hadoop.io.compress.GzipCodec");
CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
FileSystem fs = FileSystem.get(conf);
FileStatus[] fileStatuses = fs.listStatus(new Path(inputDir));
for (FileStatus fileStatus : fileStatuses) {
// 获取当前文件的绝对路径
String fullPath = fileStatus.getPath().toString();
// 判断当前迭代对象是否是目录
boolean isDir = fileStatus.isDirectory();
if (isDir) {
continue;
}
if (!fullPath.endsWith(".txt")) {
continue;
}
String[] arr = fullPath.split("/");
String fileName = arr[arr.length - 1];
// 指定要被压缩的文件路径
FSDataInputStream inputStream = fs.open(new Path(fullPath));
// 指定压缩文件路径
FSDataOutputStream outputStream = fs.create(new Path(outputDir + "/" + fileName + ".gz"));
// 创建压缩输出流
CompressionOutputStream compressionOutputStream = codec.createOutputStream(outputStream);
IOUtils.copyBytes(inputStream, compressionOutputStream, conf);
IOUtils.closeStream(inputStream);
IOUtils.closeStream(compressionOutputStream);
}
}
}
4. 解压
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.util.ReflectionUtils;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
public class UnCompress {
public void exe(String inputDir, String outputDir) throws Exception {
Configuration conf = new Configuration();
Class codecClass = Class.forName("org.apache.hadoop.io.compress.GzipCodec");
CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
FileSystem fs = FileSystem.get(conf);
FileStatus[] fileStatuses = fs.listStatus(new Path(inputDir));
for (FileStatus fileStatus : fileStatuses) {
// 获取当前文件的绝对路径
String fullPath = fileStatus.getPath().toString();
// 判断当前迭代对象是否是目录
boolean isDir = fileStatus.isDirectory();
if (isDir) {
continue;
}
if (!fullPath.endsWith(".txt.gz")) {
continue;
}
String[] arr = fullPath.split("/");
String gzFileName = arr[arr.length - 1];
String fileName = CompressionCodecFactory.removeSuffix(gzFileName, codec.getDefaultExtension());
InputStream in = null;
OutputStream out = null;
try {
in = codec.createInputStream(fs.open(new Path(fullPath)));
out = fs.create(new Path(outputDir + "/" + fileName));
IOUtils.copyBytes(in, out, conf);
} finally {
IOUtils.closeStream(out);
IOUtils.closeStream(in);
}
}
}
}
5. 命令行
#!/bin/sh
# [compress] [inputDir] [outputDir]
hadoop jar HdfsCompress-1.0.jar compress /home/test /home/test/compress
# [uncompress] [inputDir] [outputDir]
#hadoop jar HdfsCompress-1.0.jar uncompress /home/test/compress /home/test
6. 测试结果
hadoop fs -ls /home/test
drwxr-xr-x - aps supergroup 0 2020-01-16 16:35 /home/test/compress
-rw-r--r-- 3 aps supergroup 12242 2020-01-16 16:38 /home/test/test1.txt
-rw-r--r-- 3 aps supergroup 12242 2020-01-16 16:38 /home/test/test2.txt
-rw-r--r-- 3 aps supergroup 12242 2020-01-16 16:38 /home/test/test3.txt
hadoop fs -ls /home/test/compress
-rw-r--r-- 3 aps supergroup 1158 2020-01-16 16:35 /home/test/compress/test1.txt.gz
-rw-r--r-- 3 aps supergroup 1158 2020-01-16 16:35 /home/test/compress/test2.txt.gz
-rw-r--r-- 3 aps supergroup 1158 2020-01-16 16:35 /home/test/compress/test3.txt.gz
注:HDFS清理文件且不放入回收站命令:
hadoop fs -rm -skipTrash /home/test/test1.txt