原理
通过比较文件的md5的值判断文件是否相同。使用了org.apache.commons.io
中的方法遍历文件以及计算md5值。
源码
ListFileWalker.scala
import java.io.{File, FileInputStream}
import java.util.Collection
import org.apache.commons.codec.digest.DigestUtils
import org.apache.commons.io.filefilter.IOFileFilter
import org.apache.commons.io.{DirectoryWalker, IOUtils}
import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer
class ListFileWalker private (directoryFilter: IOFileFilter, fileFilter: IOFileFilter, depthLimit: Int)
extends DirectoryWalker[File](directoryFilter: IOFileFilter, fileFilter: IOFileFilter, depthLimit: Int){
override def handleFile(file: File, depth: Int, results: Collection[File]) = results.add(file)
def list(startDirectory: File): ArrayBuffer[File] = {
val files = new ArrayBuffer[File]
walk(startDirectory, files)
files
}
}
object ListFileWalker{
def apply(directoryFilter: IOFileFilter, fileFilter: IOFileFilter) = new ListFileWalker(directoryFilter, fileFilter, -1)
def getMd5(path: String): String = DigestUtils.md5Hex(IOUtils.toByteArray(new FileInputStream(path)))
}
main函数
import java.io.File
import org.apache.commons.io.filefilter.{FileFilterUtils, HiddenFileFilter}
import scala.collection.mutable.{HashMap, ArrayBuffer}
object Oven {
def main(args: Array[String]): Unit = {
val warker = ListFileWalker(HiddenFileFilter.VISIBLE, FileFilterUtils.suffixFileFilter(".txt"))
val result = new HashMap[String, ArrayBuffer[String]]
warker.list(new File("/tmp")).foreach(file => {
val path = file.getAbsolutePath
val md5 = ListFileWalker.getMd5(path)
if (result.contains(md5)) {
result(md5) += path
} else {
result += (md5 -> ArrayBuffer(path))
}
})
result.filter(_._2.size > 1).foreach(println(_))
}
}
输出结果
这段代码的是递归查找/tmp
目录下面所有的txt文件,得到的结果为
(6c37124d096045a3637da652099f92b1,ArrayBuffer(/tmp/LICENSE.txt, /tmp/yarn/LICENSE.txt))
(0c580d17b0408a0c19772da288971eb4,ArrayBuffer(/tmp/hadoop/common/NOTICE.txt, /tmp/hadoop/share/doc/hadoop/hdfs/NOTICE.txt, /tmp/hadoop/share/doc/hadoop/mapreduce/NOTICE.txt))
列出所有相同md5值的文件。