测试用例
doc1.txt:
hello spark
hello hadoop
doc2.txt:
hello hive
hello hbase
hello spark
doc3.txt:
hadoop hbase
hive scala
实现代码
package SparkLearning.SparkDay01
import org.apache.spark.sql.SparkSession
object ReadDirectort {
def main(args: Array[String]): Unit = {
//获取sparkSession对象
val spark = SparkSession.builder().appName("hello").master("local").getOrCreate()
//读取目录test
val data = spark.sparkContext.wholeTextFiles("D:/ml/test")
//使用分割"/''获取文件名
val r1 = data.flatMap { x =>
val doc = x._1.split(s"/").last.split("\\.").head
//先按行切分,在按列空格进行切分
x._2.split("\r\n").flatMap(_.split(" ").map { y => (y, doc)})}
.groupByKey.map{case(x,y)=>(x,y.toSet.mkString(","))}
//.groupByKey().map(y=>(y._1,y._2.toList.mkString(","))
r1.foreach(println)
}
}