randomSplit:根据weight(权重值)将一个RDD划分成多个RDD,权重越高划分得到的元素较多的几率就越大
sample:可以使用指定的比例,比如说0.1或者0.9,从RDD中随机抽取10%或者90%的数据,从RDD中随机抽取数据的功能
代码
object TypedOperation {
case class Employee(name: String, age: Long, depId: Long, gender: String, salary: Long)
def main(args: Array[String]): Unit = {
val sparkSession = SparkSession
.builder()
.appName("BasicOperation")
.master("local")
.getOrCreate()
import sparkSession.implicits._
import org.apache.spark.sql.functions._
val employeePath = this.getClass.getClassLoader.getResource("employee.json").getPath
val employeeDF = sparkSession.read.json(employeePath)
val employeeDS = employeeDF.as[Employee]
employeeDS.randomSplit(Array(1,2,2,3)).foreach(ds => ds.show())
employeeDS.sample(0.4).show()
}
}