在spark-shell状态下查看sql内置函数:
spark.sql("show functions").show(1000)
比如:SUBSTR(columnName,0,1)='B'
show,take,first,head
df.show(30,false)df.take(10)
df.first()
df.head(3)
- 选择某列显示
df.select("column").show(30,false)
- 按条件过滤
df.filter("name='' OR name='NULL'").show
- 按列升序or降序排序
df.sort(df("name").desc).show
df.sort(df("name").asc, df("id").desc).show
别名
df.select(df("name").as("student_name")).show
join
df1.join(df2, df1.col("id") === df2.col("id")).show
-
源码:
import org.apache.spark.sql.SparkSession
object DataFrameCase {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("DataFrameCase")
.master("local[2]")
.getOrCreate()
val rdd = spark.sparkContext.textFile("C:\\Users\\Administrator\\IdeaProjects\\SparkSQLProject\\spark-warehouse\\student.data");
import spark.implicits._
val studentDF = rdd.map(_.split("\\|"))
.map(line => Student(line(0).toInt,line(1),line(2),line(3)))
.toDF()
studentDF.show
studentDF.show(30,false)
studentDF.take(10)
studentDF.first()
studentDF.head(3)
studentDF.select("email").show(30,false)
studentDF.filter("name='' OR name='NULL'").show
//name以B开头的人
studentDF.filter("SUBSTR(name,0,1)='B'").show
//sort
studentDF.sort(studentDF("name")).show
studentDF.sort(studentDF("name").desc).show
studentDF.sort("name","id").show
studentDF.sort(studentDF("name").asc, studentDF("id").desc).show
//as alias
studentDF.select(studentDF("name").as("student_name")).show
val studentDF2 = rdd.map(_.split("\\|")).map(line => Student(line(0).toInt, line(1), line(2), line(3))).toDF()
//inner join ===
studentDF.join(studentDF2, studentDF.col("id") === studentDF2.col("id")).show
spark.stop()
}
case class Student(id: Int, name: String, phone: String, email: String)
}