1. 启动shell
1. cd spar目录/bin
2. ./spark-shell
2. 载入需要分析的数据
//得到了一个rdd
val textFile = sc.textFile("file:///xuzhang/home/Cloud/spark/README.md");
//得到其中的rdd总数
textFile.count()
//得到第一条数据
textFile.first()
//filter 过滤数据,spark中叫做rdd转换
val lineWithSpark = textFile.filter(line=>line.contains("Spark"));
//或者
val lineWithSpark = textFile.filter(_.contains("Spark");
//map reduce
textFile.map(_.split(" ").size).reduce((a,b)=>if (a > b) a else b))
//调用java的方法统计
import java.lang.Math
textFile.map(_.split(" ").size).reduce((a,b)=>Math.max(a,b))
//spark 的wordCount
textFile.flatMap(_.split(" ")).map(word=>(word,1)).reduceByKey((a,b)=>a+b)
编程之scala版
/* SimpleApp.scala */
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object SimpleApp {
def main(args: Array[String]) {
val logFile = "YOUR_SPARK_HOME/README.md"
// Should be some file on your system
val conf = new SparkConf().setAppName("Simple Application")
val sc = new SparkContext(conf)
val logData = sc.textFile(logFile, 2).cache()
val numAs = logData.filter(line => line.contains("a")).count()
val numBs = logData.filter(line => line.contains("b")).count()
println("Lines with a: %s, Lines with b: %s".format(numAs, numBs))
}
}
编程之python版
"""SimpleApp.py"""
from pyspark import SparkContext
logFile = "YOUR_SPARK_HOME/README.md" # Should be some file on your system
sc = SparkContext("local", "Simple App")
logData = sc.textFile(logFile).cache()
numAs = logData.filter(lambda s: 'a' in s).count()
numBs = logData.filter(lambda s: 'b' in s).count()
print("Lines with a: %i, lines with b: %i" % (numAs, numBs))