windows单机spark下开发基于pyspark

'''

在写交互脚本时，首先导入findspark包，然后执行findspark.init()，这两行写在前边

'''

import findspark

findspark.init()

from pyspark import SparkContext

'''

from py4j.java_gateway import JavaGateway

gateway = JavaGateway() # connect to the JVM

random = gateway.jvm.java.util.Random() # create a java.util.Random instance

number1 = random.nextInt(10) # call the Random.nextInt method

number2 = random.nextInt(10)

print(number1, number2)

'''

sc = SparkContext('local')

old = sc.parallelize([1, 2, 3, 4, 5])

#新的map里将原来的每个元素拆成了3个

newFlatPartitions = old.flatMap(lambda x: (x, x+1, x*2))

#过滤，只保留小于6的元素

newFilterPartitions = newFlatPartitions.filter(lambda x: x < 6)

#去重

newDiscinctPartitions = newFilterPartitions.distinct()

print(newFlatPartitions.collect())

print(newFilterPartitions.collect())

print(newDiscinctPartitions.collect())

'''

local [*]在本地运行Spark，并使用与计算机上的逻辑内核一样多的工作线程。

'''