方法一 单机版
pd = spark_df.toPandas()
方法二 分布式
import pandas as pd
def _map_to_pandas(rdds):
return [pd.DataFrame(list(rdds))]
def topands_df(df,n_partitions = None):
if n_partitions is not None:
df = df.repartition(n_partitions)
df_pand = df.rdd.mapPartitions(_map_to_pandas).collect()
df_pand = pd.concat(df_pand)
df_pand.columns = df.columns
return df_pand