Pandas Series/DataFrame创建/操作/理解

Pandas Series创建/操作

  • Pandas Series创建
import numpy as np
import pandas as pd

s1 = pd.Series([1, 2, 3, 4, 5, 6])   # 通过列表,创建Series对象
s1   # 可以看到包含两部分,index 和 数据
    0    1
    1    2
    2    3
    3    4
    4    5
    5    6
    dtype: int64

s1.values   #  查看数据部分
    array([1, 2, 3, 4, 5, 6])

s1.index   # 查看index部分属性;开始点、结束点、步长信息
    RangeIndex(start=0, stop=6, step=1)

s2 =  pd.Series(np.arange(10))  # 通过numpy的数组创建pandas.Series

s2  # 10个元素的Series
    0    0
    1    1
    2    2
    3    3
    4    4
    5    5
    6    6
    7    7
    8    8
    9    9
    dtype: int64

s3 = pd.Series({'a': 1, 'b': 2, 'c': 3, 'd': 4})   # 通过python dict创建Series对象

s3
    a    1
    b    2
    c    3
    d    4
    dtype: int64

s3.values
    array([1, 2, 3, 4])
s3.index   #  s3的索引是字符串类型的
    Index(['a', 'b', 'c', 'd'], dtype='object')

s4 = pd.Series([10, 20, 30, 40], index = ['A', 'B', 'C', 'D'])   # 指定index的形式创建Series

s4
    A    10
    B    20
    C    30
    D    40
    dtype: int64
  • Pandas Series操作
s4['A']   # 类似dict['keyName']

    10

s4[s4>20]   # 取出Value大于20的元素

    C    30
    D    40
    dtype: int64

s4.to_dict()   # Pandas.Series转化为字典

    {'A': 10, 'B': 20, 'C': 30, 'D': 40}

s5 = pd.Series(s4.to_dict())

s5
    A    10
    B    20
    C    30
    D    40
    dtype: int64

index_1 = ['A', 'B', 'C', 'D', 'E']

s6 = pd.Series(s5, index=index_1)    # 修改Series的索引

s6
    A    10.0
    B    20.0
    C    30.0
    D    40.0
    E     NaN
    dtype: float64

pd.isnull(s6)  # 判断值是NaN的元素
    A    False
    B    False
    C    False
    D    False
    E     True
    dtype: bool

pd.notnull(s6)

    A     True
    B     True
    C     True
    D     True
    E    False
    dtype: bool

s6.name = 'demo'  #  给Series取名字
s6.index.name = 'demo_index'  # 给Series索引取名字

s6
    demo_index
    A    10.0
    B    20.0
    C    30.0
    D    40.0
    E     NaN
    Name: demo, dtype: float64

s6.index   # 查看Series索引信息,看到索引名字为:demo_index
    Index(['A', 'B', 'C', 'D', 'E'], dtype='object', name='demo_index')

Pandas Dataframe

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import webbrowser

link = 'https://www.tiobe.com/tiobe-index'
webbrowser.open(link)   # 打开一个网页; 选择table内容,复制
    True

df = pd.read_clipboard()   # 从粘贴板读取数据,生成一个Pandas的DataFrame
df
image.png
type(df)  
    pandas.core.frame.DataFrame
  • DataFrame操作
df.columns   # 查看列名
    Index(['Mar 2019', 'Mar 2018', 'Change', 'Programming Language', 'Ratings',
           'Change.1'],
          dtype='object')

df.Ratings  # 返回一列的值

    0    14.880%
    1    13.305%
    2     8.262%
    3     8.126%
    4     6.429%
    5     3.267%
    6     2.426%
    7     2.420%
    8     1.926%
    9     1.681%
    Name: Ratings, dtype: object

df_new = DataFrame(df, columns = ['Change', 'Programming Language', 'Ratings'])    # 现有的DataFrame对象,生成新指定列的新对象

df_new
image.png
df['Mar 2019']    # 类似dict的形式,返回的是Series对象
    0     1
    1     2
    2     3
    3     4
    4     5
    5     6
    6     7
    7     8
    8     9
    9    10
    Name: Mar 2019, dtype: int64


df_new = DataFrame(df, columns = ['Change', 'Programming Language', 'Ratings', 'Mar 2020'])    # 生成新DataFrame时指定了原DataFrame对象不存在的列

df_new
image.png
df_new['Mar 2020'] = range(0, 10)   # 给dataframe新列赋值
df_new['Mar 2020'] = np.arange(0, 10)  # 和上面效果一样
df_new['Mar 2020'] = pd.Series(np.arange(0, 10))  # 和上面效果一样

df_new
image.png
df_new['Mar 2020'] = pd.Series([100, 200], index=[1, 2])   # 根据index赋值某些行

df_new
image.png

深入理解Series和DataFrame

import numpy as np
import pandas as pd
from pandas import Series, DataFrame


data = {'Country': ['Belgium', 'India', 'Brazil'],
           'Capital' : ['Brussels', 'New Delhi', 'Brasilia'],
           'Population': [11190846, 1303171035, 207847528]
       }
  • Series
s1 = pd.Series(data['Country'], index=['A', 'B', 'C'])  #  指定索引创建索引。不指定默认0开始

s1.values
    array(['Belgium', 'India', 'Brazil'], dtype=object)

s1.index
    Index(['A', 'B', 'C'], dtype='object')
  • DataFrame
df1 = pd.DataFrame(data)
df1
image.png
countrys = df1['Country']   #  访问某一列,每一列也是Series对象
type(countrys)
    pandas.core.series.Series
df1.iterrows()
    <generator object DataFrame.iterrows at 0x119711318>
for row in df1.iterrows():
    print(row[0], row[1], type(row[0]), type(row[1]))    # row[1] 是Series对象
    break

    0 Country        Belgium
    Capital       Brussels
    Population    11190846
    Name: 0, dtype: object <class 'int'> <class 'pandas.core.series.Series'>
  • 通过Series创建DataFram
s1 = pd.Series(data['Country'])
s2 = pd.Series(data['Capital'])
s3 = pd.Series(data['Population'])

df2 = pd.DataFrame([s1, s2, s3], index=['Country', 'Capital', 'Population'])  # 多个Series创建DataFrame
df2  # 发现行和列有颠倒
image.png
df2 = df2.T

df2
image.png
image.png
关系:
1. Series 一维的数据结构(只有一级索引时)
2. DataFrame 二维的数据结构

Pandas DataFrame IO操作

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import webbrowser
url = 'http://pandas.pydata.org/pandas-docs/version/0.20/io.html'   #  pandas IO操作手册
webbrowser.open(url)   # 打开网页 ,复制pandas dataframe io操作方法表格
    True
df1 = pd.read_clipboard()

df1.to_csv('pandas_dataframe_oi.csv', index=False)  # 不加index=False, 写入的csv会多一列索引列0~N

!ls pandas_dataframe_oi.csv
    pandas_dataframe_oi.csv

!cat pandas_dataframe_oi.csv

    Format Type,Data Description,Reader,Writer
    text,CSV,read_csv,to_csv
    text,JSON,read_json,to_json
    text,HTML,read_html,to_html
    text,Local clipboard,read_clipboard,to_clipboard
    binary,MS Excel,read_excel,to_excel
    binary,HDF5 Format,read_hdf,to_hdf
    binary,Feather Format,read_feather,to_feather
    binary,Msgpack,read_msgpack,to_msgpack
    binary,Stata,read_stata,to_stata
    binary,SAS,read_sas, 
    binary,Python Pickle Format,read_pickle,to_pickle
    SQL,SQL,read_sql,to_sql
    SQL,Google Big Query,read_gbq,to_gbq

df2 = pd.read_csv('pandas_dataframe_oi.csv')

type(df2)
    pandas.core.frame.DataFrame

df1.to_json()    # dataframe-> json

    '{"Format Type":{"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL"},"Data Description":{"0":"CSV","1":"JSON","2":"HTML","3":"Local clipboard","4":"MS Excel","5":"HDF5 Format","6":"Feather Format","7":"Msgpack","8":"Stata","9":"SAS","10":"Python Pickle Format","11":"SQL","12":"Google Big Query"},"Reader":{"0":"read_csv","1":"read_json","2":"read_html","3":"read_clipboard","4":"read_excel","5":"read_hdf","6":"read_feather","7":"read_msgpack","8":"read_stata","9":"read_sas","10":"read_pickle","11":"read_sql","12":"read_gbq"},"Writer":{"0":"to_csv","1":"to_json","2":"to_html","3":"to_clipboard","4":"to_excel","5":"to_hdf","6":"to_feather","7":"to_msgpack","8":"to_stata","9":" ","10":"to_pickle","11":"to_sql","12":"to_gbq"}}'

pd.read_json(df1.to_json())    #  json--> dataframe
image.png
df1.to_html('df1.html')   # dataframe --> html

!ls df1.html
    df1.html

df1.to_excel('df1.xlsx')   # dataframe--> excel
!ls df1.xlsx

    df1.xlsx
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容