Pandas Series创建/操作
- Pandas Series创建
import numpy as np
import pandas as pd
s1 = pd.Series([1, 2, 3, 4, 5, 6]) # 通过列表,创建Series对象
s1 # 可以看到包含两部分,index 和 数据
0 1
1 2
2 3
3 4
4 5
5 6
dtype: int64
s1.values # 查看数据部分
array([1, 2, 3, 4, 5, 6])
s1.index # 查看index部分属性;开始点、结束点、步长信息
RangeIndex(start=0, stop=6, step=1)
s2 = pd.Series(np.arange(10)) # 通过numpy的数组创建pandas.Series
s2 # 10个元素的Series
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int64
s3 = pd.Series({'a': 1, 'b': 2, 'c': 3, 'd': 4}) # 通过python dict创建Series对象
s3
a 1
b 2
c 3
d 4
dtype: int64
s3.values
array([1, 2, 3, 4])
s3.index # s3的索引是字符串类型的
Index(['a', 'b', 'c', 'd'], dtype='object')
s4 = pd.Series([10, 20, 30, 40], index = ['A', 'B', 'C', 'D']) # 指定index的形式创建Series
s4
A 10
B 20
C 30
D 40
dtype: int64
- Pandas Series操作
s4['A'] # 类似dict['keyName']
10
s4[s4>20] # 取出Value大于20的元素
C 30
D 40
dtype: int64
s4.to_dict() # Pandas.Series转化为字典
{'A': 10, 'B': 20, 'C': 30, 'D': 40}
s5 = pd.Series(s4.to_dict())
s5
A 10
B 20
C 30
D 40
dtype: int64
index_1 = ['A', 'B', 'C', 'D', 'E']
s6 = pd.Series(s5, index=index_1) # 修改Series的索引
s6
A 10.0
B 20.0
C 30.0
D 40.0
E NaN
dtype: float64
pd.isnull(s6) # 判断值是NaN的元素
A False
B False
C False
D False
E True
dtype: bool
pd.notnull(s6)
A True
B True
C True
D True
E False
dtype: bool
s6.name = 'demo' # 给Series取名字
s6.index.name = 'demo_index' # 给Series索引取名字
s6
demo_index
A 10.0
B 20.0
C 30.0
D 40.0
E NaN
Name: demo, dtype: float64
s6.index # 查看Series索引信息,看到索引名字为:demo_index
Index(['A', 'B', 'C', 'D', 'E'], dtype='object', name='demo_index')
Pandas Dataframe
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import webbrowser
link = 'https://www.tiobe.com/tiobe-index'
webbrowser.open(link) # 打开一个网页; 选择table内容,复制
True
df = pd.read_clipboard() # 从粘贴板读取数据,生成一个Pandas的DataFrame
df
image.png
type(df)
pandas.core.frame.DataFrame
- DataFrame操作
df.columns # 查看列名
Index(['Mar 2019', 'Mar 2018', 'Change', 'Programming Language', 'Ratings',
'Change.1'],
dtype='object')
df.Ratings # 返回一列的值
0 14.880%
1 13.305%
2 8.262%
3 8.126%
4 6.429%
5 3.267%
6 2.426%
7 2.420%
8 1.926%
9 1.681%
Name: Ratings, dtype: object
df_new = DataFrame(df, columns = ['Change', 'Programming Language', 'Ratings']) # 现有的DataFrame对象,生成新指定列的新对象
df_new
image.png
df['Mar 2019'] # 类似dict的形式,返回的是Series对象
0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
8 9
9 10
Name: Mar 2019, dtype: int64
df_new = DataFrame(df, columns = ['Change', 'Programming Language', 'Ratings', 'Mar 2020']) # 生成新DataFrame时指定了原DataFrame对象不存在的列
df_new
image.png
df_new['Mar 2020'] = range(0, 10) # 给dataframe新列赋值
df_new['Mar 2020'] = np.arange(0, 10) # 和上面效果一样
df_new['Mar 2020'] = pd.Series(np.arange(0, 10)) # 和上面效果一样
df_new
image.png
df_new['Mar 2020'] = pd.Series([100, 200], index=[1, 2]) # 根据index赋值某些行
df_new
image.png
深入理解Series和DataFrame
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
data = {'Country': ['Belgium', 'India', 'Brazil'],
'Capital' : ['Brussels', 'New Delhi', 'Brasilia'],
'Population': [11190846, 1303171035, 207847528]
}
- Series
s1 = pd.Series(data['Country'], index=['A', 'B', 'C']) # 指定索引创建索引。不指定默认0开始
s1.values
array(['Belgium', 'India', 'Brazil'], dtype=object)
s1.index
Index(['A', 'B', 'C'], dtype='object')
- DataFrame
df1 = pd.DataFrame(data)
df1
image.png
countrys = df1['Country'] # 访问某一列,每一列也是Series对象
type(countrys)
pandas.core.series.Series
df1.iterrows()
<generator object DataFrame.iterrows at 0x119711318>
for row in df1.iterrows():
print(row[0], row[1], type(row[0]), type(row[1])) # row[1] 是Series对象
break
0 Country Belgium
Capital Brussels
Population 11190846
Name: 0, dtype: object <class 'int'> <class 'pandas.core.series.Series'>
- 通过Series创建DataFram
s1 = pd.Series(data['Country'])
s2 = pd.Series(data['Capital'])
s3 = pd.Series(data['Population'])
df2 = pd.DataFrame([s1, s2, s3], index=['Country', 'Capital', 'Population']) # 多个Series创建DataFrame
df2 # 发现行和列有颠倒
image.png
df2 = df2.T
df2
image.png
image.png
关系:
1. Series 一维的数据结构(只有一级索引时)
2. DataFrame 二维的数据结构
Pandas DataFrame IO操作
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import webbrowser
url = 'http://pandas.pydata.org/pandas-docs/version/0.20/io.html' # pandas IO操作手册
webbrowser.open(url) # 打开网页 ,复制pandas dataframe io操作方法表格
True
df1 = pd.read_clipboard()
df1.to_csv('pandas_dataframe_oi.csv', index=False) # 不加index=False, 写入的csv会多一列索引列0~N
!ls pandas_dataframe_oi.csv
pandas_dataframe_oi.csv
!cat pandas_dataframe_oi.csv
Format Type,Data Description,Reader,Writer
text,CSV,read_csv,to_csv
text,JSON,read_json,to_json
text,HTML,read_html,to_html
text,Local clipboard,read_clipboard,to_clipboard
binary,MS Excel,read_excel,to_excel
binary,HDF5 Format,read_hdf,to_hdf
binary,Feather Format,read_feather,to_feather
binary,Msgpack,read_msgpack,to_msgpack
binary,Stata,read_stata,to_stata
binary,SAS,read_sas,
binary,Python Pickle Format,read_pickle,to_pickle
SQL,SQL,read_sql,to_sql
SQL,Google Big Query,read_gbq,to_gbq
df2 = pd.read_csv('pandas_dataframe_oi.csv')
type(df2)
pandas.core.frame.DataFrame
df1.to_json() # dataframe-> json
'{"Format Type":{"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL"},"Data Description":{"0":"CSV","1":"JSON","2":"HTML","3":"Local clipboard","4":"MS Excel","5":"HDF5 Format","6":"Feather Format","7":"Msgpack","8":"Stata","9":"SAS","10":"Python Pickle Format","11":"SQL","12":"Google Big Query"},"Reader":{"0":"read_csv","1":"read_json","2":"read_html","3":"read_clipboard","4":"read_excel","5":"read_hdf","6":"read_feather","7":"read_msgpack","8":"read_stata","9":"read_sas","10":"read_pickle","11":"read_sql","12":"read_gbq"},"Writer":{"0":"to_csv","1":"to_json","2":"to_html","3":"to_clipboard","4":"to_excel","5":"to_hdf","6":"to_feather","7":"to_msgpack","8":"to_stata","9":" ","10":"to_pickle","11":"to_sql","12":"to_gbq"}}'
pd.read_json(df1.to_json()) # json--> dataframe
image.png
df1.to_html('df1.html') # dataframe --> html
!ls df1.html
df1.html
df1.to_excel('df1.xlsx') # dataframe--> excel
!ls df1.xlsx
df1.xlsx