导入所需要的package
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
基本操作
In [4]: s = pd.Series([1,3,5,np.nan,6,8],dtype=float)
# Series表示一个一维的表结构,类似一个定长的有序字典
In [5]: s
Out[5]:
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
In [9]: dates = pd.date_range('20130101', periods=6)
#生成一个日期索引
In [10]: dates
Out[10]:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
In [11]: df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
#DataFrame表示一个二维的表结构
In [12]: df
Out[12]:
A B C D
2013-01-01 1.523050 -0.153522 0.738800 0.118666
2013-01-02 0.381207 1.206249 0.972379 -0.570013
2013-01-03 1.218619 0.682418 -1.387236 -0.914925
2013-01-04 -1.077268 -0.696153 2.424681 0.437909
2013-01-05 -1.545638 -1.858394 0.681397 0.827652
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778
In [13]: df2 = pd.DataFrame({ 'A' : 1,
...: 'B' : pd.Timestamp('20170102'),
...: 'C' : pd.Series([1,2,3,4],index=list(range(4)),dtype='int32'),
...: 'D' : np.array([3] * 4,dtype='int32'),
...: 'E' : pd.Categorical(["test","train","test","train"]),
...: 'F' : 'foo' })
#手动生成,index为默认值
In [14]: df2
Out[14]:
A B C D E F
0 1 2017-01-02 1 3 test foo
1 1 2017-01-02 2 3 train foo
2 1 2017-01-02 3 3 test foo
3 1 2017-01-02 4 3 train foo
In [15]: df2.dtypes
Out[15]:
A int64
B datetime64[ns]
C int32
D int32
E category
F object
dtype: object
#得到DateFrame对象的前n个或者后n个
In [33]: df
Out[33]:
A B C D
2013-01-01 1.523050 -0.153522 0.738800 0.118666
2013-01-02 0.381207 1.206249 0.972379 -0.570013
2013-01-03 1.218619 0.682418 -1.387236 -0.914925
2013-01-04 -1.077268 -0.696153 2.424681 0.437909
2013-01-05 -1.545638 -1.858394 0.681397 0.827652
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778
#注意,如果没有参数,那么返回除第一个或者最后一个的所有列表
In [32]: df.head()
Out[32]:
A B C D
2013-01-01 1.523050 -0.153522 0.738800 0.118666
2013-01-02 0.381207 1.206249 0.972379 -0.570013
2013-01-03 1.218619 0.682418 -1.387236 -0.914925
2013-01-04 -1.077268 -0.696153 2.424681 0.437909
2013-01-05 -1.545638 -1.858394 0.681397 0.827652
In [16]: df.head(2)
Out[16]:
A B C D
2013-01-01 1.523050 -0.153522 0.738800 0.118666
2013-01-02 0.381207 1.206249 0.972379 -0.570013
In [17]: df.tail(3)
Out[17]:
A B C D
2013-01-04 -1.077268 -0.696153 2.424681 0.437909
2013-01-05 -1.545638 -1.858394 0.681397 0.827652
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778
#如上df
In [37]: df.index
Out[37]:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
In [38]: df.columns
Out[38]: Index(['A', 'B', 'C', 'D'], dtype='object')
In [39]: df.values
Out[39]:
array([[ 1.52305048, -0.15352237, 0.73880011, 0.11866569],
[ 0.38120739, 1.20624856, 0.97237919, -0.57001282],
[ 1.21861889, 0.68241767, -1.38723555, -0.91492475],
[-1.07726844, -0.69615296, 2.42468084, 0.43790878],
[-1.5456376 , -1.8583936 , 0.68139653, 0.82765189],
[-2.13237432, -0.72172305, -0.02113291, -0.53177846]])
#对数据快速统计
In [40]: df.describe()
Out[40]:
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean -0.272067 -0.256854 0.568148 -0.105415
std 1.523314 1.096138 1.250827 0.673642
min -2.132374 -1.858394 -1.387236 -0.914925
25% -1.428545 -0.715331 0.154499 -0.560454
50% -0.348031 -0.424838 0.710098 -0.206556
75% 1.009266 0.473433 0.913984 0.358098
max 1.523050 1.206249 2.424681 0.827652
In [41]: df.T#转置操作
Out[41]:
2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05 2013-01-06
A 1.523050 0.381207 1.218619 -1.077268 -1.545638 -2.132374
B -0.153522 1.206249 0.682418 -0.696153 -1.858394 -0.721723
C 0.738800 0.972379 -1.387236 2.424681 0.681397 -0.021133
D 0.118666 -0.570013 -0.914925 0.437909 0.827652 -0.531778
In [42]: df.sort_index(axis=1, ascending=False)#按索引大小排序,默认为从大到小
#如果axis为0表示按列索引排序,axis为1表示按行索引排序
#ascending为True,表示由小到大排序,否则,由大到小排序
Out[42]:
D C B A
2013-01-01 0.118666 0.738800 -0.153522 1.523050
2013-01-02 -0.570013 0.972379 1.206249 0.381207
2013-01-03 -0.914925 -1.387236 0.682418 1.218619
2013-01-04 0.437909 2.424681 -0.696153 -1.077268
2013-01-05 0.827652 0.681397 -1.858394 -1.545638
2013-01-06 -0.531778 -0.021133 -0.721723 -2.132374
In [43]: df.sort_values(by='B')#按值升序排序
Out[43]:
A B C D
2013-01-05 -1.545638 -1.858394 0.681397 0.827652
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778
2013-01-04 -1.077268 -0.696153 2.424681 0.437909
2013-01-01 1.523050 -0.153522 0.738800 0.118666
2013-01-03 1.218619 0.682418 -1.387236 -0.914925
2013-01-02 0.381207 1.206249 0.972379 -0.570013
In [44]: df.sort_values(by='B',ascending=False)#按值降序排列
Out[44]:
A B C D
2013-01-02 0.381207 1.206249 0.972379 -0.570013
2013-01-03 1.218619 0.682418 -1.387236 -0.914925
2013-01-01 1.523050 -0.153522 0.738800 0.118666
2013-01-04 -1.077268 -0.696153 2.424681 0.437909
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778
2013-01-05 -1.545638 -1.858394 0.681397 0.827652
数据选择
In [45]: df.A#得到单独一列,相当于一个Series
Out[45]:
2013-01-01 1.523050
2013-01-02 0.381207
2013-01-03 1.218619
2013-01-04 -1.077268
2013-01-05 -1.545638
2013-01-06 -2.132374
Freq: D, Name: A, dtype: float64
In [46]: df[0:3]#前三列
Out[46]:
A B C D
2013-01-01 1.523050 -0.153522 0.738800 0.118666
2013-01-02 0.381207 1.206249 0.972379 -0.570013
2013-01-03 1.218619 0.682418 -1.387236 -0.914925
In [47]: df['20130102':'20130104']#也可以这样操作
Out[47]:
A B C D
2013-01-02 0.381207 1.206249 0.972379 -0.570013
2013-01-03 1.218619 0.682418 -1.387236 -0.914925
2013-01-04 -1.077268 -0.696153 2.424681 0.437909
#交叉区域
In [48]: dates
Out[48]:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
In [49]: dates[0]
Out[49]: Timestamp('2013-01-01 00:00:00', offset='D')
In [55]: df
Out[55]:
A B C D
2013-01-01 1.523050 -0.153522 0.738800 0.118666
2013-01-02 0.381207 1.206249 0.972379 -0.570013
2013-01-03 1.218619 0.682418 -1.387236 -0.914925
2013-01-04 -1.077268 -0.696153 2.424681 0.437909
2013-01-05 -1.545638 -1.858394 0.681397 0.827652
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778
In [51]: df.loc[dates[0]]#去某一行
Out[51]:
A 1.523050
B -0.153522
C 0.738800
D 0.118666
Name: 2013-01-01 00:00:00, dtype: float64
In [54]: df.loc[:,['A','B']]#获取列
Out[54]:
A B
2013-01-01 1.523050 -0.153522
2013-01-02 0.381207 1.206249
2013-01-03 1.218619 0.682418
2013-01-04 -1.077268 -0.696153
2013-01-05 -1.545638 -1.858394
2013-01-06 -2.132374 -0.721723
In [56]: df.loc['20130102':'20130104',['A','B']]#指定行列
Out[56]:
A B
2013-01-02 0.381207 1.206249
2013-01-03 1.218619 0.682418
2013-01-04 -1.077268 -0.696153
In [57]: df.loc['20130102',['A','B']]#对于返回的对象进行维度缩减
Out[57]:
A 0.381207
B 1.206249
Name: 2013-01-02 00:00:00, dtype: float64
In [58]: df.loc[dates[0],'A']#得到一个具体值
Out[58]: 1.5230504820975281
In [59]: df.at[dates[0],'A']#得到一个具体值,与上边的方法相同
Out[59]: 1.5230504820975281
In [60]: df.iloc[3]#通过传递数值进行选择,第四行
Out[60]:
A -1.077268
B -0.696153
C 2.424681
D 0.437909
Name: 2013-01-04 00:00:00, dtype: float64
In [61]: df.iloc[3:5,0:2]#用数字进行切片
Out[61]:
A B
2013-01-04 -1.077268 -0.696153
2013-01-05 -1.545638 -1.858394
In [62]: df.iloc[[1,2,4],[0,2]]#返回指定的行列
Out[62]:
A C
2013-01-02 0.381207 0.972379
2013-01-03 1.218619 -1.387236
2013-01-05 -1.545638 0.681397
In [63]: df.iloc[1:3,:]#第二到三行
Out[63]:
A B C D
2013-01-02 0.381207 1.206249 0.972379 -0.570013
2013-01-03 1.218619 0.682418 -1.387236 -0.914925
In [64]: df.iloc[:,1:3]#第二到三列
Out[64]:
B C
2013-01-01 -0.153522 0.738800
2013-01-02 1.206249 0.972379
2013-01-03 0.682418 -1.387236
2013-01-04 -0.696153 2.424681
2013-01-05 -1.858394 0.681397
2013-01-06 -0.721723 -0.021133
#第二行,第二列的值
In [65]: df.iloc[1,1]
Out[65]: 1.2062485635529525
In [66]: df.iat[1,1]
Out[66]: 1.2062485635529525
In [68]: df
Out[68]:
A B C D
2013-01-01 1.523050 -0.153522 0.738800 0.118666
2013-01-02 0.381207 1.206249 0.972379 -0.570013
2013-01-03 1.218619 0.682418 -1.387236 -0.914925
2013-01-04 -1.077268 -0.696153 2.424681 0.437909
2013-01-05 -1.545638 -1.858394 0.681397 0.827652
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778
In [67]: df[df.A > 0]
Out[67]:
A B C D
2013-01-01 1.523050 -0.153522 0.738800 0.118666
2013-01-02 0.381207 1.206249 0.972379 -0.570013
2013-01-03 1.218619 0.682418 -1.387236 -0.914925
In [69]: df[df > 0]
Out[69]:
A B C D
2013-01-01 1.523050 NaN 0.738800 0.118666
2013-01-02 0.381207 1.206249 0.972379 NaN
2013-01-03 1.218619 0.682418 NaN NaN
2013-01-04 NaN NaN 2.424681 0.437909
2013-01-05 NaN NaN 0.681397 0.827652
2013-01-06 NaN NaN NaN NaN
df[df.iloc[:,0]>0]#得到第一列大于0的行
In [70]: df2 = df.copy()
In [71]: df2['E'] = ['one', 'one','two','three','four','three']
In [72]: df2
Out[72]:
A B C D E
2013-01-01 1.523050 -0.153522 0.738800 0.118666 one
2013-01-02 0.381207 1.206249 0.972379 -0.570013 one
2013-01-03 1.218619 0.682418 -1.387236 -0.914925 two
2013-01-04 -1.077268 -0.696153 2.424681 0.437909 three
2013-01-05 -1.545638 -1.858394 0.681397 0.827652 four
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778 three
In [73]: df2[df2['E'].isin(['two','one'])]#找到特定行索引(E)包含某对象(two,one)的行
Out[73]:
A B C D E
2013-01-01 1.523050 -0.153522 0.738800 0.118666 one
2013-01-02 0.381207 1.206249 0.972379 -0.570013 one
2013-01-03 1.218619 0.682418 -1.387236 -0.914925 two
In [74]: s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
In [75]: s1
Out[75]:
2013-01-02 1
2013-01-03 2
2013-01-04 3
2013-01-05 4
2013-01-06 5
2013-01-07 6
Freq: D, dtype: int64
In [76]: df['F'] = s1
In [77]: df
Out[77]:
A B C D F
2013-01-01 1.523050 -0.153522 0.738800 0.118666 NaN
2013-01-02 0.381207 1.206249 0.972379 -0.570013 1.0
2013-01-03 1.218619 0.682418 -1.387236 -0.914925 2.0
2013-01-04 -1.077268 -0.696153 2.424681 0.437909 3.0
2013-01-05 -1.545638 -1.858394 0.681397 0.827652 4.0
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778 5.0
In [78]: df.at[dates[0],'A'] = 0
In [79]: df.iat[0,1] = 0
In [80]: df.loc[:,'D'] = np.array([5] * len(df))
In [81]: df
Out[81]:
A B C D F
2013-01-01 0.000000 0.000000 0.738800 5 NaN
2013-01-02 0.381207 1.206249 0.972379 5 1.0
2013-01-03 1.218619 0.682418 -1.387236 5 2.0
2013-01-04 -1.077268 -0.696153 2.424681 5 3.0
2013-01-05 -1.545638 -1.858394 0.681397 5 4.0
2013-01-06 -2.132374 -0.721723 -0.021133 5 5.0
In [82]: df2 = df.copy()
In [83]: df2[df2 > 0] = -df2
In [84]: df2
Out[84]:
A B C D F
2013-01-01 0.000000 0.000000 -0.738800 -5 NaN
2013-01-02 -0.381207 -1.206249 -0.972379 -5 -1.0
2013-01-03 -1.218619 -0.682418 -1.387236 -5 -2.0
2013-01-04 -1.077268 -0.696153 -2.424681 -5 -3.0
2013-01-05 -1.545638 -1.858394 -0.681397 -5 -4.0
2013-01-06 -2.132374 -0.721723 -0.021133 -5 -5.0
In [85]: df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
In [86]: df1.loc[dates[0]:dates[1],'E'] = 1
In [87]: df1
Out[87]:
A B C D F E
2013-01-01 0.000000 0.000000 0.738800 5 NaN 1.0
2013-01-02 0.381207 1.206249 0.972379 5 1.0 1.0
2013-01-03 1.218619 0.682418 -1.387236 5 2.0 NaN
2013-01-04 -1.077268 -0.696153 2.424681 5 3.0 NaN
#去掉包含缺失值得行
In [88]: df1.dropna(how='any')#how='all'#如果为all,所有值都为NA时去掉这个label,any为任意NA去掉
Out[88]:
A B C D F E
2013-01-02 0.381207 1.206249 0.972379 5 1.0 1.0
#使用8来填充
In [90]: df1.fillna(value=8)
Out[90]:
A B C D F E
2013-01-01 0.000000 0.000000 0.738800 5 8.0 1.0
2013-01-02 0.381207 1.206249 0.972379 5 1.0 1.0
2013-01-03 1.218619 0.682418 -1.387236 5 2.0 8.0
2013-01-04 -1.077268 -0.696153 2.424681 5 3.0 8.0
In [91]: pd.isnull(df1)#如果是NaN,返回Ture
Out[91]:
A B C D F E
2013-01-01 False False False False True False
2013-01-02 False False False False False False
2013-01-03 False False False False False True
2013-01-04 False False False False False True
In [92]: df
Out[92]:
A B C D F
2013-01-01 0.000000 0.000000 0.738800 5 NaN
2013-01-02 0.381207 1.206249 0.972379 5 1.0
2013-01-03 1.218619 0.682418 -1.387236 5 2.0
2013-01-04 -1.077268 -0.696153 2.424681 5 3.0
2013-01-05 -1.545638 -1.858394 0.681397 5 4.0
2013-01-06 -2.132374 -0.721723 -0.021133 5 5.0
In [93]: df.mean()#列均值,忽略NaN
Out[93]:
A -0.525909
B -0.231267
C 0.568148
D 5.000000
F 3.000000
dtype: float64
In [94]: df.mean(1)#行均值,忽略NaN
Out[94]:
2013-01-01 1.434700
2013-01-02 1.711967
2013-01-03 1.502760
2013-01-04 1.730252
2013-01-05 1.255473
2013-01-06 1.424954
Freq: D, dtype: float64
In [97]: dates
Out[97]:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
In [95]: s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
#对于拥有不同维度,需要对齐的对象进行操作。Pandas会自动的沿着指定的维度进行广播,向后飘两个单位
In [96]: s
Out[96]:
2013-01-01 NaN
2013-01-02 NaN
2013-01-03 1.0
2013-01-04 3.0
2013-01-05 5.0
2013-01-06 NaN
Freq: D, dtype: float64
In [98]: df.sub(s, axis='index')#s和df按index相加
Out[98]:
A B C D F
2013-01-01 NaN NaN NaN NaN NaN
2013-01-02 NaN NaN NaN NaN NaN
2013-01-03 0.218619 -0.317582 -2.387236 4.0 1.0
2013-01-04 -4.077268 -3.696153 -0.575319 2.0 0.0
2013-01-05 -6.545638 -6.858394 -4.318603 0.0 -1.0
2013-01-06 NaN NaN NaN NaN NaN
In [99]: df
Out[99]:
A B C D F
2013-01-01 0.000000 0.000000 0.738800 5 NaN
2013-01-02 0.381207 1.206249 0.972379 5 1.0
2013-01-03 1.218619 0.682418 -1.387236 5 2.0
2013-01-04 -1.077268 -0.696153 2.424681 5 3.0
2013-01-05 -1.545638 -1.858394 0.681397 5 4.0
2013-01-06 -2.132374 -0.721723 -0.021133 5 5.0
#将一个函数应用于该对象
In [100]: df.apply(np.cumsum)
Out[100]:
A B C D F
2013-01-01 0.000000 0.000000 0.738800 5 NaN
2013-01-02 0.381207 1.206249 1.711179 10 1.0
2013-01-03 1.599826 1.888666 0.323944 15 3.0
2013-01-04 0.522558 1.192513 2.748625 20 6.0
2013-01-05 -1.023080 -0.665880 3.430021 25 10.0
2013-01-06 -3.155454 -1.387603 3.408888 30 15.0
In [101]: df.apply(lambda x: x.max() - x.min())
Out[101]:
A 3.350993
B 3.064642
C 3.811916
D 0.000000
F 4.000000
dtype: float64
In [102]: s = pd.Series(np.random.randint(0, 7, size=10))
In [103]: s
Out[103]:
0 0
1 2
2 4
3 2
4 2
5 4
6 1
7 2
8 6
9 5
dtype: int32
In [104]: s.value_counts()
Out[104]:
2 4
4 2
6 1
5 1
1 1
0 1
dtype: int64
String Methods*
In [105]: s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
In [106]: s.str.lower()
Out[106]:
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
dtype: object
concat:连接
In [107]: df = pd.DataFrame(np.random.randn(10, 4))
In [108]: df
Out[108]:
0 1 2 3
0 -0.350383 0.066715 -0.340783 0.676047
1 1.246707 1.074448 -0.306961 -0.870164
2 0.978723 2.676164 0.810125 1.742859
3 -1.881706 1.900046 0.446396 -1.850192
4 -1.869060 0.194756 -3.999157 0.357495
5 0.821017 -0.131944 -0.172258 -0.550951
6 1.065902 0.788957 -0.155128 0.954346
7 0.331280 0.257569 0.005222 2.232761
8 -1.046927 0.723164 -0.311852 -2.101644
9 -0.518778 -1.058908 0.241587 0.112990
In [111]: pieces = [df[:2], df[:3], df[:5]]
In [112]: pd.concat(pieces)
Out[112]:
0 1 2 3
0 -0.350383 0.066715 -0.340783 0.676047
1 1.246707 1.074448 -0.306961 -0.870164
0 -0.350383 0.066715 -0.340783 0.676047
1 1.246707 1.074448 -0.306961 -0.870164
2 0.978723 2.676164 0.810125 1.742859
0 -0.350383 0.066715 -0.340783 0.676047
1 1.246707 1.074448 -0.306961 -0.870164
2 0.978723 2.676164 0.810125 1.742859
3 -1.881706 1.900046 0.446396 -1.850192
4 -1.869060 0.194756 -3.999157 0.357495
Join
In [113]: left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
In [114]: right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
In [115]: left
Out[115]:
key lval
0 foo 1
1 foo 2
In [116]: right
Out[116]:
key rval
0 foo 4
1 foo 5
In [117]: pd.merge(left, right, on='key')
Out[117]:
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5
#对比
In [119]: left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
In [120]: right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
In [121]: left
Out[121]:
key lval
0 foo 1
1 bar 2
In [122]: right
Out[122]:
key rval
0 foo 4
1 bar 5
In [123]: pd.merge(left, right, on='key')
Out[123]:
key lval rval
0 foo 1 4
1 bar 2 5
append
In [124]: df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
In [125]: df
Out[125]:
A B C D
0 -0.313586 0.010213 0.521081 -1.286954
1 0.079310 1.344337 -0.206647 1.471418
2 0.439421 -0.908587 0.382107 0.765690
3 -0.473688 -0.607198 -0.028644 0.541583
4 -0.778374 -0.743020 -1.231464 0.226309
5 0.335211 -1.464544 0.792436 -0.678236
6 -1.690446 0.315661 1.148733 -1.110375
7 -1.965615 0.341851 0.604430 -0.418681
In [126]: s = df.iloc[3]
In [127]: df.append(s, ignore_index=True)
Out[127]:
A B C D
0 -0.313586 0.010213 0.521081 -1.286954
1 0.079310 1.344337 -0.206647 1.471418
2 0.439421 -0.908587 0.382107 0.765690
3 -0.473688 -0.607198 -0.028644 0.541583
4 -0.778374 -0.743020 -1.231464 0.226309
5 0.335211 -1.464544 0.792436 -0.678236
6 -1.690446 0.315661 1.148733 -1.110375
7 -1.965615 0.341851 0.604430 -0.418681
8 -0.473688 -0.607198 -0.028644 0.541583
Grouping
对于”group by”操作,我们通常是指以下一个或多个操作步骤:
(Splitting)按照一些规则将数据分为不同的组;
(Applying)对于每组数据分别执行一个函数;
(Combining)将结果组合到一个数据结构中;
In [128]: df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
...: 'foo', 'bar', 'foo', 'foo'],
...: 'B' : ['one', 'one', 'two', 'three',
...: 'two', 'two', 'one', 'three'],
...: 'C' : np.random.randn(8),
...: 'D' : np.random.randn(8)})
In [129]: df
Out[129]:
A B C D
0 foo one 1.089882 -0.893373
1 bar one 0.799028 0.237772
2 foo two -0.937741 -1.471799
3 bar three 1.195312 0.692406
4 foo two 1.005337 -0.249233
5 bar two -1.306802 -0.627874
6 foo one 1.619168 -0.539545
7 foo three -0.902081 1.303749
In [130]: df.groupby('A').sum()
Out[130]:
C D
A
bar 0.687537 0.302305
foo 1.874566 -1.850200
In [131]: df.groupby(['A','B']).sum()
Out[131]:
C D
A B
bar one 0.799028 0.237772
three 1.195312 0.692406
two -1.306802 -0.627874
foo one 2.709051 -1.432917
three -0.902081 1.303749
two 0.067596 -1.721032
Reshaping
In [132]: tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
...: 'foo', 'foo', 'qux', 'qux'],
...: ['one', 'two', 'one', 'two',
...: 'one', 'two', 'one', 'two']]))
In [133]: tuples
Out[133]:
[('bar', 'one'),
('bar', 'two'),
('baz', 'one'),
('baz', 'two'),
('foo', 'one'),
('foo', 'two'),
('qux', 'one'),
('qux', 'two')]
In [134]: index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
In [135]: df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
In [136]: df
Out[136]:
A B
first second
bar one 0.535103 0.335592
two -0.007379 -0.271691
baz one -1.927102 -0.348317
two -0.392650 0.178679
foo one 0.620067 0.365730
two -0.675787 -1.806861
qux one -0.160767 0.376871
two -0.667896 -0.137858
In [137]: df2 = df[:4]
In [138]: df2
Out[138]:
A B
first second
bar one 0.535103 0.335592
two -0.007379 -0.271691
baz one -1.927102 -0.348317
two -0.392650 0.178679
In [139]: stacked = df2.stack()
In [140]: stacked
Out[140]:
first second
bar one A 0.535103
B 0.335592
two A -0.007379
B -0.271691
baz one A -1.927102
B -0.348317
two A -0.392650
B 0.178679
dtype: float64
In [141]: stacked.unstack()
Out[141]:
A B
first second
bar one 0.535103 0.335592
two -0.007379 -0.271691
baz one -1.927102 -0.348317
two -0.392650 0.178679
In [142]: stacked.unstack(1)
Out[142]:
second one two
first
bar A 0.535103 -0.007379
B 0.335592 -0.271691
baz A -1.927102 -0.392650
B -0.348317 0.178679
In [143]: stacked.unstack(0)
Out[143]:
first bar baz
second
one A 0.535103 -1.927102
B 0.335592 -0.348317
two A -0.007379 -0.392650
B -0.271691 0.178679
Pivot Tables
In [144]: df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
...: 'B' : ['A', 'B', 'C'] * 4,
...: 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
...: 'D' : np.random.randn(12),
...: 'E' : np.random.randn(12)})
In [145]: df
Out[145]:
A B C D E
0 one A foo -0.029439 -1.855289
1 one B foo -0.648452 1.242427
2 two C foo -0.120858 -0.085237
3 three A bar -0.215119 2.244871
4 one B bar 1.412866 -0.886582
5 one C bar 0.818426 1.053411
6 two A foo 0.594699 -0.461450
7 three B foo 0.861705 0.687342
8 one C foo -1.429140 1.337407
9 one A bar -0.872577 0.641517
10 two B bar 0.524023 -1.575005
11 three C bar 0.394977 0.075784
In [146]: pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
Out[146]:
C bar foo
A B
one A -0.872577 -0.029439
B 1.412866 -0.648452
C 0.818426 -1.429140
three A -0.215119 NaN
B NaN 0.861705
C 0.394977 NaN
two A NaN 0.594699
B 0.524023 NaN
C NaN -0.120858
Time Series*
In [158]: rng = pd.date_range('1/1/2017', periods=10, freq='S')
#从指定时刻开始产生时刻
In [159]: rng
Out[159]:
DatetimeIndex(['2017-01-01 00:00:00', '2017-01-01 00:00:01',
'2017-01-01 00:00:02', '2017-01-01 00:00:03',
'2017-01-01 00:00:04', '2017-01-01 00:00:05',
'2017-01-01 00:00:06', '2017-01-01 00:00:07',
'2017-01-01 00:00:08', '2017-01-01 00:00:09'],
dtype='datetime64[ns]', freq='S')
In [160]: ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
#赋予指定时刻随机值
In [161]: ts
Out[161]:
2017-01-01 00:00:00 3
2017-01-01 00:00:01 235
2017-01-01 00:00:02 97
2017-01-01 00:00:03 10
2017-01-01 00:00:04 411
2017-01-01 00:00:05 153
2017-01-01 00:00:06 188
2017-01-01 00:00:07 195
2017-01-01 00:00:08 418
2017-01-01 00:00:09 226
Freq: S, dtype: int32
ts.resample('5Min').sum()#指定时间间隔求和
Out[163]:
2017-01-01 1936
Freq: 5T, dtype: int32
In [166]: rng = pd.date_range('1/1/2017', periods=100, freq='S')
In [167]: ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
In [168]: ts.resample('1Min').sum()
Out[168]:
2017-01-01 00:00:00 15906
2017-01-01 00:01:00 9008
Freq: T, dtype: int32
In [169]: sum(ts.values[:60])
Out[169]: 15906
In [170]: sum(ts.values[60:])
Out[170]: 9008
In [171]: rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
In [172]: rng
Out[172]:
DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
'2012-03-10'],
dtype='datetime64[ns]', freq='D')
In [173]: ts = pd.Series(np.random.randn(len(rng)), rng)
In [175]: ts
Out[175]:
2012-03-06 1.421425
2012-03-07 -0.096355
2012-03-08 -0.648838
2012-03-09 0.688946
2012-03-10 -1.051607
Freq: D, dtype: float64
In [176]: ts_utc = ts.tz_localize('UTC')
In [177]: ts_utc
Out[177]:
2012-03-06 00:00:00+00:00 1.421425
2012-03-07 00:00:00+00:00 -0.096355
2012-03-08 00:00:00+00:00 -0.648838
2012-03-09 00:00:00+00:00 0.688946
2012-03-10 00:00:00+00:00 -1.051607
Freq: D, dtype: float64
In [178]: ts_utc.tz_convert('US/Eastern')#时区转化
Out[178]:
2012-03-05 19:00:00-05:00 1.421425
2012-03-06 19:00:00-05:00 -0.096355
2012-03-07 19:00:00-05:00 -0.648838
2012-03-08 19:00:00-05:00 0.688946
2012-03-09 19:00:00-05:00 -1.051607
Freq: D, dtype: float64
In [179]: rng = pd.date_range('1/1/2012', periods=5, freq='M')
In [180]: rng
Out[180]:
DatetimeIndex(['2012-01-31', '2012-02-29', '2012-03-31', '2012-04-30',
'2012-05-31'],
dtype='datetime64[ns]', freq='M')
In [181]: ts = pd.Series(np.random.randn(len(rng)), index=rng)
In [182]: ts
Out[182]:
2012-01-31 -0.155942
2012-02-29 -0.767510
2012-03-31 -0.155900
2012-04-30 -0.098059
2012-05-31 0.607614
Freq: M, dtype: float64
In [183]: ps = ts.to_period()
In [184]: ps
Out[184]:
2012-01 -0.155942
2012-02 -0.767510
2012-03 -0.155900
2012-04 -0.098059
2012-05 0.607614
Freq: M, dtype: float64
In [185]: ps.to_timestamp()
Out[185]:
2012-01-01 -0.155942
2012-02-01 -0.767510
2012-03-01 -0.155900
2012-04-01 -0.098059
2012-05-01 0.607614
Freq: MS, dtype: float64
In [186]: prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
In [187]: prng
Out[187]:
PeriodIndex(['1990Q1', '1990Q2', '1990Q3', '1990Q4', '1991Q1', '1991Q2',
'1991Q3', '1991Q4', '1992Q1', '1992Q2', '1992Q3', '1992Q4',
'1993Q1', '1993Q2', '1993Q3', '1993Q4', '1994Q1', '1994Q2',
'1994Q3', '1994Q4', '1995Q1', '1995Q2', '1995Q3', '1995Q4',
'1996Q1', '1996Q2', '1996Q3', '1996Q4', '1997Q1', '1997Q2',
'1997Q3', '1997Q4', '1998Q1', '1998Q2', '1998Q3', '1998Q4',
'1999Q1', '1999Q2', '1999Q3', '1999Q4', '2000Q1', '2000Q2',
'2000Q3', '2000Q4'],
dtype='int64', freq='Q-NOV')
In [188]: ts = pd.Series(np.random.randn(len(prng)), prng)
In [189]: ts
Out[189]:
1990Q1 0.297797
1990Q2 0.283513
1990Q3 -0.322105
1990Q4 -1.541848
1991Q1 0.539559
1991Q2 0.161805
1991Q3 0.966456
1991Q4 0.228009
1992Q1 0.033140
1992Q2 -1.803392
1992Q3 0.795412
1992Q4 0.670945
1993Q1 1.443656
1993Q2 -1.191861
1993Q3 -0.747472
1993Q4 0.919639
1994Q1 1.414509
1994Q2 -0.087450
1994Q3 -1.556399
1994Q4 -1.028431
1995Q1 0.516376
1995Q2 -0.798248
1995Q3 -0.032770
1995Q4 0.127403
1996Q1 0.118407
1996Q2 0.983731
1996Q3 0.444699
1996Q4 1.305806
1997Q1 -0.445013
1997Q2 -0.130233
1997Q3 -1.544834
1997Q4 -0.737617
1998Q1 -0.481101
1998Q2 -0.694068
1998Q3 -1.945882
1998Q4 0.091352
1999Q1 0.097229
1999Q2 -0.422791
1999Q3 1.093277
1999Q4 -0.551056
2000Q1 -1.201843
2000Q2 0.270037
2000Q3 -0.256666
2000Q4 1.233440
Freq: Q-NOV, dtype: float64
In [190]: ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
In [191]: ts.head()
Out[191]:
1990-03-01 09:00 0.297797
1990-06-01 09:00 0.283513
1990-09-01 09:00 -0.322105
1990-12-01 09:00 -1.541848
1991-03-01 09:00 0.539559
Freq: H, dtype: float64
Categoricals
Plotting
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
ts.plot()
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
plt.figure(); df.plot(); plt.legend(loc='best')