1.DataFrame入门
数据集的创建
import pandas as pd
dates = pd.date_range('20140729', periods = 6)
dates
df = pd.DataFrame(np.random.randn(6,4), index = dates, columns = list('ABCD'))
df
df2 = pd.DataFrame({'A' : np.random.randn(6), })
df2
df3 = pd.DataFrame({'A':pd.Timestamp('20140729'), 'B':pd.Series(1), })
df3
df.dtypes
df.head(3)
df.tail(2)
df.index
DatetimeIndex(['2014-07-29', '2014-07-30', '2014-07-31', '2014-08-01',
'2014-08-02', '2014-08-03'],
dtype='datetime64[ns]', freq='D')
df.columns
Index([u'A', u'B', u'C', u'D'], dtype='object')
df.values
array([[ 1.74785641, 1.49122288, -0.19078369, 0.00449092],
[ 0.03249352, 1.39113781, -1.13572906, 1.46444988],
[-1.12609255, 1.3929994 , 2.53144664, 0.98242057],
[-0.79119008, -1.04549646, 1.00945668, 0.68014356],
[-1.23060363, -0.22338443, 1.14097936, 0.12460565],
[-0.09212438, -0.33446273, 1.43101278, -1.38077278]])
df.describe()
df.T
df.sort_values(['C'], ascending=True)
2.DataFrame切片操作
import numpy as np
import pandas as pd
dates = pd.date_range('20140729', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index = dates, columns = list('ABCD'))
df
df['A']
2014-07-29 -0.405424
2014-07-30 -0.122791
2014-07-31 -0.590013
2014-08-01 -0.587685
2014-08-02 1.134898
2014-08-03 -0.292489
Freq: D, Name: A, dtype: float64
df[1:3]
df['2014-07-30':'2014-08-02']
df.loc[dates[0]]
A -0.405424
B 0.129631
C 0.664167
D -0.958164
Name: 2014-07-29 00:00:00, dtype: float64
#通过loc取所有行,A,B列
df.loc[:, ['A', 'B']]
#时间区间, A,B列
df.loc['2014-07-31':'2014-08-03', ['A', 'B']]
df.loc['2014-07-31', 'B']
-0.2958600899474903
#通过at,取第一行,第一列
df.at[dates[0], 'A']
-0.4054243882888962
#提取第四行数据
df.iloc[3]
A -0.587685
B 0.082466
C -0.153058
D 1.259797
Name: 2014-08-01 00:00:00, dtype: float64
#取4-5行,1-2列
df.iloc[3:5, 0:2]
#2,3,5行,1,3列
df.iloc[[1,2,4], [0, 2]]
#所有行,2,3列
df.iloc[:, 1:3]
#提取第2行第2列的一个值
df.iloc[1, 1]
-2.2539911876511227
#通过iat可以提取单个元素,并且效率更高
df.iat[1, 1]
-2.2539911876511227