一、基础
0.导入pandas
import pandas as pd
import numpy as np
1.查看版本
print(pd.__version__,end='\n')
0.25.1
series操作
2.不同方法创建series
从列表创建series
arr=[1,2,3,4]
s1=pd.Series(arr)
print(s1,end='\n')
0 1
1 2
2 3
3 4
dtype: int64
从ndarray创建series
arr1=np.array(arr)
s2=pd.Series(arr1)
print(s2,end='\n')
0 1
1 2
2 3
3 4
dtype: int32
从字典创建series
dict={'a':1,'b':2}
s3=pd.Series(dict)
print(s3)
a 1
b 2
dtype: int64
3.修改series索引
s3.index=['c','d']
print(s3)
c 1
d 2
dtype: int64
4.series纵向拼接
series无横向拼接
s4=s3.append(s1)
print(s4)
c 1
d 2
0 1
1 2
2 3
3 4
dtype: int64
5.series按指定索引删除元素
s4=s4.drop('c')
print(s4)
d 2
0 1
1 2
2 3
3 4
dtype: int64
6.修改指定索引元素
s4.index=['A','B','C','D','E']
s4['A']=6
s4
A 6
B 1
C 2
D 3
E 4
dtype: int64
7.按指定索引查找元素
s4['B']
1
8.切片操作
s4[:3]
A 6
B 1
C 2
dtype: int64
9.运算操作
加add减sub乘mul除div 索引不同以nan填充
s4.add(s4)
s4.sub(2)
s4.mul(s4)
s4.div(4)
A 1.50
B 0.25
C 0.50
D 0.75
E 1.00
dtype: float64
10.统计值:中位数、求和、最值、方差、标准差、均值
s4.median(axis=0)
s4.sum(0)
s4.max()
s4.min()
s4.var()
s4.std()
s4.mean()
3.2
dataframe操作
11.创建dataframe数据
以时间序列为索引通过numpy创建
index=pd.date_range('today',periods=10,freq='M')
num_arr=np.random.randn(len(index),4)
column=['A','B','C','D']
df=pd.DataFrame(num_arr,index=index,columns=column)
print(df)
A B C D
2019-12-31 16:52:16.149736 0.581617 -0.021979 -0.515586 1.465126
2020-01-31 16:52:16.149736 0.097726 0.638343 1.460869 -1.210854
2020-02-29 16:52:16.149736 -0.382305 1.126188 1.106719 -0.026886
2020-03-31 16:52:16.149736 0.161220 0.630511 -1.115112 0.843560
2020-04-30 16:52:16.149736 -0.601466 -2.132495 -0.128893 0.004075
2020-05-31 16:52:16.149736 0.179003 -0.601442 1.521162 -0.209871
2020-06-30 16:52:16.149736 -1.113282 2.251862 -1.231143 -0.543544
2020-07-31 16:52:16.149736 -1.465847 0.174940 0.396115 -1.602252
2020-08-31 16:52:16.149736 1.331310 0.355311 0.085278 1.010926
2020-09-30 16:52:16.149736 -0.093573 -0.036117 0.736916 0.596624
通过字典创建
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df2 = pd.DataFrame(data, index=labels)
print(df2)
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 yes
e dog 5.0 2 no
f cat 2.0 3 no
g snake 4.5 1 no
h cat NaN 1 yes
i dog 7.0 2 no
j dog 3.0 1 no
12.查看数据
head/tail/describe/info/index/columns/values/.T/sort_values(by=)
df2.head()
df.head(10)
df2.tail()
df2.tail(3)
df2.describe()
df2.info()
df2.index
df2.columns
df2.values
df2.T
df2.sort_values(by='age',ascending=False)
df2['age'].value_counts()
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
animal 10 non-null object
age 8 non-null float64
visits 10 non-null int64
priority 10 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 280.0+ bytes
3.0 2
7.0 1
4.5 1
2.0 1
5.0 1
0.5 1
2.5 1
Name: age, dtype: int64
13.切片操作\查询操作、副本拷贝、是否为空,缺失多少空值
df2[:2]==df2.iloc[:2,:] #等价
df2.iloc[1:4,::2]
df2.iloc[2]==df2.iloc[2,:]#等价
df2.iat[3,2]=1 #根据下标修改元素值
df2.iat[3,2]
df2[df2.columns[2]]
df2.loc[:,df2.columns[2]]
df2.age==df2['age']
df2[['animal','age']]==df2.loc[:,['animal','age']] #等价
df3=df2.copy()
df3.isnull()
df3.isnull().sum(0)#查询每列空值多少
df3.isnull().sum(1)#查询每行空值多少
df3.dropna(how='any')#删除所有空值的行(any表示只要有一个空值就删除)
df3.isnull().sum()
animal 0
age 2
visits 0
priority 0
dtype: int64
14.添加数据
num=pd.Series(np.random.randn(len(df3)),index=df3.index) #索引要和要添加的数据保持一致,否者索引不一致以nan填充
df3['No']=num
print(df3)
animal age visits priority No
a cat 2.5 1 yes 1.663161
b cat 3.0 3 yes 0.391939
c snake 0.5 2 no -0.136955
d dog NaN 1 yes -0.253120
e dog 5.0 2 no -1.109632
f cat 2.0 3 no 0.444705
g snake 4.5 1 no 0.238463
h cat NaN 1 yes 0.153178
i dog 7.0 2 no 0.692350
j dog 3.0 1 no -1.069520
15.对指定索引行修改数据
df3.loc['f','g']=100 #对指定行列索引的位置赋值
print(df3)
animal age visits priority No g
a cat 2.5 1 yes 1.663161 NaN
b cat 3.0 3 yes 0.391939 NaN
c snake 0.5 2 no -0.136955 NaN
d dog NaN 1 yes -0.253120 NaN
e dog 5.0 2 no -1.109632 NaN
f cat 2.0 3 no 0.444705 100.0
g snake 4.5 1 no 0.238463 NaN
h cat NaN 1 yes 0.153178 NaN
i dog 7.0 2 no 0.692350 NaN
j dog 3.0 1 no -1.069520 NaN
df3.loc[['f','g']]=200#对指定索引行的位置统一修改内容
print(df3)
animal age visits priority No g
a cat 2.5 1 yes 1.663161 NaN
b cat 3.0 3 yes 0.391939 NaN
c snake 0.5 2 no -0.136955 NaN
d dog NaN 1 yes -0.253120 NaN
e dog 5.0 2 no -1.109632 NaN
f 200 200.0 200 200 200.000000 200.0
g 200 200.0 200 200 200.000000 200.0
h cat NaN 1 yes 0.153178 NaN
i dog 7.0 2 no 0.692350 NaN
j dog 3.0 1 no -1.069520 NaN
16.dataframe数据统计值计算和series类似,可以根据轴计算行,列的统计值
df3.mean() #行方向求均值,默认axis=0
df3.mean(axis=0)
df3.mean(axis=1)#列方向求均值
df3.mean(1)
df3.sum(0)
df3.sum(1)
df3.max()
df3.max(1)
df3.min()
df3.min(1)
#单列计算
df3['age'].mean()
52.625
17.字符串操作,series操作
s_str=pd.Series(['A', 'B', 'C', 'Aaba', 'Baca',
np.nan, 'CABA', 'dog', 'cat'])
s_str.str.lower()
s_str.str.upper()
s_str.str.title()
0 A
1 B
2 C
3 Aaba
4 Baca
5 NaN
6 Caba
7 Dog
8 Cat
dtype: object
dataframe中的字符串可单独选出来进行字符串操作
df_str=pd.DataFrame(s_str)
df_str.iloc[:,0].str.lower()
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
Name: 0, dtype: object
18.缺失值操作 填充、删除
df4=df3.copy()
df4.fillna(value=0)
df5=df3.copy()
df5.dropna(how='all')#删除所有值均为空值的行
df5.dropna(how='any')#删除所有包含任一空值的行
19.数据库连接类似操作
pd.merge(),索引相同的保留
left = pd.DataFrame({'key': ['foo1', 'foo2'], 'one': [1, 2]})
right = pd.DataFrame({'key': ['foo2', 'foo3'], 'two': [4, 5]})
pd.merge(left,right,on='key')
20.文件操作,写入和读取
df3.to_csv('animal.csv') #dataframe转换成csv格式,会默认将索引也存为一列,所以再次读取的时候将第0列设为索引
pd.read_csv('animal.csv',index_col=0)
21.写入excel
df3.to_excel('animal.xlsx',sheet_name='sheet1')
df4=pd.read_excel('animal.xlsx','sheet1',index_col=0,na_values=['NA'])
print(df4)
animal age visits priority No g
a cat 2.5 1 yes 1.663161 NaN
b cat 3.0 3 yes 0.391939 NaN
c snake 0.5 2 no -0.136955 NaN
d dog NaN 1 yes -0.253120 NaN
e dog 5.0 2 no -1.109632 NaN
f 200 200.0 200 200 200.000000 200.0
g 200 200.0 200 200 200.000000 200.0
h cat NaN 1 yes 0.153178 NaN
i dog 7.0 2 no 0.692350 NaN
j dog 3.0 1 no -1.069520 NaN
进阶
22.建立一个以时间为索引,值为随机数的series
index=pd.date_range(start='2018-01-01',end='2018-12-31',freq='D')
s=pd.Series(np.random.rand(len(index)),index=index)
s
2018-01-01 0.063296
2018-01-02 0.342126
2018-01-03 0.089079
2018-01-04 0.257992
2018-01-05 0.901294
...
2018-12-27 0.083605
2018-12-28 0.567538
2018-12-29 0.123146
2018-12-30 0.813415
2018-12-31 0.768627
Freq: D, Length: 365, dtype: float64
23.统计每一个周三对应值的和
s[s.index.weekday==2].sum()
24.966913035758164
24.统计每个月的平均值
s.resample('M').mean()
2018-01-31 0.412672
2018-02-28 0.654985
2018-03-31 0.500802
2018-04-30 0.576454
2018-05-31 0.490485
2018-06-30 0.571304
2018-07-31 0.488079
2018-08-31 0.488980
2018-09-30 0.510299
2018-10-31 0.469717
2018-11-30 0.558550
2018-12-31 0.478610
Freq: M, dtype: float64
25.将series中的时间进行转换(秒转分)
s=pd.date_range('today',periods=100,freq='S')
ts=pd.Series(np.random.randint(0,500,len(s)),index=s)
ts
2019-12-09 16:52:20.507986 378
2019-12-09 16:52:21.507986 142
2019-12-09 16:52:22.507986 171
2019-12-09 16:52:23.507986 54
2019-12-09 16:52:24.507986 152
...
2019-12-09 16:53:55.507986 317
2019-12-09 16:53:56.507986 122
2019-12-09 16:53:57.507986 71
2019-12-09 16:53:58.507986 226
2019-12-09 16:53:59.507986 239
Freq: S, Length: 100, dtype: int32
ts.resample('Min').sum()
2019-12-09 16:52:00 9619
2019-12-09 16:53:00 14919
Freq: T, dtype: int32
26.世界时间标准
s=pd.date_range('today',periods=1,freq='D')
ts=pd.Series(np.random.randn(len(s)),s)
ts_utc=ts.tz_localize('UTC')
ts_utc
2019-12-09 16:52:20.883007+00:00 1.175586
Freq: D, dtype: float64
27.转换为上海所在时区
ts_utc.tz_convert('Asia/Shanghai')
2019-12-10 00:52:20.883007+08:00 1.175586
Freq: D, dtype: float64
28.不同时间表示方式的转换
rng = pd.date_range('1/1/2018', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
print(ts)
2018-01-31 0.012957
2018-02-28 -0.749021
2018-03-31 0.504755
2018-04-30 0.963639
2018-05-31 1.975850
Freq: M, dtype: float64
ps = ts.to_period()
print(ps)
2018-01 0.012957
2018-02 -0.749021
2018-03 0.504755
2018-04 0.963639
2018-05 1.975850
Freq: M, dtype: float64
ps.to_timestamp()
2018-01-01 0.012957
2018-02-01 -0.749021
2018-03-01 0.504755
2018-04-01 0.963639
2018-05-01 1.975850
Freq: MS, dtype: float64
29.创建多重索引series
letters = ['A', 'B', 'C']
numbers = list(range(10))
mi = pd.MultiIndex.from_product([letters, numbers]) # 设置多重索引
s = pd.Series(np.random.rand(30), index=mi) # 随机数
s
A 0 0.135384
1 0.323504
2 0.155028
3 0.483293
4 0.992802
5 0.174233
6 0.323222
7 0.243424
8 0.488358
9 0.927360
B 0 0.699627
1 0.636198
2 0.427656
3 0.609661
4 0.229332
5 0.233044
6 0.759226
7 0.012801
8 0.817615
9 0.220503
C 0 0.280586
1 0.902414
2 0.006040
3 0.259447
4 0.585277
5 0.072649
6 0.278152
7 0.490477
8 0.746673
9 0.050695
dtype: float64
30.多重索引series查询、切片
s.loc[:,[1,3,6]]
A 1 0.323504
3 0.483293
6 0.323222
B 1 0.636198
3 0.609661
6 0.759226
C 1 0.902414
3 0.259447
6 0.278152
dtype: float64
s.loc[pd.IndexSlice[:'B',5:]]
A 5 0.174233
6 0.323222
7 0.243424
8 0.488358
9 0.927360
B 5 0.233044
6 0.759226
7 0.012801
8 0.817615
9 0.220503
dtype: float64
31.dataframe数据的多重索引
frame=pd.DataFrame(np.arange(12).reshape(6,2),index=[list('AAABBB'),list('123123')],columns=['hello','shiyanlou'])
print(frame)
hello shiyanlou
A 1 0 1
2 2 3
3 4 5
B 1 6 7
2 8 9
3 10 11
32.多重索引设置列名称
frame.index.names=['first','second']
print(frame)
hello shiyanlou
first second
A 1 0 1
2 2 3
3 4 5
B 1 6 7
2 8 9
3 10 11
33.多重索引分组求和
sum_group=frame.groupby('first').sum()
print(sum_group)
hello shiyanlou
first
A 6 9
B 24 27
34.dataframe行列转换
print(frame)
frame_stack=frame.stack()
print(frame_stack)
hello shiyanlou
first second
A 1 0 1
2 2 3
3 4 5
B 1 6 7
2 8 9
3 10 11
first second
A 1 hello 0
shiyanlou 1
2 hello 2
shiyanlou 3
3 hello 4
shiyanlou 5
B 1 hello 6
shiyanlou 7
2 hello 8
shiyanlou 9
3 hello 10
shiyanlou 11
dtype: int32
35.索引转换
print(frame)
frame_unstack=frame.unstack()
print(frame_unstack)
hello shiyanlou
first second
A 1 0 1
2 2 3
3 4 5
B 1 6 7
2 8 9
3 10 11
hello shiyanlou
second 1 2 3 1 2 3
first
A 0 2 4 1 3 5
B 6 8 10 7 9 11
36.按条件查找
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data, index=labels)
print(df)
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 yes
e dog 5.0 2 no
f cat 2.0 3 no
g snake 4.5 1 no
h cat NaN 1 yes
i dog 7.0 2 no
j dog 3.0 1 no
37.单一条件查询
df_slice=df[df['age']>3]
print(df_slice)
animal age visits priority
e dog 5.0 2 no
g snake 4.5 1 no
i dog 7.0 2 no
df_slice=df[df.age>3]
print(df_slice)
animal age visits priority
e dog 5.0 2 no
g snake 4.5 1 no
i dog 7.0 2 no
38.多重条件查询
df_slice=df[(df['animal']=='cat')&(df['age']<3)]
print(df_slice)
animal age visits priority
a cat 2.5 1 yes
f cat 2.0 3 no
df_slice=df[(df.animal=='cat')&(df.age<3)]
print(df_slice)
animal age visits priority
a cat 2.5 1 yes
f cat 2.0 3 no
39.关键字查询
df_slice=df[df['animal'].isin(['cat','dog'])]
print(df_slice)
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
d dog NaN 3 yes
e dog 5.0 2 no
f cat 2.0 3 no
h cat NaN 1 yes
i dog 7.0 2 no
j dog 3.0 1 no
df_slice=df[df.animal.isin(['cat','dog'])]
print(df_slice)
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
d dog NaN 3 yes
e dog 5.0 2 no
f cat 2.0 3 no
h cat NaN 1 yes
i dog 7.0 2 no
j dog 3.0 1 no
40.按照标签及列名查询
df_slice=df.loc[df2.index[[3,4,8]],['animal','age']]
print(df_slice)
animal age
d dog NaN
e dog 5.0
i dog 7.0
41.多条件排序
df_slice_sort=df.sort_values(by=['age','visits'],ascending=[False,True])
print(df_slice_sort)
animal age visits priority
i dog 7.0 2 no
e dog 5.0 2 no
g snake 4.5 1 no
j dog 3.0 1 no
b cat 3.0 3 yes
a cat 2.5 1 yes
f cat 2.0 3 no
c snake 0.5 2 no
h cat NaN 1 yes
d dog NaN 3 yes
42.分组求和
df_sum_group=df4.groupby('animal').sum()
print(df_sum_group)
age visits No g
animal
200 400.0 400 400.000000 400.0
cat 5.5 5 2.208277 0.0
dog 15.0 6 -1.739923 0.0
snake 0.5 2 -0.136955 0.0
43.使用列表拼接多个DataFrame
temp_df1 = pd.DataFrame(np.random.randn(3, 4)) # 生成由随机数组成的 DataFrame 1
temp_df2 = pd.DataFrame(np.random.randn(3, 4)) # 生成由随机数组成的 DataFrame 2
temp_df3 = pd.DataFrame(np.random.randn(3, 4)) # 生成由随机数组成的 DataFrame 3
print(temp_df1)
print(temp_df2)
print(temp_df3)
pieces = [temp_df1, temp_df2, temp_df3]
df=pd.concat(pieces)
print(df)
0 1 2 3
0 -0.778653 1.585000 -1.144196 0.378433
1 0.904217 0.382427 -0.130134 -0.868192
2 -0.464003 -0.623726 -0.377522 -0.646894
0 1 2 3
0 -0.464497 1.317456 -0.929411 1.622154
1 -0.560659 -0.099662 -1.286837 -0.451764
2 1.108570 -0.673523 -1.240363 1.306632
0 1 2 3
0 -2.318692 0.166132 0.697082 -0.263406
1 -1.366781 0.035830 1.609133 -1.727403
2 1.503694 -0.575755 0.447589 1.310894
0 1 2 3
0 -0.778653 1.585000 -1.144196 0.378433
1 0.904217 0.382427 -0.130134 -0.868192
2 -0.464003 -0.623726 -0.377522 -0.646894
0 -0.464497 1.317456 -0.929411 1.622154
1 -0.560659 -0.099662 -1.286837 -0.451764
2 1.108570 -0.673523 -1.240363 1.306632
0 -2.318692 0.166132 0.697082 -0.263406
1 -1.366781 0.035830 1.609133 -1.727403
2 1.503694 -0.575755 0.447589 1.310894
44.找出表中和最小的列
np.random.seed(0)
df=pd.DataFrame(np.random.random(size=(5,10)),columns=list(np.arange(10)))
column_index=df.sum(0).idxmin()# idxmax(),idxmin()为series函数返回最大最小值的索引值
print('The index of column whose sum is min:{}\n'.format(column_index))
df_sum_min=df.loc[:,column_index]
print(df_sum_min)
The index of column whose sum is min:4
0 0.423655
1 0.071036
2 0.118274
3 0.018790
4 0.666767
Name: 4, dtype: float64
45.dataframe中每个元素减去每一行的平均值
mean=df.mean(axis=1)
df_sub_mean=df.sub(mean,axis=0)
print(df_sub_mean)
0 1 2 3 4 5 6 \
0 -0.066953 0.099423 -0.013003 -0.070883 -0.192111 0.030128 -0.178179
1 0.244382 -0.018448 0.020701 0.378253 -0.476307 -0.460214 -0.527125
2 0.398367 0.218907 -0.118772 0.200278 -0.461977 0.059670 -0.436898
3 -0.290884 0.218794 -0.099289 0.012994 -0.536650 0.062196 0.056656
4 -0.031517 0.046007 0.306606 -0.330799 0.275742 0.279613 -0.180642
7 8 9
0 0.276007 0.347896 -0.232325
1 0.285276 0.230813 0.322669
2 0.364418 -0.058403 -0.165589
3 0.061494 0.388308 0.126381
4 -0.262099 -0.075597 -0.027314
46.分组,并得到每一组中最大三个数之和
df6 = pd.DataFrame({'A':list('aaabbcaabcccbbc')
,'B':[12, 345, 3, 1, 45, 14, 4, 52, 54, 23, 235, 21, 57, 3, 87]})
print(df6)
A B
0 a 12
1 a 345
2 a 3
3 b 1
4 b 45
5 c 14
6 a 4
7 a 52
8 b 54
9 c 23
10 c 235
11 c 21
12 b 57
13 b 3
14 c 87
df_sum_largest_3=df6.groupby('A')['B'].nlargest(3)
print(df_sum_largest_3)
A
a 1 345
7 52
0 12
b 12 57
8 54
4 45
c 10 235
14 87
9 23
Name: B, dtype: int64
df6.groupby('A')['B'].nlargest(3).sum()
910
透视表 pivot_table
47.新建表将A,B,C列作为索引进行聚合
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3,
'B': ['A', 'B', 'C'] * 4,
'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D': np.random.randn(12),
'E': np.random.randn(12)})
print(df)
df_pivot_table=pd.pivot_table(df, index=['A', 'B'])
print(df_pivot_table)
A B C D E
0 one A foo -1.706270 -0.028182
1 one B foo 1.950775 0.428332
2 two C foo -0.509652 0.066517
3 three A bar -0.438074 0.302472
4 one B bar -1.252795 -0.634322
5 one C bar 0.777490 -0.362741
6 two A foo -1.613898 -0.672460
7 three B foo -0.212740 -0.359553
8 one C foo -0.895467 -0.813146
9 one A bar 0.386902 -1.726283
10 two B bar -0.510805 0.177426
11 three C bar -1.180632 -0.401781
D E
A B
one A -0.659684 -0.877232
B 0.348990 -0.102995
C -0.058988 -0.587944
three A -0.438074 0.302472
B -0.212740 -0.359553
C -1.180632 -0.401781
two A -1.613898 -0.672460
B -0.510805 0.177426
C -0.509652 0.066517
48.透视表按指定行进行聚合
将该 DataFrame 的 D 列聚合,按照 A,B 列为索引进行聚合,聚合的方式为默认求均值参数aggfunc=np.mean。
df_group_pivot_table=pd.pivot_table(df, values=['D'], index=['A', 'B'])\
.sort_values(by=['A','B'])#文本索引按首字母排序
print(df_group_pivot_table)
D
A B
one A -0.659684
B 0.348990
C -0.058988
three A -0.438074
B -0.212740
C -1.180632
two A -1.613898
B -0.510805
C -0.509652
#help(pd.pivot_table)#查看pandas透视表文档
49.上一题中 D 列聚合时,采用默认求均值的方法,若想使用更多的方式可以在 aggfunc 中实现
df_pivot=pd.pivot_table(df, values=['D'], index=['A', 'B'], aggfunc=[np.sum, len])
print(df_pivot)
sum len
D D
A B
one A -1.319368 2.0
B 0.697980 2.0
C -0.117976 2.0
three A -0.438074 1.0
B -0.212740 1.0
C -1.180632 1.0
two A -1.613898 1.0
B -0.510805 1.0
C -0.509652 1.0
50.透视表利用额外列进行辅助分割
D 列按照 A,B 列进行聚合时,若关心 C 列对 D 列的影响,可以加入 columns 值进行分析。
df_pivot=pd.pivot_table(df,values=['D'],index=['A', 'B'],
columns=['C'],aggfunc=np.sum)
print(df_pivot)
D
C bar foo
A B
one A 0.386902 -1.706270
B -1.252795 1.950775
C 0.777490 -0.895467
three A -0.438074 NaN
B NaN -0.212740
C -1.180632 NaN
two A NaN -1.613898
B -0.510805 NaN
C NaN -0.509652
51.透视表的缺省值处理
在透视表中由于不同的聚合方式,相应缺少的组合将为缺省值,可以加入 fill_value 对缺省值处理。
df_pivot_fillna=pd.pivot_table(df, values=['D'], index=['A', 'B'],
columns=['C'], aggfunc=np.sum, fill_value=0)
print(df_pivot_fillna)
D
C bar foo
A B
one A 0.386902 -1.706270
B -1.252795 1.950775
C 0.777490 -0.895467
three A -0.438074 0.000000
B 0.000000 -0.212740
C -1.180632 0.000000
two A 0.000000 -1.613898
B -0.510805 0.000000
C 0.000000 -0.509652
绝对类型
在数据的形式上主要包括数量型和性质型,数量型表示着数据可数范围可变,而性质型表示范围已经确定不可改变,绝对型数据就是性质型数据的一种。
52.绝对型数据定义:
df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": [
'a', 'b', 'b', 'a', 'a', 'e']})
df["grade"] = df["raw_grade"].astype("category")
df.info()
print(df)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
id 6 non-null int64
raw_grade 6 non-null object
grade 6 non-null category
dtypes: category(1), int64(1), object(1)
memory usage: 202.0+ bytes
id raw_grade grade
0 1 a a
1 2 b b
2 3 b b
3 4 a a
4 5 a a
5 6 e e
53.对绝对型数据重命名
df["grade"].cat.categories = ["very good", "good", "very bad"]
print(df)
id raw_grade grade
0 1 a very good
1 2 b good
2 3 b good
3 4 a very good
4 5 a very good
5 6 e very bad
54.重新排列绝对型数据并补充相应的缺省值
df["grade"] = df["grade"].cat.set_categories(
["very bad", "bad", "medium", "good", "very good"])
print(df)
id raw_grade grade
0 1 a very good
1 2 b good
2 3 b good
3 4 a very good
4 5 a very good
5 6 e very bad
55.对绝对型数据进行排序、分组
df.sort_values(by='grade')
df.groupby('grade').size()
grade
very bad 1
bad 0
medium 0
good 2
very good 3
dtype: int64
数据清洗
常常我们得到的数据是不符合我们最终处理的数据要求,包括许多缺省值以及坏的数据,需要我们对数据进行清洗。
56.缺失值拟合
在FilghtNumber中有数值缺失,其中数值为按 10 增长,补充相应的缺省值使得数据完整,并让数据为 int 类型。
df = pd.DataFrame({'From_To': ['LoNDon_paris', 'MAdrid_miLAN', 'londON_StockhOlm',
'Budapest_PaRis', 'Brussels_londOn'],
'FlightNumber': [10045, np.nan, 10065, np.nan, 10085],
'RecentDelays': [[23, 47], [], [24, 43, 87], [13], [67, 32]],
'Airline': ['KLM(!)', '<Air France> (12)', '(British Airways. )',
'12. Air France', '"Swiss Air"']})
df['FlightNumber'] = df['FlightNumber'].interpolate().astype(int)
print(df)
From_To FlightNumber RecentDelays Airline
0 LoNDon_paris 10045 [23, 47] KLM(!)
1 MAdrid_miLAN 10055 [] <Air France> (12)
2 londON_StockhOlm 10065 [24, 43, 87] (British Airways. )
3 Budapest_PaRis 10075 [13] 12. Air France
4 Brussels_londOn 10085 [67, 32] "Swiss Air"
57.数据拆分
其中From_to应该为两独立的两列From和To,将From_to依照_拆分为独立两列建立为一个新表。
temp = df.From_To.str.split('_', expand=True)
temp.columns = ['From', 'To']
print(temp)
From To
0 LoNDon paris
1 MAdrid miLAN
2 londON StockhOlm
3 Budapest PaRis
4 Brussels londOn
58.字符标准化:
其中注意到地点的名字都不规范(如:londON应该为London)需要对数据进行标准化处理。
temp['From'] = temp['From'].str.capitalize()
temp['To'] = temp['To'].str.capitalize()
59. 删除坏数据加入整理好的数据:
将最开始的 From_to 列删除,加入整理好的 From 和 to 列。
df = df.drop('From_To', axis=1)
df = df.join(temp)
print(df)
FlightNumber RecentDelays Airline From To
0 10045 [23, 47] KLM(!) London Paris
1 10055 [] <Air France> (12) Madrid Milan
2 10065 [24, 43, 87] (British Airways. ) London Stockholm
3 10075 [13] 12. Air France Budapest Paris
4 10085 [67, 32] "Swiss Air" Brussels London
60.去除多余字符:
如同 airline 列中许多数据有许多其他字符,会对后期的数据分析有较大影响,需要对这类数据进行修正。
df['Airline'] = df['Airline'].str.extract(
'([a-zA-Z\s]+)', expand=False).str.strip()
print(df)
FlightNumber RecentDelays Airline From To
0 10045 [23, 47] KLM London Paris
1 10055 [] Air France Madrid Milan
2 10065 [24, 43, 87] British Airways London Stockholm
3 10075 [13] Air France Budapest Paris
4 10085 [67, 32] Swiss Air Brussels London
61.格式规范:
在 RecentDelays 中记录的方式为列表类型,由于其长度不一,这会为后期数据分析造成很大麻烦。这里将 RecentDelays 的列表拆开,取出列表中的相同位置元素作为一列,若为空值即用 NaN 代替。
delays = df['RecentDelays'].apply(pd.Series)#将'RecentDelays'中的列表元素装换成n列,n为最大列表的长度,,空值以nan代替
print(delays)
0 1 2
0 23.0 47.0 NaN
1 NaN NaN NaN
2 24.0 43.0 87.0
3 13.0 NaN NaN
4 67.0 32.0 NaN
#新产生列的列名
delays.columns = ['delay_{}'.format(n)
for n in range(1, len(delays.columns)+1)]
df = df.drop('RecentDelays', axis=1).join(delays)
print(df)
FlightNumber Airline From To delay_1 delay_2 \
0 10045 KLM London Paris 23.0 47.0
1 10055 Air France Madrid Milan NaN NaN
2 10065 British Airways London Stockholm 24.0 43.0
3 10075 Air France Budapest Paris 13.0 NaN
4 10085 Swiss Air Brussels London 67.0 32.0
delay_3
0 NaN
1 NaN
2 87.0
3 NaN
4 NaN
数据预处理
62.信息区间划分:
班级一部分同学的数学成绩表,如下图所示
df=pd.DataFrame({'name':['Alice','Bob','Candy','Dany',\
'Ella','Frank','Grace','Jenny'],'grades':[58,83,79,65,93,45,61,88]})
但我们更加关心的是该同学是否及格,将该数学成绩按照是否>60来进行划分。
df = pd.DataFrame({'name': ['Alice', 'Bob', 'Candy', 'Dany', 'Ella',
'Frank', 'Grace', 'Jenny'],
'grades': [58, 83, 79, 65, 93, 45, 61, 88]})
def choice(x):
if x > 60:
return 1
else:
return 0
df.grades = pd.Series(map(lambda x: choice(x), df.grades))
print(df)
name grades
0 Alice 0
1 Bob 1
2 Candy 1
3 Dany 1
4 Ella 1
5 Frank 0
6 Grace 1
7 Jenny 1
63.数据去重
尝试将 A 列中连续重复的数据清除。
df = pd.DataFrame({'A': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]})
df1=df.loc[df['A'].shift() != df['A']]
print(df1)
A
0 1
1 2
3 3
4 4
5 5
8 6
9 7
64. 数据归一化:
有时候,DataFrame 中不同列之间的数据差距太大,需要对其进行归一化处理。
其中,Max-Min 归一化是简单而常见的一种方式,公式如下:
Y=X−Xmin/Xmax−Xmin
def normalization(df):
numerator = df.sub(df.min())
denominator = (df.max()).sub(df.min())
Y = numerator.div(denominator)
return Y
df = pd.DataFrame(np.random.random(size=(5, 3)))
print(df)
df_normalized=normalization(df)
print(df_normalized)
0 1 2
0 0.566601 0.265389 0.523248
1 0.093941 0.575946 0.929296
2 0.318569 0.667410 0.131798
3 0.716327 0.289406 0.183191
4 0.586513 0.020108 0.828940
0 1 2
0 0.759433 0.378929 0.490848
1 0.000000 0.858700 1.000000
2 0.360915 1.000000 0.000000
3 1.000000 0.416032 0.064443
4 0.791425 0.000000 0.874161
65.数据标准化
Y=X-Y.mean/Y.std
def standardlization(df):
mean=df.mean()
std=df.std()
Y = df.sub(mean)/std
return Y
df = pd.DataFrame(np.random.random(size=(5, 3)))
print(df)
df_standardlized=standardlization(df)
print(df_standardlized)
0 1 2
0 0.004695 0.677817 0.270008
1 0.735194 0.962189 0.248753
2 0.576157 0.592042 0.572252
3 0.223082 0.952749 0.447125
4 0.846409 0.699479 0.297437
0 1 2
0 -1.335508 -0.583492 -0.700914
1 0.729611 1.091905 -0.854331
2 0.280014 -1.088839 1.480668
3 -0.718131 1.036291 0.577511
4 1.044014 -0.455865 -0.502933
66.Pandas 绘图操作
为了更好的了解数据包含的信息,最直观的方法就是将其绘制成图。
#series可视化
%matplotlib inline
import matplotlib.pyplot as plt
ts = pd.Series(np.random.randn(100), index=pd.date_range('today', periods=100))
ts = ts.cumsum()
ts.plot()
plt.show()

#dataframe折线图
df = pd.DataFrame(np.random.randn(100, 4), index=ts.index,
columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
df.plot()
plt.show()

散点图、柱状图
import matplotlib.pyplot as plt
df = pd.DataFrame({"xs": [1, 5, 2, 8, 1], "ys": [4, 2, 1, 9, 6]})
df = df.cumsum()
df.plot.scatter("xs", "ys", color='red', marker="*")
plt.show()

df = pd.DataFrame({"revenue": [57, 68, 63, 71, 72, 90, 80, 62, 59, 51, 47, 52],
"advertising": [2.1, 1.9, 2.7, 3.0, 3.6, 3.2, 2.7, 2.4, 1.8, 1.6, 1.3, 1.9],
"month": range(12)
})
ax = df.plot.bar("month", "revenue", color="yellow")
df.plot("month", "advertising", secondary_y=True, ax=ax)
plt.show()
