#合并——merge
#Pandas具有全功能的,高性能内存中连接操作,与SQL等关系数据库非常相似
pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
left_index=False, right_index=False, sort=True,
suffixes=('_x', '_y'), copy=True, indicator=False)
df3 = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
df4 = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
[output]:
A B key1 key2
0 A0 B0 K0 K0
1 A1 B1 K0 K1
2 A2 B2 K1 K0
3 A3 B3 K2 K1
C D key1 key2
0 C0 D0 K0 K0
1 C1 D1 K1 K0
2 C2 D2 K1 K0
3 C3 D3 K2 K0
pd.merge(df3, df4)
pd.merge(df3, df4,on=['key1','key2'], how = 'inner')
[output]:
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
pd.merge(df3, df4, on=['key1','key2'], how = 'outer')
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
5 NaN NaN K2 K0 C3 D3
pd.merge(df3, df4, on=['key1','key2'], how = 'left')
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
pd.merge(df3, df4, on=['key1','key2'], how = 'right')
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
3 NaN NaN K2 K0 C3 D3
# 参数 left_on, right_on, left_index, right_index → 当键不为一个列时,可以单独设置左键与右键
# df1以‘key’为键,df2以index为键
# left_index:为True时,第一个df以index为键,默认False
# right_index:为True时,第二个df以index为键,默认False
# 所以left_on, right_on, left_index, right_index可以相互组合:
# left_on + right_on, left_on + right_index, left_index + right_on, left_index + right_index
df1 = pd.DataFrame({'lkey':list('bbacaab'),
'data1':range(7)})
df2 = pd.DataFrame({'rkey':list('abd'),
'date2':range(3)})
pd.merge(df1, df2, left_on='lkey', right_on='rkey')
data1 lkey date2 rkey
0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a
df1 = pd.DataFrame({'key':list('abcdfeg'),
'data1':range(7)})
df2 = pd.DataFrame({'date2':range(100,105)},
index = list('abcde'))
pd.merge(df1, df2, left_on='key', right_index=True)
data1 key date2
0 0 a 100
1 1 b 101
2 2 c 102
3 3 d 103
5 5 e 104
# 参数 sort
df1 = pd.DataFrame({'key':list('bbacaab'),
'data1':[1,3,2,4,5,9,7]})
df2 = pd.DataFrame({'key':list('abd'),
'date2':[11,2,33]})
x2 = pd.merge(df1,df2, on = 'key', sort=True, how = 'outer')
[output]:
data1 key date2
0 2.0 a 11.0
1 5.0 a 11.0
2 9.0 a 11.0
3 1.0 b 2.0
4 3.0 b 2.0
5 7.0 b 2.0
6 4.0 c NaN
7 NaN d 33.0
#x2.sort_values('data1')
# 合并 join
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
pd.merge(df1, df2, left_index=True, right_index=True, suffixes=('_1', '_2'))
#等价
df1.join(df2['date2'])
left.join(right, on = 'key'))
# 等价
pd.merge(left, right, left_on='key', right_index=True, how='left', sort=False)
连接 - 沿轴执行连接操作
pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
keys=None, levels=None, names=None, verify_integrity=False,
copy=True)
s3 = pd.Series([1,2,3],index = ['a','c','h'])
s4 = pd.Series([2,3,4],index = ['b','e','d'])
pd.concat([s3,s4]))
[output]:
a 1
c 2
h 3
b 2
e 3
d 4
dtype: int64
pd.concat([s3,s4]).sort_index()
[output]:
a 1
b 2
c 2
d 4
e 3
h 3
dtype: int64
pd.concat([s3,s4], axis=1) # 列 + 列
[output]:
0 1
a 1.0 NaN
b NaN 2.0
c 2.0 NaN
d NaN 4.0
e NaN 3.0
h 3.0 NaN
pd.concat([[s3,s4], axis= 1, join='inner')
[output]:
0 1
b 2 2
c 3 3
pd.concat([s3,s4], axis= 1, join_axes=[['a','b','d']])
[output]:
0 1
a 1.0 NaN
b 2.0 2.0
d NaN 4.0
sre = pd.concat([s3,s4], keys = ['one','two'])
[output]:
one a 1
b 2
c 3
two b 2
c 3
d 4
dtype: int64 <class 'pandas.core.series.Series'>
sre = pd.concat([s5,s6], keys = ['one','two'], axis=1)
[output]:
one two
a 1.0 NaN
b 2.0 2.0
c 3.0 3.0
d NaN 4.0 <class 'pandas.core.frame.DataFrame'>
# 修补 pd.combine_first()
df1 = pd.DataFrame([[np.nan, 3., 5.], [-4.6, np.nan, np.nan],[np.nan, 7., np.nan]])
df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5., 1.6, 4]],index=[1, 2])
0 1 2
0 NaN 3.0 5.0
1 -4.6 NaN NaN
2 NaN 7.0 NaN
0 1 2
1 -42.6 NaN -8.2
2 -5.0 1.6 4.0
df1.combine_first(df2)
# 根据index,df1的空值被df2替代
# 如果df2的index多于df1,则更新到df1上,比如index=['a',1]
0 1 2
0 NaN 3.0 5.0
1 -4.6 NaN -8.2
2 -5.0 7.0 4.0
df1.update(df2)
print(df1)
# update,直接df2覆盖df1,相同index位置
0 1 2
0 NaN 3.0 5.0
1 -42.6 NaN -8.2
2 -5.0 1.6 4.0