数据连接 merge
import pandas as pd
import numpy as np
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data2' : np.random.randint(0,10,3)})
print(df_obj1)
print(df_obj2)
data1 key
0 5 b
1 3 b
2 7 a
3 3 c
4 7 a
5 7 a
6 3 b
data2 key
0 9 a
1 3 b
2 8 d
默认将重叠列的列名作为“外键”进行连接
pd.merge(df_obj1, df_obj2)
on显示指定“外键”
pd.merge(df_obj1, df_obj2, on='key')
left_on,right_on分别指定左侧数据和右侧数据的“外键”
# 更改列名
df_obj1 = df_obj1.rename(columns={'key':'key1'})
df_obj2 = df_obj2.rename(columns={'key':'key2'})
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2')
“外连接”
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer')
左连接
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left')
右连接
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right')
处理重复列名
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data' : np.random.randint(0,10,3)})
pd.merge(df_obj1, df_obj2, on='key', suffixes=('_left', '_right'))
按索引连接
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])
pd.merge(df_obj1, df_obj2, left_on='key', right_index=True)