pandas合并数据集
代码
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
np.random.seed(0)
# 配置pandas显示
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
# 合并数据集:Concat与Append操作
def make_df(cols, ind):
data = {c: [str(c) + str(i) for i in ind]
for c in cols}
return pd.DataFrame(data, ind)
print(make_df('ABC', range(3)))
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
print(pd.concat([ser1, ser2]))
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
print(df1)
print(df2)
print(pd.concat([df1, df2]))
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
print(pd.concat([df3, df4], axis=1))
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index
print(x); print(y)
print(pd.concat([x, y]))
print(pd.concat([x, y], ignore_index=True))
print(pd.concat([x, y], keys=['x', 'y']))
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
print(df5)
print(df6)
print(pd.concat([df5, df6]))
print(pd.concat([df5, df6], join='inner'))
# print(pd.concat([df5, df6], join_axes=[df5.columns]))
# Pandas的append()不直接更新原有对象的值,而是为合并后的数据创建一个新对象。
# 因此,它不能被称之为一个非常高效的解决方案,因为每次合并都需要重新创建索引和数据缓存。
# 总之,如果你需要进行多个append 操作,还是建议先创建一个DataFrame列表,
# 然后用 concat() 函数一次 性解决所有合并任务。
print(df1.append(df2))
# pd.merge()实现的功能基于关系代数(relational algebra)的一部分。
# 关系代数是处理关系型数据的通用理论,
# 绝大部分数据库的可用操作都以此为理论基础。
# pd.merge() 函数实现了三种数据连接的类型:一对一、多对一和多对多。
# 一对一
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
'hire_date': [2004, 2008, 2012, 2014]})
print(df1)
print(df2)
df3 = pd.merge(df1, df2)
print(df3)
# 多对一
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
'supervisor': ['Carly', 'Guido', 'Steve']})
print(pd.merge(df3, df4))
# 多对多
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting', 'Engineering', 'Engineering',
'HR', 'HR'],
'skills': ['math', 'spreadsheets', 'coding', 'linux',
'spreadsheets', 'organization']})
print(pd.merge(df1, df5))
# 设置数据合并的键
print(pd.merge(df1, df2, on='employee'))
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'salary': [70000, 80000, 120000, 90000]})
print(pd.merge(df1, df3, left_on='employee', right_on='name').drop('name', axis=1))
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
print(pd.merge(df1a, df2a, left_index=True, right_index=True))
print(df1a.join(df2a))
print(pd.merge(df1a, df3, left_index=True, right_on='name'))
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
'food': ['fish', 'beans', 'bread']},
columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
'drink': ['wine', 'beer']},
columns=['name', 'drink'])
print(pd.merge(df6, df7))
print(pd.merge(df6, df7, how='inner'))
print(pd.merge(df6, df7, how='outer'))
print(pd.merge(df6, df7, how='left'))
print(pd.merge(df6, df7, how='right'))
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'rank': [3, 1, 4, 2]})
print(pd.merge(df8, df9, on='name'))
print(pd.merge(df8, df9, on='name', suffixes=['_L', '_R']))
输出
A B C
0 A0 B0 C0
1 A1 B1 C1
2 A2 B2 C2
1 A
2 B
3 C
4 D
5 E
6 F
dtype: object
A B
1 A1 B1
2 A2 B2
A B
3 A3 B3
4 A4 B4
A B
1 A1 B1
2 A2 B2
3 A3 B3
4 A4 B4
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
A B
0 A0 B0
1 A1 B1
A B
0 A2 B2
1 A3 B3
A B
0 A0 B0
1 A1 B1
0 A2 B2
1 A3 B3
A B
0 A0 B0
1 A1 B1
2 A2 B2
3 A3 B3
A B
x 0 A0 B0
1 A1 B1
y 0 A2 B2
1 A3 B3
A B C
1 A1 B1 C1
2 A2 B2 C2
B C D
3 B3 C3 D3
4 B4 C4 D4
A B C D
1 A1 B1 C1 NaN
2 A2 B2 C2 NaN
3 NaN B3 C3 D3
4 NaN B4 C4 D4
B C
1 B1 C1
2 B2 C2
3 B3 C3
4 B4 C4
A B
1 A1 B1
2 A2 B2
3 A3 B3
4 A4 B4
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
employee hire_date
0 Lisa 2004
1 Bob 2008
2 Jake 2012
3 Sue 2014
employee group hire_date
0 Bob Accounting 2008
1 Jake Engineering 2012
2 Lisa Engineering 2004
3 Sue HR 2014
employee group hire_date supervisor
0 Bob Accounting 2008 Carly
1 Jake Engineering 2012 Guido
2 Lisa Engineering 2004 Guido
3 Sue HR 2014 Steve
employee group skills
0 Bob Accounting math
1 Bob Accounting spreadsheets
2 Jake Engineering coding
3 Jake Engineering linux
4 Lisa Engineering coding
5 Lisa Engineering linux
6 Sue HR spreadsheets
7 Sue HR organization
employee group hire_date
0 Bob Accounting 2008
1 Jake Engineering 2012
2 Lisa Engineering 2004
3 Sue HR 2014
employee group salary
0 Bob Accounting 70000
1 Jake Engineering 80000
2 Lisa Engineering 120000
3 Sue HR 90000
group hire_date
employee
Bob Accounting 2008
Jake Engineering 2012
Lisa Engineering 2004
Sue HR 2014
group hire_date
employee
Bob Accounting 2008
Jake Engineering 2012
Lisa Engineering 2004
Sue HR 2014
group name salary
0 Accounting Bob 70000
1 Engineering Jake 80000
2 Engineering Lisa 120000
3 HR Sue 90000
name food drink
0 Mary bread wine
name food drink
0 Mary bread wine
name food drink
0 Peter fish NaN
1 Paul beans NaN
2 Mary bread wine
3 Joseph NaN beer
name food drink
0 Peter fish NaN
1 Paul beans NaN
2 Mary bread wine
name food drink
0 Mary bread wine
1 Joseph NaN beer
name rank_x rank_y
0 Bob 1 3
1 Jake 2 1
2 Lisa 3 4
3 Sue 4 2
name rank_L rank_R
0 Bob 1 3
1 Jake 2 1
2 Lisa 3 4
3 Sue 4 2