pandas 多级索引,高级知识了。
代码
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
np.random.seed(0)
# 配置pandas显示
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
# Pandas 提供了Panel和Panel4D对象解决三维数据与四维数据。
# 而在实践中,更直观的形式是通过层级索引(hierarchical indexing,也被称为多级索引-multi-indexing)配合多个有不同等级(level)的一级索引一起使用,
# 这样就可以将高维数 组转换成类似一维 Series 和二维 DataFrame 对象的形式。
index = [('California', 2000), ('California', 2010),
('New York', 2000), ('New York', 2010),
('Texas', 2000), ('Texas', 2010)]
population = [33871468, 37353956,
18976457, 19378102,
20851820, 25145561]
pop = pd.Series(population, index=index)
print(pop)
print(pop[('California', 2010):('Texas', 2000)])
print(pop[[i for i in pop.index if i[1] == 2010]])
index = pd.MultiIndex.from_tuples(index)
print(index)
pop = pop.reindex(index)
print(pop)
print(pop[:, 2010])
pop_df = pop.unstack()
print(pop_df)
print(pop_df.stack())
pop_df = pd.DataFrame({'total': pop,
'under18': [9267089, 9284094,
4687374, 4318033,
5906301, 6879014]})
print(pop_df)
f_u18 = pop_df['under18'] / pop_df['total']
print(f_u18.unstack())
df = pd.DataFrame(np.random.rand(4, 2),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=['data1', 'data2'])
print(df)
data = {('California', 2000): 33871648,
('California', 2010): 37253956,
('Texas', 2000): 20851820,
('Texas', 2010): 25145561,
('New York', 2000): 18976457,
('New York', 2010): 19378102}
print(pd.Series(data))
# 显示创建多级索引
print(pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]]))
print(pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)]))
print(pd.MultiIndex.from_product([['a', 'b'], [1, 2]]))
print(pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
codes=[[0, 0, 1, 1], [0, 1, 0, 1]]))
pop.index.names = ['state', 'year']
print(pop)
# 多级列索引
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
names=['subject', 'type'])
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
health_data = pd.DataFrame(data, index=index, columns=columns)
print(health_data)
print(health_data['Guido'])
# 多级索引的取值与切片
print(pop)
print(pop['California', 2000])
print(pop['California'])
print(pop.loc['California': 'New York'])
print(pop[:, 2000])
print(pop[pop > 22000000])
print(pop[['California', 'Texas']])
print(health_data['Guido', 'HR'])
print(health_data.iloc[:2, :2])
print(health_data.loc[:, ('Bob', 'HR')])
idx = pd.IndexSlice
print(health_data.loc[idx[:, 1], idx[:, 'HR']])
# 多级索引行列转换
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data = data.sort_index()
print(data)
print(data['a': 'b'])
print(pop.unstack(level=0))
print(pop.unstack(level=1))
print(pop.unstack().stack())
pop_flat = pop.reset_index(name='population')
print(pop_flat)
print(pop_flat.set_index(['state', 'year']))
# 多级索引的数据累记
print(health_data)
data_mean = health_data.mean(level='year')
print(data_mean)
print(data_mean.mean(axis=1, level='type'))
输出
(California, 2000) 33871468
(California, 2010) 37353956
(New York, 2000) 18976457
(New York, 2010) 19378102
(Texas, 2000) 20851820
(Texas, 2010) 25145561
dtype: int64
(California, 2010) 37353956
(New York, 2000) 18976457
(New York, 2010) 19378102
(Texas, 2000) 20851820
dtype: int64
(California, 2010) 37353956
(New York, 2010) 19378102
(Texas, 2010) 25145561
dtype: int64
MultiIndex([('California', 2000),
('California', 2010),
( 'New York', 2000),
( 'New York', 2010),
( 'Texas', 2000),
( 'Texas', 2010)],
)
California 2000 33871468
2010 37353956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
California 37353956
New York 19378102
Texas 25145561
dtype: int64
2000 2010
California 33871468 37353956
New York 18976457 19378102
Texas 20851820 25145561
California 2000 33871468
2010 37353956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
total under18
California 2000 33871468 9267089
2010 37353956 9284094
New York 2000 18976457 4687374
2010 19378102 4318033
Texas 2000 20851820 5906301
2010 25145561 6879014
2000 2010
California 0.273596 0.248544
New York 0.247010 0.222831
Texas 0.283251 0.273568
data1 data2
a 1 0.548814 0.715189
2 0.602763 0.544883
b 1 0.423655 0.645894
2 0.437587 0.891773
California 2000 33871648
2010 37253956
Texas 2000 20851820
2010 25145561
New York 2000 18976457
2010 19378102
dtype: int64
MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)
MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)
MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)
MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)
state year
California 2000 33871468
2010 37353956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
subject Bob Guido Sue
type HR Temp HR Temp HR Temp
year visit
2013 1 36.0 37.4 38.0 38.5 45.0 37.1
2 41.0 37.3 52.0 36.8 40.0 36.1
2014 1 11.0 37.7 46.0 36.3 60.0 35.5
2 37.0 36.8 52.0 38.5 39.0 37.4
type HR Temp
year visit
2013 1 38.0 38.5
2 52.0 36.8
2014 1 46.0 36.3
2 52.0 38.5
state year
California 2000 33871468
2010 37353956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
33871468
year
2000 33871468
2010 37353956
dtype: int64
state year
California 2000 33871468
2010 37353956
New York 2000 18976457
2010 19378102
dtype: int64
state
California 33871468
New York 18976457
Texas 20851820
dtype: int64
state year
California 2000 33871468
2010 37353956
Texas 2010 25145561
dtype: int64
state year
California 2000 33871468
2010 37353956
Texas 2000 20851820
2010 25145561
dtype: int64
year visit
2013 1 38.0
2 52.0
2014 1 46.0
2 52.0
Name: (Guido, HR), dtype: float64
subject Bob
type HR Temp
year visit
2013 1 36.0 37.4
2 41.0 37.3
year visit
2013 1 36.0
2 41.0
2014 1 11.0
2 37.0
Name: (Bob, HR), dtype: float64
subject Bob Guido Sue
type HR HR HR
year visit
2013 1 36.0 38.0 45.0
2014 1 11.0 46.0 60.0
char int
a 1 0.359508
2 0.437032
b 1 0.666767
2 0.670638
c 1 0.697631
2 0.060225
dtype: float64
char int
a 1 0.359508
2 0.437032
b 1 0.666767
2 0.670638
dtype: float64
state California New York Texas
year
2000 33871468 18976457 20851820
2010 37353956 19378102 25145561
year 2000 2010
state
California 33871468 37353956
New York 18976457 19378102
Texas 20851820 25145561
state year
California 2000 33871468
2010 37353956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
state year population
0 California 2000 33871468
1 California 2010 37353956
2 New York 2000 18976457
3 New York 2010 19378102
4 Texas 2000 20851820
5 Texas 2010 25145561
population
state year
California 2000 33871468
2010 37353956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
subject Bob Guido Sue
type HR Temp HR Temp HR Temp
year visit
2013 1 36.0 37.4 38.0 38.5 45.0 37.1
2 41.0 37.3 52.0 36.8 40.0 36.1
2014 1 11.0 37.7 46.0 36.3 60.0 35.5
2 37.0 36.8 52.0 38.5 39.0 37.4
subject Bob Guido Sue
type HR Temp HR Temp HR Temp
year
2013 38.5 37.35 45.0 37.65 42.5 36.60
2014 24.0 37.25 49.0 37.40 49.5 36.45
type HR Temp
year
2013 42.000000 37.200000
2014 40.833333 37.033333
Process finished with exit code 0