练习书5-《python数据科学手册》

pandas 多级索引，高级知识了。

代码

import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

np.random.seed(0)
# 配置pandas显示
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)


# Pandas 提供了Panel和Panel4D对象解决三维数据与四维数据。
# 而在实践中，更直观的形式是通过层级索引(hierarchical indexing，也被称为多级索引-multi-indexing)配合多个有不同等级(level)的一级索引一起使用，
# 这样就可以将高维数 组转换成类似一维 Series 和二维 DataFrame 对象的形式。
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
population = [33871468, 37353956,
              18976457, 19378102,
              20851820, 25145561]
pop = pd.Series(population, index=index)
print(pop)
print(pop[('California', 2010):('Texas', 2000)])
print(pop[[i for i in pop.index if i[1] == 2010]])

index = pd.MultiIndex.from_tuples(index)
print(index)
pop = pop.reindex(index)
print(pop)
print(pop[:, 2010])
pop_df = pop.unstack()
print(pop_df)
print(pop_df.stack())
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
print(pop_df)
f_u18 = pop_df['under18'] / pop_df['total']
print(f_u18.unstack())

df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
print(df)

data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
print(pd.Series(data))

# 显示创建多级索引
print(pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]]))
print(pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)]))
print(pd.MultiIndex.from_product([['a', 'b'], [1, 2]]))
print(pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
                    codes=[[0, 0, 1, 1], [0, 1, 0, 1]]))
pop.index.names = ['state', 'year']
print(pop)

# 多级列索引
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])

data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
health_data = pd.DataFrame(data, index=index, columns=columns)
print(health_data)
print(health_data['Guido'])
# 多级索引的取值与切片
print(pop)
print(pop['California', 2000])
print(pop['California'])
print(pop.loc['California': 'New York'])
print(pop[:, 2000])
print(pop[pop > 22000000])
print(pop[['California', 'Texas']])

print(health_data['Guido', 'HR'])
print(health_data.iloc[:2, :2])
print(health_data.loc[:, ('Bob', 'HR')])
idx = pd.IndexSlice
print(health_data.loc[idx[:, 1], idx[:, 'HR']])

# 多级索引行列转换
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data = data.sort_index()
print(data)
print(data['a': 'b'])
print(pop.unstack(level=0))
print(pop.unstack(level=1))
print(pop.unstack().stack())
pop_flat = pop.reset_index(name='population')
print(pop_flat)
print(pop_flat.set_index(['state', 'year']))
# 多级索引的数据累记
print(health_data)
data_mean = health_data.mean(level='year')
print(data_mean)
print(data_mean.mean(axis=1, level='type'))

输出

(California, 2000)    33871468
(California, 2010)    37353956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64
(California, 2010)    37353956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64
(California, 2010)    37353956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64
MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )
California  2000    33871468
            2010    37353956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
California    37353956
New York      19378102
Texas         25145561
dtype: int64
                2000      2010
California  33871468  37353956
New York    18976457  19378102
Texas       20851820  25145561
California  2000    33871468
            2010    37353956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
                    total  under18
California 2000  33871468  9267089
           2010  37353956  9284094
New York   2000  18976457  4687374
           2010  19378102  4318033
Texas      2000  20851820  5906301
           2010  25145561  6879014
                2000      2010
California  0.273596  0.248544
New York    0.247010  0.222831
Texas       0.283251  0.273568
        data1     data2
a 1  0.548814  0.715189
  2  0.602763  0.544883
b 1  0.423655  0.645894
  2  0.437587  0.891773
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
state       year
California  2000    33871468
            2010    37353956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
subject      Bob       Guido         Sue      
type          HR  Temp    HR  Temp    HR  Temp
year visit                                    
2013 1      36.0  37.4  38.0  38.5  45.0  37.1
     2      41.0  37.3  52.0  36.8  40.0  36.1
2014 1      11.0  37.7  46.0  36.3  60.0  35.5
     2      37.0  36.8  52.0  38.5  39.0  37.4
type          HR  Temp
year visit            
2013 1      38.0  38.5
     2      52.0  36.8
2014 1      46.0  36.3
     2      52.0  38.5
state       year
California  2000    33871468
            2010    37353956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
33871468
year
2000    33871468
2010    37353956
dtype: int64
state       year
California  2000    33871468
            2010    37353956
New York    2000    18976457
            2010    19378102
dtype: int64
state
California    33871468
New York      18976457
Texas         20851820
dtype: int64
state       year
California  2000    33871468
            2010    37353956
Texas       2010    25145561
dtype: int64
state       year
California  2000    33871468
            2010    37353956
Texas       2000    20851820
            2010    25145561
dtype: int64
year  visit
2013  1        38.0
      2        52.0
2014  1        46.0
      2        52.0
Name: (Guido, HR), dtype: float64
subject      Bob      
type          HR  Temp
year visit            
2013 1      36.0  37.4
     2      41.0  37.3
year  visit
2013  1        36.0
      2        41.0
2014  1        11.0
      2        37.0
Name: (Bob, HR), dtype: float64
subject      Bob Guido   Sue
type          HR    HR    HR
year visit                  
2013 1      36.0  38.0  45.0
2014 1      11.0  46.0  60.0
char  int
a     1      0.359508
      2      0.437032
b     1      0.666767
      2      0.670638
c     1      0.697631
      2      0.060225
dtype: float64
char  int
a     1      0.359508
      2      0.437032
b     1      0.666767
      2      0.670638
dtype: float64
state  California  New York     Texas
year                                 
2000     33871468  18976457  20851820
2010     37353956  19378102  25145561
year            2000      2010
state                         
California  33871468  37353956
New York    18976457  19378102
Texas       20851820  25145561
state       year
California  2000    33871468
            2010    37353956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
        state  year  population
0  California  2000    33871468
1  California  2010    37353956
2    New York  2000    18976457
3    New York  2010    19378102
4       Texas  2000    20851820
5       Texas  2010    25145561
                 population
state      year            
California 2000    33871468
           2010    37353956
New York   2000    18976457
           2010    19378102
Texas      2000    20851820
           2010    25145561
subject      Bob       Guido         Sue      
type          HR  Temp    HR  Temp    HR  Temp
year visit                                    
2013 1      36.0  37.4  38.0  38.5  45.0  37.1
     2      41.0  37.3  52.0  36.8  40.0  36.1
2014 1      11.0  37.7  46.0  36.3  60.0  35.5
     2      37.0  36.8  52.0  38.5  39.0  37.4
subject   Bob        Guido          Sue       
type       HR   Temp    HR   Temp    HR   Temp
year                                          
2013     38.5  37.35  45.0  37.65  42.5  36.60
2014     24.0  37.25  49.0  37.40  49.5  36.45
type         HR       Temp
year                      
2013  42.000000  37.200000
2014  40.833333  37.033333

Process finished with exit code 0