练习书4-《python数据科学手册》

pandas的索引对齐和缺失值处理

代码

import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

np.random.seed(0)
# 配置pandas显示
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)


# 对于一元运算(像函数与三角函数)，这些通用函数将在输出结果中保留索引和列标签;
# 而对于二元运算(如加法和乘法)，Pandas 在传递通用函数时会自动对齐索引进行计算。
# 这就意味着，保存数据内容与组合不同来源的数据——两处在NumPy数组中都容易出错的地方——变成了 Pandas 的杀手锏
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
print(ser)

df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
                  columns=['A', 'B', 'C', 'D'])
print(df)
# 通用函数:保留索引
print(np.exp(ser))
print(np.sin(df * np.pi / 4))
# 通用函数:索引对齐

area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')
print(population / area)
print(area.index | population.index)

A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
print(A + B)
print(A.add(B, fill_value=0))

A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
print(A)
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('BAC'))
fill = A.stack().mean()

print(B)
print(A + B)
print(A.add(B, fill_value=fill))

A = rng.randint(10, size=(3, 4))
print(A)
print(A - A[0])

df = pd.DataFrame(A, columns=list('QRST'))
print(df - df.iloc[0])
print(df.subtract(df['R'], axis=0))

halfrow = df.iloc[0, ::2]
print(halfrow)
print(df - halfrow)

# 处理缺失值
print(pd.Series([1, np.nan, 2, None]))
x = pd.Series(range(2), dtype=int)
x[0] = None
print(x)
# 发现缺失值
data = pd.Series([1, np.nan, 'hello', None])
print(data.isnull())
print(data[data.notnull()])
# 剔除缺失值
print(data.dropna())
df = pd.DataFrame([[1, np.nan, 2],
                   [2, 3, 5],
                   [np.nan, 4, 6]])
print(df)
print(df.dropna())
print(df.dropna(axis='columns'))
df[3] = np.nan
print(df)
print(df.dropna(axis='columns', how='all'))
print(df.dropna(axis='rows', thresh=3))
# 填充缺失值
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
print(data)
print(data.fillna(0))
print(data.fillna(method='ffill'))
print(data.fillna(method='bfill'))

输出

0    6
1    3
2    7
3    4
dtype: int64
   A  B  C  D
0  6  9  2  6
1  7  4  3  7
2  7  2  5  4
0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64
          A             B         C             D
0 -1.000000  7.071068e-01  1.000000 -1.000000e+00
1 -0.707107  1.224647e-16  0.707107 -7.071068e-01
2 -0.707107  1.000000e+00 -0.707107  1.224647e-16
Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64
Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')
0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64
0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64
   A   B
0  1  11
1  5   1
   B  A  C
0  4  0  9
1  5  8  0
2  9  2  6
      A     B   C
0   1.0  15.0 NaN
1  13.0   6.0 NaN
2   NaN   NaN NaN
      A     B     C
0   1.0  15.0  13.5
1  13.0   6.0   4.5
2   6.5  13.5  10.5
[[3 8 2 4]
 [2 6 4 8]
 [6 1 3 8]]
[[ 0  0  0  0]
 [-1 -2  2  4]
 [ 3 -7  1  4]]
   Q  R  S  T
0  0  0  0  0
1 -1 -2  2  4
2  3 -7  1  4
   Q  R  S  T
0 -5  0 -6 -4
1 -4  0 -2  2
2  5  0  2  7
Q    3
S    2
Name: 0, dtype: int64
     Q   R    S   T
0  0.0 NaN  0.0 NaN
1 -1.0 NaN  2.0 NaN
2  3.0 NaN  1.0 NaN
0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64
0    NaN
1    1.0
dtype: float64
0    False
1     True
2    False
3     True
dtype: bool
0        1
2    hello
dtype: object
0        1
2    hello
dtype: object
     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6
     0    1  2
1  2.0  3.0  5
   2
0  2
1  5
2  6
     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  NaN  4.0  6 NaN
     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6
     0    1  2   3
1  2.0  3.0  5 NaN
a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64
a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64
a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64
a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

Process finished with exit code 0

练习书4-《python数据科学手册》

推荐阅读更多精彩内容