pandas的索引对齐和缺失值处理
代码
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
np.random.seed(0)
# 配置pandas显示
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
# 对于一元运算(像函数与三角函数),这些通用函数将在输出结果中保留索引和列标签;
# 而对于二元运算(如加法和乘法),Pandas 在传递通用函数时会自动对齐索引进行计算。
# 这就意味着,保存数据内容与组合不同来源的数据——两处在NumPy数组中都容易出错的地方——变成了 Pandas 的杀手锏
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
print(ser)
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
columns=['A', 'B', 'C', 'D'])
print(df)
# 通用函数:保留索引
print(np.exp(ser))
print(np.sin(df * np.pi / 4))
# 通用函数:索引对齐
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127}, name='population')
print(population / area)
print(area.index | population.index)
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
print(A + B)
print(A.add(B, fill_value=0))
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
columns=list('AB'))
print(A)
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
columns=list('BAC'))
fill = A.stack().mean()
print(B)
print(A + B)
print(A.add(B, fill_value=fill))
A = rng.randint(10, size=(3, 4))
print(A)
print(A - A[0])
df = pd.DataFrame(A, columns=list('QRST'))
print(df - df.iloc[0])
print(df.subtract(df['R'], axis=0))
halfrow = df.iloc[0, ::2]
print(halfrow)
print(df - halfrow)
# 处理缺失值
print(pd.Series([1, np.nan, 2, None]))
x = pd.Series(range(2), dtype=int)
x[0] = None
print(x)
# 发现缺失值
data = pd.Series([1, np.nan, 'hello', None])
print(data.isnull())
print(data[data.notnull()])
# 剔除缺失值
print(data.dropna())
df = pd.DataFrame([[1, np.nan, 2],
[2, 3, 5],
[np.nan, 4, 6]])
print(df)
print(df.dropna())
print(df.dropna(axis='columns'))
df[3] = np.nan
print(df)
print(df.dropna(axis='columns', how='all'))
print(df.dropna(axis='rows', thresh=3))
# 填充缺失值
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
print(data)
print(data.fillna(0))
print(data.fillna(method='ffill'))
print(data.fillna(method='bfill'))
输出
0 6
1 3
2 7
3 4
dtype: int64
A B C D
0 6 9 2 6
1 7 4 3 7
2 7 2 5 4
0 403.428793
1 20.085537
2 1096.633158
3 54.598150
dtype: float64
A B C D
0 -1.000000 7.071068e-01 1.000000 -1.000000e+00
1 -0.707107 1.224647e-16 0.707107 -7.071068e-01
2 -0.707107 1.000000e+00 -0.707107 1.224647e-16
Alaska NaN
California 90.413926
New York NaN
Texas 38.018740
dtype: float64
Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')
0 NaN
1 5.0
2 9.0
3 NaN
dtype: float64
0 2.0
1 5.0
2 9.0
3 5.0
dtype: float64
A B
0 1 11
1 5 1
B A C
0 4 0 9
1 5 8 0
2 9 2 6
A B C
0 1.0 15.0 NaN
1 13.0 6.0 NaN
2 NaN NaN NaN
A B C
0 1.0 15.0 13.5
1 13.0 6.0 4.5
2 6.5 13.5 10.5
[[3 8 2 4]
[2 6 4 8]
[6 1 3 8]]
[[ 0 0 0 0]
[-1 -2 2 4]
[ 3 -7 1 4]]
Q R S T
0 0 0 0 0
1 -1 -2 2 4
2 3 -7 1 4
Q R S T
0 -5 0 -6 -4
1 -4 0 -2 2
2 5 0 2 7
Q 3
S 2
Name: 0, dtype: int64
Q R S T
0 0.0 NaN 0.0 NaN
1 -1.0 NaN 2.0 NaN
2 3.0 NaN 1.0 NaN
0 1.0
1 NaN
2 2.0
3 NaN
dtype: float64
0 NaN
1 1.0
dtype: float64
0 False
1 True
2 False
3 True
dtype: bool
0 1
2 hello
dtype: object
0 1
2 hello
dtype: object
0 1 2
0 1.0 NaN 2
1 2.0 3.0 5
2 NaN 4.0 6
0 1 2
1 2.0 3.0 5
2
0 2
1 5
2 6
0 1 2 3
0 1.0 NaN 2 NaN
1 2.0 3.0 5 NaN
2 NaN 4.0 6 NaN
0 1 2
0 1.0 NaN 2
1 2.0 3.0 5
2 NaN 4.0 6
0 1 2 3
1 2.0 3.0 5 NaN
a 1.0
b NaN
c 2.0
d NaN
e 3.0
dtype: float64
a 1.0
b 0.0
c 2.0
d 0.0
e 3.0
dtype: float64
a 1.0
b 1.0
c 2.0
d 2.0
e 3.0
dtype: float64
a 1.0
b 2.0
c 2.0
d 3.0
e 3.0
dtype: float64
Process finished with exit code 0