pandas数据累记和分组
代码
import numpy as np
import pandas as pd
import time
import matplotlib as mpl
import matplotlib.pyplot as plt
# plt.style.use('classic')
plt.style.use('seaborn-whitegrid')
np.random.seed(0)
# 配置pandas显示
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
# 数据累计(summarization):计算累计(aggregation)指标,
# 如sum()、mean()、median()、min()和max(),
# 其中每一个指标都呈现了大数据集的特征。
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
print(ser)
print(ser.sum())
print(ser.mean())
df = pd.DataFrame({'A': rng.rand(5),
'B': rng.rand(5)})
print(df)
print(df.mean())
print(df.mean(axis='columns'))
print(df.describe())
# 虽然“分组”(group by)这个名字是借用SQL数据库语言的命令,
# 但其理念引用发明R语言frame的 Hadley Wickham的观点可能更合适:
# 分割(split)、应用(apply)和组合(combine)。
# GroupBy的用处就是将这些 步骤进行抽象:
# 用户不需要知道在底层如何计算,只要把操作看成一个整体就够了。
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data': range(6)},
columns=['key', 'data'])
print(df)
print(df.groupby('key'))
print(df.groupby('key').sum())
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data1': range(6),
'data2': rng.randint(0, 10, 6)},
columns=['key', 'data1', 'data2'])
print(df)
print(df.groupby('key').aggregate(['min', np.median, max]))
print(df.groupby('key').aggregate({'data1': 'min',
'data2': 'max'}))
def filter_func(x):
return x['data2'].std() > 4
print(df.groupby('key').std())
print(df.groupby('key').filter(filter_func))
print(df.groupby('key').transform(lambda x: x - x.mean()))
def norm_by_data(x):
x['data1'] /= x['data2'].sum()
return x
print(df.groupby('key').apply(norm_by_data))
L = [0, 1, 0, 1, 2, 0]
print(df.groupby(L).sum())
输出
0 0.374540
1 0.950714
2 0.731994
3 0.598658
4 0.156019
dtype: float64
2.811925491708157
0.5623850983416314
A B
0 0.155995 0.020584
1 0.058084 0.969910
2 0.866176 0.832443
3 0.601115 0.212339
4 0.708073 0.181825
A 0.477888
B 0.443420
dtype: float64
0 0.088290
1 0.513997
2 0.849309
3 0.406727
4 0.444949
dtype: float64
A B
count 5.000000 5.000000
mean 0.477888 0.443420
std 0.353125 0.426952
min 0.058084 0.020584
25% 0.155995 0.181825
50% 0.601115 0.212339
75% 0.708073 0.832443
max 0.866176 0.969910
key data
0 A 0
1 B 1
2 C 2
3 A 3
4 B 4
5 C 5
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11edbb6d8>
data
key
A 3
B 5
C 7
key data1 data2
0 A 0 5
1 B 1 0
2 C 2 3
3 A 3 3
4 B 4 7
5 C 5 9
data1 data2
min median max min median max
key
A 0 1.5 3 3 4.0 5
B 1 2.5 4 0 3.5 7
C 2 3.5 5 3 6.0 9
data1 data2
key
A 0 5
B 1 7
C 2 9
data1 data2
key
A 2.12132 1.414214
B 2.12132 4.949747
C 2.12132 4.242641
key data1 data2
1 B 1 0
2 C 2 3
4 B 4 7
5 C 5 9
data1 data2
0 -1.5 1.0
1 -1.5 -3.5
2 -1.5 -3.0
3 1.5 -1.0
4 1.5 3.5
5 1.5 3.0
key data1 data2
0 A 0.000000 5
1 B 0.142857 0
2 C 0.166667 3
3 A 0.375000 3
4 B 0.571429 7
5 C 0.416667 9
data1 data2
0 7 17
1 4 3
2 4 7