Serise的apply方法
import pandas as pd
def my_sq(x):
return x*x
df = pd.DataFrame({'a':[10,20,30],
'b':[40,50,60]})
sq = df['a'].apply(my_sq)
print(sq)
'''
0 100
1 400
2 900
Name: a, dtype: int64
'''
带有参数的函数,需要额外指定参数,函数的第一个变量是Serise的元素
def my_exp1(x, e):
return x**e
def my_exp2(e, x):
return e**x
ex = df['a'].apply(my_exp1, e=2)
print(ex)
'''
0 100
1 400
2 900
Name: a, dtype: int64
'''
ex = df['a'].apply(my_exp2, x=2)
print(ex)
'''
0 100
1 400
2 900
Name: a, dtype: int64
'''
DataFrame的apply方法
DataFrame需要指定按行还是按列应用函数
按列应用,apply()的axis参数设为0(默认)
def avg_3(x, y, z):
return (x + y + z)/3
print(df.apply(avg_3))
# 报错
# TypeError: avg_3() missing 2 required positional arguments: 'y' and 'z'
# 整列传递到第一个参数,第二第三参数没有传入值
def avg_3_apply(row):
x = row[0]
y = row[1]
z = row[2]
return (x + y + z)/3
print(df.apply(avg_3_apply))
'''
a 20.0
b 50.0
dtype: float64
'''
按行应用,apply()的axis参数设为1
def avg_2_apply(row):
x = row[0]
y = row[1]
return (x + y)/3
print(df.apply(avg_2_apply, axis=1))
'''
0 16.666667
1 23.333333
2 30.000000
dtype: float64
'''
apply高级用法
需求:计算titanic每一行或每一列的完整案例(不含缺失值)的百分比
titanic = sns.load_dataset('titanic')命令会因为网络原因报错
URLError: <urlopen error [Errno 11004] getaddrinfo failed>
需要手动下载,然后从本地导入 https://github.com/mwaskom/seaborn-data
cache=True时,会根据data_home的路径来加载数据集
import seaborn as sns
titanic = sns.load_dataset('titanic',cache=True,data_home="./seaborn-data")
import numpy as np
def count_missing(vec):
null_vec = pd.isnull(vec)
null_count = np.sum(null_vec)
return null_count
def prop_missing(vec):
num = count_missing(vec)
dem = vec.size
return num / dem
def prop_complete(vec):
return 1 - prop_missing(vec)
# 按列应用
print(titanic.apply(count_missing))
'''
survived 0
pclass 0
sex 0
age 177
sibsp 0
parch 0
fare 0
embarked 2
class 0
who 0
adult_male 0
deck 688
embark_town 2
alive 0
alone 0
dtype: int64
'''
print(titanic.apply(prop_complete))
'''
survived 1.000000
pclass 1.000000
sex 1.000000
age 0.801347
sibsp 1.000000
parch 1.000000
fare 1.000000
embarked 0.997755
class 1.000000
who 1.000000
adult_male 1.000000
deck 0.227834
embark_town 0.997755
alive 1.000000
alone 1.000000
dtype: float64
'''
# 按行应用
print(titanic.apply(count_missing, axis=1))
'''
0 1
1 0
2 1
3 0
4 1
..
886 1
887 0
888 2
889 0
890 1
Length: 891, dtype: int64
'''
print(titanic.apply(prop_complete, axis=1))
'''
0 0.933333
1 1.000000
2 0.933333
3 1.000000
4 0.933333
...
886 0.933333
887 1.000000
888 0.866667
889 1.000000
890 0.933333
Length: 891, dtype: float64
'''
# 添加缺失信息,并抽取部分观察
titanic['num_missing'] = titanic.apply(count_missing, axis=1)
print(titanic.head())
print(titanic.loc[titanic.num_missing > 1, :].sample(10))
'''
survived pclass sex age sibsp parch fare embarked class \
768 0 3 male NaN 1 0 24.1500 Q Third
364 0 3 male NaN 1 0 15.5000 Q Third
77 0 3 male NaN 0 0 8.0500 S Third
420 0 3 male NaN 0 0 7.8958 C Third
568 0 3 male NaN 0 0 7.2292 C Third
409 0 3 female NaN 3 1 25.4667 S Third
481 0 2 male NaN 0 0 0.0000 S Second
497 0 3 male NaN 0 0 15.1000 S Third
413 0 2 male NaN 0 0 0.0000 S Second
240 0 3 female NaN 1 0 14.4542 C Third
who adult_male deck embark_town alive alone num_missing
768 man True NaN Queenstown no False 2
364 man True NaN Queenstown no False 2
77 man True NaN Southampton no True 2
420 man True NaN Cherbourg no True 2
568 man True NaN Cherbourg no True 2
409 woman False NaN Southampton no False 2
481 man True NaN Southampton no True 2
497 man True NaN Southampton no True 2
413 man True NaN Southampton no True 2
240 woman False NaN Cherbourg no False 2
'''