总结一下数据处理的常用操作
理论
1、缺失值处理
# 去除缺失值
data.dropna() #等价于 data[data.notnull()]
data.dropna(axis=1, how='all') #只丢弃全部为缺失值的行
# 填补缺失值
df.fillna({1: 0.5, 2: 0},inplace=True)
df.fillna(method='ffill', limit=2) #前向填充
data.fillna(data.mean()) #均值填充
2、重复值处理
# 去除重复值
data.drop_duplicates()
data.drop_duplicates(['k1', 'k2'], keep='last') #keep默认保留第一个
3、替换
data.replace(-999, np.nan)
data.replace([-999, -1000], [np.nan, 0]) #每个值可以有不同的替换值
data.replace({-999: np.nan, -1000: 0}) #也可以用字典形式替换
4、利用函数或映射进行数据转换
# 利用字典转换(map函数的使用)
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',....: 'Pastrami', 'corned beef', 'Bacon',....: 'pastrami', 'honey ham', 'nova lox'],....: 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
meat_to_animal = { 'bacon': 'pig', 'pulled pork': 'pig', 'pastrami': 'cow', 'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon' }
lowercased = data['food'].str.lower() #大小写一致
data['animal'] = lowercased.map(meat_to_animal)
# 上两条等价于
data['food'].map(lambda x: meat_to_animal[x.lower()])
5、连续数据离散化
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
pd.cut(ages, bins)
# 上两条等价于
pd.cut(ages, [18, 26, 36, 61, 100], right=False)
# 给元面命名
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
# 等份划分
pd.qcut(ages, 4)
6、数据合并
# merge合并 → 类似excel的vlookup
pd.merge(df3, df4,on=['key1','key2'], how = 'inner') # 多个链接键
pd.merge(df3, df4, left_on='lkey', right_on='rkey') #两个对象的列名不同
# concat 连接
pd.concat([s1, s2, s3], axis=1) #默认情况下,concat是在axis=0上工作的,即列不变,仅增加数据行数
pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']]) # 指定拼接的列
7、数据的分组聚合
# groupby
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
a = df.groupby('A').mean()
#按key1进行分组,并计算data1列的平均值
grouped = df['data1'].groupby(df['key1'])
grouped.mean()
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means.unstack()
# 透视表
data.pivot_table(values='tip_pct', index=['time', 'size', 'smoker'], columns='day', aggfunc='mean', fill_value=0)
应用
1、数据读取
df = pd.read_csv('/home/kesci/商铺数据.csv')
print(type(df),df['name'].dtype) # 查看df类型,查看df中一列的数值类型
df.head()
2、数据提取
# 列索引
df1['quality'] #df[...] - 单列索引,结果series
df1[['quality','environment','service']] #df[...] - 多列索引,结果dataframe
# 行索引 - loc
# 按照index选择行,只选择一行输出Series,选择多行输出Dataframe
import numpy as np
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
index = ['one','two','three'],
columns = ['a','b','c','d'])
data1 = df.loc['one']
data2 = df.loc[['one','two']]
# 行索引 - iloc
# 按照整数位置(从轴的0到length-1)选择行
data3 = df.iloc[0]
data4 = df.iloc[[0,2]]
# 行索引 - 布尔型索引
df1[df1['comment'] >50]
df1[df1['comment'] >50][['quality','environment','service']]
3、数据清洗
# 提取有用数据并更改数据类型
df1 = df[df['comment'].str.contains('条')]
df1['comment'] = df1['comment'].str.split('条').str[0].astype('int')
# 分列
df1['quality'] = df1['commentlist'].str.split(' ').str[0].str[2:].astype('float')
df1['environment'] = df1['commentlist'].str.split(' ').str[1].str[2:].astype('float')
df1['service'] = df1['commentlist'].str.split(' ').str[2].str[2:].astype('float')
del df1['commentlist']