pandas.DataFrame.dropna
(http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.dropna.html#pandas-dataframe-dropna)
DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
axis 0 行 1列
how any 有一个以上缺失去除 all所有缺失才去除
thresh 至少thresh个非控制
#统计缺失情况
sum_na_col=data_last.apply(lambda x: sum(x.isnull()),axis=0)
per_na_col=sum_na_col/data_last.shape[0]
sum_na_row=data_last.apply(lambda x: sum(x.isnull()),axis=1)
plt.hist(sum_na_row)
sum_na_row=sum_na_row.tolist()
count_na_row = {k:sum_na_row.count(k) for k in set(sum_na_row)}
def sum_na(data):
return data.isnull().sum()
data_last['column_na']= data_last.apply(sum_na,axis=1)
#法1
df.apply(lambda x: sum(x.isnull()),axis=0)
#默认是列的
#法1
df.apply(lambda x: sum(x.isnull()),axis=0)
#法2
total= df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total','Percent'])
missing_data.head(20)
df_train= df_train.drop((missing_data[missing_data['Total'] > 50000]).index,1)