观察数据,从整体上了解数据类型,分布,相关性
常用包:pandas,numpy, scipy(统计学包,可完成matlab诸多功能), seaborn(丰富的数据可视化包), matplotlib,missingno(缺失值可视化)
常用语句:
整体观察:.nunique() .value_counts() .info() .describe() .isnull().sum() .skew() .kurt()
缺失值可视化:missingno.matrix(data.sample()) missingno.bar(data.sample())
分布可视化:
按类别特征和数字特征分类,对数字特征分析相关性,对类别特征看分布状态
数字特征:热力图,散点图,分布,回归
f,ax = plt.subplots(figsize = (7,7))
sns.heatmap(correlation, square = True, vmax = 0.8)
f = pd.melt(data, value_vars = num)
#all distributions
g = sns.FacetGrid(f,col = 'variable',col_wrap=2,sharex=False,sharey=False)
g = g.map(sns.distplot,'value')
#散点图
sns.set()
sns.pairplot(data[['price','v_1','v_2']],diag_kind='kde', kind = 'scatter')
plt.show()
#回归
fig, (ax1,ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (10,5))
sns.regplot(x = 'v_1',y = 'price', data = data[['v_1','price']],scatter = True, fit_reg = True,ax = ax1)
sns.regplot(x = 'v_2',y = 'price', data = data[['v_2','price']],scatter = True, fit_reg = True,ax = ax2)
#类别特征箱线图,小提琴图,柱状图
for c in ['brand','gearbox']: #填补缺失值
data[c] = data[c].astype('category')
if data[c].isnull().any():
data[c] = data[c].cat.add_categories(['MISSING']) #增加missing类
data[c] = data[c].fillna('MISSING')
def boxplot(x, y, **kwargs):
sns.boxplot(x=x, y=y)
x=plt.xticks(rotation=90)
f = pd.melt(data, id_vars=['price'], value_vars=['brand','gearbox'])
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, height=5)
g = g.map(boxplot, "value", "price")
柱状图
def count_plot(x, **kwargs):
sns.countplot(x=x)
x=plt.xticks(rotation=90)
f = pd.melt(data, value_vars=['brand','gearbox'])
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, height=5)
g = g.map(count_plot, "value")