一、数据集处理
1、数据集导入
# 读取房价数据
import pandas as pd
def load_housing_data():
return pd.read_csv('./housing.csv')
2、查看前五行数据
housing = load_housing_data()
# 查看前五行数据
housing.head()
3、获取数据的简单描述
# 获取数据的简单描述
housing.info()
输出数据描述
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude 20640 non-null float64
latitude 20640 non-null float64
housing_median_age 20640 non-null float64
total_rooms 20640 non-null float64
total_bedrooms 20433 non-null float64
population 20640 non-null float64
households 20640 non-null float64
median_income 20640 non-null float64
median_house_value 20640 non-null float64
ocean_proximity 20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
3、查看ocean_proximity
数据的取值范围及各个值的数量
# 查看ocean_proximity数据的取值范围
housing['ocean_proximity'].value_counts()
输出数据:
<1H OCEAN 9136
INLAND 6551
NEAR OCEAN 2658
NEAR BAY 2290
ISLAND 5
Name: ocean_proximity, dtype: int64
4、了解数据的基本情况
# 了解数据的基本情况
housing.describe()
输出图例:
5、绘制每个属性的直方图,来快速了解数据。
# 绘制每个属性的直方图,来快速了解数据。
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50,figsize=(20,15))
plt.show()
二、 测试集处理
import numpy as np
def split_train_test(data,test_radio):
'''
生成训练集和测试集
'''
np.random.seed(42)
#生成不重复的下标随机数
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_radio)
# 测试集的下标集合
test_indices = shuffled_indices[:test_set_size]
# 训练集的下标集合
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices] , data.iloc[test_indices]
输出训练集和测试集长度
train_set,test_set = split_train_test(housing, 0.2)
len(train_set),len(test_set)
输出:
(16512, 4128)
使用 hashlib
import hashlib
# 选择测试集,方式2.无惧新增数据
def test_set_check(identifier, test_radio, hash):
return hash(np.int64(identifier)).digest()[-1] < (256 * test_radio)
def split_train_test_by_id(data, test_radio,id_column,hash=hashlib.md5):
ids = data[id_column]
in_test_set = ids.apply(lambda id_:test_set_check(id_,test_radio, hash))
return data.loc[~in_test_set], data.loc[in_test_set]
# 为数据添加index列
housing_width_id = housing.reset_index()
train_set, test_set = split_train_test_by_id(housing_width_id,0.2,"index")
len(train_set),len(test_set)
输出训练集和测试集长度如上。(16512, 4128)
判断:取最后一个字节,是否小于(256 * 0.2)
hashlib.md5(np.int64(0)).digest()[-1] < (256 * 0.2)
返会 False
.
绘制median_income的柱状图
from sklearn.model_selection import train_test_split
train_set , test_set = train_test_split(housing,test_size=0.2,random_state=42)
# len(train_set),len(test_set)
housing['income_cat'].where(housing['income_cat'] < 5 , 5.0, inplace=True)
housing['income_cat'].hist(bins=5,histtype='stepfilled')
分层随机抽取测试集
from sklearn.model_selection import StratifiedShuffleSplit
# 分层随机抽取测试集
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
result = split.split(housing,housing['income_cat'])
for train_index,test_index in result:
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
len(strat_test_set),len(strat_train_set)
查看每个类别在总数总所占比例。
housing['income_cat'].value_counts() / len(housing)
'''
3.0 0.350581
2.0 0.318847
4.0 0.176308
5.0 0.114438
1.0 0.039826
Name: income_cat, dtype: float64
'''
每个类别在测试集中的数据比例
strat_test_set['income_cat'].value_counts()/len(strat_test_set)
'''
3.0 0.350533
2.0 0.318798
4.0 0.176357
5.0 0.114583
1.0 0.039729
Name: income_cat, dtype: float64
'''
删除收入类别列
for set in (strat_test_set,strat_train_set):
# inplace = True 时,在源数据上替换,并返回None
set.drop(['income_cat'],axis=1,inplace=True)
strat_train_set.head()
探索数据
housing = strat_train_set.copy()
# 将地理数据可视化
housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.4,
s=housing['population'] / 100,label='population',c="median_house_value",
cmap=plt.get_cmap("jet"),colorbar=True)
plt.legend()
plt.show()
寻找特征之间的相关性
【相关系数】:用来衡量两组变量之间的相关性。取值范围是-1到1之间。约接近1表示约正相关。约接近-1表示约负相关。 约接近0表示约不相关。
计算相关系数矩阵
corr_matrix = housing.corr()
corr_matrix["median_house_value"]
使用 Pandas 的 scatter_matrix() 函数
绘制每个属性相对于其他属性的相关性。
from pandas.plotting import scatter_matrix
attributes = ["median_house_value","median_income","total_rooms","housing_median_age"]
scatter_matrix(housing[attributes],figsize=(12,8))
plt.show()
housing["rooms_per_household"] = housing['total_rooms'] / housing["households"]
housing["bedrooms_per_room"] = housing['total_bedrooms'] / housing["total_rooms"]
housing["population_per_household"] = housing['population'] / housing["households"]
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
输出:
'''
median_house_value 1.000000
median_income 0.687160
rooms_per_household 0.146285
total_rooms 0.135097
housing_median_age 0.114110
households 0.064506
total_bedrooms 0.047689
population_per_household -0.021985
population -0.026920
longitude -0.047432
latitude -0.142724
bedrooms_per_room -0.259984
Name: median_house_value, dtype: float64
'''
数据准备
【问】:如何处理缺失数据?
- 放弃有缺失值的数据
housing.dropna(['total_bedrooms'])
- 放弃这个属性
housing.drop("total_bedrooms",axis=1)
- 将缺失值设置为某个值,例如平均值
mean = housing['total_bedrooms'].mean()
housing['total_bedrooms'].fillna(mean)
# 获取训练数据(去除标签)
housing = strat_train_set.drop("median_house_value",axis=1)
# 获取训练数据的标签
housing_labels = strat_train_set['median_house_value'].copy()
## Scikit-Learn 提供了容易的缺失值处理方式:imputer
from sklearn.preprocessing import Imputer
# 删除属性ocean_proximity
housing_num = housing.drop('ocean_proximity',axis=1)
imputer = Imputer(strategy="median")
# 为所有属性生成填充策略
imputer.fit(housing_num)
'''输出:
Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)
'''
每个属性要替换的值
imputer.statistics_
'''输出:
array([-118.51 , 34.26 , 29. , 2119.5 , 433. , 1164. ,
408. , 3.5409])
''''
数据填充
# 完成填充,结果是Numpy数组
X = imputer.transform(housing_num)
# 将Numpy数组转换回DataFrame格式
housing_tr = pd.DataFrame(X,columns=housing_num.columns)
housing_tr.info()
'''输出:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 8 columns):
longitude 16512 non-null float64
latitude 16512 non-null float64
housing_median_age 16512 non-null float64
total_rooms 16512 non-null float64
total_bedrooms 16512 non-null float64
population 16512 non-null float64
households 16512 non-null float64
median_income 16512 non-null float64
dtypes: float64(8)
memory usage: 1.0 MB
'''
Scikit-Learn API设计
Scikit-Learn API设计具有较高的一致性,一般来说Scikit-Learn由以下模块组成:
估算器
【解释】:能够根据数据集对某些参数进行估算的任意对象,都可以称为估算器(例如:Imputer)。
估算器由 fit() 方法执行,需要一个数据集作为参数。对于监督式学习,需要两个参数,第二个是包含标签的数据集。创建估算器,需要指定估算策略,例如:strategy='mean'。
转换器:
【解释】:可以用来转换数据集的估算器,称为转换器 。API一般包括:
- transform() : 该方法传入一个待转换的数据集,返回一个转换后的数据集。
所有的转换器都有一个方法:
- fit_transform(): 相当于先调用了 fit() 方法,再调用了 transform() 方法。并且这个经过了优化,会更快。
预测器
【解释】:能够基于一个给定的数据集进行预测的估算器,称为预测器。
预测器有一个predict()方法,该方法接受一个新的实例数据集,返回一个包含相应预测的数据集。 还存在一个 score() 方法,可以用来衡量给定测试集的预测质量。
处理文本和分类属性
# 将分类文本转换为数字形式
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat = housing['ocean_proximity']
housing_cat_encoded = encoder.fit_transform(housing_cat)
# 转换后的值
housing_cat_encoded
# 查看转换器学习的映射关系,'<1H OCEAN' 是0
encoder.classes_
'''(输出)
array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
dtype=object)
'''
housing_cat_encoded.reshape(-1,1)
'''(输出)
array([[0],
[0],
[4],
...,
[1],
[0],
[3]])
'''
独热编码
# 将数字形式的编码,转换为独热编码
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(categories='auto')
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
# 返回的结果是一个稀疏矩阵,以节省内存空间。可以调用.toarray()方法,将其转换为numpy数组。
housing_cat_1hot
了解独热编码?稀疏矩阵
可以一次将文本转换为独热编码
LabelBinarizer 转换器
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot
'''(输出)
array([[1, 0, 0, 0, 0],
[1, 0, 0, 0, 0],
[0, 0, 0, 0, 1],
...,
[0, 1, 0, 0, 0],
[1, 0, 0, 0, 0],
[0, 0, 0, 1, 0]])
'''
自定义转换器
转换器需要实现三个方法:fit() (返回自身) 、transform() 、fit_transform()
【问】:如何添加TransformMixin 作为基类?
【答】:可以直接调用fit_transform(). 如果将BaseEstimator作为基类(在构造方法中,不适用args,*kwargs 参数)。
【知识补充】:可以获得get_params() 和 set_params()方法(用来调整超参数)
from sklearn.base import BaseEstimator,TransformerMixin
rooms_ix, bedrooms_ix,population_ix, househould_ix = 3, 4, 5, 6
class CombineAttributesAdder(BaseEstimator,TransformerMixin):
def __init__(self,add_bedrooms_per_room = True):
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, Y = None):
return self
def transform(self, X, Y = None):
rooms_per_household = X[:,rooms_ix] / X[:,househould_ix]
population_per_household = X[:,population_ix] / X[:,househould_ix]
print(rooms_per_household)
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:,bedrooms_ix] / X[:,rooms_ix]
# 将数据按列拼接
return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
else :
return np.c_[X,rooms_per_household, population_per_household]
attr_adder = CombineAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
housing_extra_attribs
输出:
[4.625368731563422 6.008849557522124 4.225108225108225 ...
6.34640522875817 5.50561797752809 4.843505477308295]
Out[33]:
array([[-121.89, 37.29, 38.0, ..., '<1H OCEAN', 4.625368731563422,
2.094395280235988],
[-121.93, 37.05, 14.0, ..., '<1H OCEAN', 6.008849557522124,
2.7079646017699117],
[-117.2, 32.77, 31.0, ..., 'NEAR OCEAN', 4.225108225108225,
2.0259740259740258],
...,
[-116.4, 34.09, 9.0, ..., 'INLAND', 6.34640522875817,
2.742483660130719],
[-118.01, 33.82, 31.0, ..., '<1H OCEAN', 5.50561797752809,
3.808988764044944],
[-122.45, 37.77, 52.0, ..., 'NEAR BAY', 4.843505477308295,
1.9859154929577465]], dtype=object)