1. 特征选择------sklearn代码
1.1 特征选择------方差法
忽略warning错误
import warnings
warnings.filterwarnings("ignore")
# 方差法
from sklearn.feature_selection import VarianceThreshold
X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
selector = VarianceThreshold()
X_new = selector.fit_transform(X)
#
print('X_new:\n',selector.fit_transform(X))
print('get_params:\n',selector.get_params())
print('get_support:\n',selector.get_support())
print('inverse_transform:\n',selector.inverse_transform(X_new))
运行结果
X_new:
[[2 0]
[1 4]
[1 1]]
get_params:
{'threshold': 0.0}
get_support:
[False True True False]
inverse_transform:
[[0 2 0 0]
[0 1 4 0]
[0 1 1 0]]
1.2 特征选择------单变量特征选择 (卡方,F分布,互信息)
1.2.1 什么是卡方分布
Compute chi-squared stats between each non-negative feature and class.This score can be used to select the n_features features with the highest values for the test chi-squared statistic from X, which must contain only non-negative features.
检测每个自变量与因变量的相关性,使用此函数“除去”最可能独立于类的特征。要优于方差检验
1.2.2 时间复杂度
O(n_classes * n_features)
1.2.3 sklearn中卡方的两个函数
* SelectKBest(score_func,k=)。保留评分最高得分的 K 个特征;
* SelectPercentile(score_func,percentile=)。保留最高得分的百分比特征;
1.2.4 score_func的选择
1.2.3 的两个函数 默认:f_classif 函数。这里想使用卡方分布法,选择函数chi2.
对于回归:
f_regression:相关系数,计算每个变量与目标变量的相关系数,然后计算出F值和P值;
mutual_info_regression:互信息;
对于分类:
chi2:卡方检验;
f_classif:方差分析,计算方差分析(ANOVA)的F值 (组间均方 / 组内均方);
mutual_info_classif:互信息;
注:chi2 , mutual_info_classif , mutual_info_regression 可以保持数据的稀疏性。
注: 关于f_classif和f_regression的了解。点此链接
1.2.5 两个属性
scores_ : array-like, shape=(n_features,),Scores of features.
pvalues_ : array-like, shape=(n_features,),p-values of feature scores, None if
score_func
returned only scores.
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest,SelectPercentile
from sklearn.feature_selection import chi2,f_classif,mutual_info_classif
iris = load_iris()
X, y = iris.data, iris.target
print('old_data_shape:\n',X.shape)
# SelectKBest 保留评分最高的 K 个特征
X_SelectKBest = SelectKBest(chi2, k=2)
X_new_SelectKBest = X_SelectKBest.fit_transform(X, y)
print('new_SelectKBest_data_shape:\n',X_new_SelectKBest.shape)
# SelectPercentile 保留最高得分百分比的特征
# 默认:f_classif 函数
# 对于回归: f_regression , mutual_info_regression
# 对于分类: chi2 , f_classif , mutual_info_classif
X_SelectPercentile = SelectPercentile(chi2,percentile=50)
X_new_SelectPercentile = X_SelectPercentile.fit_transform(X, y)
print('new_SelectPercentile_data_shape:\n',X_new_SelectPercentile.shape)
# Attributes
# scores_ : array-like, shape=(n_features,)
# Scores of features.
# pvalues_ : array-like, shape=(n_features,)
# p-values of feature scores, None if `score_func` returned only scores.
print('KBest scores: \n',X_SelectKBest.scores_)
print('Percentile scores:\n',X_SelectPercentile.scores_)
print('KBest pvalues_: \n',X_SelectKBest.pvalues_)
print('Percentile pvalues_:\n',X_SelectPercentile.pvalues_)
# 查看 选择的那两列
# 对比发现 SelectKBest 和 SelectPercentile 挑选的都是最后两列
print('old_data_10:\n',X[0:10,:])
print('X_new_SelectKBest:\n',X_new_SelectKBest[0:10,:])
print('X_new_SelectPercentile:\n',X_new_SelectPercentile[0:10,:])
运行结果
old_data_shape:
(150, 4)
new_SelectKBest_data_shape:
(150, 2)
new_SelectPercentile_data_shape:
(150, 2)
KBest scores:
[ 10.81782088 3.59449902 116.16984746 67.24482759]
Percentile scores:
[ 10.81782088 3.59449902 116.16984746 67.24482759]
KBest pvalues_:
[4.47651499e-03 1.65754167e-01 5.94344354e-26 2.50017968e-15]
Percentile pvalues_:
[4.47651499e-03 1.65754167e-01 5.94344354e-26 2.50017968e-15]
old_data_10:
[[5.1 3.5 1.4 0.2]
[4.9 3. 1.4 0.2]
[4.7 3.2 1.3 0.2]
[4.6 3.1 1.5 0.2]
[5. 3.6 1.4 0.2]
[5.4 3.9 1.7 0.4]
[4.6 3.4 1.4 0.3]
[5. 3.4 1.5 0.2]
[4.4 2.9 1.4 0.2]
[4.9 3.1 1.5 0.1]]
X_new_SelectKBest:
[[1.4 0.2]
[1.4 0.2]
[1.3 0.2]
[1.5 0.2]
[1.4 0.2]
[1.7 0.4]
[1.4 0.3]
[1.5 0.2]
[1.4 0.2]
[1.5 0.1]]
X_new_SelectPercentile:
[[1.4 0.2]
[1.4 0.2]
[1.3 0.2]
[1.5 0.2]
[1.4 0.2]
[1.7 0.4]
[1.4 0.3]
[1.5 0.2]
[1.4 0.2]
[1.5 0.1]]
# 对比 不同的 score_func 有什么区别,是不是不同的函数选择的特征都是一样的?
# 说明:这里小样本,结果具有偶然性,实际开发项目时可以对比一下。
# 个人觉得,根据理论方法,选择不同的score_func会选择出不同的特征。
# 分类函数比较: chi2 , f_classif , mutual_info_classif
X_SelectPercentile_chi2 = SelectPercentile(chi2,percentile=50)
X_SelectPercentile_f_classif = SelectPercentile(f_classif,percentile=50)
X_SelectPercentile_mutual_info_classif = SelectPercentile(mutual_info_classif,percentile=50)
X_new_SelectKBest_chi2 = X_SelectPercentile_chi2.fit_transform(X, y)
X_new_SelectKBest_f_classif = X_SelectPercentile_f_classif.fit_transform(X, y)
X_new_SelectKBest_mutual_info_classif = X_SelectPercentile_mutual_info_classif.fit_transform(X, y)
print('chi2 scores: \n',X_SelectPercentile_chi2.scores_)
print('f_classif scores: \n',X_SelectPercentile_f_classif.scores_)
print('mutual_info_classif scores:\n',X_SelectPercentile_mutual_info_classif.scores_)
print('chi2 pvalues_: \n',X_SelectPercentile_chi2.pvalues_)
print('f_classif pvalues_: \n',X_SelectPercentile_f_classif.pvalues_)
print('mutual_info_classif pvalues_:\n',X_SelectPercentile_mutual_info_classif.pvalues_)
运行结果
chi2 scores:
[ 10.81782088 3.59449902 116.16984746 67.24482759]
f_classif scores:
[ 119.26450218 47.3644614 1179.0343277 959.32440573]
mutual_info_classif scores:
[0.47980703 0.26374852 0.98914392 0.97626377]
chi2 pvalues_:
[4.47651499e-03 1.65754167e-01 5.94344354e-26 2.50017968e-15]
f_classif pvalues_:
[1.66966919e-31 1.32791652e-16 3.05197580e-91 4.37695696e-85]
mutual_info_classif pvalues_:
None
# 互信息
from minepy import MINE
import pandas as pd
def my_mine(x_data,y):
# 定义一个列表,将互信息的值放在列表中
mic_score = []
m = MINE()
# 取出 x 的每一列
for column in x_data.columns:
m.compute_score(x_data[column],y)
mic_score.append(m.mic())
# 转为 dataframe 并将其 列名 一一对应
mic_score = pd.DataFrame(mic_score )
# 更改 索引
mic_score.index = x_data.columns
# 更改列名
mic_score = mic_score.rename(columns = {0:'MICSCORE'})
return mic_score
from sklearn.datasets import load_iris
iris = load_iris()
x ,y= pd.DataFrame(iris.data).rename(columns={0:'a',1:'b',2:'c',3:'d'}), iris.target
my_mine(x,y)
# 说明:
# sklearn.feature_selection 中封装的互信息,和此处有一点的差异,但是差异不大
# 推荐使用 sklearn 官网的sklearn.feature_selection 库
# [0.50056966 0.24913013 0.99109155 0.99213811]
运行结果
MICSCORE
a 0.642196
b 0.401504
c 0.918296
d 0.918296
1.3 特征选择------Pearson相关系数 (Pearson Correlation)
皮尔森相关系数是一种最简单的,能帮助理解特征和响应变量之间关系的方法,该方法衡量的是变量之间的线性相关性,结果的取值区间为[-1,1],-1表示完全的负相关,+1表示完全的正相关,0表示没有线性相关。 Pearson相关系数的一个明显缺陷是,作为特征排序机制,只对线性关系敏感。如果关系是非线性的,即便两个变量具有一一对应的关系,Pearson相关性也可能会接近0。
# Scipy的 pearsonr 方法
import pandas as pd
from pandas import Series
from scipy.stats import pearsonr
def my_pearsonr(x_data,y_data):
# 遍历 x_data的每一列
sorce_p_value = []
for column in x_data.columns:
sorce = pearsonr(x_data[column], y_data)
sorce_p_value.append(list(sorce))
# 转化为 dataframe
sorce_p_value = pd.DataFrame(sorce_p_value)
# 更改 索引
sorce_p_value.index = x_data.columns
# 更改列名
sorce_p_value = sorce_p_value.rename(columns = {0:'sorce',1:'p-value'})
return sorce_p_value
from sklearn.datasets import load_iris
x ,y= pd.DataFrame(iris.data).rename(columns={0:'a',1:'b',2:'c',3:'d'}), iris.target
sorce_p_value = my_pearsonr(x,y)
sorce_p_value
运行结果
sorce p-value
a 0.782561 2.890478e-32
b -0.419446 9.159985e-08
c 0.949043 4.155478e-76
d 0.956464 4.775002e-81
# 相关系数法对非线性不明感
import numpy as np
x = np.random.uniform(-1, 1, 100000)
print(pearsonr(x, x**2))
运行结果
(-0.006741987892572058, 0.03300673581074622)
1.4 特征选择------距离相关系数 (Distance correlation)
# 预备知识
# 维度改变
## atleast_xd 支持将输入数据直接视为 x维。这里的 x 可以表示:1,2,3。
from scipy.spatial.distance import pdist, squareform
import numpy as np
np.atleast_1d([1])
np.atleast_2d([1])
np.atleast_3d([1])
print('np.atleast_1d:',np.atleast_1d([1]))
print('np.atleast_2d:',np.atleast_2d([1]))
print('np.atleast_3d:',np.atleast_3d([1]))
# 欧几里德距离
# Pairwise distance between pairs of object(Pdist函数用于各种距离的生成)
X = [[1,2,3,4,5],
[2,4,6,8,10],
[3,6,9,12,15],
[1,2,3,4,5]]
x_pdist=pdist(X)
# pdist(x) 计算m*n的数据矩阵中对象之间的欧几里得距离
# 得到的是一个长度为m(m-1)/2的距离向量,距离是按顺序排列的(2,1),(3,1)…….(m,1),(3,2)……..(m,2)………(m,m-1)
print('欧氏距离:\n',x_pdist)
#矩阵中的(i,j),对应于原始数据集中的i列和j列之间的欧氏距离。
print('矩阵:\n',squareform(x_pdist))
# 连乘操作
print(np.prod([1,2])) # 1*2
print(np.prod([[1,2],[3,4]])) # (1*2) * (3*4)
print(np.prod([[1,2],[3,4]],axis=0)) # (1*3) * (2*4)
print(np.prod([[1,2],[3,4]],axis=1)) # (1*2) (3*4)
运行结果
np.atleast_1d: [1]
np.atleast_2d: [[1]]
np.atleast_3d: [[[1]]]
欧氏距离:
[ 7.41619849 14.83239697 0. 7.41619849 7.41619849 14.83239697]
矩阵:
[[ 0. 7.41619849 14.83239697 0. ]
[ 7.41619849 0. 7.41619849 7.41619849]
[14.83239697 7.41619849 0. 14.83239697]
[ 0. 7.41619849 14.83239697 0. ]]
2
24
[3 8]
[ 2 12]
from scipy.spatial.distance import pdist, squareform
import numpy as np
def my_distcorr(X, Y):
""" Compute the distance correlation function
>>> a = [1,2,3,4,5]
>>> b = np.array([2,4,6,8,10])
>>> distcorr(a, b)
0.762676242417
"""
X = np.atleast_1d(X)
Y = np.atleast_1d(Y)
if np.prod(X.shape) == len(X):
X = X[:, None]
if np.prod(Y.shape) == len(Y):
Y = Y[:, None]
X = np.atleast_2d(X)
Y = np.atleast_2d(Y)
n = X.shape[0]
if Y.shape[0] != X.shape[0]:
raise ValueError('Number of samples must match')
a = squareform(pdist(X))
b = squareform(pdist(Y))
A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean()
B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean()
dcov2_xy = (A * B).sum()/float(n * n)
dcov2_xx = (A * A).sum()/float(n * n)
dcov2_yy = (B * B).sum()/float(n * n)
dcor = np.sqrt(dcov2_xy)/np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy))
return dcor
a = [1,2,3,4,5]
b = np.array([2,4,6,8,10])
print(my_distcorr(a, b))
# 查看 Pearson相关系数 接近零,其 距离相关系数
my_distcorr(a, [i*i for i in a])
运行结果
1.0
0.9869160440537483
1.5 特征选择------基于模型的特征排序 (Model based ranking)
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
import numpy as np
# Load boston housing dataset as an example
boston = load_boston()
X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]
rf = RandomForestRegressor(n_estimators=20, max_depth=4)
scores = []
# 单独采用每个特征进行建模,并进行交叉验证
for i in range(X.shape[1]):
score = cross_val_score(rf, X[:, i:i+1], Y, scoring="r2", # 注意X[:, i]和X[:, i:i+1]的区别
cv=ShuffleSplit(len(X), 5, .3),
# cv=5,
n_jobs=-1)
scores.append((format(np.mean(score), '.3f'), names[i]))
print(sorted(scores, reverse=True))
运行结果
[('0.665', 'LSTAT'), ('0.545', 'RM'), ('0.394', 'NOX'), ('0.327', 'INDUS'), ('0.320', 'TAX'), ('0.318', 'PTRATIO'), ('0.201', 'CRIM'), ('0.179', 'ZN'), ('0.161', 'RAD'), ('0.124', 'DIS'), ('0.114', 'B'), ('0.078', 'AGE'), ('0.031', 'CHAS')]
1.6 特征选择------递归特征消除 (Recursive Feature Elimination)
属性:
support_:选择的特征,布尔类型。
ranking_:特征的排名位置。 选择的特征被指定为等级1。越差的特征,等级数越高。
# 属性:
# support_:选择的特征,布尔类型。
# ranking_:特征的排名位置。 选择的特征被指定为等级1。越差的特征,等级数越高。
# 不采用交叉验证
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
# 可以选择不同的基模型
estimator = SVR(kernel="linear")
#参数estimator为基模型,参数n_features_to_select为选择的特征个数
selector = RFE(estimator, n_features_to_select=5)
selector = selector.fit(X, y)
print('selector.support_:\n',selector.support_)
print('selector.ranking_:\n',selector.ranking_)
print(selector)
运行结果
selector.support_:
[ True True True True True False False False False False]
selector.ranking_:
[1 1 1 1 1 6 4 3 2 5]
RFE(estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
n_features_to_select=5, step=1, verbose=0)
# # 采用交叉验证,
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
estimator = SVR(kernel="linear")
# 参数:
# step: 如果大于或等于1,那么`step`对应于(整数)每次迭代时要删除的要素数。
# 如果在(0.0,1.0)内,那么`step`对应于百分比(向下舍入)要在每次迭代时删除的要素。
selector = RFECV(estimator, step=1, cv=6)
selector = selector.fit(X, y)
print('selector.support_:\n',selector.support_)
print('selector.ranking_:\n',selector.ranking_)
运行结果
selector.support_:
[ True True True True True False False False False False]
selector.ranking_:
[1 1 1 1 1 6 4 3 2 5]
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000, n_features=25, n_informative=3,
n_redundant=2, n_repeated=0, n_classes=8,
n_clusters_per_class=1, random_state=0)
# Create the RFE object and compute a cross-validated score.
svc = SVC(kernel="linear")
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2),
scoring='accuracy')
rfecv.fit(X, y)
print("Optimal number of features : %d" % rfecv.n_features_)
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
运行结果
Optimal number of features : 3
1.7 特征选择------基于L1的特征选择 (L1-based feature selection)
- 使用L1范数作为惩罚项的线性模型(Linear models)会得到稀疏解:大部分特征对应的系数为0。当你希望减少特征的维度以用于其它分类器时,可以通过 feature_selection.SelectFromModel 来选择不为0的系数。
- 特别指出,常用于此目的的稀疏预测模型有 linear_model.Lasso(回归),linear_model.LogisticRegression 和 svm.LinearSVC(分类).
- SelectFromModel 作为meta-transformer,能够用于拟合后任何拥有coef_或feature_importances_ 属性的预测模型。 如果特征对应的coef_ 或 feature_importances_ 值低于设定的阈值threshold,那么这些特征将被移除。除了手动设置阈值,也可通过字符串参数调用内置的启发式算法(heuristics)来设置阈值,包括:平均值(“mean”), 中位数(“median”)以及他们与浮点数的乘积,如”0.1*mean”。
# svm.LinearSVC
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
iris = load_iris()
X, y = iris.data, iris.target
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
print('X_shape:',X.shape)
print('X_new.shape:',X_new.shape)
运行结果
X_shape: (150, 4)
X_new.shape: (150, 3)
## 带L1和L2惩罚项的逻辑回归作为基模型的特征选择
# 对于SVM和逻辑回归,参数C控制稀疏性:C越小,被选中的特征越少。对于Lasso,参数alpha越大,被选中的特征越少。
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression as LR
X_new = SelectFromModel(LR(C=0.1)).fit_transform(iris.data, iris.target)
print('X_new.shape:',X_new.shape)
运行结果
X_new.shape: (150, 2)
# Use SelectFromModel meta-transformer along with Lasso to select the best couple of features from the Boston dataset.
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_boston
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
# Load the boston dataset.
boston = load_boston()
X, y = boston['data'], boston['target']
# We use the base estimator LassoCV since the L1 norm promotes sparsity of features.
clf = LassoCV()
# Set a minimum threshold of 0.25
sfm = SelectFromModel(clf, threshold=0.25)
sfm.fit(X, y)
n_features = sfm.transform(X).shape[1]
print('X_shape:',X.shape)
print('new_X_shape:',sfm.transform(X).shape)
# Reset the threshold till the number of features equals two.
# Note that the attribute can be set directly instead of repeatedly
# fitting the metatransformer.
while n_features > 2:
sfm.threshold += 0.1
X_transform = sfm.transform(X)
n_features = X_transform.shape[1]
# Plot the selected two features from X.
plt.title(
"Features selected from Boston using SelectFromModel with "
"threshold %0.3f." % sfm.threshold)
feature1 = X_transform[:, 0]
feature2 = X_transform[:, 1]
plt.plot(feature1, feature2, 'r.')
plt.xlabel("Feature number 1")
plt.ylabel("Feature number 2")
plt.ylim([np.min(feature2), np.max(feature2)])
plt.show()
运行结果
X_shape: (506, 13)
new_X_shape: (506, 5)
1.8 特征选择------ 随机稀疏模型 (Randomized sparse models)
- 基于L1的稀疏模型的局限在于,当面对一组互相关的特征时,它们只会选择其中一项特征。为了减轻该问题的影响,可以使用随机化技术,通过多次重新估计稀疏模型来扰乱设计矩阵,或通过多次下采样数据来统计一个给定的回归量被选中的次数。
- RandomizedLasso 实现了使用这项策略的Lasso,RandomizedLogisticRegression 使用逻辑回归,适用于分类任务。要得到整个迭代过程的稳定分数,你可以使用 lasso_stability_path。
- 注意到对于非零特征的检测,要使随机稀疏模型比标准F统计量更有效, 那么模型的参考标准需要是稀疏的,换句话说,非零特征应当只占一小部分。
import matplotlib.pyplot as plt
import numpy as np
from scipy import linalg
from sklearn.linear_model import (RandomizedLasso, lasso_stability_path,
LassoLarsCV)
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import auc, precision_recall_curve
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.utils.extmath import pinvh
def mutual_incoherence(X_relevant, X_irelevant):
"""Mutual incoherence, as defined by formula (26a) of [Wainwright2006].
"""
projector = np.dot(np.dot(X_irelevant.T, X_relevant),
pinvh(np.dot(X_relevant.T, X_relevant)))
return np.max(np.abs(projector).sum(axis=1))
for conditioning in (1, 1e-4):
###########################################################################
# Simulate regression data with a correlated design
n_features = 501
n_relevant_features = 3
noise_level = .2
coef_min = .2
# The Donoho-Tanner phase transition is around n_samples=25: below we
# will completely fail to recover in the well-conditioned case
n_samples = 25
block_size = n_relevant_features
rng = np.random.RandomState(42)
# The coefficients of our model
coef = np.zeros(n_features)
coef[:n_relevant_features] = coef_min + rng.rand(n_relevant_features)
# The correlation of our design: variables correlated by blocs of 3
corr = np.zeros((n_features, n_features))
for i in range(0, n_features, block_size):
corr[i:i + block_size, i:i + block_size] = 1 - conditioning
corr.flat[::n_features + 1] = 1
corr = linalg.cholesky(corr)
# Our design
X = rng.normal(size=(n_samples, n_features))
X = np.dot(X, corr)
# Keep [Wainwright2006] (26c) constant
X[:n_relevant_features] /= np.abs(
linalg.svdvals(X[:n_relevant_features])).max()
X = StandardScaler().fit_transform(X.copy())
# The output variable
y = np.dot(X, coef)
y /= np.std(y)
# We scale the added noise as a function of the average correlation
# between the design and the output variable
y += noise_level * rng.normal(size=n_samples)
mi = mutual_incoherence(X[:, :n_relevant_features],
X[:, n_relevant_features:])
###########################################################################
# Plot stability selection path, using a high eps for early stopping
# of the path, to save computation time
alpha_grid, scores_path = lasso_stability_path(X, y, random_state=42,
eps=0.05)
plt.figure()
# We plot the path as a function of alpha/alpha_max to the power 1/3: the
# power 1/3 scales the path less brutally than the log, and enables to
# see the progression along the path
hg = plt.plot(alpha_grid[1:] ** .333, scores_path[coef != 0].T[1:], 'r')
hb = plt.plot(alpha_grid[1:] ** .333, scores_path[coef == 0].T[1:], 'k')
ymin, ymax = plt.ylim()
plt.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$')
plt.ylabel('Stability score: proportion of times selected')
plt.title('Stability Scores Path - Mutual incoherence: %.1f' % mi)
plt.axis('tight')
plt.legend((hg[0], hb[0]), ('relevant features', 'irrelevant features'),
loc='best')
###########################################################################
# Plot the estimated stability scores for a given alpha
# Use 6-fold cross-validation rather than the default 3-fold: it leads to
# a better choice of alpha:
# Stop the user warnings outputs- they are not necessary for the example
# as it is specifically set up to be challenging.
with warnings.catch_warnings():
warnings.simplefilter('ignore', UserWarning)
lars_cv = LassoLarsCV(cv=6).fit(X, y)
# Run the RandomizedLasso: we use a paths going down to .1*alpha_max
# to avoid exploring the regime in which very noisy variables enter
# the model
alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
clf = RandomizedLasso(alpha=alphas, random_state=42).fit(X, y)
trees = ExtraTreesRegressor(100).fit(X, y)
# Compare with F-score
F, _ = f_regression(X, y)
plt.figure()
for name, score in [('F-test', F),
('Stability selection', clf.scores_),
('Lasso coefs', np.abs(lars_cv.coef_)),
('Trees', trees.feature_importances_),
]:
precision, recall, thresholds = precision_recall_curve(coef != 0,
score)
plt.semilogy(np.maximum(score / np.max(score), 1e-4),
label="%s. AUC: %.3f" % (name, auc(recall, precision)))
plt.plot(np.where(coef != 0)[0], [2e-4] * n_relevant_features, 'mo',
label="Ground truth")
plt.xlabel("Features")
plt.ylabel("Score")
# Plot only the 100 first coefficients
plt.xlim(0, 100)
plt.legend(loc='best')
plt.title('Feature selection scores - Mutual incoherence: %.1f'
% mi)
plt.show()
1.9 特征选择------ 基于树的特征选择 (Tree-based feature selection)
- 基于树的预测模型,能够用来计算特征的重要程度,因此能用来去除不相关的特征:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
random_state=0,
shuffle=False)
# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
random_state=0)
forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
运行结果
Feature ranking:
1. feature 1 (0.295902)
2. feature 2 (0.208351)
3. feature 0 (0.177632)
4. feature 3 (0.047121)
5. feature 6 (0.046303)
6. feature 8 (0.046013)
7. feature 7 (0.045575)
8. feature 4 (0.044614)
9. feature 9 (0.044577)
10. feature 5 (0.043912)
# 用于人脸识别数据
from time import time
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces
from sklearn.ensemble import ExtraTreesClassifier
# Number of cores to use to perform parallel fitting of the forest model
n_jobs = 1
# Load the faces dataset
data = fetch_olivetti_faces()
X = data.images.reshape((len(data.images), -1))
y = data.target
mask = y < 5 # Limit to 5 classes
X = X[mask]
y = y[mask]
# Build a forest and compute the pixel importances
print("Fitting ExtraTreesClassifier on faces data with %d cores..." % n_jobs)
t0 = time()
forest = ExtraTreesClassifier(n_estimators=1000,
max_features=128,
n_jobs=n_jobs,
random_state=0)
forest.fit(X, y)
print("done in %0.3fs" % (time() - t0))
importances = forest.feature_importances_
importances = importances.reshape(data.images[0].shape)
# Plot pixel importances
plt.matshow(importances, cmap=plt.cm.hot)
plt.title("Pixel importances with forests of trees")
plt.show()