降低维度的方法
- 选择特征
从原有的特征中挑选出对结果影响最大的特征
- 抽取特征
将数据从高维度空间投影到低维度空间
选择特征
移除低变异数的特征
import pandas
from sklearn.feature_selection import VarianceThreshold
df = pandas.read_csv('Data/customer_behavior.csv')
X = df[['bachelor','gender','age','salary']]
sel = VarianceThreshold(threshold=0) # 方差小于threshold的值的特征变量将被删除,默认为0
X_val = sel.fit_transform(X)
names = X.columns[sel.get_support()]
print(names)
# 以上设置threshold的值为0,即表示某一特征变量在数据集中没有变化,这个特征对结果没有任何影响,会被删除
单变量特征筛选
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X = df[['bachelor','gender','age','salary']]
y = df['purchased'].values
clf = SelectKBest(chi2,k=2) # k=2与结果最相关的2个变量
clf.fit(X,y)
print(clf.scores_) # 各个特征变量与结果的相关程度
X_new = clf.fit_transform(X,y)
print(X_new) # 最相关的2个变量的dataframe
逐步剔除特征(Recursive feature elimination)
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
clf = SVC(kernel='linear') # RFE中只能使用线性分类模型
rfe = RFE(clf,n_features_to_select=1) # 逐步剔除到只剩1个特征变量
rfe.fit(X_val,y)
for x in rfe.ranking_:
print(names[x-1],rfe.ranking_[x-1])
使用随机森林筛选特征
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10,random_state=123)
clf.fit(X_val,y)
for feature in zip(names,clf.feature_importances_): #clf.feature_importances_ 给出每个特征对结果的重要性
print(feature)
特征重要性可视化
import matplotlib.pyplot as plt
plt.title('Feature Importance')
plt.bar(range(0,len(names)),clf.feature_importances_)
plt.xticks(range(0,len(names)),names)
plt.show()
抽取特征
主成分分析 PCA
from sklearn.datasets import load_iris
iris= load_iris()
X = iris.data
y = iris.target
from sklearn.decomposition import PCA
pca = PCA(n_components = 2) # n_components 定义压缩成2个主成分
pca.fit(X)
X_reduced = pca.transform(X)
print(X_reduced.shape)
from matplotlib import pyplot as plt
plt.scatter(X_reduced[:,0],X_reduced[:,1],c=y)
plt.show()
for component in pca.components_:
print('+'.join("%.3f * %s"%(value,name) for value,name in
zip(component,iris.feature_names)))
# 0.362 * sepal length (cm)+-0.082 * sepal width (cm)+0.857 * petal length (cm)+0.359 * petal width (cm)
# 0.657 * sepal length (cm)+0.730 * sepal width (cm)+-0.176 * petal length (cm)+-0.075 * petal width (cm)
# 显示主成分是怎么构成的
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)
# 结果[ 0.92461621 0.05301557],主成分1比主成分2解释的比例高很多
奇异值分解 SVD
from scipy.linalg import svd
U,S,V = svd(X,full_matrices=False) #将矩阵X分解成3个相乘的矩阵
import numpy as np
S = np.diag(S) # 分解完后的S是一个1维矩阵,恢复成对角矩阵
print(U.dot(S).dot(V)) # 3个矩阵点乘后与X相同
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2) # 使用sklearn的truncatedsvd将矩阵降到2维数据
X_new = svd.fit_transform(X)
plt.scatter(X_new[:,0],X_new[:,1],c=y)
plt.show()