一、泰坦尼克号数据准备
import pandas as pd
import numpy as np
#特征最影响结果的K个特征
from sklearn.feature_selection import SelectKBest
#卡方校验,作为SelectKBest的参数
from sklearn.feature_selection import chi2
df = pd.read_csv(r"D:\node\nd\Pandas_study\pandas_test\titanic_train.csv")
df = df[["PassengerId", "Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]].copy()
print(df.head())
二、数据清理和转换
1、查看是否有空值的列
print(df.info())
2、给Age列填充平均值
#年龄字段,填充年龄的平均值
df["Age"] = df["Age"].fillna(df["Age"].median())
3、将性别列变成数字
df.loc[df["Sex"] == "male","Sex"] = 0
df.loc[df["Sex"] == "female","Sex"] = 1
print(df.head())
4、给Embarked列填充空值,字符串转换成数字
a = df["Embarked"].unique()
print(a)
#填充空值
df["Embarked"] = df["Embarked"].fillna(0)
#字符串变成数字
df.loc[df["Embarked"] == "S","Embarked"] = 1
df.loc[df["Embarked"] == "C","Embarked"] = 2
df.loc[df["Embarked"] == "Q","Embarked"] = 3
print(df.head())
四、将特征列和结果列拆分开
#结果列
y = df.pop("Survived")
#特征列
X = df
print(y.head())
print(X.head())
五、使用卡放检验选择topK的特征
#选择所有的特征,目的是看到特征重要性排序,score_func参数代表选择的方式,是卡方校验,
#k代表校验的列,这里是校验所有的列
bestfeatures = SelectKBest(score_func=chi2,k = len(X.columns))
fit = bestfeatures.fit(X,y)
print(fit)
六、按照重要性顺序打印特征列表
1、返回每一列分数
#fit中有一个scores_返回了每一列的分数
df_scores = pd.DataFrame(fit.scores_)
print(df_scores)
2、把每一列编程新的df
df_columns = pd.DataFrame(X.columns)
print("X这个df的列是:",X.columns)
print(df_columns)
3、合并两个df
#合并两个df
df_feature_scores = pd.concat([df_scores,df_columns],axis = 1)
print(df_feature_scores)
4、设置合并后的df列名,查看最终结果
#设置合并后df的列名
df_feature_scores.columns = ["Score","feature_name"]
print(df_feature_scores)
5、对数据进行降序排列,可以得知哪些因素对结果有影响
df_sort =df_feature_scores.sort_values(by = "Score",ascending= False)
print(df_sort)