from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("max_colwidth", 100)
pd.set_option("display.width", 1000)
train = pd.read_csv("data/train.csv", index_col="PassengerId")
test = pd.read_csv("data/test.csv", index_col="PassengerId")
df = pd.concat([train, test], sort=False)
# Fill Fields
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])
df["Fare"] = df["Fare"].fillna(df["Fare"].quantile(0.5))
df["Name"] = df["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip())
df["Cabin"] = df["Cabin"].fillna("NoCabin")
df["Age"] = pd.cut(df.groupby("Name")["Age"].transform(lambda x: x.fillna(x.mean())), 10)
df["Fare"] = pd.cut(df["Fare"], 10)
# Stander Fields
dummies = pd.get_dummies(df[["Name", "Sex", "Embarked"]])
df["Age"] = pd.factorize(df["Age"])[0]
df["Ticket"] = pd.factorize(df["Ticket"])[0]
df["Fare"] = pd.factorize(df["Fare"])[0]
df["Cabin"] = pd.factorize(df["Cabin"])[0]
datas = pd.concat([df, dummies], axis=1)
datas = datas.filter(regex="Survived|Pclass|Name_.*|Sex_.*|SibSp|Parch|Ticket|Fare|Cabin|Embarked_.*")
scaler = StandardScaler()
for c in ["Pclass", "SibSp", "Parch", "Ticket", "Fare", "Cabin"]:
datas[c] = scaler.fit_transform(datas[[c]])
train = datas[datas["Survived"].notnull()]
test = datas[datas["Survived"].isnull()]
y = train.loc[:, "Survived"]
X = train.drop(columns="Survived")
test_X = test.drop(columns="Survived")
xgb = XGBClassifier(max_depth=2, n_estimators=295, learning_rate=0.01)
rfc = RandomForestClassifier(max_depth=6, n_estimators=95)
dtc = DecisionTreeClassifier()
svc = SVC(probability=True, gamma="scale")
lr = LogisticRegression(solver="lbfgs")
vc = VotingClassifier(estimators=[("xgb", xgb), ("rfc", rfc), ("dtc", dtc), ("svc", svc), ("lr", lr)], voting="soft")
vc.fit(X, y)
predictions = vc.predict(test_X)
submission = pd.DataFrame({'PassengerId': test.index, 'Survived': predictions},dtype=int)
submission.to_csv("data/submission.csv", index=False)
Kaggle Titanic 0.8
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。