from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("max_colwidth", 100)
pd.set_option("display.width", 1000)
train = pd.read_csv("data/train.csv", index_col="PassengerId")
test = pd.read_csv("data/test.csv", index_col="PassengerId")
df = pd.concat([train, test], sort=False)
# Fill Fields
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])
df["Fare"] = df["Fare"].fillna(df["Fare"].quantile(0.5))
df["Name"] = df["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip())
df["Cabin"] = df["Cabin"].fillna("NoCabin")
df["Age"] = pd.cut(df.groupby("Name")["Age"].transform(lambda x: x.fillna(x.mean())), 10)
df["Fare"] = pd.cut(df["Fare"], 10)
# Stander Fields
dummies = pd.get_dummies(df[["Name", "Sex", "Embarked"]])
df["Age"] = pd.factorize(df["Age"])[0]
df["Ticket"] = pd.factorize(df["Ticket"])[0]
df["Fare"] = pd.factorize(df["Fare"])[0]
df["Cabin"] = pd.factorize(df["Cabin"])[0]
datas = pd.concat([df, dummies], axis=1)
datas = datas.filter(regex="Survived|Pclass|Name_.*|Sex_.*|SibSp|Parch|Ticket|Fare|Cabin|Embarked_.*")
scaler = StandardScaler()
for c in ["Pclass", "SibSp", "Parch", "Ticket", "Fare", "Cabin"]:
datas[c] = scaler.fit_transform(datas[[c]])
train = datas[datas["Survived"].notnull()]
test = datas[datas["Survived"].isnull()]
y = train.loc[:, "Survived"]
X = train.drop(columns="Survived")
test_X = test.drop(columns="Survived")
xgb = XGBClassifier(max_depth=2, n_estimators=295, learning_rate=0.01)
rfc = RandomForestClassifier(max_depth=6, n_estimators=95)
dtc = DecisionTreeClassifier()
svc = SVC(probability=True, gamma="scale")
lr = LogisticRegression(solver="lbfgs")
vc = VotingClassifier(estimators=[("xgb", xgb), ("rfc", rfc), ("dtc", dtc), ("svc", svc), ("lr", lr)], voting="soft")
vc.fit(X, y)
predictions = vc.predict(test_X)
submission = pd.DataFrame({'PassengerId': test.index, 'Survived': predictions},dtype=int)
submission.to_csv("data/submission.csv", index=False)
Kaggle Titanic 0.8
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...