train = read.csv("C:/Users/袁良杰/Desktop/Titanic/train.csv", stringsAsFactors = F)

test = read.csv("C:/Users/袁良杰/Desktop/Titanic/test.csv", stringsAsFactors = F)

full = bind_rows(train, test)str(full)

## 'data.frame':    1309 obs. of  12 variables:

##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...

##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...

##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...

##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...

##  $ Sex        : chr  "male" "female" "female" "female" ...

##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...

##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...

##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...

##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...

##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...

##  $ Cabin      : chr  "" "C85" "" "C123" ...

##  $ Embarked   : chr  "S" "C" "S" "S" ...





full$Title <- gsub("(.*, )|(\\..*)", "", full$Name)


table(full$Sex, full$Title)


##          Capt Col Don Dona  Dr Jonkheer Lady Major Master Miss Mlle Mme

##   female    0   0   0    1   1        0    1     0      0  260    2   1

##   male      1   4   1    0   7        1    0     2     61    0    0   0


##           Mr Mrs  Ms Rev Sir the Countess

##   female   0 197   2   0   0            1

##   male   757   0   0   8   1            0


rare.title = c("Capt", "Col", "Don", "Dona", "Dr", "Jonkheer", "Lady", "Major",  "Rev", "Sir", "the Countess")


full$Title[full$Title == "Mlle"] = "Miss"

full$Title[full$Title == "Ms"] = "Miss"

full$Title[full$Title == "Mme"] = "Mrs"

full$Title[full$Title %in% rare.title] = "Rare.Title"


table(full$Sex, full$Title)


##          Master Miss  Mr Mrs Rare.Title

##   female      0  264   0 198          4

##   male       61    0 757   0         25



full$Surname = sapply(full$Name, function(x) strsplit(x, split = "[,.]")[[1]][1])

cat(paste( nlevels(factor(full$Surname)), "个不同的姓氏"))

## 875个不同的姓氏


full$Fsize = full$SibSp + full$Parch + 1

full$Family = paste(full$Surname, full$Fsize, sep = "-")


ggplot(full[1:891,],aes(x = Fsize, fill = factor(Survived))) +      

geom_bar(position = "fill") +      

scale_x_continuous(breaks = c(1:12)) +      

labs(x = "家庭规模", y = "生存与遇难比")

#由图看出,单人与家庭规模在五人及以上时生存率较低,因此我们将家庭规模分为三类full$Fsize2[full$Fsize == 1] = "single"

full$Fsize2[full$Fsize >& full$Fsize <= 4] = "small"

full$Fsize2[full$Fsize >4] = "large"

mosaicplot(table(full$Fsize2, full$Survived), main = "家庭规模与生存率", shade = T)  


data(full,package = "VIM")

## Warning in data(full, package = "VIM"): data set 'full' not found


## [1] 1044

#第1044行是从S港上船 的三等舱的乘客信息,将其他同样的乘客进行可视化

ggplot(full[full$Pclass == "3" & full$Embarked == "S",], aes(x = Fare)) +

geom_density(fill = "green", alpha=0.4) +

geom_vline(aes(xintercept=median(Fare, na.rm=T)),colour="red", linetype=2, lwd=1) +

scale_x_continuous(breaks = c(0:60))

## Warning: Removed 1 rows containing non-finite values (stat_density).


full$Fare[1044] = median(full[full$Pclass == "3" & full$Embarked == "S",]$Fare, na.rm = T)


Factor.Vars = c("PassengerId", "Pclass", "Sex",  "Title", "Surname", "Fsize2", "Family")

full[Factor.Vars] = lapply(full[Factor.Vars], function(x) as.factor(x))imp = mice(full[,names(full) %in% c("Pclass", "Sex", "Age", "Fare", "Title", "Fsize2", "SbiSp", "Parch")], seed = 1234)

mice.imp = complete(imp, action = 5)



hist(full$Age, freq = F, main = "full$Age",  col="orange", ylim = c(0,0.04) )

hist(mice.imp$Age, freq = F, main = "mice.imp$Age",  col="lightblue", ylim = c(0,0.04) )


full$Age = mice.imp$Age



## [1] 0



full$Aduch[full$Age < 18] = "Chid"

full$Aduch[full$Age >= 18] = "Adult"

table(full$Aduch, full$Survived)


##           0   1

##   Adult 480 271

##   Chid   69  71

full$Aduch = factor(full$Aduch)

data(full,package = "VIM")

## Warning in data(full, package = "VIM"): data set 'full' not found



train = full[1:891,]

test = full[892:1309,]



rf.model = randomForest(factor(Survived) ~ Pclass + Sex + Age + Fare + Title + Fsize2 + SibSp + Parch + Aduch, data = train, na.action = na.roughfix, importance = T) 


importance(rf.model, type = 2)

##        MeanDecreaseGini

## Pclass        33.465766

## Sex           51.314368

## Age           53.151593

## Fare          67.072898

## Title         80.053375

## Fsize2        17.148189

## SibSp         12.992146

## Parch          8.156831

## Aduch          3.940061

forest.pred = predict(rf.model, test)

forest.perf = table(test$Survived, forest.pred )



answer = data.frame(PassengerID = test$PassengerId, Survived = forest.pred)

write.csv(answer, file = "ans_predict.csv", row.names

