Apply machine leaning techniques to predict what types of people were likely to survive in Titanic disaster.

Models used :

Achieved 80% accuracy in prediction. Rank top 11% among 5,601 competitions as of end of Oct-2016

Load Packages

library(mice)
library(randomForest)
library(caTools)
library(rpart)
library(rpart.plot)
library(dplyr)
library(ggplot2)

Import Data

train = read.csv("train.csv", stringsAsFactors = FALSE)
test <- read.csv("test.csv", stringsAsFactors = FALSE)

Combind training and test datasets for feature engineering

full  <- bind_rows(train, test)

Look at the full dataset

str(full)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1309 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...

Survived - Survival (0 = No, 1 = Yes)

Pclass - Passenger Class

Name - Passenger Name

Sex - Sex

Age - Age

Sibsp - Number of Siblings/Spouses Aboard

Parch - Number of Parents/Children Aboard

Ticket - Ticket Number

Fare - Fare

Cabin - Cabin

Embarked - Port of Embarkation

Feature Engineering

Create new variable : Title

full$Title <- gsub('(.*, )|(\\..*)', '', full$Name)
officer <- c("Capt", "Col", "Major", "Dr", "Rev")
Royality <- c("Jonkheer", "Don", "Sir", "the Countess", "Dona", "Lady")
full$Title[full$Title %in% officer] <- "Officer"
full$Title[full$Title %in% Royality] <- "Royality"
full$Title[full$Title == "Mlle"] <- "Miss"
full$Title[full$Title == "Mme"] <- "Mrs"
full$Title[full$Title == "Ms"] <- "Mrs"
table(full$Title)
## 
##   Master     Miss       Mr      Mrs  Officer Royality 
##       61      262      757      200       23        6

Create new variable : Family Size

full$FamilySize <- (full$SibSp+full$Parch + 1)
ggplot(full[1:891,], aes(x = FamilySize, fill = factor(Survived))) + 
  geom_bar(stat = 'count', position = 'dodge') +
  scale_x_continuous(breaks = c(1:11))

Create new variable : Family Size Category

full <- full %>% 
  mutate(FamilySizeCat = ifelse(FamilySize == 1, "Single", ifelse(FamilySize >= 5, "Large", "Samll")))

Create new varialble : Single Fare

fare.table <- full %>% 
  group_by(Ticket) %>% 
  summarise(Fare = mean(Fare, rm.na = TRUE), 
            TicketCount = n(), 
            SingleFare = Fare/TicketCount)
full <- left_join(full, fare.table, by = "Ticket")
full$Fare.y <- NULL
ggplot(full[1:891,], aes(x = SingleFare, fill = factor(Survived))) + 
  geom_histogram(position = 'dodge') +
  xlim(0, 50)

Create new varilables : Cabin and Deck

full$Cabin[full$Cabin == ""] <- "U000"
full$Deck <- substr(full$Cabin, 1, 1)
full$Deck[full$Deck == "T"] <- "A"
table(full$Deck, full$Pclass)
##    
##       1   2   3
##   A  23   0   0
##   B  65   0   0
##   C  94   0   0
##   D  40   6   0
##   E  34   4   3
##   F   0  13   8
##   G   0   0   5
##   U  67 254 693
ggplot(full[1:891,], aes(x = Deck, fill = factor(Survived))) + 
  geom_bar(position = 'dodge')

Impute Fare.x

full %>% 
  filter(Pclass == 3 & Cabin == "U000") %>% 
  group_by(Pclass, Cabin, Embarked) %>% 
  summarise(mean = mean(SingleFare, na.rm = TRUE))
## Source: local data frame [3 x 4]
## Groups: Pclass, Cabin [?]
## 
##   Pclass Cabin Embarked     mean
##    (int) (chr)    (chr)    (dbl)
## 1      3  U000        C 6.829740
## 2      3  U000        Q 7.575783
## 3      3  U000        S 7.392706
full$Fare.x[is.na(full$Fare.x)] <- 7.392706
full$SingleFare[is.na(full$SingleFare)] <- 7.392706

Impute Embarked

full %>% 
  filter(Pclass == 1) %>% 
  group_by(Embarked) %>% 
  summarise(mean = mean(SingleFare, na.rm = TRUE))
## Source: local data frame [4 x 2]
## 
##   Embarked     mean
##      (chr)    (dbl)
## 1          40.00000
## 2        C 38.32432
## 3        Q 30.00000
## 4        S 30.39188
full$Embarked[full$Embarked == ""] <- "C"

Before we impute Age, take a look at what factors that might impact the age distribution

ggplot(full, aes(x = as.factor(Pclass), y = Age)) + geom_boxplot()

ggplot(full, aes(x = as.factor(Sex), y = Age)) + geom_boxplot()

ggplot(full, aes(x = as.factor(SibSp), y = Age)) + geom_boxplot()

ggplot(full, aes(x = as.factor(Parch), y = Age)) + geom_boxplot()

ggplot(full, aes(x = as.factor(Deck), y = Age)) + geom_boxplot()

ggplot(full, aes(x = Fare.x, y = Age)) + 
  geom_point(aes(color = factor(Survived))) + 
  geom_smooth(method = "lm", se=FALSE, color="black", aes(group=1))

ggplot(full, aes(x = as.factor(TicketCount), y = Age)) + geom_boxplot()

Run imputation with selected variables

full.impute.1 <- full %>% select(Pclass, Sex, Age, SibSp, Parch, Title, FamilySize)
full.impute.1 <- complete(mice(full.impute.1))
## 
##  iter imp variable
##   1   1  Age
##   1   2  Age
##   1   3  Age
##   1   4  Age
##   1   5  Age
##   2   1  Age
##   2   2  Age
##   2   3  Age
##   2   4  Age
##   2   5  Age
##   3   1  Age
##   3   2  Age
##   3   3  Age
##   3   4  Age
##   3   5  Age
##   4   1  Age
##   4   2  Age
##   4   3  Age
##   4   4  Age
##   4   5  Age
##   5   1  Age
##   5   2  Age
##   5   3  Age
##   5   4  Age
##   5   5  Age
full.impute.1$Survived <- full$Survived

Run another imputation with all variables

full.impute.2 <- full %>% select(Pclass, Sex, Age, SibSp, Parch, Fare.x, Embarked, Title, FamilySize, SingleFare, Deck)
full.impute.2 <- complete(mice(full.impute.2))
## 
##  iter imp variable
##   1   1  Age
##   1   2  Age
##   1   3  Age
##   1   4  Age
##   1   5  Age
##   2   1  Age
##   2   2  Age
##   2   3  Age
##   2   4  Age
##   2   5  Age
##   3   1  Age
##   3   2  Age
##   3   3  Age
##   3   4  Age
##   3   5  Age
##   4   1  Age
##   4   2  Age
##   4   3  Age
##   4   4  Age
##   4   5  Age
##   5   1  Age
##   5   2  Age
##   5   3  Age
##   5   4  Age
##   5   5  Age
full.impute.2$Survived <- full$Survived

Look at the result after imputation

ggplot(full[1:891,], aes(x = Age, fill = factor(Survived))) + geom_histogram() + ylim(0,120)

ggplot(full.impute.1[1:891,], aes(x = Age, fill = factor(Survived))) + geom_histogram()

ggplot(full.impute.2[1:891,], aes(x = Age, fill = factor(Survived))) + geom_histogram()

ggplot(full[1:891,], aes(x = Age, fill = factor(Survived))) + geom_histogram(aes(y =..density..)) + ylim(0,0.1)

ggplot(full.impute.1[1:891,], aes(x = Age, fill = factor(Survived))) + geom_histogram(aes(y =..density..)) + ylim(0,0.1)

ggplot(full.impute.2[1:891,], aes(x = Age, fill = factor(Survived))) + geom_histogram(aes(y =..density..)) + ylim(0,0.1)

Compare to 1st and 2nd imputations, the 1st one looks more close to original age distribution. So we will use it for runing the models.

Logistic Regression

Run logsitic regrssion with most variables

full$Age <- full.impute.1$Age
train.impute <- full[1:891,]
test.impute <- full[892:1309,]
lg_fit1 <- glm(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare.x + Embarked + FamilySize + SingleFare + Deck + Title, data = train.impute, family = "binomial")
summary(lg_fit1)
## 
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp + Parch + 
##     Fare.x + Embarked + FamilySize + SingleFare + Deck + Title, 
##     family = "binomial", data = train.impute)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3815  -0.5562  -0.3686   0.5499   2.4743  
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    19.863520 504.071639   0.039   0.9686    
## Pclass         -0.872740   0.220964  -3.950 7.83e-05 ***
## Sexmale       -14.784280 504.070448  -0.029   0.9766    
## Age            -0.027625   0.008529  -3.239   0.0012 ** 
## SibSp          -0.549119   0.128476  -4.274 1.92e-05 ***
## Parch          -0.338400   0.140148  -2.415   0.0158 *  
## Fare.x          0.002588   0.004444   0.583   0.5602    
## EmbarkedQ      -0.284929   0.395314  -0.721   0.4711    
## EmbarkedS      -0.418922   0.254871  -1.644   0.1002    
## FamilySize            NA         NA      NA       NA    
## SingleFare      0.007233   0.019722   0.367   0.7138    
## DeckB           0.115772   0.755810   0.153   0.8783    
## DeckC          -0.272818   0.702655  -0.388   0.6978    
## DeckD           0.767585   0.767021   1.001   0.3170    
## DeckE           1.089875   0.763804   1.427   0.1536    
## DeckF           0.504067   1.083470   0.465   0.6418    
## DeckG          -0.870956   1.244861  -0.700   0.4842    
## DeckU          -0.340483   0.673794  -0.505   0.6133    
## TitleMiss     -15.345195 504.070708  -0.030   0.9757    
## TitleMr        -3.550740   0.535861  -6.626 3.44e-11 ***
## TitleMrs      -14.504139 504.070772  -0.029   0.9770    
## TitleOfficer   -3.310967   0.830858  -3.985 6.75e-05 ***
## TitleRoyality  -3.400136   1.375808  -2.471   0.0135 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1186.66  on 890  degrees of freedom
## Residual deviance:  712.49  on 869  degrees of freedom
## AIC: 756.49
## 
## Number of Fisher Scoring iterations: 13
predict_fit1 <- predict(lg_fit1, data = train.impute, type = "response")
table(train.impute$Survived, predict_fit1 > 0.5)
##    
##     FALSE TRUE
##   0   484   65
##   1    78  264
(482+265)/891
## [1] 0.8383838

Select only variable with p value < 0.05

lg_fit2 <- glm(Survived ~ Pclass + Age + SibSp + Embarked + Parch + Title, data = train.impute, family = "binomial")
summary(lg_fit2)
## 
## Call:
## glm(formula = Survived ~ Pclass + Age + SibSp + Embarked + Parch + 
##     Title, family = "binomial", data = train.impute)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3848  -0.5801  -0.3705   0.5370   2.4849  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    5.896863   0.689258   8.555  < 2e-16 ***
## Pclass        -1.242591   0.142074  -8.746  < 2e-16 ***
## Age           -0.025929   0.008222  -3.154  0.00161 ** 
## SibSp         -0.525550   0.122538  -4.289 1.80e-05 ***
## EmbarkedQ     -0.277634   0.389159  -0.713  0.47559    
## EmbarkedS     -0.470539   0.238882  -1.970  0.04887 *  
## Parch         -0.309174   0.130983  -2.360  0.01826 *  
## TitleMiss     -0.574150   0.492522  -1.166  0.24372    
## TitleMr       -3.565390   0.526791  -6.768 1.30e-11 ***
## TitleMrs       0.175215   0.540416   0.324  0.74577    
## TitleOfficer  -3.383071   0.780935  -4.332 1.48e-05 ***
## TitleRoyality -2.763221   1.072860  -2.576  0.01001 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1186.66  on 890  degrees of freedom
## Residual deviance:  731.05  on 879  degrees of freedom
## AIC: 755.05
## 
## Number of Fisher Scoring iterations: 5
predict_fit2 <- predict(lg_fit2, data = train.impute, type = "response")
table(train.impute$Survived, predict_fit2 > 0.5)
##    
##     FALSE TRUE
##   0   486   63
##   1    81  261
(483+258)/891
## [1] 0.8316498

1st model with greater # of variable seems to have better accuracy on the training set. It will be interersting to see if it will still have better predicting power when we apply it to test set without being overfitting.

Decision Tree

dt.fit1 <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare.x + Embarked + FamilySize + SingleFare + Deck + Title, data = train.impute, method = "class")
rpart.plot(dt.fit1)

predict_df_fit1 <- predict(dt.fit1, data = train.impute)
table(train.impute$Survived, predict_df_fit1[,2] > 0.5)
##    
##     FALSE TRUE
##   0   521   28
##   1   104  238
(509+246)/891
## [1] 0.8473625

The accuracy on the training from decision seem too good to be true especially comparing to logistic regression and random forest model we will see next. It could be overfitting. But the tree does give us a good logical path on what type of people were likely to survive.

Random Forest

train.impute$Pclass <- as.factor(train.impute$Pclass)
train.impute$SibSp <- as.factor(train.impute$SibSp)
train.impute$Parch <- as.factor(train.impute$SibSp)
train.impute$Embarked <- as.factor(train.impute$Embarked)
train.impute$Title <- as.factor(train.impute$Title)
train.impute$Sex <- as.factor(train.impute$Sex)
train.impute$FamilySize <- as.factor(train.impute$FamilySize)
train.impute$Deck <- as.factor(train.impute$Deck)
train.impute.rf <- train.impute %>%
  select(Survived, Pclass, Age, SibSp, Parch, Embarked, Title, Sex, FamilySize, Deck, Fare.x, SingleFare)
rf_fit1 <- randomForest(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare.x + Embarked + FamilySize + SingleFare + Deck + Title, data = train.impute.rf)
predict.rf.fit1 <- predict(rf_fit1, data = train.impute.rf)
table(train.impute$Survived, predict.rf.fit1 > 0.5)
##    
##     FALSE TRUE
##   0   496   53
##   1    91  251
(489+248)/891
## [1] 0.8271605
varImpPlot(rf_fit1,type=2)

Choose top 5 most important variables and run random forest again

rf_fit2 <- randomForest(Survived ~ Sex + Age + Fare.x + SingleFare + Title, data = train.impute.rf)
predict.rf.fit2 <- predict(rf_fit2, data = train.impute.rf)
table(train.impute$Survived, predict.rf.fit2 > 0.5)
##    
##     FALSE TRUE
##   0   487   62
##   1    91  251
(491+248)/891
## [1] 0.8294052

A little surprising to see that the results from random forest are not as good as both logistic regression and decision tree. But again, it is only against the training set.

Since I do not have the final result on the test set, it is difficult to say which models give us the best result. One thing can be done is to split the original training set into training and test sets then we can do the evaluation.