Load the file

From the Kaggle site, https://www.kaggle.com/c/titanic, we can download the train and test set file. The test set missing on the column for Survived.

train <- read.csv(file = 'train.csv', header = TRUE, na.strings = '')
test <- read.csv(file = 'test.csv', header = TRUE, na.strings = '')

## fill in for Survived then bind two dataset
test$Survived <- NA
total_dataset <- rbind(train, test)

## quick look at the dataset
summary(total_dataset)
##   PassengerId      Survived          Pclass          Name          
##  Min.   :   1   Min.   :0.0000   Min.   :1.000   Length:1309       
##  1st Qu.: 328   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median : 655   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   : 655   Mean   :0.3838   Mean   :2.295                     
##  3rd Qu.: 982   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :1309   Max.   :1.0000   Max.   :3.000                     
##                 NA's   :418                                        
##      Sex                 Age            SibSp            Parch      
##  Length:1309        Min.   : 0.17   Min.   :0.0000   Min.   :0.000  
##  Class :character   1st Qu.:21.00   1st Qu.:0.0000   1st Qu.:0.000  
##  Mode  :character   Median :28.00   Median :0.0000   Median :0.000  
##                     Mean   :29.88   Mean   :0.4989   Mean   :0.385  
##                     3rd Qu.:39.00   3rd Qu.:1.0000   3rd Qu.:0.000  
##                     Max.   :80.00   Max.   :8.0000   Max.   :9.000  
##                     NA's   :263                                     
##     Ticket               Fare            Cabin             Embarked        
##  Length:1309        Min.   :  0.000   Length:1309        Length:1309       
##  Class :character   1st Qu.:  7.896   Class :character   Class :character  
##  Mode  :character   Median : 14.454   Mode  :character   Mode  :character  
##                     Mean   : 33.295                                        
##                     3rd Qu.: 31.275                                        
##                     Max.   :512.329                                        
##                     NA's   :1
str(total_dataset)
## 'data.frame':    1309 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  NA "C85" NA "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...
library(tidyverse)
library(mice)
library(caret)

Dataset cleaning

First we check the variable Name, we want to extract the title information from it, as it may indicate gender and age or some additional information maybe relevent for the prediction.

## create empty column
total_dataset$Title <- NA

## parse out the title
total_dataset$Title <- str_sub(total_dataset$Name, str_locate(total_dataset$Name, ",")[, 1] + 2, str_locate(total_dataset$Name, "\\.")[,1]-1)

## plot to see the frequency
total_dataset %>% 
  group_by(Title) %>% 
  summarise(n = n()) %>% 
  ggplot() +
  geom_bar(aes(x= reorder(Title, desc(n)), y= n), stat = 'identity')
## `summarise()` ungrouping output (override with `.groups` argument)

We see there are only Mr, Miss, Mrs and Master are the most common, while other titles have negelibale frequency, therefore we can group the low frequency ones into the standard title.

## master as unmarried man
total_dataset$Title[total_dataset$Title %in% c('Mme', 'Lady', 'Ms', 'the Countess', 'Dona')] <- 'Mrs'
total_dataset$Title[total_dataset$Title %in% c('Capt', 'Don', 'Col', 'Dr', 'Jonkheer', 'Major', 'Rev', 'Sir')] <- 'Mr'
total_dataset$Title[total_dataset$Title %in% c('Mlle')] <- 'Miss'

## after grouping
table(total_dataset$Title)
## 
## Master   Miss     Mr    Mrs 
##     61    262    783    203

We change all other character columns into factors

total_dataset$Survived <- as.factor(total_dataset$Survived)
total_dataset$Pclass <- as.factor(total_dataset$Pclass)
total_dataset$Sex <- as.factor(total_dataset$Sex)
total_dataset$Embarked <- as.factor(total_dataset$Embarked)
total_dataset$Title <- as.factor(total_dataset$Title)

For the numercial variables SibSp and Parch, we will create a new variable to group those family informations into a single factor.

## only consider total family size
total_dataset <- mutate(total_dataset, FamSize = SibSp + Parch)

Compute NA’s

The dataset has large missing values in Age (263), and a few in Fare(1), Embarked(2). We will use the mice package to impute the missing value.

set.seed(123)
total_dataset_clean <- total_dataset %>% 
  select(-c(PassengerId, Survived, Name, SibSp, Parch)) %>% 
  mice(print = FALSE)
## Warning: Number of logged events: 2

By default mice impute 5 sets of computation, after compare different entry with NA’s, chosse a sensible imputation.

dataset <- complete(total_dataset_clean, 1)

Select final columns. Ticket and Cabin can be considered, but there are too many missing values and their represents similiar information with Fare, so we will drop them for this analysis.

dataset <- dataset %>% 
  select(-c(Ticket, Cabin)) %>% 
  mutate(Survived = total_dataset$Survived)

Modelling

Because we will use xgboost, so we will do one hot encoding for the categorical variables.

## dummy variable to pass the process
dummy_var <- dummyVars(~Pclass + Sex + Title,
                       data = dataset,
                       levelsOnly = TRUE)
dataset_dummy <- predict(dummy_var, dataset)

## drop the original variable
dataset2 <- dataset %>% 
  mutate(dataset_dummy) %>% 
  select(-c(Pclass, Sex, Title))

As the dataset was split already, we will use the row number to keep the training and testing set.

train_dataset <- dataset2[1:891,]
test_dataset <- dataset2[892:1309,]

Setup the train control for the caret model

## 10 fold cv, 3 times
train_control <- trainControl(method = 'repeatedcv',
                              number = 10,
                              repeats = 3,
                              search = 'grid')

We will try varies model, then compare their performance.

## logistics
model_lg <- train(Survived ~ .,
                  data = train_dataset,
                  method = 'glm',
                  family = binomial(),
                  tuneLength = 5,
                  trControl = train_control,
                  set.seed(1234))

## random forest (slow)
model_rf <- train(Survived ~ .,
                  data = train_dataset,
                  method = 'rf',
                  tuneLength = 5,
                  trControl = train_control,
                  set.seed(1234))

## Navie bayes
model_nb <- train(Survived ~ .,
                  data = train_dataset,
                  method = 'naive_bayes',
                  tuneLength = 5,
                  trControl = train_control,
                  set.seed(1234))

## knn
model_knn <- train(Survived ~ .,
                   data = train_dataset,
                   method = 'knn',
                   tuneLength = 5,
                   preProcess = c('center', 'scale'),
                   trControl = train_control)

## gradient boosting
model_gbm <- train(Survived ~ .,
                   data = train_dataset,
                   method = 'gbm',
                   tuneLength = 5,
                   trControl = train_control,
                   verbose = FALSE,
                   set.seed(1234))

## xgboost (slow)
model_xgb <- train(Survived ~ .,
                   data = train_dataset,
                   method = 'xgbTree',
                   tuneLength = 5,
                   trControl = train_control,
                   set.seed(1234))

We can use resamples from caret to compare models, then we can decide which model to use or group.

## compare models
results <- resamples(list(log=model_lg, rf=model_rf, nb=model_nb, knn = model_knn, xgb=model_xgb, gbm=model_gbm))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: log, rf, nb, knn, xgb, gbm 
## Number of resamples: 30 
## 
## Accuracy 
##          Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## log 0.7528090 0.8112870 0.8314607 0.8353979 0.8647855 0.9222222    0
## rf  0.7191011 0.8128788 0.8323970 0.8379873 0.8651685 0.9000000    0
## nb  0.7191011 0.7733657 0.7966037 0.7945907 0.8089888 0.8651685    0
## knn 0.7528090 0.7977528 0.8146067 0.8159689 0.8426966 0.8750000    0
## xgb 0.7888889 0.8314607 0.8426966 0.8462723 0.8634831 0.9325843    0
## gbm 0.7865169 0.8186925 0.8426966 0.8425270 0.8662921 0.9438202    0
## 
## Kappa 
##          Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## log 0.4764706 0.5928866 0.6348348 0.6480386 0.7074114 0.8337731    0
## rf  0.3948871 0.5887276 0.6348348 0.6457644 0.7095171 0.7826323    0
## nb  0.4336981 0.5177730 0.5717516 0.5685024 0.5965776 0.7176097    0
## knn 0.4822845 0.5592411 0.5921934 0.6041449 0.6649313 0.7290034    0
## xgb 0.5488127 0.6369323 0.6644218 0.6693879 0.7050655 0.8539387    0
## gbm 0.5348006 0.6025020 0.6591904 0.6602655 0.7129447 0.8789774    0
modelCor(results)
##             log          rf          nb         knn        xgb         gbm
## log  1.00000000 -0.04519476 -0.05458422  0.38706243  0.2793724 -0.06636379
## rf  -0.04519476  1.00000000 -0.05383537 -0.05964235  0.1647165 -0.13359733
## nb  -0.05458422 -0.05383537  1.00000000 -0.03405458 -0.1031276 -0.05869008
## knn  0.38706243 -0.05964235 -0.03405458  1.00000000  0.4181593 -0.07832586
## xgb  0.27937245  0.16471651 -0.10312762  0.41815934  1.0000000 -0.22633993
## gbm -0.06636379 -0.13359733 -0.05869008 -0.07832586 -0.2263399  1.00000000

Accuracy is similar between all models, except navie bayes being below 80%, while the corraltion matrix shows the models all differ, so it is useful to bag or stack them.

## plot view
bwplot(results)

We will create a new data frame to store the predictions and taking averageing for their predication.

pred_table_voting <- NULL

## model predictions
pred_table_voting$gbm <- predict(model_gbm, newdata = train_dataset, type = 'prob')
pred_table_voting$lg <- predict(model_lg, newdata = train_dataset, type = 'prob')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
pred_table_voting$rf <- predict(model_rf, newdata = train_dataset, type = 'prob')
pred_table_voting$nb <- predict(model_nb, newdata = train_dataset, type = 'prob')
pred_table_voting$knn <- predict(model_knn, newdata = train_dataset, type = 'prob')
pred_table_voting$xgb <- predict(model_xgb, newdata = train_dataset, type = 'prob')

## original value
pred_table_voting$Survived <- train_dataset$Survived
pred_table_voting <- as.data.frame(pred_table_voting)

## nb seems bad, so we kept it out 
pred_table_voting <- pred_table_voting %>% 
  mutate(Survived_prob = (gbm.1+lg.1+rf.1+knn.1+xgb.1)/5) %>% 
  mutate(Survived_pred = if_else(Survived_prob>=0.5, 1, 0))

Check the groupped performance

## confustion matrix
cm_final <- confusionMatrix(as.factor(pred_table_voting$Survived_pred), as.factor(train_dataset$Survived), positive = '1')

cm_final
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 520  71
##          1  29 271
##                                           
##                Accuracy : 0.8878          
##                  95% CI : (0.8652, 0.9077)
##     No Information Rate : 0.6162          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7571          
##                                           
##  Mcnemar's Test P-Value : 4.132e-05       
##                                           
##             Sensitivity : 0.7924          
##             Specificity : 0.9472          
##          Pos Pred Value : 0.9033          
##          Neg Pred Value : 0.8799          
##              Prevalence : 0.3838          
##          Detection Rate : 0.3042          
##    Detection Prevalence : 0.3367          
##       Balanced Accuracy : 0.8698          
##                                           
##        'Positive' Class : 1               
## 

The weighted model has an improved accuracy than individual models, so we will apply them onto the test set.

## create prob predication 
pred_table_voting2 <- NULL
pred_table_voting2$gbm <- predict(model_gbm, newdata = test_dataset, type = 'prob')
pred_table_voting2$lg <- predict(model_lg, newdata = test_dataset, type = 'prob')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
pred_table_voting2$rf <- predict(model_rf, newdata = test_dataset, type = 'prob')
pred_table_voting2$knn <- predict(model_knn, newdata = test_dataset, type = 'prob')
pred_table_voting2$xgb <- predict(model_xgb, newdata = test_dataset, type = 'prob')

## convert to a dataframe
pred_table_voting2 <- as.data.frame(pred_table_voting2)

## find average
pred_table_voting2 <- pred_table_voting2 %>% 
  mutate(dummy_sum = (gbm.1+lg.1+rf.1+knn.1+xgb.1)/5) %>% 
  mutate(Survived = if_else(dummy_sum > 0.5, 1, 0)) %>%
  mutate(PassengerId = test[ ,1]) %>% 
  select(PassengerId, Survived)

Produce final csv output

write.csv(pred_table_voting2, 'Output.final.csv', row.names = FALSE)

Final score of 0.77033.