1 Package used:

library(tidyverse)
library(VIM)
library(RColorBrewer)
library(viridis)
library(DT)
library(magrittr)
library(scales)
library(ggstance)
library(mice)
library(stringr)
library(mice)
library(party)
library(caret)
library(ROCR)
library(e1071)
library(randomForest)
library(adabag)
library(ggstance)

2 Load data:

The datasets given to us are split into two tables - train & test. The logic behind this division is to clean, tidy, impute and explore important varibales in train sets and build a model to predict survivals from the test set. We have survival data for train dataset but none for test dataset.

VARIABLE - DESCRIPTIONS:

survival - Survival (0 = No; 1 = Yes)

pclass - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)

name - Name

sex - Sex

age - Age

sibsp - Number of Siblings/Spouses Aboard

parch - Number of Parents/Children Aboard

ticket - Ticket Number

fare - Passenger Fare

cabin - Cabin

embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

raw_train <- read_csv("d:/dropbox/r_project/titanic 2017/train.csv", na=c(""))

raw_test <- read_csv("d:/dropbox/r_project/titanic 2017/test.csv", na=c(""))

2.1 Check data

raw_train data has 891 obs. and 12 variables

raw_test data has 418 obs. and 11 variables

summary (raw_train)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 
summary (raw_test)
##   PassengerId         Pclass          Name               Sex           
##  Min.   : 892.0   Min.   :1.000   Length:418         Length:418        
##  1st Qu.: 996.2   1st Qu.:1.000   Class :character   Class :character  
##  Median :1100.5   Median :3.000   Mode  :character   Mode  :character  
##  Mean   :1100.5   Mean   :2.266                                        
##  3rd Qu.:1204.8   3rd Qu.:3.000                                        
##  Max.   :1309.0   Max.   :3.000                                        
##                                                                        
##       Age            SibSp            Parch           Ticket         
##  Min.   : 0.17   Min.   :0.0000   Min.   :0.0000   Length:418        
##  1st Qu.:21.00   1st Qu.:0.0000   1st Qu.:0.0000   Class :character  
##  Median :27.00   Median :0.0000   Median :0.0000   Mode  :character  
##  Mean   :30.27   Mean   :0.4474   Mean   :0.3923                     
##  3rd Qu.:39.00   3rd Qu.:1.0000   3rd Qu.:0.0000                     
##  Max.   :76.00   Max.   :8.0000   Max.   :9.0000                     
##  NA's   :86                                                          
##       Fare            Cabin             Embarked        
##  Min.   :  0.000   Length:418         Length:418        
##  1st Qu.:  7.896   Class :character   Class :character  
##  Median : 14.454   Mode  :character   Mode  :character  
##  Mean   : 35.627                                        
##  3rd Qu.: 31.500                                        
##  Max.   :512.329                                        
##  NA's   :1
colnames_train <- names(raw_train)

colnames_test <- names(raw_test)

Lets check variables which are in raw_train set but not in raw_test

setdiff (colnames_train, colnames_test)
## [1] "Survived"

Lets check variables which are in raw_test set but not in raw_train

setdiff (colnames_test, colnames_train)
## character(0)

2.2 Bind rows

We will combine the two datasets so that when we need to impute or see paterns, more observations is better.

data1 <- bind_rows(raw_train, raw_test)

data1$Survived <- as.factor(data1$Survived)
data1$SibSp <- as.factor(data1$SibSp)
data1$Parch <- as.factor(data1$Parch)

3 Missing values

Missing value count per variables used in data1

data1 %>% 
        map_dbl(~sum(is.na(.)))
## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0         418           0           0           0         263 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           1        1014           2
map_dbl(data1, ~sum((is.na(.) == T)/length(.))*100)
## PassengerId    Survived      Pclass        Name         Sex         Age 
##  0.00000000 31.93277311  0.00000000  0.00000000  0.00000000 20.09167303 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##  0.00000000  0.00000000  0.00000000  0.07639419 77.46371276  0.15278839

We know that test data had 418 obs. and it didn’t have survival data. There are lot of missing values in Cabin and Age variable.

# aggr_col<- viridis (n = 4, alpha = 0.5,  begin = 0, end =1, option = "D" )

aggr_col <- c('lightseagreen', 'red')

aggr (data1, col = aggr_col, combined = T, sortVars = T, sortCombs = T, cex.axis = .8)

Above plot displays all existing combinations of missing (red) and observed (lightseagreen) values. For example, this plot reveals that if variable Cabin are missing, they are also missing Age and Fare variables. However, when Embarked variable is missing, Cabin, Age and Fare variables have data. We are not concerned with Survival’s missing observations since they are mostly from test dataset.

4 Feature Engineering

4.1 Extracting Title from name

Sex variable doesn’t tell us whether a person is married or single but the name title does. Title can also even predict if they are in family or not.

Lets extract title from names.

data1$Title <- str_replace_all(data1$Name, '(.*, )|(\\..*)', "")

table(data1$Sex,data1$Title)
##         
##          Capt Col Don Dona  Dr Jonkheer Lady Major Master Miss Mlle Mme
##   female    0   0   0    1   1        0    1     0      0  260    2   1
##   male      1   4   1    0   7        1    0     2     61    0    0   0
##         
##           Mr Mrs  Ms Rev Sir the Countess
##   female   0 197   2   0   0            1
##   male   757   0   0   8   1            0
print("Unique Titles are: "); unique(data1$Title)
## [1] "Unique Titles are: "
##  [1] "Mr"           "Mrs"          "Miss"         "Master"      
##  [5] "Don"          "Rev"          "Dr"           "Mme"         
##  [9] "Ms"           "Major"        "Lady"         "Sir"         
## [13] "Mlle"         "Col"          "Capt"         "the Countess"
## [17] "Jonkheer"     "Dona"

We will replace Capt|Col|Don|Dr|Jonkheer|Major|Master|Rev|Sir|the Countess to Others. Also, there are Miss which are spelled incorrecly - “Mme”, “Ms”, “Lady”, “Mlle”. We will replace them with Miss.

replace_var <- c("Mme", "Ms", "Lady", "Mlle")

data1$Title <-  gsub("Mme|Ms|Lady|Mlle", "Miss", data1$Title)

data1$Title <- gsub("Capt|Col|Don|Dr|Jonkheer|Major|Master|Rev|Sir|the Countess", "Others", data1$Title )

data1$Title <-gsub("Othersa", "Others", data1$Title )

4.2 Family numbers

Another variable we can create from given data is the size of the family.

sibsp Number of Siblings/Spouses Aboard parch Number of Parents/Children Aboard

Large Family more then 4 people, Mid Family more than 1 but less then 5 and single is 1.

data1 <- data1 %>% 
        mutate(Fam_size = as.numeric(SibSp) + as.numeric(Parch) + 1) %>% 
        mutate(Family = case_when(.$Fam_size == 1 ~ "Single", .$Fam_size >1 & .$Fam_size < 5 ~ "Mid Family", .$Fam_size >4 ~ "Large Family"))
        
table(data1$Family)
## 
## Large Family   Mid Family 
##          284         1025

There are lot of observation missing from Cabin ~ 1K missing observations. I am not sure how to deal with this so I’ll only extract the first letter of the Cabin from avaialble observations and keep it for future analysis.

data1$Cabin_initial <- str_sub(data1$Cabin, 1,1)

unique(data1$Cabin_initial)
## [1] NA  "C" "E" "G" "D" "A" "B" "F" "T"

5 Dealing with missing values

The data can be missing for many reasons. Generally, there are 3 steps required in dealing with missing data:

  1. Find the missing data
  2. Investigate the cause of missing data &
  3. Replace the missing values with sensible data or delete the missing data rows.

We do not know the cause of missing data in variables - Cabin, Age, Embarked and Fare. However, for this analysis we will treat those variables as Missing Completely At Random (MCAR) which means there is no systematic reason as to why the data are missing.

So far we know where are the missing data and concluded that it is MCAR. How do we replace/delete the missing values?

5.1 Imputing missing values

  1. Firstly, lets us look at missing values of Embarked variables. We know from our missing data plot Missing values that there are no other variables missing when combined with Embarked variables.
embarked_missing <- data1 %>% 
        filter(is.na(Embarked))

datatable(embarked_missing)

There are two observation with missing data from Embarked variable. We also know that there are three Port of Embarkation -> C = Cherbourg; Q = Queenstown; S = Southampton. The table above shows that both tickets were from Pclass 1; Fare $80; Ticket no 113572 & Cabin B28.

data1 %$% 
        unique(Cabin) %>% 
        summary()
##    Length     Class      Mode 
##       187 character character

There are 187 unique Cabin numbers.

data1 %$%
        unique(Ticket) %>% 
        summary()
##    Length     Class      Mode 
##       929 character character

There are 929 unique ticket numbers.

We know that there are 1, 2 & 3 passangers class. Let us check their fares.

data1 %>% 
        filter(!is.na(Embarked)) %>% 
        ggplot(aes(x = as.factor(Pclass), y = Fare, fill = as.factor(Pclass)))+
        geom_boxplot()+
        facet_wrap(~as.factor(Embarked), scales = "free")+
        labs(x = "Pclass")+
        scale_y_continuous(labels = scales::dollar)

We can see from the box plot that Pclass 1 median fare is approx $80 when embarked from c which is Cherbourg. Lets confirm it.

data1 %>% 
        filter(Pclass == 1, Embarked == "C") %>% 
        summarise(Pclass1_median = median(Fare, na.rm=T))
## # A tibble: 1 Ă— 1
##   Pclass1_median
##            <dbl>
## 1        76.7292

I think it is sensible to assign “C” for those two tickets whose Embarked values were missing.

data1$Embarked[data1$PassengerId == 62 | data1$PassengerId == 830] <- "C"
  1. We also know PassengerId 1044 has Fare value missing. Let have a look:
fare_na <- data1 %>% 
        filter(is.na(Fare))
datatable(fare_na)

PassengerId 1044 has Pclass 3 and embarked from S = Southampton.

data1 %>% 
        filter(Pclass == 3, Embarked == "S") %>% 
        ggplot(aes(x = Fare, y = ..density..))+
        geom_density(show.legend = F, fill = "peachpuff3")+
        geom_vline(aes(xintercept = median(Fare, na.rm = T)),linetype = "dashed", lwd = 1, col = "red")+
        scale_x_continuous(labels = scales::dollar)+
        labs(title = "Density Plot - $Fares for Pclass = 3 & Embarked = 'S' ")

data1 %>% 
        filter(Pclass == 3, Embarked == "S") %>% 
        ggplot(aes(x = Pclass, y = Fare))+
        geom_boxplot()+
        labs(x = "Pclass 3", title = "Box Plot - $Fares for Pclass = 3 & Embarked = 'S' ")

median_fare_p3_S <- data1 %>% 
        filter(Pclass == 3, Embarked == "S") %$%
        median(Fare, na.rm=T)

paste0("The median fare for Pclas 3 & Embarked = 'S' is $", median_fare_p3_S)
## [1] "The median fare for Pclas 3 & Embarked = 'S' is $8.05"

The density plot which show the counts and the area underneath the polygon is one. This plot tells us approx $8.05 accounts to around 15 percent of the fare in Pclass 3 and Embarked from S. I think it is sensible to replace NA with $8.05

data1$Fare[data1$PassengerId == 1044] <- median_fare_p3_S
  1. Age variable
# names(data1)

var_factors <- c("PassengerId", "Pclass", "Sex", "Embarked", "Title", "Fam_size", "Family")

data1[var_factors] <- map_df(data1[var_factors], ~as.factor(.))

We’ll use MICE imputation method to impute missing Age variables.

Info: https://cran.r-project.org/web/packages/mice/mice.pdf

We’ll use Pclass, SibSp, Parch, Fare, Embarked, Title, Fam_size variables as predictors to impute missing values in Age.

set.seed(1)

names(data1)

data1$Age_org <- data1$Age

data1$Age_imp <- is.na(data1$Age)

mice_m1 <- data1 %>% 
        select(Pclass, SibSp, Parch, Fare, Embarked, Title, Fam_size, Age) %>% 
        mice(method = "rf")

mice_m1_output <- complete(mice_m1)

data1$Age <- mice_m1_output$Age

data1 %>% 
        ggplot(aes( x= Age_org, fill = Age_imp))+
        geom_histogram(col = "black", show.legend = F)+
        labs(title = "Histogram of Age")

data1 %>% 
        ggplot(aes(x = Age, fill = Age_imp))+
        geom_histogram(col = "black")+
        labs(title = "Histogram of Age with imputed value highlighted")

The shape of histogram has not changed significantlly from original values and after imputing missing values. I think its sensible to use the imputed values.

5.2 Data Exploration

Let us split out data1 sets into original train and test objects but name it as train_edited & test_edited. They are exactly same apart from the new imputed values.

train_edited <- data1[1:891,]
test_edited <- data1[892:1309, ]

Sex ratio vs survival:

prop.table(table(train_edited$Survived, train_edited$Sex),2)
##    
##        female      male
##   0 0.2579618 0.8110919
##   1 0.7420382 0.1889081

~ 74 % of all the females in train data survived whereas only ~18% out of all male population survived. That is huge differences!!!

pclass_Sex <- train_edited %>% 
        group_by(Survived, Pclass, Sex, Family, Title) %>% 
        summarise(n=n())

datatable(pclass_Sex, caption = "Survivor Table")
pclass_Sex %>% 
        ggplot(aes(x = Pclass, y = n))+
        geom_bar(aes(fill = Title), stat="identity")+
        facet_wrap(Survived~Sex, scales = "free_y")

pclass_Sex %>% 
        ggplot(aes(x = Pclass, y = n))+
        geom_bar(aes(fill = Family), stat="identity")+
        facet_wrap(Survived~Sex)

The above plot clearly shows that there are lot more female survivors then men. In all 3 classes female survivors outnumber male survivors. If you were female you have more chances of survival in Pclass 1 & 2. You’ll have a good chance of survival if you were single or in mid size family.

Lets confirm visually that small family have survived better than large family

train_edited %>% 
        ggplot(aes(x = Fam_size, fill = Survived))+
        geom_bar(position = "dodge")+
        labs(x = "Family Size")

Age vs Survival Boxplot

train_edited %>% 
        ggplot(aes(x = Age, y = Survived, fill = Sex))+
        geom_boxploth()+
        geom_jitter(alpha = 0.09)+
        labs(x = "Age")

Those who did not survive, on average females were much younger than males.

datatable(train_edited %>% 
        group_by(Survived, Sex) %>% 
        summarise(average_age = median(Age),Count=n()))

6 Prediction

Let predict which passengers survives on Titanic. I’ll build two models using conditional inference tree and randomForest.

6.1 Using conditional inference tree (ctree)

We’ll split the training set - train_edited into trainset & testset using 70/30 ratio.

split.data <- function(data, p = .7, s=007) {
        set.seed(s)
        n <- nrow(data)
        train_index <- sample(1:n, size= round(p*n)) 
        trainset <- data[train_index, ]
        testset <- data[-train_index, ]
        list(trainset = trainset, testset=testset)
}



allset <- split.data(train_edited)


trainset <- allset$trainset
testset <- allset$testset

We’ll build our ctree model using Survived ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + Fam_size + Age_mice + Family

trainset.ctree <- ctree(Survived ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + Fam_size + Age + Family, trainset)

Conditional inference tree of the train_edited set:

plot(trainset.ctree)

The above tree diagram has confirmed what we already knew from our data exploration. The tree diagram claims that the most important variable is Title and if you’re female and travel in Pclass 1 & 2 you have much chances of survival. If you are male the worst case scenario would be to travel in Pclass 2 & 3.

Lets Validate power of prediction with confusion matrix:

Since we had split the train_edited data set into trainset and testset we know the survival status. We can use this information to check our model prediction power.

ctree_predict <- predict(trainset.ctree, testset)

conf_matrix_ctree<- confusionMatrix(ctree_predict, testset$Survived)

conf_matrix_ctree
## $positive
## [1] "0"
## 
## $table
##           Reference
## Prediction   0   1
##          0 138  24
##          1  25  80
## 
## $overall
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   8.164794e-01   6.147867e-01   7.647412e-01   8.610376e-01   6.104869e-01 
## AccuracyPValue  McnemarPValue 
##   2.964914e-13   1.000000e+00 
## 
## $byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.8466258            0.7692308            0.8518519 
##       Neg Pred Value            Precision               Recall 
##            0.7619048            0.8518519            0.8466258 
##                   F1           Prevalence       Detection Rate 
##            0.8492308            0.6104869            0.5168539 
## Detection Prevalence    Balanced Accuracy 
##            0.6067416            0.8079283 
## 
## $mode
## [1] "sens_spec"
## 
## $dots
## list()
## 
## attr(,"class")
## [1] "confusionMatrix"
accurary_ctree <- 8.239700e-01 *100

paste0("The the confusion matrix overall accurary is ", accurary_ctree, "% when using Conditional inference tree")
## [1] "The the confusion matrix overall accurary is 82.397% when using Conditional inference tree"

Overall accurary is telling us how often is the classifier correct. 82% not bad !!!

6.2 Randomforest

Randomforest is an ensemble learning method that grows multiple decision trees and each decision tree will produce its own prediction results. Enselble learning method combines results produced by different learners into one format.

trainset.rf <- randomForest(Survived ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + Fam_size + Age + Family, data=trainset, importance = T)

rf_predict <- predict(trainset.rf, trainset)

conf_rf <- confusionMatrix(rf_predict, trainset$Survived)

conf_rf
## $positive
## [1] "0"
## 
## $table
##           Reference
## Prediction   0   1
##          0 381  41
##          1   5 197
## 
## $overall
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   9.262821e-01   8.391104e-01   9.028925e-01   9.455268e-01   6.185897e-01 
## AccuracyPValue  McnemarPValue 
##   2.157280e-70   2.463327e-07 
## 
## $byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.9870466            0.8277311            0.9028436 
##       Neg Pred Value            Precision               Recall 
##            0.9752475            0.9028436            0.9870466 
##                   F1           Prevalence       Detection Rate 
##            0.9430693            0.6185897            0.6105769 
## Detection Prevalence    Balanced Accuracy 
##            0.6762821            0.9073889 
## 
## $mode
## [1] "sens_spec"
## 
## $dots
## list()
## 
## attr(,"class")
## [1] "confusionMatrix"
rf_accuracy <- 9.310897e-01*100
 
paste0("The the confusion matrix overall accurary is ", rf_accuracy, "% when using randomForest")
## [1] "The the confusion matrix overall accurary is 93.10897% when using randomForest"
plot(trainset.rf, main = "Mean square error rate")
legend('topright', legend = colnames(trainset.rf$err.rate), col=1:3, fill=1:3)

Above plot shows mean square error rate. The green line shows the error rate for survival whereas red lines shows for the dead. Plot is telling us that we are more successful predicting death than survival.

datatable(importance(trainset.rf), caption = "Relative Variable Importance")
varImpPlot(trainset.rf, main = "Plot of relative variable importance as measured by randomForest")


The above plot validates what ctree predicted that Title is the most important variable.

7 Final prediction

# predictions <- predict(trainset.rf, test_edited)
# 
# solution <- data.frame(PassengerID = test_edited$PassengerId, Survived = predictions)
# 
# write.csv(solution, file = "Titanic_Survival_pred_rf.csv", row.names = F)

## 0.77990 kaggle score

After submitting my prediction I got 0.77990 kaggle score. I think its not that bad score since there is always room for tuning model which will give us better score.

8 Reference:

  1. https://www.kaggle.com/mrisdal/titanic/exploring-survival-on-the-titanic/notebook

  2. Machine learning with R Cookbook - Yu Wei