library("reshape2") # data restructure
library("plyr") # data manipulation
library("dplyr") # data manipulation
library("ggplot2") # display graphics
library("mice") # missing value imputation
library("VIM") # missing value imputation graphics
library("PerformanceAnalytics") # correlation graphics
library("randomForest") # random forest classification
library("stepPlr") # penalized classification
library("caTools") # boosted logistic regression
library("caret") # classification
library("caretEnsemble") # ensemble

1.2 Data Loading

Load the full dataset, the following contains variable description

full = read.csv(file = '/Users/maxin/Desktop/titanic/full.csv', header = T, stringsAsFactors = T, na.strings = "")

Then tweak the variables, Name is each passenger's name, so it should be character rather than factor, other categorical variables are coded as factor.

full$Class[full$Class == 1] = 'upper'
full$Class[full$Class == 2] = 'middle'
full$Class[full$Class == 3] = 'lower'

full$Survival[full$Survival == 1] = 'survive'
full$Survival[full$Survival == 0] = 'die'

full$Name = as.character(full$Name)
full$Survival = as.factor(full$Survival)
full$Class = as.factor(full$Class)
str(full)

## 'data.frame':    1309 obs. of  9 variables:
##  $ Survival     : Factor w/ 2 levels "die","survive": 2 2 1 1 1 2 2 1 2 1 ...
##  $ Name         : chr  "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Creighton" ...
##  $ Class        : Factor w/ 3 levels "lower","middle",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Gender       : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
##  $ Age          : num  29 0.917 2 30 25 ...
##  $ SiblingSpouse: int  0 1 1 1 1 0 1 0 2 0 ...
##  $ ParentChild  : int  0 2 2 2 2 0 0 0 0 0 ...
##  $ Fare         : num  211 152 152 152 152 ...
##  $ Embark       : Factor w/ 3 levels "Cherbourg","Queenstown",..: 3 3 3 3 3 3 3 3 3 1 ...

Features and its descriptions:

feature name	description
Survival	factor, 2 levels, 'die' or 'survive'
Name	character, (Surname, Title, Given Name (or Nickname))
Class	factor, cabin class, 3 levels, 'upper', 'middle', 'lower'
Gender	factor, 2 levels, 'female', 'male'
Age	num, 263 missing values
SiblingSpouse	int, number of siblings or spouse of this passenger aboard Titanic
ParentChild	int, number of parents or children of this passenger aboard Titanic
Fare	num, ticket fare amount, 1 missing value
Embark	factor, port of embarkation, 2 levels, 'Southampton', 'Cherbourg', 'Queenstown', 2 missing values

2. Feature Engineering

2.1 Existing Features

Before we do any predictive modelling, let's first explore how the feature profiles and how they relate to probability of survival?

Age distribution, by cabin

In middle and lower cabins, most passengers are \( 15 - 40 \) years old, since they mostly consist of ship staffs and servants. In the upper cabin, however, it is more even.

p_age_by_class = ggplot(data = filter(full, !is.na(Age))) + 
                 geom_density(aes(group = Class, color = Class, x = Age)) + 
                 scale_color_discrete(name = 'Passenger Class', labels = c('upper cabin', 'middle cabin', 'lower cabin'), breaks = c('upper', 'middle', 'lower')) + 
                 labs(title = 'Age distribution on Titanic, by passenger class')
(p_age_by_class)

plot of chunk unnamed-chunk-4

Age distribution, by gender

p_age_by_gender = ggplot(data = filter(full, !is.na(Age))) + 
                  geom_density(aes(group = Gender, color = Gender, x = Age)) + 
                  scale_color_manual(values = c('red', 'blue')) + 
                  labs(title = 'Age distribution on Titanic, by gender')
(p_age_by_gender)

plot of chunk unnamed-chunk-5

Fare distribution, by cabin

p_fare_by_class = ggplot(data = filter(full, !is.na(Fare) )) + 
                  geom_point(aes(x = Class, y = Fare), alpha = 0.5, color = 'black', size = 1) + 
                  geom_violin(aes(x = Class, y = Fare, color = Class), alpha = 0.1)  + 
                  coord_flip() + 
                  scale_color_discrete(name = 'Passenger Class', labels = c('upper cabin', 'middle cabin', 'lower cabin'), breaks = c('upper', 'middle', 'lower')) + 
                  labs(title = 'Fare structure on Titanic, by cabin class') + 
                  scale_y_log10()
(p_fare_by_class)

## Warning: Removed 17 rows containing non-finite values (stat_ydensity).

plot of chunk unnamed-chunk-6

Fare distribution, by embark port

p_fare_by_embark = ggplot(data = filter(full, !is.na(Embark) & !is.na(Fare))) + 
                   geom_point(aes(x = Embark, y = Fare), alpha = 0.5, color = 'black', size = 1) + 
                   geom_violin(aes(x = Embark, y = Fare, color = Embark), alpha = 0.1)  + 
                   coord_flip() + scale_color_discrete(name = 'Embarkation') + 
                   labs(title = 'Fare structure on Titanic, by embark port') + 
                   scale_y_continuous(trans = 'log2')
(p_fare_by_embark)

## Warning: Removed 17 rows containing non-finite values (stat_ydensity).

plot of chunk unnamed-chunk-7

Survival or not, by gender and cabin

p_survival_by_gender_by_class = ggplot(full) + 
                                geom_bar(aes(x = Survival, fill = Survival), alpha = 1) + 
                                facet_wrap(~Gender + Class) + 
                                labs(title = 'Survival status, by gender and by cabin class')
(p_survival_by_gender_by_class)

plot of chunk unnamed-chunk-8

2.2 Missing Values

This dataset contains some missing values in variables Age (missed), Fare (1 missed), and Embark (2 missed). Without further ado, assume they are Missing At Random. For Fare and Embark, we impute the missed values by …, however, we resort to predictive methods to impute missed values for Age.

full$Embark[c(169,285)] = 'Cherbourg'
full$Fare[1226] = median(filter(full, Embark == 'Southampton' & Class == 'lower')$Fare, na.rm = T)

I use the package MICE (Multivariate Imputation by Chained Equations) to do iterative imputation for Age. The missing values are imputed by iteratively running “predictive mean matching” algorithm on the missing values. I then compute the median of 5 iterations as the imputed Age values.

set.seed(606)
mice_mod = mice(full[, !names(full) %in% c('Name', 'LifeStage', 'Survival')], method = 'pmm')
mice_output = complete(mice_mod, 'repeated')
#imputed_age = as.data.frame(complete(mice_mod, 2)$Age)
imputed_age = as.data.frame(apply(select(mice_output, matches("Age.")), 1, median))
names(imputed_age) = 'Age'
full$ImputeAge = imputed_age$Age
ages = melt(full[, c('ImputeAge', 'Age')])

## No id variables; using all as measure variables

p_original_vs_imputed_age = ggplot(ages) +
                            geom_histogram(aes(x = value, y = (..count..)/sum(..count..)), color = 'pink', alpha = 0.5, bins = 20) + 
                            facet_wrap(~variable) +
                            labs(title = 'Comparison: original age vs imputed age distribuions', x = 'Age', y = 'Percent')               
(p_original_vs_imputed_age)

## Warning: Removed 263 rows containing non-finite values (stat_bin).

2.3 New Features

Travel size, how does each passenger travel? alone or accompany?

Here we create a new feature TravelSize based on \( TravelSize = SiblingSpouse + ParentChild+ 1 \), but this does not account for other types of companions such as nannies, in-laws, close friends, neighbors etc.

full$TravelSize = full$SiblingSpouse + full$ParentChild + 1
p_survival_by_travel = ggplot(full) +
                       geom_bar(aes(x = TravelSize, fill = Survival), position = 'dodge', width = 0.5) +
                       labs(title = 'Survival status, by passenger travel size', x = 'passenger travelling accompany size') + 
                       scale_x_continuous(breaks = 1:11) 
(p_survival_by_travel)

plot of chunk unnamed-chunk-12

From the plot, it can be seen that when a passenger is travelling alone or the travel size is more than 5 persons, survival probability sharply dwindles. So it makes sense to collapse the TravelSize variable into a categorical variable Company indicating family size.

full$Company[full$TravelSize == 1] = 'alone'
full$Company[full$TravelSize > 1 & full$TravelSize < 5] = 'small'
full$Company[full$TravelSize > 4] = 'large'
full$Company = factor(full$Company, levels = c('alone', 'small', 'large'))

p_survival_by_company = ggplot(full) +
                        geom_bar(aes(x = Company, fill = Survival), position = 'dodge', width = 0.5) + 
                        labs(title = 'Survival status, by passenger company', x = 'passenger company') 
(p_survival_by_company)

plot of chunk unnamed-chunk-13

Life stage, will this affect survival rates?

There were not enough life boats to carry all the passengers when Titanic began to sink. Who would be the luckier ones? There were people sacrificing themselves to save the spots for others. Most infants survived, and a lot of men died. So I construct a new variable indicating life stage. Is this person a grown-up man, a teenage, an infant, etc?

full$LifeStage[full$ImputeAge <= 1] = 'infant'
full$LifeStage[full$ImputeAge > 1 &  full$ImputeAge <= 16] = 'teenage'
full$LifeStage[full$ImputeAge > 16 &  full$ImputeAge <= 50 & full$Gender == 'male'] = 'grownup-man'
full$LifeStage[full$ImputeAge > 1 &  full$ImputeAge <= 50 & full$Gender == 'female'] = 'grownup-woman'
full$LifeStage[full$ImputeAge > 50 & full$Gender == 'male'] = 'elder-man'
full$LifeStage[full$ImputeAge > 50 & full$Gender == 'female'] = 'elder-woman'

full$LifeStage = factor(full$LifeStage, levels = c('infant', 'teenage', 'grownup-man', 'grownup-woman', 'elder-man', 'elder-woman'))
p_survival_by_lifestage = ggplot(full) +
                        geom_bar(aes(x = LifeStage, fill = Survival), position = 'dodge', width = 0.5) + 
                        labs(title = 'Survival status, by passenger life stage', x = 'passenger life stage') 
(p_survival_by_lifestage)

plot of chunk unnamed-chunk-14

Titles, a proxy for passenger socio-economic status

Titles can partially reflect a person's socio-economic status, and this should affect his/her chances to survive.

full$Titles = gsub(pattern = '(.*, )|(\\..*)', replacement= '',x=full$Name)
unique(full$Titles)

##  [1] "Miss"         "Master"       "Mr"           "Mrs"         
##  [5] "Col"          "Mme"          "Dr"           "Major"       
##  [9] "Capt"         "Lady"         "Sir"          "Mlle"        
## [13] "Dona"         "Jonkheer"     "the Countess" "Don"         
## [17] "Rev"          "Ms"

For generality, I consolidated some variants of honorific titles into common ones, for example, Mlle is French title for Miss, so I changed it to Miss. In the mean time, some of the titles such as the Countess indicates nobility, so I constructed another variable denotin passenger's nobility .The following table shows honorific titles on Titanic.

nobles = c('Lady', 'the Countess', 'Sir', 'Jonkheer')
full$Noble = rep('civic', 1309)
full$Noble[full$Titles %in% nobles] = 'noble'
full$Noble = factor(full$Noble)
full$Titles[full$Titles %in% c('Dona', 'Mme', 'Ms', 'Lady', 'the Countess')] = 'Mrs'
full$Titles[full$Titles %in% c('Mlle')] = 'Miss'
full$Titles[full$Titles %in% c('Rev', 'Major', 'Sir', 'Jonkheer', 'Col', 'Capt', 'Don', 'Dr')] = 'Mr'
full$Titles = factor(full$Titles)
table(full$Titles)

## 
## Master   Miss     Mr    Mrs 
##     61    262    783    203

# first do some practice
set.seed(616)
train_idx = sample(x = nrow(full), replace=F, size= 891)
train = full[train_idx,]
test = full[-train_idx,]

set.seed(616)
rf_model = randomForest(data = train, Survival ~ Class + Gender + ImputeAge + SiblingSpouse + ParentChild + Fare + Embark + Company + LifeStage + Titles + Noble, importance = T)
varImpPlot(rf_model)

plot of chunk unnamed-chunk-18

importance = importance(rf_model)
VarImportance = data.frame(Features = rownames(importance), Importance = round(importance[, 'MeanDecreaseGini'],2))
RankImportance = mutate(.data = VarImportance, Rank = paste("#", dense_rank(desc(Importance))))

pred = predict(rf_model, select(test, -Survival))
table(pred, test$Survival)

##          
## pred      die survive
##   die     240      57
##   survive  23      98

3. Predictions

set.seed(616)
predictors = names(full)[!names(full) %in% c("Name", "Age", "TravelSize", "Noble")]

set.seed(616)
train_idx = sample(x = nrow(full), replace=F, size= 900)
train = full[train_idx,]
test = full[-train_idx,]

train = train[, predictors]
test = test[, predictors]

3.1 Method 1: Stochastic Gradient Boosting

set.seed(616)
ctrl = trainControl(method="repeatedcv",number=5, repeats=5, classProbs= TRUE, summaryFunction= twoClassSummary)
gbmGrid = expand.grid(interaction.depth = seq(1,7,by=2),
                      n.trees = seq(50, 250, by = 25),
                      shrinkage = c(0.01, 0.1),
                      n.minobsinnode = 5)

gbmTune = train(Survival ~ ., 
                 data = train,
                 method = "gbm",
                 metric = "ROC",
                 preProc = c("center", "scale"), 
                 verbose = FALSE,
                 trControl = ctrl,
                 tuneGrid = gbmGrid)

gbmPreds = predict(gbmTune, test)
confusionMatrix(gbmPreds, test$Survival)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction die survive
##    die     228      45
##    survive  32     104
##                                           
##                Accuracy : 0.8117          
##                  95% CI : (0.7704, 0.8485)
##     No Information Rate : 0.6357          
##     P-Value [Acc > NIR] : 5.658e-15       
##                                           
##                   Kappa : 0.5858          
##  Mcnemar's Test P-Value : 0.1715          
##                                           
##             Sensitivity : 0.8769          
##             Specificity : 0.6980          
##          Pos Pred Value : 0.8352          
##          Neg Pred Value : 0.7647          
##              Prevalence : 0.6357          
##          Detection Rate : 0.5575          
##    Detection Prevalence : 0.6675          
##       Balanced Accuracy : 0.7875          
##                                           
##        'Positive' Class : die             
##

ggplot(gbmTune) + theme(legend.position = "top") + labs(title = "Stochastic Gradient Boosting 5-fold CV tuning parameter(s): tree depth, iterations, learning rate")

plot of chunk unnamed-chunk-21

3.2 Method 2: Boosted Logistic Regression

set.seed(616)
ctrl = trainControl(method="repeatedcv",number=5, repeats=5, classProbs= TRUE, summaryFunction= twoClassSummary)

blrGrid = expand.grid(nIter = seq(5, 100, 5))

blrTune = train(Survival~., 
                data= train, 
                method="LogitBoost", 
                preProc = c("center", "scale"), 
                verbose=FALSE, 
                trControl=ctrl, 
                metric="ROC", 
                tuneGrid=blrGrid)

blrPreds = predict(blrTune, test)
confusionMatrix(blrPreds, test$Survival)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction die survive
##    die     223      53
##    survive  37      96
##                                           
##                Accuracy : 0.78            
##                  95% CI : (0.7366, 0.8192)
##     No Information Rate : 0.6357          
##     P-Value [Acc > NIR] : 2.174e-10       
##                                           
##                   Kappa : 0.5138          
##  Mcnemar's Test P-Value : 0.1138          
##                                           
##             Sensitivity : 0.8577          
##             Specificity : 0.6443          
##          Pos Pred Value : 0.8080          
##          Neg Pred Value : 0.7218          
##              Prevalence : 0.6357          
##          Detection Rate : 0.5452          
##    Detection Prevalence : 0.6748          
##       Balanced Accuracy : 0.7510          
##                                           
##        'Positive' Class : die             
##

ggplot(blrTune) + theme(legend.position = "top")+  labs(title = "Boosted Logistic Regression 5-fold CV tuning parameter(s): iterations")

plot of chunk unnamed-chunk-22

3.3 Method 3: Support Vector Classifier

set.seed(616)
ctrl = trainControl(method="repeatedcv",number=5, repeats=5, classProbs= TRUE, summaryFunction= twoClassSummary)

svmGrid = expand.grid(C = sapply(-3:3, exp), sigma = sapply(-3:1, exp))

svmTune = train(Survival ~., 
                data=train, 
                method="svmRadial", 
                preProc = c("center", "scale"),
                metric="ROC", 
                trControl=ctrl, 
                tuneGrid=svmGrid)

svmPreds = predict(svmTune, test)
confusionMatrix(svmPreds, test$Survival)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction die survive
##    die     230      45
##    survive  30     104
##                                           
##                Accuracy : 0.8166          
##                  95% CI : (0.7757, 0.8529)
##     No Information Rate : 0.6357          
##     P-Value [Acc > NIR] : 8.879e-16       
##                                           
##                   Kappa : 0.5954          
##  Mcnemar's Test P-Value : 0.106           
##                                           
##             Sensitivity : 0.8846          
##             Specificity : 0.6980          
##          Pos Pred Value : 0.8364          
##          Neg Pred Value : 0.7761          
##              Prevalence : 0.6357          
##          Detection Rate : 0.5623          
##    Detection Prevalence : 0.6724          
##       Balanced Accuracy : 0.7913          
##                                           
##        'Positive' Class : die             
##

ggplot(svmTune) + theme(legend.position = "top") + labs(title = "Support Machine Classifier 5-fold CV tuning parameter(s): cost, sigma")

plot of chunk unnamed-chunk-23

3.4 Method 4: Flexible Discriminant Analysis

set.seed(616)
ctrl = trainControl(method="repeatedcv",number=5, repeats=5, classProbs= TRUE, summaryFunction= twoClassSummary)

fdaGrid = expand.grid(nprune = seq(2,32,by = 6), degree = 1:5)

fdaTune = train(Survival ~., 
                data=train, 
                method="fda",
                preProc = c("center", "scale"), 
                metric="ROC", 
                trControl=ctrl, 
                tuneGrid=fdaGrid)

fdaPreds = predict(fdaTune, test)
confusionMatrix(fdaPreds, test$Survival)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction die survive
##    die     228      44
##    survive  32     105
##                                          
##                Accuracy : 0.8142         
##                  95% CI : (0.773, 0.8507)
##     No Information Rate : 0.6357         
##     P-Value [Acc > NIR] : 2.259e-15      
##                                          
##                   Kappa : 0.5918         
##  Mcnemar's Test P-Value : 0.207          
##                                          
##             Sensitivity : 0.8769         
##             Specificity : 0.7047         
##          Pos Pred Value : 0.8382         
##          Neg Pred Value : 0.7664         
##              Prevalence : 0.6357         
##          Detection Rate : 0.5575         
##    Detection Prevalence : 0.6650         
##       Balanced Accuracy : 0.7908         
##                                          
##        'Positive' Class : die            
##

ggplot(fdaTune) + theme(legend.position = "top") + labs(title = "Flexible Discriminant Analysis 5-fold CV tuning parameter(s): number of prunes, degrees")

plot of chunk unnamed-chunk-24

Contents

1. Introduction

1.1 Libraries