Kaggle Titanic With List Columns & purrr

Intro

This is yet another Titanic Kaggle modelling notebook, which has morphed into a demo of using list columns in data frames to simplify and organise the modelling process. It creates a huge advantage in allowing easy use of functional programming on the columns and groups in the dataframe, greatly shortening and simplifying the code.

Setup & Functions

library(tidyverse)
library(caret)
library(knitr)
library(ROCR)
titanic_raw <- read_csv("../data/train.csv")

plot_rocr <- function(roc_pred, model_name){
  # ggplot-style roc plot based on the ROCR package.
  roc_perf <- performance(roc_pred, measure = "tpr", x.measure="fpr")
  roc_data <- tibble(fpr = roc_perf@x.values[[1]],
                     tpr = roc_perf@y.values[[1]])
  
  ggplot(data = roc_data, aes(fpr, tpr, group = 1)) +
    geom_line(size = 1) +
    theme_minimal() +
    geom_abline(intercept = 0, slope = 1, linetype = 2) +
    annotate("text", x = 0.75, y = 0.25, 
             label = 
               paste("AUC =", 
                     performance(roc_pred, measure = "auc")@y.values[[1]])) +
    labs(title = model_name)
}

get_kappa <- function(model) {
  # helper function for extracting from caret models
  mean(model$resample$Kappa)
}

get_accuracy <- function(model) {
  # helper function for extracting from caret models
  mean(model$resample$Accuracy)
}

map_predict <- function(model, test_data, predict_type = "prob") {
  # for making predictions in a functional way.
  predict(model, 
              newdata = test_data, 
              type = predict_type)
}

map_predict_class <- function(model, test_data){
  map_predict(model, test_data, "raw")
}

Explore

Check for NA’s

titanic_raw %>% head() %>% kable(caption = "Titanic Survivor Kaggle Dataset")

Titanic Survivor Kaggle Dataset
PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
1	0	3	Braund, Mr. Owen Harris	male	22	1	A/5 21171	7.2500	NA	S
2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Thayer)	female	38	1	PC 17599	71.2833	C85	C
3	1	3	Heikkinen, Miss. Laina	female	26	0	STON/O2. 3101282	7.9250	NA	S
4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35	1	113803	53.1000	C123	S
5	0	3	Allen, Mr. William Henry	male	35	0	373450	8.0500	NA	S
6	0	3	Moran, Mr. James	male	NA	0	330877	8.4583	NA	Q

# Next two do the same thing, check number of NA's
# colSums(is.na(titanic_raw)) 
titanic_raw %>% summarise_all(function(x) sum(is.na(x))) %>% 
  knitr::kable(caption = "NA's for Each Variable")

NA’s for Each Variable
PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	0	0	0	0	177	0	0	0	0	687	2

titanic_raw %>% tally() %>%  
  knitr::kable(caption = "Number of Observations")

Number of Observations
n
891

Clean

titanic_clean <- 
  titanic_raw %>% select(-Cabin, -PassengerId, -Ticket) %>% 
  mutate(Age = ifelse(is.na(Age), median(Age, na.rm=TRUE), Age),
         NumRelations = SibSp + Parch,
         NameLength = nchar(Name),
         Title = stringr::str_extract(Name, "(\\S+)\\s*\\.")) %>% 
  filter(!is.na(Embarked)) %>% 
  mutate_at(vars(Survived, Pclass, SibSp, Parch),
            make.names) %>% # make the levels of factors valid
  mutate(Survived = as.factor(Survived),
         Pclass = as.factor(Pclass),
         Sex= as.factor(Sex),
         SibSp = as.factor(SibSp),
         Parch = as.factor(Parch),
         Embarked = as.factor(Embarked),
         Title = as.factor(Title))

titanic_train_index <- titanic_clean$Survived %>%
  createDataPartition(p=0.75, list = FALSE, times=1)

titanic_train <- titanic_clean[titanic_train_index, ]
titanic_test <- titanic_clean[-titanic_train_index, ]

Model

Let’s fit the model to the relevant variables. We use binomial since there are 2 possible outcomes.

train_control <- 
  trainControl(method="cv", number = 5, savePredictions = TRUE, 
               classProbs = TRUE, preProcOptions = c("centre", "scale"))

form <- Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked +
  NumRelations + NameLength + Title

tune_metric <- "Kappa"

models_df <- tibble(model_name =  c("GLM","RF", "SVM", "XGB" ),
                    model = list(
                      GLM = train(form, data = titanic_train,
                                  trControl=train_control,
                                  method="glm", family="binomial", 
                                  metric = tune_metric),
                      RF = train(form, data=titanic_train,
                                 trControl=train_control, 
                                 method="rf", metric = tune_metric),
                      SVM = train(form, data=titanic_train,
                                  trControl=train_control,
                                  method="svmLinear", 
                                  metric = tune_metric),
                      XGB = train(form, data=titanic_train,
                                  trControl=train_control,
                                  method="xgbLinear",
                                  metric = tune_metric)))

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

models_df <-
  models_df %>% 
  mutate(confusion_matrix = map(model, confusionMatrix),
         train_mean_kappa = map(model, get_kappa),
         train_mean_accuracy = map(model, get_accuracy))

Thanks to caret we can extract similar metrics between the different models quite easily. All the models seem to be coming in around the same in regards accuracy and kappa.

Model Internal Assessment

Caret allows you to calculate a confusion matrix based on all the folds of the resampled models. This should give an indication of model performance on several re-samples, but does not tell you the performance of the final model on the training dataset

models_df$confusion_matrix

## $GLM
## Cross-Validated (5 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##           Reference
## Prediction   X0   X1
##         X0 54.3 10.6
##         X1  7.5 27.6
##                             
##  Accuracy (average) : 0.8186
## 
## 
## $RF
## Cross-Validated (5 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##           Reference
## Prediction   X0   X1
##         X0 54.3 10.0
##         X1  7.5 28.2
##                             
##  Accuracy (average) : 0.8246
## 
## 
## $SVM
## Cross-Validated (5 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##           Reference
## Prediction   X0   X1
##         X0 55.2 10.0
##         X1  6.6 28.2
##                             
##  Accuracy (average) : 0.8336
## 
## 
## $XGB
## Cross-Validated (5 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##           Reference
## Prediction   X0   X1
##         X0 54.7  9.4
##         X1  7.0 28.8
##                             
##  Accuracy (average) : 0.8351

Both models seem to be performing very similarly, with differences likely within model variance. Of course the only way to tell for sure is to apply it to an external dataset.

Caret can also allow you to generate summary tables of metrics in order to compare model fits.

models_df$resample_metrics <- list(resamples(models_df$model))
  
models_df$resample_metrics[[1]] %>% summary()

## 
## Call:
## summary.resamples(object = .)
## 
## Models: GLM, RF, SVM, XGB 
## Number of resamples: 5 
## 
## Accuracy 
##          Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## GLM 0.7819549 0.7985075 0.8045113 0.8186287 0.8134328 0.8947368    0
## RF  0.7761194 0.7985075 0.8345865 0.8246998 0.8421053 0.8721805    0
## SVM 0.7744361 0.8208955 0.8208955 0.8336214 0.8496241 0.9022556    0
## XGB 0.7443609 0.8283582 0.8571429 0.8350241 0.8646617 0.8805970    0
## 
## Kappa 
##          Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## GLM 0.5405599 0.5575935 0.5673173 0.6093954 0.6057896 0.7757167    0
## RF  0.4986281 0.5836594 0.6501674 0.6232529 0.6571744 0.7266352    0
## SVM 0.5007508 0.6142960 0.6229777 0.6409173 0.6771845 0.7893775    0
## XGB 0.4553120 0.6345743 0.6874459 0.6467581 0.7116358 0.7448227    0

The random forest has a seemingly more consistent set of results and a higher median in both kappa and accuracy.

bwplot(models_df$resample_metrics[[1]])

densityplot(models_df$resample_metrics[[1]])

Interestingly the models seem to not correlate with each other at all, but we only have 5 data points so…

splom(models_df$resample_metrics[[1]])

But at the same time we see that there is no statistically significant difference in performance of the models (lower diagonal p-value is high).

diffs <- diff(models_df$resample_metrics[[1]])
summary(diffs)

## 
## Call:
## summary.diff.resamples(object = diffs)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##     GLM RF        SVM       XGB      
## GLM     -0.006071 -0.014993 -0.016395
## RF  1             -0.008922 -0.010324
## SVM 1   1                   -0.001403
## XGB 1   1         1                  
## 
## Kappa 
##     GLM RF        SVM       XGB      
## GLM     -0.013857 -0.031522 -0.037363
## RF  1             -0.017664 -0.023505
## SVM 1   1                   -0.005841
## XGB 1   1         1

The calibration plot shows some serious issues with the SVM in the middle of the probability distribution with some mild mis-calibration for the XGB model. Difficult to say if the SVM is salvageable. The XGB model could be easily calibrated

models_df <-
  models_df %>% 
  mutate(caret_pred = map(model, function(x) x$pred)) %>% 
  mutate(internal_calibrations = 
           map2(rep(list(obs ~ X0), 4), caret_pred, calibration)) %>% 
  select(-caret_pred)

(models_df %>% select(internal_calibrations, model_name) %>% 
    mutate(calibration_plots = 
             map2(internal_calibrations, model_name,
                  ~plot(.x, main = .y))))$calibration_plots

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

Model External Assessment

Note that the simple glm model was identical to what caret generated with cross-validation.

# fix model factor levels
models_df$model$GLM$xlevels$Title <- 
  union(models_df$model$GLM$xlevels$Title, levels(titanic_test$Title))
models_df$model$RF$xlevels$Title <- 
  union(models_df$model$GLM$xlevels$Title, levels(titanic_test$Title))
models_df$model$SVM$xlevels$Title <- 
  union(models_df$model$GLM$xlevels$Title, levels(titanic_test$Title))
models_df$model$XGB$xlevels$Title <- 
  union(models_df$model$GLM$xlevels$Title, levels(titanic_test$Title))

# Insert test data and test observations 
models_df <-
  models_df %>% 
  mutate(test_data = list(titanic_test),
         observed = list(titanic_test$Survived))

# insert predictions
models_df <- 
  models_df %>% 
  mutate(pred_probability = map2(model, test_data, map_predict),
         pred_class = map2(model, test_data, map_predict_class),
         test_confusion_matrix = map2(pred_class, observed,
                                      confusionMatrix))

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

models_df$test_confusion_matrix

## $GLM
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  X0  X1
##         X0 116  27
##         X1  21  58
##                                           
##                Accuracy : 0.7838          
##                  95% CI : (0.7238, 0.8361)
##     No Information Rate : 0.6171          
##     P-Value [Acc > NIR] : 8.045e-08       
##                                           
##                   Kappa : 0.5363          
##  Mcnemar's Test P-Value : 0.4705          
##                                           
##             Sensitivity : 0.8467          
##             Specificity : 0.6824          
##          Pos Pred Value : 0.8112          
##          Neg Pred Value : 0.7342          
##              Prevalence : 0.6171          
##          Detection Rate : 0.5225          
##    Detection Prevalence : 0.6441          
##       Balanced Accuracy : 0.7645          
##                                           
##        'Positive' Class : X0              
##                                           
## 
## $RF
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  X0  X1
##         X0 122  25
##         X1  15  60
##                                          
##                Accuracy : 0.8198         
##                  95% CI : (0.7628, 0.868)
##     No Information Rate : 0.6171         
##     P-Value [Acc > NIR] : 4.694e-11      
##                                          
##                   Kappa : 0.61           
##  Mcnemar's Test P-Value : 0.1547         
##                                          
##             Sensitivity : 0.8905         
##             Specificity : 0.7059         
##          Pos Pred Value : 0.8299         
##          Neg Pred Value : 0.8000         
##              Prevalence : 0.6171         
##          Detection Rate : 0.5495         
##    Detection Prevalence : 0.6622         
##       Balanced Accuracy : 0.7982         
##                                          
##        'Positive' Class : X0             
##                                          
## 
## $SVM
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  X0  X1
##         X0 120  27
##         X1  17  58
##                                           
##                Accuracy : 0.8018          
##                  95% CI : (0.7432, 0.8521)
##     No Information Rate : 0.6171          
##     P-Value [Acc > NIR] : 2.424e-09       
##                                           
##                   Kappa : 0.571           
##  Mcnemar's Test P-Value : 0.1748          
##                                           
##             Sensitivity : 0.8759          
##             Specificity : 0.6824          
##          Pos Pred Value : 0.8163          
##          Neg Pred Value : 0.7733          
##              Prevalence : 0.6171          
##          Detection Rate : 0.5405          
##    Detection Prevalence : 0.6622          
##       Balanced Accuracy : 0.7791          
##                                           
##        'Positive' Class : X0              
##                                           
## 
## $XGB
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  X0  X1
##         X0 121  28
##         X1  16  57
##                                           
##                Accuracy : 0.8018          
##                  95% CI : (0.7432, 0.8521)
##     No Information Rate : 0.6171          
##     P-Value [Acc > NIR] : 2.424e-09       
##                                           
##                   Kappa : 0.569           
##  Mcnemar's Test P-Value : 0.09725         
##                                           
##             Sensitivity : 0.8832          
##             Specificity : 0.6706          
##          Pos Pred Value : 0.8121          
##          Neg Pred Value : 0.7808          
##              Prevalence : 0.6171          
##          Detection Rate : 0.5450          
##    Detection Prevalence : 0.6712          
##       Balanced Accuracy : 0.7769          
##                                           
##        'Positive' Class : X0              
##

Reasonably good accuracy can be seen that is better than the no information rate for all models, with maybe a weakness on prediciting the negative class displayed by the lower specificity. The final random forest model appears to be performing a bit better than the other. Comparing to the “averaged” performance of all the folds earlier, suggests that the best model for the random forest was quite a bit better than the average which translated through to the external assessment.

Question: Is the difference between the model performance significant?

Answer: No. See above

ROC Analysis

Considering a fairly balanced response class, a ROC analysis is valid. Comparing the models is a bit moot since they don’t seem to have different performance to each other, but we will do this for the sake of completion.

models_df <-
  models_df %>% 
  mutate(pred_probability = map(pred_probability, function(x) x$X1),
         rocr_prediction = map2(pred_probability, observed, prediction)) 

map2(models_df$rocr_prediction, models_df$model_name, plot_rocr)

## $GLM

## 
## $RF

## 
## $SVM

## 
## $XGB