\(\Huge{\text{Predict survival on the Titanic}}\)

The objective is to train machine learning algorithms on a training set of Titanic survivors and to predict the survivors of a test set.

My results to date with several algorithms is the following:

rtree best entry: 0.79426
- logit 0.78469
- rf 0.77033
- boost 0.76555
- lsvm 0.76555
- qda 0.71292
- nlsvm 0.69856

July 13, 2014

Clean the dataset

Both the training and the test datasets had missing values and other anomalies; I used the following function to clean them up:

clean_the_dataset = function(input) {
    
    output   = input
    mean_age = mean(input$Age,na.rm=TRUE)
    
    output$child <- 0
    output$child[output$Age < 18] <- 1
    mean_child_age = mean(subset(output,child == 1,select=c(Age))$Age,na.rm=TRUE)
    mean_adult_age = mean(subset(output,child == 0,select=c(Age))$Age,na.rm=TRUE)
    
    median_fare = median(input$Fare,na.rm=TRUE)
        
    for (i in seq(len(input$Age))) {
        
        if (is.na(input$Fare[i])) {
            output$Fare[i] = median_fare
            }
        
        if (input$Embarked[i] == "") {
            # I tried to programmatically find the
            # modal value ... I finally just
            # looked it up
            output$Embarked[i] = "S"
            }
        
        if (input$Cabin[i] == "") {
            output$Cabin[i] = "Unk"
            }
        
        #
        # what do the tickets tell us?
        #
        
        if (is.na(input$Age[i])) {
            
            if (grepl("Mr.", input$Name[i]) ||
                grepl("Mrs.", input$Name[i]) ||
                grepl("Don.", input$Name[i]) ||
                grepl("Dona.", input$Name[i]) ||
                grepl("Jonkheer.", input$Name[i]) ||
                grepl("Jonkvrouw.", input$Name[i]) ||
                grepl("Lady.", input$Name[i]) ||
                grepl("Mme.", input$Name[i]) ||
                grepl("Sir.", input$Name[i]) || 
                grepl("countess.", input$Name[i]) ||
                grepl("Rev.", input$Name[i]) ||
                grepl("Major.", input$Name[i]) ||
                grepl("Capt.", input$Name[i]) ||
                grepl("Dr.", input$Name[i])) {
                    output$Age[i] = mean_adult_age
                    output$child[i] <- 0
            } else if (grepl("Miss.", input$Name[i]) ||
                       grepl("Master.", input$Name[i]) ||
                       grepl("Mlle.", input$Name[i])) {
                           output$Age[i] = mean_child_age
                           output$child[i] <- 1
            } else {
                output$Age[i] = mean_age
            }
        }
    }
    
    output$Fare2 <- '30+'
    output$Fare2[output$Fare < 30 & output$Fare >= 20] <- '20-30'
    output$Fare2[output$Fare < 20 & output$Fare >= 10] <- '10-20'
    output$Fare2[output$Fare < 10] <- '<10'
    output$Fare2 = factor(output$Fare2)
    
    output$Embarked = factor(output$Embarked)
    
    output
}

Training dataset

Read in the training dataset, clean it and look at the result

data = clean_the_dataset(read.csv('train.csv',
                                  as.is=c(4,9,11,12),
                                  strip.white=TRUE))
str(data)

## 'data.frame':    891 obs. of  14 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "Unk" "C85" "Unk" "C123" ...
##  $ Embarked   : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
##  $ child      : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ Fare2      : Factor w/ 4 levels "<10","10-20",..: 1 4 1 4 1 1 4 3 2 4 ...

Split train.csv into training and test sets

set.seed(123)
smp_size  <- floor(0.75 * nrow(data))
train_ind <- sample(seq_len(nrow(data)), size = smp_size)

train <- data[train_ind, ]
test <-  data[-train_ind, ]

Logistic Regression

Pass in all the meaningul variables and then use the step funciton to reduce the model to its AIC ‘best’. This model will be used by all the subsequent algorithms.

set.seed(123)
#
# taking out child got logit back to its #1 position
#
model  = glm(Survived~.-Name-Ticket-Cabin-child,data=train,family=binomial(link = "logit"))
logit  = step(model,trace=0)

(logit_formula = logit$formula) # save for later models

## Survived ~ Pclass + Sex + Age + SibSp + Parch + Embarked + Fare2

summary(logit)

## 
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp + Parch + 
##     Embarked + Fare2, family = binomial(link = "logit"), data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.403  -0.609  -0.407   0.659   2.420  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.40504    0.78410    4.34  1.4e-05 ***
## Pclass      -0.50849    0.21452   -2.37  0.01777 *  
## Sexmale     -2.52854    0.22793  -11.09  < 2e-16 ***
## Age         -0.03715    0.00888   -4.18  2.9e-05 ***
## SibSp       -0.47457    0.13697   -3.46  0.00053 ***
## Parch       -0.25051    0.15696   -1.60  0.11048    
## EmbarkedQ   -0.25046    0.46058   -0.54  0.58659    
## EmbarkedS   -0.55309    0.26879   -2.06  0.03962 *  
## Fare210-20   0.60193    0.31268    1.93  0.05422 .  
## Fare220-30   0.88222    0.41972    2.10  0.03556 *  
## Fare230+     1.53745    0.48407    3.18  0.00149 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 896.48  on 667  degrees of freedom
## Residual deviance: 597.77  on 657  degrees of freedom
## AIC: 619.8
## 
## Number of Fisher Scoring iterations: 5

predicted_logit = predict(logit, newdata = test, type = "response")
logit_mat = table(test$Survived,predicted_logit>0.5)
confusion.matrix_ROC(test$Survived,predicted_logit, threshold=0.5, tree=FALSE)

## Logistic Regression gives you Pr(y = 1),
## whatever you decide that positive outcome to be.
## 
##          Predicted 0  Predicted 1
## Actual 0    TN           FP
## Actual 1    FN           TP
## 
## Confusion Matrix; threshold =  0.5             
## actual_values FALSE TRUE
##             0   121   24
##             1    20   58
## True Positive Rate ... sensitivity= 0.7436
## True Negative Rate ... specificity= 0.8345
## Model Accuracy =  80.27 % correct predictions
## Baseline would predict 0, with 65.02 % accuracy
## ... if Model Accuracy < Baseline, either adjust the threshold or change the model

## Warning: package 'gplots' was built under R version 3.1.1

plot of chunk unnamed-chunk-5

## AUC = 0.877 , Accuracy =  80.27 %, Baseline =  65.02 %
## Area Under Curve: less than 50% -> worse than guessing

Regression Tree with 10-fold CV

set.seed(123)
library(rpart)
library(rpart.plot)
library(rattle)

## Warning: package 'rattle' was built under R version 3.1.1

## Rattle: A free graphical interface for data mining with R.
## Version 3.1.0 Copyright (c) 2006-2014 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(RColorBrewer)

#
# ######################################
# this says it's doing Cross Validations
# should I use the whole "data" set?
# ######################################
#

rtree = rpart(logit_formula, method="anova", data=train, control=rpart.control(xval=10, cp=0.01))

# check if a different cp would be better
(cpbest=rtree$cptable[which.min(rtree$cptable[,"xerror"]),"CP"])

## [1] 0.01

#prp(rtree)
fancyRpartPlot(rtree)

plot of chunk unnamed-chunk-6

predicted_rtree = predict(rtree, newdata = test)
rtree_mat = table(test$Survived,predicted_rtree>0.5)
confusion.matrix_ROC(test$Survived,predicted_rtree, threshold=0.5, tree=FALSE)

## Logistic Regression gives you Pr(y = 1),
## whatever you decide that positive outcome to be.
## 
##          Predicted 0  Predicted 1
## Actual 0    TN           FP
## Actual 1    FN           TP
## 
## Confusion Matrix; threshold =  0.5             
## actual_values FALSE TRUE
##             0   137    8
##             1    22   56
## True Positive Rate ... sensitivity= 0.7179
## True Negative Rate ... specificity= 0.9448
## Model Accuracy =  86.55 % correct predictions
## Baseline would predict 0, with 65.02 % accuracy
## ... if Model Accuracy < Baseline, either adjust the threshold or change the model

plot of chunk unnamed-chunk-6

## AUC = 0.9143 , Accuracy =  86.55 %, Baseline =  65.02 %
## Area Under Curve: less than 50% -> worse than guessing

Random Forest

set.seed(123)
library(randomForest)

## Warning: package 'randomForest' was built under R version 3.1.1

## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

rforest = randomForest(logit_formula,   data=train)

## Warning: The response has five or fewer unique values.  Are you sure you
## want to do regression?

print(rforest) # view results

## 
## Call:
##  randomForest(formula = logit_formula, data = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 0.1415
##                     % Var explained: 40.8

importance(rforest) # importance of each predictor

##          IncNodePurity
## Pclass          11.575
## Sex             37.448
## Age             21.013
## SibSp            6.245
## Parch            4.511
## Embarked         5.032
## Fare2           11.302

varImpPlot(rforest)

plot of chunk unnamed-chunk-7

predicted_rf = predict(rforest, newdata = test)
thresh = 0.49
rf_mat = table(test$Survived,predicted_rf>thresh)
confusion.matrix_ROC(test$Survived,predicted_rf, threshold=thresh, tree=FALSE)

## Logistic Regression gives you Pr(y = 1),
## whatever you decide that positive outcome to be.
## 
##          Predicted 0  Predicted 1
## Actual 0    TN           FP
## Actual 1    FN           TP
## 
## Confusion Matrix; threshold =  0.49             
## actual_values FALSE TRUE
##             0   131   14
##             1    19   59
## True Positive Rate ... sensitivity= 0.7564
## True Negative Rate ... specificity= 0.9034
## Model Accuracy =  85.2 % correct predictions
## Baseline would predict 0, with 65.02 % accuracy
## ... if Model Accuracy < Baseline, either adjust the threshold or change the model

plot of chunk unnamed-chunk-7

## AUC = 0.9112 , Accuracy =  85.2 %, Baseline =  65.02 %
## Area Under Curve: less than 50% -> worse than guessing

Boosting

set.seed(123)
library(gbm) # Generalized Boosted Regression Modeling

## Loading required package: survival
## Loading required package: splines
## Loading required package: lattice
## Loading required package: parallel
## Loaded gbm 2.1

# for a regression distribution='gaussian'; for classification, 'bernoulli'
# 'shrinkage=0.001' is the default lambda
boost = gbm(logit_formula, data=train, distribution="bernoulli", n.trees=5000, interaction.depth=4)
summary(boost, cex.lab = 0.8)

plot of chunk unnamed-chunk-8

##               var rel.inf
## Sex           Sex  46.470
## Age           Age  20.011
## Pclass     Pclass  14.767
## Fare2       Fare2   7.086
## Embarked Embarked   5.230
## SibSp       SibSp   4.604
## Parch       Parch   1.833

predicted_boost = predict(boost, newdata=test, n.trees = 5000)
boost_mat = table(test$Survived, predicted_boost>0.5)
confusion.matrix_ROC(test$Survived,predicted_boost, threshold=0.5, tree=FALSE)

## Logistic Regression gives you Pr(y = 1),
## whatever you decide that positive outcome to be.
## 
##          Predicted 0  Predicted 1
## Actual 0    TN           FP
## Actual 1    FN           TP
## 
## Confusion Matrix; threshold =  0.5             
## actual_values FALSE TRUE
##             0   137    8
##             1    22   56
## True Positive Rate ... sensitivity= 0.7179
## True Negative Rate ... specificity= 0.9448
## Model Accuracy =  86.55 % correct predictions
## Baseline would predict 0, with 65.02 % accuracy
## ... if Model Accuracy < Baseline, either adjust the threshold or change the model

plot of chunk unnamed-chunk-8

## AUC = 0.9113 , Accuracy =  86.55 %, Baseline =  65.02 %
## Area Under Curve: less than 50% -> worse than guessing

Linear SVM Classifier

set.seed(123)
library(e1071)

lsvm = svm(logit_formula, data=train, kernel="linear", cost=1, scale=FALSE)
print(lsvm)

## 
## Call:
## svm(formula = logit_formula, data = train, kernel = "linear", 
##     cost = 1, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.09091 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  338

predicted_lsvm = predict(lsvm, newdata=test)
lsvm_mat = table(test$Survived, predicted_lsvm > 0.5)
confusion.matrix_ROC(test$Survived, predicted_lsvm, threshold=0.5, tree=FALSE)

## Logistic Regression gives you Pr(y = 1),
## whatever you decide that positive outcome to be.
## 
##          Predicted 0  Predicted 1
## Actual 0    TN           FP
## Actual 1    FN           TP
## 
## Confusion Matrix; threshold =  0.5             
## actual_values FALSE TRUE
##             0   124   21
##             1    26   52
## True Positive Rate ... sensitivity= 0.6667
## True Negative Rate ... specificity= 0.8552
## Model Accuracy =  78.92 % correct predictions
## Baseline would predict 0, with 65.02 % accuracy
## ... if Model Accuracy < Baseline, either adjust the threshold or change the model

plot of chunk unnamed-chunk-9

## AUC = 0.8425 , Accuracy =  78.92 %, Baseline =  65.02 %
## Area Under Curve: less than 50% -> worse than guessing

Non Linear SVM

library(e1071)
set.seed(123)
nlsvm = svm(logit_formula, data=train, kernel="radial", cost=5, scale = FALSE)
print(nlsvm)

## 
## Call:
## svm(formula = logit_formula, data = train, kernel = "radial", 
##     cost = 5, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  5 
##       gamma:  0.09091 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  425

predicted_nlsvm = predict(nlsvm, newdata=test)
nlsvm_mat       = table(test$Survived, predicted_nlsvm > 0.5)
confusion.matrix_ROC(test$Survived, predicted_nlsvm, threshold=0.5, tree=FALSE)

## Logistic Regression gives you Pr(y = 1),
## whatever you decide that positive outcome to be.
## 
##          Predicted 0  Predicted 1
## Actual 0    TN           FP
## Actual 1    FN           TP
## 
## Confusion Matrix; threshold =  0.5             
## actual_values FALSE TRUE
##             0   124   21
##             1    20   58
## True Positive Rate ... sensitivity= 0.7436
## True Negative Rate ... specificity= 0.8552
## Model Accuracy =  81.61 % correct predictions
## Baseline would predict 0, with 65.02 % accuracy
## ... if Model Accuracy < Baseline, either adjust the threshold or change the model

plot of chunk unnamed-chunk-10

## AUC = 0.8366 , Accuracy =  81.61 %, Baseline =  65.02 %
## Area Under Curve: less than 50% -> worse than guessing

Quadratic Discriminant Analysis

set.seed(123)
library(MASS)
qda.fit = qda(logit_formula, data=train)
qda.fit # there appears to be no summary() for qda

## Call:
## qda(logit_formula, data = train)
## 
## Prior probabilities of groups:
##      0      1 
## 0.6048 0.3952 
## 
## Group means:
##   Pclass Sexmale   Age  SibSp  Parch EmbarkedQ EmbarkedS Fare210-20
## 0  2.515  0.8515 30.96 0.5470 0.3094   0.08416    0.7822     0.1906
## 1  1.973  0.3144 27.74 0.4583 0.4432   0.09091    0.6402     0.2235
##   Fare220-30 Fare230+
## 0     0.1436   0.1733
## 1     0.1591   0.4167

predicted_qda = predict(qda.fit, test)$class
qda_mat       = table(test$Survived, predicted_qda)
confusion.matrix_ROC(test$Survived, predicted_qda, threshold=0.5, tree=TRUE)

## Logistic Regression gives you Pr(y = 1),
## whatever you decide that positive outcome to be.
## 
##          Predicted 0  Predicted 1
## Actual 0    TN           FP
## Actual 1    FN           TP
## 
## Confusion Matrix; threshold =  0.5             predicted_values
## actual_values   0   1
##             0 122  23
##             1  15  63
## True Positive Rate ... sensitivity= 0.8077
## True Negative Rate ... specificity= 0.8414
## Model Accuracy =  82.96 % correct predictions
## Baseline would predict 0, with 65.02 % accuracy
## ... if Model Accuracy < Baseline, either adjust the threshold or change the model

Predict Survival in the unknown sample

unknown = clean_the_dataset(read.csv('test.csv',
                                     as.is=c(3,8,10,11),
                                     strip.white=TRUE))

unknown$Survived = rep(0,dim(unknown)[1])

unknown_logit = unknown
unknown_rtree = unknown
unknown_rf    = unknown
unknown_boost = unknown
unknown_lsvm  = unknown
unknown_nlsvm = unknown
unknown_qda   = unknown

# using Logistic Regression
predicted_unk_logit    = predict(logit, newdata = unknown)
unk_survived_logit     = as.numeric(predicted_unk_logit > 0.5)
unknown_logit$Survived = unk_survived_logit

# using Regression Tree
predicted_unk_rtree    = predict(rtree, newdata = unknown)
unk_survived_rtree     = as.numeric(predicted_unk_rtree > 0.5)
unknown_rtree$Survived = unk_survived_rtree

# using Random Forest
predicted_unk_rf    = predict(rforest, newdata = unknown)
unk_survived_rf     = as.numeric(predicted_unk_rf > thresh)
unknown_rf$Survived = unk_survived_rf

# using Boosting
predicted_unk_boost = predict(boost, newdata = unknown, n.trees = 5000)
unk_survived_boost  = as.numeric(predicted_unk_boost > 0.5)
unknown_boost$Survived = unk_survived_boost

# using Linear SVM
predicted_unk_lsvm = predict(lsvm, newdata = unknown)
unk_survived_lsvm  = as.numeric(predicted_unk_lsvm > 0.5)
unknown_lsvm$Survived = unk_survived_lsvm

# using Non Linear SVM
predicted_unk_nlsvm = predict(nlsvm, newdata=unknown)
unk_survived_nlsvm  = as.numeric(predicted_unk_nlsvm > 0.5)
unknown_nlsvm$Survived = unk_survived_nlsvm

# using QDA
predicted_unk_qda = predict(qda.fit, unknown)$class
unk_survived_qda  = predicted_unk_qda
unknown_qda$Survived = unk_survived_qda

Model Comparison

cat('Model Accuracy on training data\n',(logit_mat[1]+logit_mat[4])/sum(logit_mat)*100,'% Logistic Regression\n',
    (rtree_mat[1]+rtree_mat[4])/sum(rtree_mat)*100,'% Regression Tree\n',
    (rf_mat[1]+rf_mat[4])/sum(rf_mat)*100,'% Random Forest\n',
    (boost_mat[1]+boost_mat[4])/sum(boost_mat)*100,'% Boost\n',
    (nlsvm_mat[1]+nlsvm_mat[4])/sum(nlsvm_mat)*100,'% Non Linear SVM\n',
    (lsvm_mat[1]+lsvm_mat[4])/sum(lsvm_mat)*100,'% Linear SVM\n',
    (qda_mat[1]+qda_mat[4])/sum(qda_mat)*100,'% QDA\n\n',
    sep="")

## Model Accuracy on training data
## 80.27% Logistic Regression
## 86.55% Regression Tree
## 85.2% Random Forest
## 86.55% Boost
## 81.61% Non Linear SVM
## 78.92% Linear SVM
## 82.96% QDA

cat('Survival Rates\n',table(data$Survived)[2]/(sum(table(data$Survived)))*100,'% training dataset\n\n',
    table(unknown_logit$Survived)[2]/(sum(table(unknown_logit$Survived)))*100,'% Logistic Regression\n',
    table(unknown_rtree$Survived)[2]/(sum(table(unknown_rtree$Survived)))*100,'% Regression Tree\n',
    table(unknown_rf$Survived)[2]/(sum(table(unknown_rf$Survived)))*100,'% Random Forest\n',
    table(unknown_boost$Survived)[2]/(sum(table(unknown_boost$Survived)))*100,'% Boost\n',
    table(unknown_nlsvm$Survived)[2]/(sum(table(unknown_nlsvm$Survived)))*100,'% Non Linear SVM\n',
    table(unknown_lsvm$Survived)[2]/(sum(table(unknown_lsvm$Survived)))*100,'% Linear SVM\n',
    table(unknown_qda$Survived)[2]/(sum(table(unknown_qda$Survived)))*100,'% QDA',
    sep="")

## Survival Rates
## 38.38% training dataset
## 
## 30.14% Logistic Regression
## 29.9% Regression Tree
## 36.12% Random Forest
## 29.43% Boost
## 36.36% Non Linear SVM
## 36.36% Linear SVM
## 45.69% QDA

Create the submission datasets

# You should submit a csv file with exactly 418 entries plus a header row.
# This must have exactly 2 columns: 
#    PassengerId (which can be sorted in any order)
#    Survived which contains your binary predictions: 
#         1 for survived, 0 for did not.
#
submit_logit = subset(unknown_logit,select=c(PassengerId,Survived))
write.csv(submit_logit, file = "submit_logit.csv", row.names = FALSE)

submit_rf    = subset(unknown_rf,select=c(PassengerId,Survived))
write.csv(submit_rf, file = "submit_rf.csv", row.names = FALSE)

submit_rtree = subset(unknown_rtree,select=c(PassengerId,Survived))
write.csv(submit_rtree, file = "submit_rtree.csv", row.names = FALSE)

submit_boost = subset(unknown_boost,select=c(PassengerId,Survived))
write.csv(submit_boost, file = "submit_boost.csv", row.names = FALSE)

submit_nlsvm = subset(unknown_nlsvm,select=c(PassengerId,Survived))
write.csv(submit_nlsvm, file = "submit_nlsvm.csv", row.names = FALSE)

submit_lsvm = subset(unknown_lsvm,select=c(PassengerId,Survived))
write.csv(submit_lsvm, file = "submit_lsvm.csv", row.names = FALSE)

submit_qda = subset(unknown_qda,select=c(PassengerId,Survived))
write.csv(submit_qda, file = "submit_qda.csv", row.names = FALSE)