The following describes an approach I took to the Titanic survival prediction challenge presented by Kaggle.

Getting the Data Into R

Kaggle packaged the data for the Titanic challenge into two csv-format files:

train.csv (data containing attributes and known outcomes [survived or perished] for a subset of the passengers)

test.csv (data containing attributes without outcomes for a subset of passengers)

library(ggplot2)
library(lattice)
library(caret)
data <- read.csv("./Titanic/train.csv",header = T, stringsAsFactors = T)
levels(data$Survived) <- c("0","1")
data$Survived <- as.factor(data$Survived)

Putting some simple data visualization tools to work can take us a long way toward understanding what might influence the outcome we’re trying to predict – in this case, whether or not a passenger survived. Below is some code and the graphs they produced:

par(mfrow=c(4,2))
barplot(table(data$Survived),
        names.arg = c("Perished", "Survived"),
        main="Survived (passenger fate)", col="black")
barplot(table(data$Pclass), 
        names.arg = c("first", "second", "third"),
        main="Pclass (passenger traveling class)", col="firebrick")
barplot(table(data$Sex), main="Sex (gender)", col="darkviolet")
hist(data$Age, main="Age", xlab = NULL, col="brown")
barplot(table(data$SibSp), main="SibSp (siblings + spouse aboard)", 
        col="darkblue")
barplot(table(data$Parch), main="Parch (parents + kids aboard)", 
        col="gray50")
hist(data$Fare, main="Fare (fee paid for ticket[s])", xlab = NULL, 
     col="darkgreen")
barplot(table(data$Embarked), 
        names.arg = c("","Cherbourg", "Queenstown", "Southampton"),
        main="Embarked (port of embarkation)", col="sienna")

counts <- table(data$Survived,data$Sex)
barplot(counts, xlab="Sex", ylab="Number Survived",main="Survived passengers by Sex")

counts[2] / (counts[1] + counts[2])

## [1] 0.7420382

counts[4] / (counts[3] + counts[4])

## [1] 0.1889081

counts <- table(data$Survived,data$Pclass)
barplot(counts, xlab="Class", ylab="Number Survived",main="Survived passengers by Class")

counts[2] / (counts[1] + counts[2])

## [1] 0.6296296

counts[4] / (counts[3] + counts[4])

## [1] 0.4728261

counts[6] / (counts[5] + counts[6])

## [1] 0.2423625

counts <- table(data$Survived,data$SibSp)
barplot(counts, xlab="Sublings", ylab="Number Survived",main="Survived passengers by Number of Siblings/Spouses Aboard")

counts <- table(data$Survived,data$Parch)
barplot(counts, xlab="Children", ylab="Number Survived",main="Survived passengers by Number of Children Aboard")

library(Hmisc)
library(ISLR)
cutFare <- cut2(data$Fare, g=3)
qplot(cutFare, Parch, data=data, fill=cutFare, geom=c("boxplot", "jitter"))

t1 <- table(cutFare, data$Survived)
t1

##                 
## cutFare            0   1
##   [ 0.00,  8.68) 247  61
##   [ 8.68, 26.25) 172 116
##   [26.25,512.33] 130 165

prop.table(t1, 1)

##                 
## cutFare                  0         1
##   [ 0.00,  8.68) 0.8019481 0.1980519
##   [ 8.68, 26.25) 0.5972222 0.4027778
##   [26.25,512.33] 0.4406780 0.5593220

boxplot(data$Age ~ data$Survived, 
        main="Passenger Fate by Age",
        xlab="Survived", ylab="Age")

Let’s visualize the missing data by feature:

## map missing data by feature
require(Amelia)
missmap(data, main="Missings Values", col=c("yellow", "black"), legend=FALSE)

Cleaning data

Let’s create a function to extract honorific (i.e. title) from the Name feature.Then we check for all the unique values in titles listed, and find portions of missing values by title.

getTitle <- function(data) {
    title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE)
    title.comma.end <- title.dot.start + attr(title.dot.start, "match.length")-1
    data$Title <- substr(data$Name, title.dot.start+2, title.comma.end-1)
    return (data$Title)
}   
data$title <- getTitle(data)

# checking for all the unique titles listed
unique(data$title)

##  [1] "Mr"           "Mrs"          "Miss"         "Master"      
##  [5] "Don"          "Rev"          "Dr"           "Mme"         
##  [9] "Ms"           "Major"        "Lady"         "Sir"         
## [13] "Mlle"         "Col"          "Capt"         "the Countess"
## [17] "Jonkheer"

# finding # of missing values by title & # of not missing values by title
bystats(data$Age, data$title)

## 
##  Mean of data$Age by  
## 
##                N Missing      Mean
## Capt           1       0 70.000000
## Col            2       0 58.000000
## Don            1       0 40.000000
## Dr             6       1 42.000000
## Jonkheer       1       0 38.000000
## Lady           1       0 48.000000
## Major          2       0 48.500000
## Master        36       4  4.574167
## Miss         146      36 21.773973
## Mlle           2       0 24.000000
## Mme            1       0 24.000000
## Mr           398     119 32.368090
## Mrs          108      17 35.898148
## Ms             1       0 28.000000
## Rev            6       0 43.166667
## Sir            1       0 49.000000
## the Countess   1       0 33.000000
## ALL          714     177 29.699118

summary(data$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.42   20.12   28.00   29.70   38.00   80.00     177

var.levels <- c("Dr","Master","Miss","Mr","Mrs")

imputeMedian <- function(impute.var, filter.var, var.levels) {
    for (v in var.levels) {
        impute.var[ which( filter.var == v)] <- impute(impute.var[ 
            which( filter.var == v)])
    }
    return (impute.var)
}
data$Age <- imputeMedian(data$Age,data$title,var.levels=var.levels)
summary(data$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.42   21.00   30.00   29.39   35.00   80.00

checkNAs <- function(data) {
    
    c <- rep(NA,ncol(data))
    
    for (i in 1:ncol(data)) {
        
      c[i] <- table(!is.na(data[,i]))
    }
    return(c)
}
checkNAs(data)

##  [1] 891 891 891 891 891 891 891 891 891 891 891 891 891

Analyzing data and new feature generation.

# analyzing Fare rates
summary(data$Fare)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    7.91   14.45   32.20   31.00  512.30

subset(data, Fare < 7)[order(subset(data, Fare < 7)$Fare, subset(data, Fare < 7)$Pclass), c("Age", "title", "Pclass", "Fare")]

##      Age    title Pclass   Fare
## 264 40.0       Mr      1 0.0000
## 634 30.0       Mr      1 0.0000
## 807 39.0       Mr      1 0.0000
## 816 30.0       Mr      1 0.0000
## 823 38.0 Jonkheer      1 0.0000
## 278 30.0       Mr      2 0.0000
## 414 30.0       Mr      2 0.0000
## 467 30.0       Mr      2 0.0000
## 482 30.0       Mr      2 0.0000
## 675 30.0       Mr      2 0.0000
## 733 30.0       Mr      2 0.0000
## 180 36.0       Mr      3 0.0000
## 272 25.0       Mr      3 0.0000
## 303 19.0       Mr      3 0.0000
## 598 49.0       Mr      3 0.0000
## 379 20.0       Mr      3 4.0125
## 873 33.0       Mr      1 5.0000
## 327 61.0       Mr      3 6.2375
## 844 34.5       Mr      3 6.4375
## 819 43.0       Mr      3 6.4500
## 203 34.0       Mr      3 6.4958
## 372 18.0       Mr      3 6.4958
## 144 19.0       Mr      3 6.7500
## 655 18.0     Miss      3 6.7500
## 412 30.0       Mr      3 6.8583
## 826 30.0       Mr      3 6.9500
## 130 45.0       Mr      3 6.9750
## 805 27.0       Mr      3 6.9750

dataFare <- subset(data,Fare==0)
barplot(table(dataFare$Survived))

data$Fare[which(data$Fare==0)] <- NA
data$Pclass <- as.factor(data$Pclass)
data$Fare <- imputeMedian(data$Fare,data$Pclass,var.levels=as.numeric(levels(data$Pclass)))
table(is.na(data$Fare))

## 
## FALSE 
##   891

# new feature creation
data$Family <- data$Parch + data$SibSp
barplot(log2(table(data$Family)))

unique(data$title)

##  [1] "Mr"           "Mrs"          "Miss"         "Master"      
##  [5] "Don"          "Rev"          "Dr"           "Mme"         
##  [9] "Ms"           "Major"        "Lady"         "Sir"         
## [13] "Mlle"         "Col"          "Capt"         "the Countess"
## [17] "Jonkheer"

changeTitles <- function(data, old.titles, new.title) {
    for (honorific in old.titles) {
        data$title[ which( data$title == honorific)] <- new.title
    }
    return (data$title)
}


## Title consolidation
data$title <- changeTitles(data, 
                               c("Capt", "Col", "Don", "Dr", 
                                 "Jonkheer", "Lady", "Major", 
                                 "Rev", "Sir"),
                               "Noble")
data$title <- changeTitles(data, c("the Countess", "Ms"), 
                               "Mrs")
data$title <- changeTitles(data, c("Mlle", "Mme"), "Miss")
data$title <- as.factor(data$title)
unique(data$title)

## [1] Mr     Mrs    Miss   Master Noble 
## Levels: Master Miss Mr Mrs Noble

# Replace Sex with 1:0
data$Sex <- as.character(data$Sex)
data$Sex[which(data$Sex=="female")] <- "2"
data$Sex[which(data$Sex=="male")] <- "1"
data$Sex <- as.factor(data$Sex)

# Split Age into k-folds
split <- cut2(x = data$Age,g = 10)
table(split)

## split
## [ 0.42,17.0) [17.00,20.5) [20.50,23.0) [23.00,27.0) [27.00,30.5) 
##          104           79           88           88          209 
## [30.50,34.5) [34.50,39.0) [39.00,48.0) [48.00,80.0] 
##           69           76           89           89

qplot(split, data$Age, data=data, fill=split, geom=c("boxplot", "jitter"))

data$Age[which(data$Age<17)] <- 0
data$Age[which(data$Age>=17&data$Age<20)] <- 1
data$Age[which(data$Age>=20&data$Age<23)] <- 2
data$Age[which(data$Age>=23&data$Age<27)] <- 3
data$Age[which(data$Age>=27&data$Age<30)] <- 4
data$Age[which(data$Age>=30&data$Age<34)] <- 5
data$Age[which(data$Age>=34&data$Age<39)] <- 6
data$Age[which(data$Age>=39&data$Age<48)] <- 7
data$Age[which(data$Age>=48)] <- 8
data$Age <- as.factor(data$Age)


#Family plays an important role
# Single Mother Aged <27
data$indicator1 <- as.factor(ifelse(data$Sex==2&as.numeric(data$Age)<=4&data$Parch>0,1,0))
# Single Mother Aged >27
data$indicator2 <- as.factor(ifelse(data$Sex==2&as.numeric(data$Age)>4&data$Parch>0,1,0))
# Married couple with no kids
data$indicator3 <- as.factor(ifelse(data$SibSp==1&data$Parch==0,1,0))
# Single relatives with no kids
data$indicator4 <- as.factor(ifelse(data$SibSp>1&data$Parch==0,1,0))
# Young single males in 2nd or 3rd classes
data$indicator5 <- as.factor(ifelse(as.numeric(data$Age)<4&data$Sex=="1"&data$Family==0&as.numeric(data$Pclass)>1,1,0))

Machine learning

Let’s get data into the R

set.seed(1234)
inBuild <- createDataPartition(data$Survived,p=.8,list = F)
training <- data[inBuild,]
testing <- data[-inBuild,]

Generalized linear model (GLM) is a flexible generalization of ordinary linear regression that allows for response variables that have error distribution models other than a normal distribution. The GLM generalizes linear regression by allowing the linear model to be related to the response variable via a link function and by allowing the magnitude of the variance of each measurement to be a function of its predicted value.

set.seed(1234)
mod1.1 <- glm(Survived~ Pclass+Sex+Age+Family+Fare+Embarked+title,family=binomial("logit"),data=training)

#shows importance of variables
anova(mod1.1,test="Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Survived
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                       713     950.86              
## Pclass    2   78.060       711     872.80 < 2.2e-16 ***
## Sex       1  216.975       710     655.83 < 2.2e-16 ***
## Age       8   22.927       702     632.90  0.003459 ** 
## Family    1   17.644       701     615.26 2.664e-05 ***
## Fare      1    2.815       700     612.44  0.093361 .  
## Embarked  3    4.721       697     607.72  0.193419    
## title     4   39.487       693     568.24 5.527e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Null deviance shows error (diestance of predicted from actual) at random guessing
# Residual deviance shows error of the model used (573.27)
summary(mod1.1)

## 
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + Family + Fare + 
##     Embarked + title, family = binomial("logit"), data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2369  -0.5780  -0.3844   0.5328   2.5669  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.572e+01  1.022e+03   0.015  0.98773    
## Pclass2     -8.111e-01  3.691e-01  -2.197  0.02800 *  
## Pclass3     -1.876e+00  3.716e-01  -5.047 4.50e-07 ***
## Sex2         1.656e+01  1.028e+03   0.016  0.98715    
## Age1        -5.829e-01  6.124e-01  -0.952  0.34121    
## Age2        -5.879e-01  5.236e-01  -1.123  0.26149    
## Age3        -8.437e-01  5.927e-01  -1.423  0.15460    
## Age4        -4.245e-01  6.258e-01  -0.678  0.49755    
## Age5        -7.776e-01  5.690e-01  -1.366  0.17179    
## Age6        -5.090e-01  6.310e-01  -0.807  0.41988    
## Age7        -1.469e+00  6.504e-01  -2.258  0.02394 *  
## Age8        -8.755e-01  6.316e-01  -1.386  0.16573    
## Family      -4.643e-01  9.714e-02  -4.779 1.76e-06 ***
## Fare         3.755e-03  3.099e-03   1.212  0.22561    
## EmbarkedC   -1.179e+01  1.022e+03  -0.012  0.99079    
## EmbarkedQ   -1.216e+01  1.022e+03  -0.012  0.99051    
## EmbarkedS   -1.239e+01  1.022e+03  -0.012  0.99032    
## titleMiss   -1.677e+01  1.028e+03  -0.016  0.98699    
## titleMr     -3.269e+00  6.651e-01  -4.915 8.89e-07 ***
## titleMrs    -1.582e+01  1.028e+03  -0.015  0.98772    
## titleNoble  -3.634e+00  9.447e-01  -3.847  0.00012 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 950.86  on 713  degrees of freedom
## Residual deviance: 568.24  on 693  degrees of freedom
## AIC: 610.24
## 
## Number of Fisher Scoring iterations: 14

The Residuals vs. Fitted plot shows no pattern, supporting the accuracy of the independence assumption.
The Normal Q-Q plot indicates that the residuals are normally distributed because the points lie closely to the line.
The Scale-Location plot confirms the constant variance assumption, as the points are randomly distributed.
The Residuals vs. Leverage argues that no outliers are present, as all values fall well within the 0.5 bands.

plot(mod1.1, metric = "Kappa")

Removing low-importance features

mod1.1 <- glm(Survived~ Sex + Pclass + Age + Family + Embarked+title+SibSp,family=binomial("logit"),data=training)
summary(mod1.1)

## 
## Call:
## glm(formula = Survived ~ Sex + Pclass + Age + Family + Embarked + 
##     title + SibSp, family = binomial("logit"), data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1860  -0.5769  -0.3786   0.5260   2.5946  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   16.0357  1025.9097   0.016  0.98753    
## Sex2          16.5045  1028.2749   0.016  0.98719    
## Pclass2       -1.0102     0.3329  -3.035  0.00241 ** 
## Pclass3       -2.1189     0.3139  -6.751 1.47e-11 ***
## Age1          -0.5049     0.6140  -0.822  0.41085    
## Age2          -0.5119     0.5240  -0.977  0.32860    
## Age3          -0.7662     0.5910  -1.296  0.19483    
## Age4          -0.3670     0.6257  -0.587  0.55752    
## Age5          -0.7274     0.5707  -1.274  0.20250    
## Age6          -0.3939     0.6234  -0.632  0.52748    
## Age7          -1.4350     0.6458  -2.222  0.02628 *  
## Age8          -0.8776     0.6327  -1.387  0.16542    
## Family        -0.3359     0.1540  -2.181  0.02915 *  
## EmbarkedC    -11.8017  1025.9096  -0.012  0.99082    
## EmbarkedQ    -12.1988  1025.9097  -0.012  0.99051    
## EmbarkedS    -12.4638  1025.9096  -0.012  0.99031    
## titleMiss    -16.7581  1028.2751  -0.016  0.98700    
## titleMr       -3.3255     0.6775  -4.908 9.19e-07 ***
## titleMrs     -15.8955  1028.2752  -0.015  0.98767    
## titleNoble    -3.7130     0.9535  -3.894 9.85e-05 ***
## SibSp         -0.1621     0.2228  -0.727  0.46700    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 950.86  on 713  degrees of freedom
## Residual deviance: 569.42  on 693  degrees of freedom
## AIC: 611.42
## 
## Number of Fisher Scoring iterations: 14

Residual deviance is the same (575.71) which is indicating that we should consider feature creating using FARE & EMBARKED

Getting to more coplex models and feature creation

fitControl <- trainControl( method = "repeatedcv", 
                            summaryFunction=twoClassSummary,
                            number = 20,
                            repeats = 5,
                            classProbs=TRUE,
                            savePredictions=TRUE)

PP <- c('center', 'scale')

# Setting tuneGrid parameter for boosted tree models
# grid <-  expand.grid(interaction.depth = c(1, 5, 9), 
#                        n.trees = (1:30)*50, shrinkage = 0.1,n.minobsinnode = 20)

# 1. GLM model
glm.v1 <- train(Survived ~ Sex + Pclass + Age + Family + Embarked+title+SibSp, data = training, 
                method = "glm", metric = "ROC",trControl = fitControl)

# Residual deviance went down to 569.95
summary(glm.v1)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1860  -0.5769  -0.3786   0.5260   2.5946  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   16.0357  1025.9097   0.016  0.98753    
## Sex2          16.5045  1028.2749   0.016  0.98719    
## Pclass2       -1.0102     0.3329  -3.035  0.00241 ** 
## Pclass3       -2.1189     0.3139  -6.751 1.47e-11 ***
## Age1          -0.5049     0.6140  -0.822  0.41085    
## Age2          -0.5119     0.5240  -0.977  0.32860    
## Age3          -0.7662     0.5910  -1.296  0.19483    
## Age4          -0.3670     0.6257  -0.587  0.55752    
## Age5          -0.7274     0.5707  -1.274  0.20250    
## Age6          -0.3939     0.6234  -0.632  0.52748    
## Age7          -1.4350     0.6458  -2.222  0.02628 *  
## Age8          -0.8776     0.6327  -1.387  0.16542    
## Family        -0.3359     0.1540  -2.181  0.02915 *  
## EmbarkedC    -11.8017  1025.9096  -0.012  0.99082    
## EmbarkedQ    -12.1988  1025.9097  -0.012  0.99051    
## EmbarkedS    -12.4638  1025.9096  -0.012  0.99031    
## titleMiss    -16.7581  1028.2751  -0.016  0.98700    
## titleMr       -3.3255     0.6775  -4.908 9.19e-07 ***
## titleMrs     -15.8955  1028.2752  -0.015  0.98767    
## titleNoble    -3.7130     0.9535  -3.894 9.85e-05 ***
## SibSp         -0.1621     0.2228  -0.727  0.46700    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 950.86  on 713  degrees of freedom
## Residual deviance: 569.42  on 693  degrees of freedom
## AIC: 611.42
## 
## Number of Fisher Scoring iterations: 14

# Looking for ways to improve model through feature creation
# Most of the passengers were in 3rd class
table(data$Pclass)

## 
##   1   2   3 
## 216 184 491

# Most of the passengers onboarded in Southampton
table(data$Embarked)

## 
##       C   Q   S 
##   2 168  77 644

# Updating the model with modified feature (creating 2-level factor)
glm.v2 <- train(Survived ~ Sex + Pclass + Age + Family + I(Embarked=="S")+title+SibSp, data = training, 
                method = "glm", metric = "ROC",trControl = fitControl)
# Residual deviance went up slightly (570.93) 
summary(glm.v2)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2061  -0.5665  -0.3733   0.5315   2.5978  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                4.1326     0.6349   6.509 7.57e-11 ***
## Sex2                      15.5490   624.1263   0.025 0.980124    
## Pclass2                   -1.0534     0.3289  -3.203 0.001361 ** 
## Pclass3                   -2.1929     0.3029  -7.240 4.48e-13 ***
## Age1                      -0.5604     0.6116  -0.916 0.359495    
## Age2                      -0.6058     0.5154  -1.175 0.239812    
## Age3                      -0.7964     0.5902  -1.349 0.177268    
## Age4                      -0.4141     0.6237  -0.664 0.506737    
## Age5                      -0.8003     0.5657  -1.415 0.157136    
## Age6                      -0.4508     0.6188  -0.729 0.466282    
## Age7                      -1.5110     0.6400  -2.361 0.018230 *  
## Age8                      -0.9421     0.6280  -1.500 0.133589    
## Family                    -0.3246     0.1523  -2.130 0.033137 *  
## `I(Embarked == "S")TRUE`  -0.5287     0.2453  -2.155 0.031162 *  
## titleMiss                -15.7682   624.1266  -0.025 0.979844    
## titleMr                   -3.2396     0.6662  -4.863 1.16e-06 ***
## titleMrs                 -14.8638   624.1268  -0.024 0.981000    
## titleNoble                -3.6582     0.9463  -3.866 0.000111 ***
## SibSp                     -0.1775     0.2210  -0.803 0.421823    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 950.86  on 713  degrees of freedom
## Residual deviance: 570.28  on 695  degrees of freedom
## AIC: 608.28
## 
## Number of Fisher Scoring iterations: 13

# Checking if nobility played any role in survival factor
glm.v3 <- train(Survived ~ Sex + Pclass + Age + Family + I(Embarked=="S")+I(title=="Noble")+I(title=="Mr")+SibSp
                , data = training, 
                method = "glm", metric = "ROC",trControl = fitControl)
# Residual deviance went up slightly (579.60) 
summary(glm.v3)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2574  -0.6147  -0.3660   0.5329   2.5652  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                3.71240    0.59097   6.282 3.35e-10 ***
## Sex2                       0.24178    0.53831   0.449  0.65332    
## Pclass2                   -0.99039    0.32009  -3.094  0.00197 ** 
## Pclass3                   -2.14962    0.29895  -7.191 6.46e-13 ***
## Age1                      -0.34472    0.59092  -0.583  0.55965    
## Age2                      -0.58761    0.51066  -1.151  0.24986    
## Age3                      -0.50604    0.55256  -0.916  0.35977    
## Age4                      -0.05523    0.56801  -0.097  0.92254    
## Age5                      -0.43822    0.51203  -0.856  0.39209    
## Age6                      -0.05727    0.55211  -0.104  0.91739    
## Age7                      -1.08863    0.55274  -1.970  0.04890 *  
## Age8                      -0.49351    0.57129  -0.864  0.38767    
## Family                    -0.25545    0.14464  -1.766  0.07737 .  
## `I(Embarked == "S")TRUE`  -0.50951    0.24325  -2.095  0.03621 *  
## `I(title == "Noble")TRUE` -3.26511    0.77863  -4.193 2.75e-05 ***
## `I(title == "Mr")TRUE`    -3.21667    0.60925  -5.280 1.29e-07 ***
## SibSp                     -0.21177    0.21258  -0.996  0.31916    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 950.86  on 713  degrees of freedom
## Residual deviance: 578.61  on 697  degrees of freedom
## AIC: 612.61
## 
## Number of Fisher Scoring iterations: 5

# Checking if protocol "women and children first" played a pivotal role
glm.v4 <- train(Survived ~ Sex + Pclass + Age + Family + I(Embarked=="S") + 
                    I(title=="Noble") + 
                    I(title=="Mr") +
                    I(title=="Mr"&Pclass=="3")+SibSp+
                    indicator1+indicator2+indicator3+indicator4+indicator5, data = training, 
                method = "glm", metric = "ROC",trControl = fitControl,preProcess=PP)
# Residual deviance went down slightly (551.14) 
summary(glm.v4)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9580  -0.5615  -0.4347   0.3300   2.4708  
## 
## Coefficients:
##                                        Estimate Std. Error z value
## (Intercept)                             -0.4917     0.1261  -3.900
## Sex2                                    -0.2185     0.3702  -0.590
## Pclass2                                 -0.5822     0.1664  -3.499
## Pclass3                                 -1.9485     0.2927  -6.658
## Age1                                    -0.1502     0.1777  -0.845
## Age2                                    -0.2025     0.1915  -1.058
## Age3                                    -0.3254     0.1920  -1.694
## Age4                                    -0.1195     0.1855  -0.644
## Age5                                    -0.4000     0.2626  -1.524
## Age6                                    -0.1095     0.2010  -0.545
## Age7                                    -0.5003     0.2134  -2.344
## Age8                                    -0.2457     0.2065  -1.190
## Family                                  -0.3497     0.3712  -0.942
## `I(Embarked == "S")TRUE`                -0.2287     0.1103  -2.074
## `I(title == "Noble")TRUE`               -0.7783     0.1556  -5.002
## `I(title == "Mr")TRUE`                  -2.5434     0.4372  -5.817
## `I(title == "Mr" & Pclass == "3")TRUE`   1.2101     0.2921   4.143
## SibSp                                   -0.6186     0.3716  -1.665
## indicator11                              0.1971     0.1808   1.090
## indicator21                              0.1037     0.1636   0.634
## indicator31                              0.3003     0.1306   2.300
## indicator41                              0.2664     0.1099   2.424
## indicator51                             -0.1102     0.1750  -0.630
##                                        Pr(>|z|)    
## (Intercept)                            9.60e-05 ***
## Sex2                                   0.555079    
## Pclass2                                0.000468 ***
## Pclass3                                2.77e-11 ***
## Age1                                   0.397944    
## Age2                                   0.290240    
## Age3                                   0.090188 .  
## Age4                                   0.519508    
## Age5                                   0.127599    
## Age6                                   0.586003    
## Age7                                   0.019058 *  
## Age8                                   0.234137    
## Family                                 0.346113    
## `I(Embarked == "S")TRUE`               0.038087 *  
## `I(title == "Noble")TRUE`              5.67e-07 ***
## `I(title == "Mr")TRUE`                 5.99e-09 ***
## `I(title == "Mr" & Pclass == "3")TRUE` 3.43e-05 ***
## SibSp                                  0.095963 .  
## indicator11                            0.275731    
## indicator21                            0.526014    
## indicator31                            0.021461 *  
## indicator41                            0.015338 *  
## indicator51                            0.528743    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 950.86  on 713  degrees of freedom
## Residual deviance: 548.39  on 691  degrees of freedom
## AIC: 594.39
## 
## Number of Fisher Scoring iterations: 6

anova(glm.v4$finalModel,test="Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: .outcome
## 
## Terms added sequentially (first to last)
## 
## 
##                                        Df Deviance Resid. Df Resid. Dev
## NULL                                                     713     950.86
## Sex2                                    1  227.535       712     723.33
## Pclass2                                 1    4.320       711     719.01
## Pclass3                                 1   63.181       710     655.83
## Age1                                    1    0.082       709     655.75
## Age2                                    1    0.223       708     655.52
## Age3                                    1    0.938       707     654.59
## Age4                                    1    0.287       706     654.30
## Age5                                    1    0.684       705     653.62
## Age6                                    1    0.304       704     653.31
## Age7                                    1   10.137       703     643.18
## Age8                                    1   10.273       702     632.90
## Family                                  1   17.644       701     615.26
## `I(Embarked == "S")TRUE`                1    5.434       700     609.82
## `I(title == "Noble")TRUE`               1    0.852       699     608.97
## `I(title == "Mr")TRUE`                  1   29.368       698     579.60
## `I(title == "Mr" & Pclass == "3")TRUE`  1   21.339       697     558.27
## SibSp                                   1    0.515       696     557.75
## indicator11                             1    0.152       695     557.60
## indicator21                             1    0.082       694     557.52
## indicator31                             1    3.164       693     554.35
## indicator41                             1    5.556       692     548.80
## indicator51                             1    0.404       691     548.39
##                                         Pr(>Chi)    
## NULL                                                
## Sex2                                   < 2.2e-16 ***
## Pclass2                                 0.037661 *  
## Pclass3                                1.886e-15 ***
## Age1                                    0.774526    
## Age2                                    0.636983    
## Age3                                    0.332777    
## Age4                                    0.592317    
## Age5                                    0.408359    
## Age6                                    0.581381    
## Age7                                    0.001453 ** 
## Age8                                    0.001350 ** 
## Family                                 2.664e-05 ***
## `I(Embarked == "S")TRUE`                0.019753 *  
## `I(title == "Noble")TRUE`               0.356047    
## `I(title == "Mr")TRUE`                 5.984e-08 ***
## `I(title == "Mr" & Pclass == "3")TRUE` 3.849e-06 ***
## SibSp                                   0.472955    
## indicator11                             0.696281    
## indicator21                             0.774385    
## indicator31                             0.075260 .  
## indicator41                             0.018421 *  
## indicator51                             0.525064    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Moving on to more complex models

Boosting

Start off with unning boosting parameters.

ada.grid <- expand.grid(.iter = c(500),
                        .maxdepth = 4,
                        .nu = 0.1)
#writting a model
set.seed(1234)
ada.v1 <- train(Survived ~ Sex + Pclass + Age + Family + Embarked+title+SibSp+
                    indicator1+indicator2+indicator3+indicator4+
                    indicator5, 
                  data = training,
                  method = "ada",
                  metric = "ROC",
                  tuneGrid = ada.grid,
                  trControl = fitControl,preProcess=PP)
#The final values used for the model were iter = 300, maxdepth = 4 and nu = 0.1.
ada.v1

## Boosted Classification Trees 
## 
## 714 samples
##  18 predictor
##   2 classes: '0', '1' 
## 
## Pre-processing: centered, scaled 
## Resampling: Cross-Validated (20 fold, repeated 5 times) 
## 
## Summary of sample sizes: 678, 679, 678, 679, 679, 679, ... 
## 
## Resampling results
## 
##   ROC        Sens       Spec       ROC SD      Sens SD     Spec SD  
##   0.8729458  0.9009091  0.7202747  0.07049063  0.06787657  0.1259303
## 
## Tuning parameter 'iter' was held constant at a value of 500
## 
## Tuning parameter 'maxdepth' was held constant at a value of 4
## 
## Tuning parameter 'nu' was held constant at a value of 0.1
##

#Comparing to boosted tree model GBM
gbmGrid <-  expand.grid(interaction.depth = 5, 
                        n.trees = 100, 
                        shrinkage = 0.1, 
                        n.minobsinnode = 20)
set.seed(1234)
gbm.v1 <- train(Survived ~ Sex + Pclass + Age + Family + I(Embarked=="S") + 
                    I(title=="Noble") + 
                    I(title=="Mr") +
                    I(title=="Mr"&Pclass=="3")+SibSp+
                    indicator1+indicator2+indicator3+indicator4+
                    indicator5,
                data = training,
                 method = "gbm",
                 trControl = fitControl,
                 verbose = FALSE,
                 tuneGrid = gbmGrid,preProcess=PP)
#The final values used for the model were n.trees = 100, interaction.depth = 5, shrinkage = 0.1
# and n.minobsinnode = 20. bm.v1
plot(gbm.v1$finalModel$fit)

Random Forest

Tunning boosting parameters.

rfGrid <- data.frame(.mtry = 20)
#writting a model
set.seed(1234)
rf.v1 <- train(Survived ~  Sex + Pclass + Age + Family + I(Embarked=="S") + 
                   I(title=="Noble") + 
                   I(title=="Mr") +
                   I(title=="Mr"&Pclass=="3")+SibSp+
                   indicator1+indicator2+indicator3+indicator4+
                   indicator5, 
                 data = training,
                 method = "rf",
                 metric = "ROC",
                 tuneGrid = rfGrid,
                 trControl = fitControl,preProcess=PP)

#The final value used for the model was mtry = 2. 
rf.v1

## Random Forest 
## 
## 714 samples
##  18 predictor
##   2 classes: '0', '1' 
## 
## Pre-processing: centered, scaled 
## Resampling: Cross-Validated (20 fold, repeated 5 times) 
## 
## Summary of sample sizes: 678, 679, 678, 679, 679, 679, ... 
## 
## Resampling results
## 
##   ROC        Sens       Spec       ROC SD     Sens SD     Spec SD 
##   0.8580507  0.8954545  0.6984615  0.0754542  0.06973289  0.129839
## 
## Tuning parameter 'mtry' was held constant at a value of 20
##

# SVM 
set.seed(1234)
svm.v1 <- train(Survived ~  Sex + Pclass + Age + Family + I(Embarked=="S") + 
                    I(title=="Noble") + 
                    I(title=="Mr") +
                    I(title=="Mr"&Pclass=="3")+SibSp+
                    indicator1+indicator2+indicator3+indicator4+
                    indicator5,
                  data = training,
                  method = "svmRadial",
                  tuneLength = 10,
                  metric = "ROC",
                  trControl = fitControl,preProcess=PP)
#The final values used for the model were sigma = 0.1767715 and C = 1. 
svm.v1

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 714 samples
##  18 predictor
##   2 classes: '0', '1' 
## 
## Pre-processing: centered, scaled 
## Resampling: Cross-Validated (20 fold, repeated 5 times) 
## 
## Summary of sample sizes: 678, 679, 678, 679, 679, 679, ... 
## 
## Resampling results across tuning parameters:
## 
##   C       ROC        Sens       Spec       ROC SD      Sens SD   
##     0.25  0.8559590  0.8940909  0.7202747  0.07245976  0.06808380
##     0.50  0.8555195  0.9018182  0.7122527  0.07383021  0.06254475
##     1.00  0.8539386  0.9059091  0.7106044  0.07444788  0.06272301
##     2.00  0.8568007  0.9018182  0.7195055  0.07532675  0.06642832
##     4.00  0.8603447  0.9004545  0.7193956  0.07204604  0.07008962
##     8.00  0.8547777  0.8995455  0.7230220  0.07491671  0.06876700
##    16.00  0.8531543  0.9000000  0.7069780  0.07024009  0.07018038
##    32.00  0.8437812  0.9009091  0.6982967  0.07790353  0.06848875
##    64.00  0.8342732  0.8950000  0.6924725  0.08003484  0.07008962
##   128.00  0.8334940  0.8950000  0.6882967  0.07954788  0.07068263
##   Spec SD  
##   0.1325488
##   0.1300229
##   0.1277979
##   0.1337085
##   0.1270181
##   0.1237785
##   0.1251919
##   0.1263558
##   0.1248563
##   0.1270579
## 
## Tuning parameter 'sigma' was held constant at a value of 0.02901079
## ROC was used to select the optimal model using  the largest value.
## The final values used for the model were sigma = 0.02901079 and C = 4.

Neural network

set.seed(1234)
nnet.v1 <- train(Survived ~  Sex + Pclass + Age + Family + I(Embarked=="S") + 
                     I(title=="Noble") + 
                     I(title=="Mr") +
                     I(title=="Mr"&Pclass=="3")+SibSp+
                     indicator1+indicator2+indicator3+indicator4+
                     indicator5,
                data = training,
                method = "nnet",
                trControl = fitControl,preProcess=PP)
#The final values used for the model were sigma = 0.1767715 and C = 1. 
nnet.v1

eXtreme Gradient Boosting

set.seed(1234)
xgb.v1 <- train(Survived ~  Sex + Pclass + Age + Family + I(Embarked=="S") + 
                    I(title=="Noble") + 
                    I(title=="Mr") +
                    I(title=="Mr"&Pclass=="3")+SibSp+
                    indicator1+indicator2+indicator3+indicator4+
                    indicator5,
                 data = training,
                 method = "xgbTree",
                 trControl = fitControl,preProcess=PP)
#The final values used for the model were sigma = 0.1767715 and C = 1. 
xgb.v1

## eXtreme Gradient Boosting 
## 
## 714 samples
##  18 predictor
##   2 classes: '0', '1' 
## 
## Pre-processing: centered, scaled 
## Resampling: Cross-Validated (20 fold, repeated 5 times) 
## 
## Summary of sample sizes: 678, 679, 678, 679, 679, 679, ... 
## 
## Resampling results across tuning parameters:
## 
##   max_depth  nrounds  ROC        Sens       Spec       ROC SD    
##   1           50      0.8637987  0.9090909  0.7088462  0.07532399
##   1          100      0.8635627  0.9090909  0.7109890  0.07655009
##   1          150      0.8636251  0.9086364  0.7138462  0.07812143
##   2           50      0.8705257  0.9045455  0.7276374  0.07802330
##   2          100      0.8714123  0.9095455  0.7066484  0.07774913
##   2          150      0.8704570  0.9168182  0.6986264  0.07811869
##   3           50      0.8761501  0.9150000  0.7003846  0.07236124
##   3          100      0.8705669  0.9145455  0.7180220  0.07472821
##   3          150      0.8685390  0.9113636  0.7179670  0.07534815
##   Sens SD     Spec SD  
##   0.06060606  0.1419008
##   0.06060606  0.1425109
##   0.06145922  0.1414600
##   0.06572716  0.1296556
##   0.06698366  0.1332367
##   0.06526673  0.1346154
##   0.06545831  0.1331716
##   0.07114908  0.1270460
##   0.06787504  0.1206444
## 
## Tuning parameter 'eta' was held constant at a value of 0.3
## ROC was used to select the optimal model using  the largest value.
## The final values used for the model were nrounds = 50, max_depth = 3
##  and eta = 0.3.

Making predictions with each model.

glm.predictions <- predict(glm.v4,newdata=testing)
ada.predictions <- predict(ada.v1,newdata=testing)
rf.predictions <- predict(rf.v1,newdata=testing)
svm.predictions <- predict(svm.v1,newdata=testing)
gbm.predictions <- predict(gbm.v1,newdata=testing)
nnet.predictions <- predict(nnet.v1,newdata=testing)
xgb.predictions <- predict(xgb.v1,newdata=testing)

#finTesting <- data.frame(testing,GLM=glm.predictions,ADA=ada.predictions,RF=rf.predictions,
#                         SVM=svm.predictions,GBM=gbm.predictions,NNET=nnet.predictions,XGB=xgb.predictions)
#write.csv(finTesting,"./Titanic/Testing/testing.csv", row.names=FALSE)

Comparing errors.

confusionMatrix(glm.predictions,testing$Survived) #Accuracy : 0.8079 (no feature engineering 0.8136)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 96 21
##          1 13 47
##                                           
##                Accuracy : 0.8079          
##                  95% CI : (0.7421, 0.8632)
##     No Information Rate : 0.6158          
##     P-Value [Acc > NIR] : 2.854e-08       
##                                           
##                   Kappa : 0.5849          
##  Mcnemar's Test P-Value : 0.2299          
##                                           
##             Sensitivity : 0.8807          
##             Specificity : 0.6912          
##          Pos Pred Value : 0.8205          
##          Neg Pred Value : 0.7833          
##              Prevalence : 0.6158          
##          Detection Rate : 0.5424          
##    Detection Prevalence : 0.6610          
##       Balanced Accuracy : 0.7860          
##                                           
##        'Positive' Class : 0               
##

confusionMatrix(ada.predictions,testing$Survived) #Accuracy : 0.7966 (no feature engineering 0.8079)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 93 20
##          1 16 48
##                                           
##                Accuracy : 0.7966          
##                  95% CI : (0.7297, 0.8533)
##     No Information Rate : 0.6158          
##     P-Value [Acc > NIR] : 1.87e-07        
##                                           
##                   Kappa : 0.5653          
##  Mcnemar's Test P-Value : 0.6171          
##                                           
##             Sensitivity : 0.8532          
##             Specificity : 0.7059          
##          Pos Pred Value : 0.8230          
##          Neg Pred Value : 0.7500          
##              Prevalence : 0.6158          
##          Detection Rate : 0.5254          
##    Detection Prevalence : 0.6384          
##       Balanced Accuracy : 0.7795          
##                                           
##        'Positive' Class : 0               
##

confusionMatrix(rf.predictions,testing$Survived) #Accuracy :  0.7684  (no feature engineering 0.8023)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 89 21
##          1 20 47
##                                           
##                Accuracy : 0.7684          
##                  95% CI : (0.6992, 0.8284)
##     No Information Rate : 0.6158          
##     P-Value [Acc > NIR] : 1.153e-05       
##                                           
##                   Kappa : 0.5091          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8165          
##             Specificity : 0.6912          
##          Pos Pred Value : 0.8091          
##          Neg Pred Value : 0.7015          
##              Prevalence : 0.6158          
##          Detection Rate : 0.5028          
##    Detection Prevalence : 0.6215          
##       Balanced Accuracy : 0.7538          
##                                           
##        'Positive' Class : 0               
##

confusionMatrix(svm.predictions,testing$Survived) #Accuracy : 0.791 (no feature engineering 0.7966)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 93 21
##          1 16 47
##                                           
##                Accuracy : 0.791           
##                  95% CI : (0.7236, 0.8483)
##     No Information Rate : 0.6158          
##     P-Value [Acc > NIR] : 4.548e-07       
##                                           
##                   Kappa : 0.552           
##  Mcnemar's Test P-Value : 0.5108          
##                                           
##             Sensitivity : 0.8532          
##             Specificity : 0.6912          
##          Pos Pred Value : 0.8158          
##          Neg Pred Value : 0.7460          
##              Prevalence : 0.6158          
##          Detection Rate : 0.5254          
##    Detection Prevalence : 0.6441          
##       Balanced Accuracy : 0.7722          
##                                           
##        'Positive' Class : 0               
##

confusionMatrix(gbm.predictions,testing$Survived) #Accuracy : 0.8192 (no feature engineering 0.7966)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 94 17
##          1 15 51
##                                           
##                Accuracy : 0.8192          
##                  95% CI : (0.7545, 0.8729)
##     No Information Rate : 0.6158          
##     P-Value [Acc > NIR] : 3.784e-09       
##                                           
##                   Kappa : 0.6158          
##  Mcnemar's Test P-Value : 0.8597          
##                                           
##             Sensitivity : 0.8624          
##             Specificity : 0.7500          
##          Pos Pred Value : 0.8468          
##          Neg Pred Value : 0.7727          
##              Prevalence : 0.6158          
##          Detection Rate : 0.5311          
##    Detection Prevalence : 0.6271          
##       Balanced Accuracy : 0.8062          
##                                           
##        'Positive' Class : 0               
##

confusionMatrix(nnet.predictions,testing$Survived) #Accuracy :0.8249 (no feature engineering 0.8192)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 95 17
##          1 14 51
##                                           
##                Accuracy : 0.8249          
##                  95% CI : (0.7607, 0.8778)
##     No Information Rate : 0.6158          
##     P-Value [Acc > NIR] : 1.304e-09       
##                                           
##                   Kappa : 0.6268          
##  Mcnemar's Test P-Value : 0.7194          
##                                           
##             Sensitivity : 0.8716          
##             Specificity : 0.7500          
##          Pos Pred Value : 0.8482          
##          Neg Pred Value : 0.7846          
##              Prevalence : 0.6158          
##          Detection Rate : 0.5367          
##    Detection Prevalence : 0.6328          
##       Balanced Accuracy : 0.8108          
##                                           
##        'Positive' Class : 0               
##

confusionMatrix(xgb.predictions,testing$Survived) #Accuracy : 0.7966 (no feature engineering 0.7853)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 92 19
##          1 17 49
##                                           
##                Accuracy : 0.7966          
##                  95% CI : (0.7297, 0.8533)
##     No Information Rate : 0.6158          
##     P-Value [Acc > NIR] : 1.87e-07        
##                                           
##                   Kappa : 0.5678          
##  Mcnemar's Test P-Value : 0.8676          
##                                           
##             Sensitivity : 0.8440          
##             Specificity : 0.7206          
##          Pos Pred Value : 0.8288          
##          Neg Pred Value : 0.7424          
##              Prevalence : 0.6158          
##          Detection Rate : 0.5198          
##    Detection Prevalence : 0.6271          
##       Balanced Accuracy : 0.7823          
##                                           
##        'Positive' Class : 0               
##

Model Evaluation

Visualizing the results.

cv.values <- resamples(list(Logit = glm.v4, Ada = ada.v1, 
                            RF = rf.v1, SVM = svm.v1,GBM=gbm.v1,NNET=nnet.v1,XGB=xgb.v1))
dotplot(cv.values, metric = "ROC")

Assemble models.

pr.df <- data.frame(glm.predictions,ada.predictions,rf.predictions,svm.predictions,gbm.predictions,nnet.predictions,xgb.predictions,Survived=testing$Survived)
comb.mod <- train(Survived~.,method = "ada",
                  metric = "ROC",
                  tuneGrid = ada.grid,
                  trControl = fitControl,preProcess=PP,data=pr.df)
fin <- predict(comb.mod,pr.df)
confusionMatrix(fin,testing$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 98 13
##          1 11 55
##                                          
##                Accuracy : 0.8644         
##                  95% CI : (0.805, 0.9112)
##     No Information Rate : 0.6158         
##     P-Value [Acc > NIR] : 2.439e-13      
##                                          
##                   Kappa : 0.7118         
##  Mcnemar's Test P-Value : 0.8383         
##                                          
##             Sensitivity : 0.8991         
##             Specificity : 0.8088         
##          Pos Pred Value : 0.8829         
##          Neg Pred Value : 0.8333         
##              Prevalence : 0.6158         
##          Detection Rate : 0.5537         
##    Detection Prevalence : 0.6271         
##       Balanced Accuracy : 0.8540         
##                                          
##        'Positive' Class : 0              
##

# Assembled model accuracy : 0.8644

Conclusion

Each of the described above models were used to predict the results and submitted to Kaggle.com. As a result, here you can see the accuracy of each model below: GLM accuracy: 0.78469 ADA accuracy: 0.77990 RF accuracy: 0.77512 SVM accuracy: 0.78469 GBM accuracy: 0.77033 NNET accuracy: 0.78947 XGB accuracy: 0.77033 Combined model accuracy: 0.76555 ####Neural Networks approcah alone showed the highest results. Assembled model showed the best result in testing session, but was the least accurate model to perform in the evaluation phase on Kaggle.com mainly due to overfitting issue with the model.

Titanic - Kaggle.com

Alex Tselsov

11 мая 2016 г.