Kaggle packaged the data for the Titanic challenge into two csv-format files:
train.csv (data containing attributes and known outcomes [survived or perished] for a subset of the passengers)
test.csv (data containing attributes without outcomes for a subset of passengers)
library(ggplot2)
library(lattice)
library(caret)
data <- read.csv("./Titanic/train.csv",header = T, stringsAsFactors = T)
levels(data$Survived) <- c("0","1")
data$Survived <- as.factor(data$Survived)
Putting some simple data visualization tools to work can take us a long way toward understanding what might influence the outcome we’re trying to predict – in this case, whether or not a passenger survived. Below is some code and the graphs they produced:
par(mfrow=c(4,2))
barplot(table(data$Survived),
names.arg = c("Perished", "Survived"),
main="Survived (passenger fate)", col="black")
barplot(table(data$Pclass),
names.arg = c("first", "second", "third"),
main="Pclass (passenger traveling class)", col="firebrick")
barplot(table(data$Sex), main="Sex (gender)", col="darkviolet")
hist(data$Age, main="Age", xlab = NULL, col="brown")
barplot(table(data$SibSp), main="SibSp (siblings + spouse aboard)",
col="darkblue")
barplot(table(data$Parch), main="Parch (parents + kids aboard)",
col="gray50")
hist(data$Fare, main="Fare (fee paid for ticket[s])", xlab = NULL,
col="darkgreen")
barplot(table(data$Embarked),
names.arg = c("","Cherbourg", "Queenstown", "Southampton"),
main="Embarked (port of embarkation)", col="sienna")
counts <- table(data$Survived,data$Sex)
barplot(counts, xlab="Sex", ylab="Number Survived",main="Survived passengers by Sex")
counts[2] / (counts[1] + counts[2])
## [1] 0.7420382
counts[4] / (counts[3] + counts[4])
## [1] 0.1889081
counts <- table(data$Survived,data$Pclass)
barplot(counts, xlab="Class", ylab="Number Survived",main="Survived passengers by Class")
counts[2] / (counts[1] + counts[2])
## [1] 0.6296296
counts[4] / (counts[3] + counts[4])
## [1] 0.4728261
counts[6] / (counts[5] + counts[6])
## [1] 0.2423625
counts <- table(data$Survived,data$SibSp)
barplot(counts, xlab="Sublings", ylab="Number Survived",main="Survived passengers by Number of Siblings/Spouses Aboard")
counts <- table(data$Survived,data$Parch)
barplot(counts, xlab="Children", ylab="Number Survived",main="Survived passengers by Number of Children Aboard")
library(Hmisc)
library(ISLR)
cutFare <- cut2(data$Fare, g=3)
qplot(cutFare, Parch, data=data, fill=cutFare, geom=c("boxplot", "jitter"))
t1 <- table(cutFare, data$Survived)
t1
##
## cutFare 0 1
## [ 0.00, 8.68) 247 61
## [ 8.68, 26.25) 172 116
## [26.25,512.33] 130 165
prop.table(t1, 1)
##
## cutFare 0 1
## [ 0.00, 8.68) 0.8019481 0.1980519
## [ 8.68, 26.25) 0.5972222 0.4027778
## [26.25,512.33] 0.4406780 0.5593220
boxplot(data$Age ~ data$Survived,
main="Passenger Fate by Age",
xlab="Survived", ylab="Age")
Let’s visualize the missing data by feature:
## map missing data by feature
require(Amelia)
missmap(data, main="Missings Values", col=c("yellow", "black"), legend=FALSE)
Let’s create a function to extract honorific (i.e. title) from the Name feature.Then we check for all the unique values in titles listed, and find portions of missing values by title.
getTitle <- function(data) {
title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE)
title.comma.end <- title.dot.start + attr(title.dot.start, "match.length")-1
data$Title <- substr(data$Name, title.dot.start+2, title.comma.end-1)
return (data$Title)
}
data$title <- getTitle(data)
# checking for all the unique titles listed
unique(data$title)
## [1] "Mr" "Mrs" "Miss" "Master"
## [5] "Don" "Rev" "Dr" "Mme"
## [9] "Ms" "Major" "Lady" "Sir"
## [13] "Mlle" "Col" "Capt" "the Countess"
## [17] "Jonkheer"
# finding # of missing values by title & # of not missing values by title
bystats(data$Age, data$title)
##
## Mean of data$Age by
##
## N Missing Mean
## Capt 1 0 70.000000
## Col 2 0 58.000000
## Don 1 0 40.000000
## Dr 6 1 42.000000
## Jonkheer 1 0 38.000000
## Lady 1 0 48.000000
## Major 2 0 48.500000
## Master 36 4 4.574167
## Miss 146 36 21.773973
## Mlle 2 0 24.000000
## Mme 1 0 24.000000
## Mr 398 119 32.368090
## Mrs 108 17 35.898148
## Ms 1 0 28.000000
## Rev 6 0 43.166667
## Sir 1 0 49.000000
## the Countess 1 0 33.000000
## ALL 714 177 29.699118
summary(data$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.42 20.12 28.00 29.70 38.00 80.00 177
var.levels <- c("Dr","Master","Miss","Mr","Mrs")
imputeMedian <- function(impute.var, filter.var, var.levels) {
for (v in var.levels) {
impute.var[ which( filter.var == v)] <- impute(impute.var[
which( filter.var == v)])
}
return (impute.var)
}
data$Age <- imputeMedian(data$Age,data$title,var.levels=var.levels)
summary(data$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.42 21.00 30.00 29.39 35.00 80.00
checkNAs <- function(data) {
c <- rep(NA,ncol(data))
for (i in 1:ncol(data)) {
c[i] <- table(!is.na(data[,i]))
}
return(c)
}
checkNAs(data)
## [1] 891 891 891 891 891 891 891 891 891 891 891 891 891
# analyzing Fare rates
summary(data$Fare)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 7.91 14.45 32.20 31.00 512.30
subset(data, Fare < 7)[order(subset(data, Fare < 7)$Fare, subset(data, Fare < 7)$Pclass), c("Age", "title", "Pclass", "Fare")]
## Age title Pclass Fare
## 264 40.0 Mr 1 0.0000
## 634 30.0 Mr 1 0.0000
## 807 39.0 Mr 1 0.0000
## 816 30.0 Mr 1 0.0000
## 823 38.0 Jonkheer 1 0.0000
## 278 30.0 Mr 2 0.0000
## 414 30.0 Mr 2 0.0000
## 467 30.0 Mr 2 0.0000
## 482 30.0 Mr 2 0.0000
## 675 30.0 Mr 2 0.0000
## 733 30.0 Mr 2 0.0000
## 180 36.0 Mr 3 0.0000
## 272 25.0 Mr 3 0.0000
## 303 19.0 Mr 3 0.0000
## 598 49.0 Mr 3 0.0000
## 379 20.0 Mr 3 4.0125
## 873 33.0 Mr 1 5.0000
## 327 61.0 Mr 3 6.2375
## 844 34.5 Mr 3 6.4375
## 819 43.0 Mr 3 6.4500
## 203 34.0 Mr 3 6.4958
## 372 18.0 Mr 3 6.4958
## 144 19.0 Mr 3 6.7500
## 655 18.0 Miss 3 6.7500
## 412 30.0 Mr 3 6.8583
## 826 30.0 Mr 3 6.9500
## 130 45.0 Mr 3 6.9750
## 805 27.0 Mr 3 6.9750
dataFare <- subset(data,Fare==0)
barplot(table(dataFare$Survived))
data$Fare[which(data$Fare==0)] <- NA
data$Pclass <- as.factor(data$Pclass)
data$Fare <- imputeMedian(data$Fare,data$Pclass,var.levels=as.numeric(levels(data$Pclass)))
table(is.na(data$Fare))
##
## FALSE
## 891
# new feature creation
data$Family <- data$Parch + data$SibSp
barplot(log2(table(data$Family)))
unique(data$title)
## [1] "Mr" "Mrs" "Miss" "Master"
## [5] "Don" "Rev" "Dr" "Mme"
## [9] "Ms" "Major" "Lady" "Sir"
## [13] "Mlle" "Col" "Capt" "the Countess"
## [17] "Jonkheer"
changeTitles <- function(data, old.titles, new.title) {
for (honorific in old.titles) {
data$title[ which( data$title == honorific)] <- new.title
}
return (data$title)
}
## Title consolidation
data$title <- changeTitles(data,
c("Capt", "Col", "Don", "Dr",
"Jonkheer", "Lady", "Major",
"Rev", "Sir"),
"Noble")
data$title <- changeTitles(data, c("the Countess", "Ms"),
"Mrs")
data$title <- changeTitles(data, c("Mlle", "Mme"), "Miss")
data$title <- as.factor(data$title)
unique(data$title)
## [1] Mr Mrs Miss Master Noble
## Levels: Master Miss Mr Mrs Noble
# Replace Sex with 1:0
data$Sex <- as.character(data$Sex)
data$Sex[which(data$Sex=="female")] <- "2"
data$Sex[which(data$Sex=="male")] <- "1"
data$Sex <- as.factor(data$Sex)
# Split Age into k-folds
split <- cut2(x = data$Age,g = 10)
table(split)
## split
## [ 0.42,17.0) [17.00,20.5) [20.50,23.0) [23.00,27.0) [27.00,30.5)
## 104 79 88 88 209
## [30.50,34.5) [34.50,39.0) [39.00,48.0) [48.00,80.0]
## 69 76 89 89
qplot(split, data$Age, data=data, fill=split, geom=c("boxplot", "jitter"))
data$Age[which(data$Age<17)] <- 0
data$Age[which(data$Age>=17&data$Age<20)] <- 1
data$Age[which(data$Age>=20&data$Age<23)] <- 2
data$Age[which(data$Age>=23&data$Age<27)] <- 3
data$Age[which(data$Age>=27&data$Age<30)] <- 4
data$Age[which(data$Age>=30&data$Age<34)] <- 5
data$Age[which(data$Age>=34&data$Age<39)] <- 6
data$Age[which(data$Age>=39&data$Age<48)] <- 7
data$Age[which(data$Age>=48)] <- 8
data$Age <- as.factor(data$Age)
#Family plays an important role
# Single Mother Aged <27
data$indicator1 <- as.factor(ifelse(data$Sex==2&as.numeric(data$Age)<=4&data$Parch>0,1,0))
# Single Mother Aged >27
data$indicator2 <- as.factor(ifelse(data$Sex==2&as.numeric(data$Age)>4&data$Parch>0,1,0))
# Married couple with no kids
data$indicator3 <- as.factor(ifelse(data$SibSp==1&data$Parch==0,1,0))
# Single relatives with no kids
data$indicator4 <- as.factor(ifelse(data$SibSp>1&data$Parch==0,1,0))
# Young single males in 2nd or 3rd classes
data$indicator5 <- as.factor(ifelse(as.numeric(data$Age)<4&data$Sex=="1"&data$Family==0&as.numeric(data$Pclass)>1,1,0))
Let’s get data into the R
set.seed(1234)
inBuild <- createDataPartition(data$Survived,p=.8,list = F)
training <- data[inBuild,]
testing <- data[-inBuild,]
Generalized linear model (GLM) is a flexible generalization of ordinary linear regression that allows for response variables that have error distribution models other than a normal distribution. The GLM generalizes linear regression by allowing the linear model to be related to the response variable via a link function and by allowing the magnitude of the variance of each measurement to be a function of its predicted value.
set.seed(1234)
mod1.1 <- glm(Survived~ Pclass+Sex+Age+Family+Fare+Embarked+title,family=binomial("logit"),data=training)
#shows importance of variables
anova(mod1.1,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Survived
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 713 950.86
## Pclass 2 78.060 711 872.80 < 2.2e-16 ***
## Sex 1 216.975 710 655.83 < 2.2e-16 ***
## Age 8 22.927 702 632.90 0.003459 **
## Family 1 17.644 701 615.26 2.664e-05 ***
## Fare 1 2.815 700 612.44 0.093361 .
## Embarked 3 4.721 697 607.72 0.193419
## title 4 39.487 693 568.24 5.527e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Null deviance shows error (diestance of predicted from actual) at random guessing
# Residual deviance shows error of the model used (573.27)
summary(mod1.1)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + Family + Fare +
## Embarked + title, family = binomial("logit"), data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2369 -0.5780 -0.3844 0.5328 2.5669
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.572e+01 1.022e+03 0.015 0.98773
## Pclass2 -8.111e-01 3.691e-01 -2.197 0.02800 *
## Pclass3 -1.876e+00 3.716e-01 -5.047 4.50e-07 ***
## Sex2 1.656e+01 1.028e+03 0.016 0.98715
## Age1 -5.829e-01 6.124e-01 -0.952 0.34121
## Age2 -5.879e-01 5.236e-01 -1.123 0.26149
## Age3 -8.437e-01 5.927e-01 -1.423 0.15460
## Age4 -4.245e-01 6.258e-01 -0.678 0.49755
## Age5 -7.776e-01 5.690e-01 -1.366 0.17179
## Age6 -5.090e-01 6.310e-01 -0.807 0.41988
## Age7 -1.469e+00 6.504e-01 -2.258 0.02394 *
## Age8 -8.755e-01 6.316e-01 -1.386 0.16573
## Family -4.643e-01 9.714e-02 -4.779 1.76e-06 ***
## Fare 3.755e-03 3.099e-03 1.212 0.22561
## EmbarkedC -1.179e+01 1.022e+03 -0.012 0.99079
## EmbarkedQ -1.216e+01 1.022e+03 -0.012 0.99051
## EmbarkedS -1.239e+01 1.022e+03 -0.012 0.99032
## titleMiss -1.677e+01 1.028e+03 -0.016 0.98699
## titleMr -3.269e+00 6.651e-01 -4.915 8.89e-07 ***
## titleMrs -1.582e+01 1.028e+03 -0.015 0.98772
## titleNoble -3.634e+00 9.447e-01 -3.847 0.00012 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 950.86 on 713 degrees of freedom
## Residual deviance: 568.24 on 693 degrees of freedom
## AIC: 610.24
##
## Number of Fisher Scoring iterations: 14
plot(mod1.1, metric = "Kappa")
Removing low-importance features
mod1.1 <- glm(Survived~ Sex + Pclass + Age + Family + Embarked+title+SibSp,family=binomial("logit"),data=training)
summary(mod1.1)
##
## Call:
## glm(formula = Survived ~ Sex + Pclass + Age + Family + Embarked +
## title + SibSp, family = binomial("logit"), data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1860 -0.5769 -0.3786 0.5260 2.5946
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 16.0357 1025.9097 0.016 0.98753
## Sex2 16.5045 1028.2749 0.016 0.98719
## Pclass2 -1.0102 0.3329 -3.035 0.00241 **
## Pclass3 -2.1189 0.3139 -6.751 1.47e-11 ***
## Age1 -0.5049 0.6140 -0.822 0.41085
## Age2 -0.5119 0.5240 -0.977 0.32860
## Age3 -0.7662 0.5910 -1.296 0.19483
## Age4 -0.3670 0.6257 -0.587 0.55752
## Age5 -0.7274 0.5707 -1.274 0.20250
## Age6 -0.3939 0.6234 -0.632 0.52748
## Age7 -1.4350 0.6458 -2.222 0.02628 *
## Age8 -0.8776 0.6327 -1.387 0.16542
## Family -0.3359 0.1540 -2.181 0.02915 *
## EmbarkedC -11.8017 1025.9096 -0.012 0.99082
## EmbarkedQ -12.1988 1025.9097 -0.012 0.99051
## EmbarkedS -12.4638 1025.9096 -0.012 0.99031
## titleMiss -16.7581 1028.2751 -0.016 0.98700
## titleMr -3.3255 0.6775 -4.908 9.19e-07 ***
## titleMrs -15.8955 1028.2752 -0.015 0.98767
## titleNoble -3.7130 0.9535 -3.894 9.85e-05 ***
## SibSp -0.1621 0.2228 -0.727 0.46700
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 950.86 on 713 degrees of freedom
## Residual deviance: 569.42 on 693 degrees of freedom
## AIC: 611.42
##
## Number of Fisher Scoring iterations: 14
Residual deviance is the same (575.71) which is indicating that we should consider feature creating using FARE & EMBARKED
fitControl <- trainControl( method = "repeatedcv",
summaryFunction=twoClassSummary,
number = 20,
repeats = 5,
classProbs=TRUE,
savePredictions=TRUE)
PP <- c('center', 'scale')
# Setting tuneGrid parameter for boosted tree models
# grid <- expand.grid(interaction.depth = c(1, 5, 9),
# n.trees = (1:30)*50, shrinkage = 0.1,n.minobsinnode = 20)
# 1. GLM model
glm.v1 <- train(Survived ~ Sex + Pclass + Age + Family + Embarked+title+SibSp, data = training,
method = "glm", metric = "ROC",trControl = fitControl)
# Residual deviance went down to 569.95
summary(glm.v1)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1860 -0.5769 -0.3786 0.5260 2.5946
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 16.0357 1025.9097 0.016 0.98753
## Sex2 16.5045 1028.2749 0.016 0.98719
## Pclass2 -1.0102 0.3329 -3.035 0.00241 **
## Pclass3 -2.1189 0.3139 -6.751 1.47e-11 ***
## Age1 -0.5049 0.6140 -0.822 0.41085
## Age2 -0.5119 0.5240 -0.977 0.32860
## Age3 -0.7662 0.5910 -1.296 0.19483
## Age4 -0.3670 0.6257 -0.587 0.55752
## Age5 -0.7274 0.5707 -1.274 0.20250
## Age6 -0.3939 0.6234 -0.632 0.52748
## Age7 -1.4350 0.6458 -2.222 0.02628 *
## Age8 -0.8776 0.6327 -1.387 0.16542
## Family -0.3359 0.1540 -2.181 0.02915 *
## EmbarkedC -11.8017 1025.9096 -0.012 0.99082
## EmbarkedQ -12.1988 1025.9097 -0.012 0.99051
## EmbarkedS -12.4638 1025.9096 -0.012 0.99031
## titleMiss -16.7581 1028.2751 -0.016 0.98700
## titleMr -3.3255 0.6775 -4.908 9.19e-07 ***
## titleMrs -15.8955 1028.2752 -0.015 0.98767
## titleNoble -3.7130 0.9535 -3.894 9.85e-05 ***
## SibSp -0.1621 0.2228 -0.727 0.46700
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 950.86 on 713 degrees of freedom
## Residual deviance: 569.42 on 693 degrees of freedom
## AIC: 611.42
##
## Number of Fisher Scoring iterations: 14
# Looking for ways to improve model through feature creation
# Most of the passengers were in 3rd class
table(data$Pclass)
##
## 1 2 3
## 216 184 491
# Most of the passengers onboarded in Southampton
table(data$Embarked)
##
## C Q S
## 2 168 77 644
# Updating the model with modified feature (creating 2-level factor)
glm.v2 <- train(Survived ~ Sex + Pclass + Age + Family + I(Embarked=="S")+title+SibSp, data = training,
method = "glm", metric = "ROC",trControl = fitControl)
# Residual deviance went up slightly (570.93)
summary(glm.v2)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2061 -0.5665 -0.3733 0.5315 2.5978
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.1326 0.6349 6.509 7.57e-11 ***
## Sex2 15.5490 624.1263 0.025 0.980124
## Pclass2 -1.0534 0.3289 -3.203 0.001361 **
## Pclass3 -2.1929 0.3029 -7.240 4.48e-13 ***
## Age1 -0.5604 0.6116 -0.916 0.359495
## Age2 -0.6058 0.5154 -1.175 0.239812
## Age3 -0.7964 0.5902 -1.349 0.177268
## Age4 -0.4141 0.6237 -0.664 0.506737
## Age5 -0.8003 0.5657 -1.415 0.157136
## Age6 -0.4508 0.6188 -0.729 0.466282
## Age7 -1.5110 0.6400 -2.361 0.018230 *
## Age8 -0.9421 0.6280 -1.500 0.133589
## Family -0.3246 0.1523 -2.130 0.033137 *
## `I(Embarked == "S")TRUE` -0.5287 0.2453 -2.155 0.031162 *
## titleMiss -15.7682 624.1266 -0.025 0.979844
## titleMr -3.2396 0.6662 -4.863 1.16e-06 ***
## titleMrs -14.8638 624.1268 -0.024 0.981000
## titleNoble -3.6582 0.9463 -3.866 0.000111 ***
## SibSp -0.1775 0.2210 -0.803 0.421823
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 950.86 on 713 degrees of freedom
## Residual deviance: 570.28 on 695 degrees of freedom
## AIC: 608.28
##
## Number of Fisher Scoring iterations: 13
# Checking if nobility played any role in survival factor
glm.v3 <- train(Survived ~ Sex + Pclass + Age + Family + I(Embarked=="S")+I(title=="Noble")+I(title=="Mr")+SibSp
, data = training,
method = "glm", metric = "ROC",trControl = fitControl)
# Residual deviance went up slightly (579.60)
summary(glm.v3)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2574 -0.6147 -0.3660 0.5329 2.5652
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.71240 0.59097 6.282 3.35e-10 ***
## Sex2 0.24178 0.53831 0.449 0.65332
## Pclass2 -0.99039 0.32009 -3.094 0.00197 **
## Pclass3 -2.14962 0.29895 -7.191 6.46e-13 ***
## Age1 -0.34472 0.59092 -0.583 0.55965
## Age2 -0.58761 0.51066 -1.151 0.24986
## Age3 -0.50604 0.55256 -0.916 0.35977
## Age4 -0.05523 0.56801 -0.097 0.92254
## Age5 -0.43822 0.51203 -0.856 0.39209
## Age6 -0.05727 0.55211 -0.104 0.91739
## Age7 -1.08863 0.55274 -1.970 0.04890 *
## Age8 -0.49351 0.57129 -0.864 0.38767
## Family -0.25545 0.14464 -1.766 0.07737 .
## `I(Embarked == "S")TRUE` -0.50951 0.24325 -2.095 0.03621 *
## `I(title == "Noble")TRUE` -3.26511 0.77863 -4.193 2.75e-05 ***
## `I(title == "Mr")TRUE` -3.21667 0.60925 -5.280 1.29e-07 ***
## SibSp -0.21177 0.21258 -0.996 0.31916
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 950.86 on 713 degrees of freedom
## Residual deviance: 578.61 on 697 degrees of freedom
## AIC: 612.61
##
## Number of Fisher Scoring iterations: 5
# Checking if protocol "women and children first" played a pivotal role
glm.v4 <- train(Survived ~ Sex + Pclass + Age + Family + I(Embarked=="S") +
I(title=="Noble") +
I(title=="Mr") +
I(title=="Mr"&Pclass=="3")+SibSp+
indicator1+indicator2+indicator3+indicator4+indicator5, data = training,
method = "glm", metric = "ROC",trControl = fitControl,preProcess=PP)
# Residual deviance went down slightly (551.14)
summary(glm.v4)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9580 -0.5615 -0.4347 0.3300 2.4708
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -0.4917 0.1261 -3.900
## Sex2 -0.2185 0.3702 -0.590
## Pclass2 -0.5822 0.1664 -3.499
## Pclass3 -1.9485 0.2927 -6.658
## Age1 -0.1502 0.1777 -0.845
## Age2 -0.2025 0.1915 -1.058
## Age3 -0.3254 0.1920 -1.694
## Age4 -0.1195 0.1855 -0.644
## Age5 -0.4000 0.2626 -1.524
## Age6 -0.1095 0.2010 -0.545
## Age7 -0.5003 0.2134 -2.344
## Age8 -0.2457 0.2065 -1.190
## Family -0.3497 0.3712 -0.942
## `I(Embarked == "S")TRUE` -0.2287 0.1103 -2.074
## `I(title == "Noble")TRUE` -0.7783 0.1556 -5.002
## `I(title == "Mr")TRUE` -2.5434 0.4372 -5.817
## `I(title == "Mr" & Pclass == "3")TRUE` 1.2101 0.2921 4.143
## SibSp -0.6186 0.3716 -1.665
## indicator11 0.1971 0.1808 1.090
## indicator21 0.1037 0.1636 0.634
## indicator31 0.3003 0.1306 2.300
## indicator41 0.2664 0.1099 2.424
## indicator51 -0.1102 0.1750 -0.630
## Pr(>|z|)
## (Intercept) 9.60e-05 ***
## Sex2 0.555079
## Pclass2 0.000468 ***
## Pclass3 2.77e-11 ***
## Age1 0.397944
## Age2 0.290240
## Age3 0.090188 .
## Age4 0.519508
## Age5 0.127599
## Age6 0.586003
## Age7 0.019058 *
## Age8 0.234137
## Family 0.346113
## `I(Embarked == "S")TRUE` 0.038087 *
## `I(title == "Noble")TRUE` 5.67e-07 ***
## `I(title == "Mr")TRUE` 5.99e-09 ***
## `I(title == "Mr" & Pclass == "3")TRUE` 3.43e-05 ***
## SibSp 0.095963 .
## indicator11 0.275731
## indicator21 0.526014
## indicator31 0.021461 *
## indicator41 0.015338 *
## indicator51 0.528743
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 950.86 on 713 degrees of freedom
## Residual deviance: 548.39 on 691 degrees of freedom
## AIC: 594.39
##
## Number of Fisher Scoring iterations: 6
anova(glm.v4$finalModel,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: .outcome
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 713 950.86
## Sex2 1 227.535 712 723.33
## Pclass2 1 4.320 711 719.01
## Pclass3 1 63.181 710 655.83
## Age1 1 0.082 709 655.75
## Age2 1 0.223 708 655.52
## Age3 1 0.938 707 654.59
## Age4 1 0.287 706 654.30
## Age5 1 0.684 705 653.62
## Age6 1 0.304 704 653.31
## Age7 1 10.137 703 643.18
## Age8 1 10.273 702 632.90
## Family 1 17.644 701 615.26
## `I(Embarked == "S")TRUE` 1 5.434 700 609.82
## `I(title == "Noble")TRUE` 1 0.852 699 608.97
## `I(title == "Mr")TRUE` 1 29.368 698 579.60
## `I(title == "Mr" & Pclass == "3")TRUE` 1 21.339 697 558.27
## SibSp 1 0.515 696 557.75
## indicator11 1 0.152 695 557.60
## indicator21 1 0.082 694 557.52
## indicator31 1 3.164 693 554.35
## indicator41 1 5.556 692 548.80
## indicator51 1 0.404 691 548.39
## Pr(>Chi)
## NULL
## Sex2 < 2.2e-16 ***
## Pclass2 0.037661 *
## Pclass3 1.886e-15 ***
## Age1 0.774526
## Age2 0.636983
## Age3 0.332777
## Age4 0.592317
## Age5 0.408359
## Age6 0.581381
## Age7 0.001453 **
## Age8 0.001350 **
## Family 2.664e-05 ***
## `I(Embarked == "S")TRUE` 0.019753 *
## `I(title == "Noble")TRUE` 0.356047
## `I(title == "Mr")TRUE` 5.984e-08 ***
## `I(title == "Mr" & Pclass == "3")TRUE` 3.849e-06 ***
## SibSp 0.472955
## indicator11 0.696281
## indicator21 0.774385
## indicator31 0.075260 .
## indicator41 0.018421 *
## indicator51 0.525064
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Start off with unning boosting parameters.
ada.grid <- expand.grid(.iter = c(500),
.maxdepth = 4,
.nu = 0.1)
#writting a model
set.seed(1234)
ada.v1 <- train(Survived ~ Sex + Pclass + Age + Family + Embarked+title+SibSp+
indicator1+indicator2+indicator3+indicator4+
indicator5,
data = training,
method = "ada",
metric = "ROC",
tuneGrid = ada.grid,
trControl = fitControl,preProcess=PP)
#The final values used for the model were iter = 300, maxdepth = 4 and nu = 0.1.
ada.v1
## Boosted Classification Trees
##
## 714 samples
## 18 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered, scaled
## Resampling: Cross-Validated (20 fold, repeated 5 times)
##
## Summary of sample sizes: 678, 679, 678, 679, 679, 679, ...
##
## Resampling results
##
## ROC Sens Spec ROC SD Sens SD Spec SD
## 0.8729458 0.9009091 0.7202747 0.07049063 0.06787657 0.1259303
##
## Tuning parameter 'iter' was held constant at a value of 500
##
## Tuning parameter 'maxdepth' was held constant at a value of 4
##
## Tuning parameter 'nu' was held constant at a value of 0.1
##
#Comparing to boosted tree model GBM
gbmGrid <- expand.grid(interaction.depth = 5,
n.trees = 100,
shrinkage = 0.1,
n.minobsinnode = 20)
set.seed(1234)
gbm.v1 <- train(Survived ~ Sex + Pclass + Age + Family + I(Embarked=="S") +
I(title=="Noble") +
I(title=="Mr") +
I(title=="Mr"&Pclass=="3")+SibSp+
indicator1+indicator2+indicator3+indicator4+
indicator5,
data = training,
method = "gbm",
trControl = fitControl,
verbose = FALSE,
tuneGrid = gbmGrid,preProcess=PP)
#The final values used for the model were n.trees = 100, interaction.depth = 5, shrinkage = 0.1
# and n.minobsinnode = 20. bm.v1
plot(gbm.v1$finalModel$fit)
Tunning boosting parameters.
rfGrid <- data.frame(.mtry = 20)
#writting a model
set.seed(1234)
rf.v1 <- train(Survived ~ Sex + Pclass + Age + Family + I(Embarked=="S") +
I(title=="Noble") +
I(title=="Mr") +
I(title=="Mr"&Pclass=="3")+SibSp+
indicator1+indicator2+indicator3+indicator4+
indicator5,
data = training,
method = "rf",
metric = "ROC",
tuneGrid = rfGrid,
trControl = fitControl,preProcess=PP)
#The final value used for the model was mtry = 2.
rf.v1
## Random Forest
##
## 714 samples
## 18 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered, scaled
## Resampling: Cross-Validated (20 fold, repeated 5 times)
##
## Summary of sample sizes: 678, 679, 678, 679, 679, 679, ...
##
## Resampling results
##
## ROC Sens Spec ROC SD Sens SD Spec SD
## 0.8580507 0.8954545 0.6984615 0.0754542 0.06973289 0.129839
##
## Tuning parameter 'mtry' was held constant at a value of 20
##
# SVM
set.seed(1234)
svm.v1 <- train(Survived ~ Sex + Pclass + Age + Family + I(Embarked=="S") +
I(title=="Noble") +
I(title=="Mr") +
I(title=="Mr"&Pclass=="3")+SibSp+
indicator1+indicator2+indicator3+indicator4+
indicator5,
data = training,
method = "svmRadial",
tuneLength = 10,
metric = "ROC",
trControl = fitControl,preProcess=PP)
#The final values used for the model were sigma = 0.1767715 and C = 1.
svm.v1
## Support Vector Machines with Radial Basis Function Kernel
##
## 714 samples
## 18 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered, scaled
## Resampling: Cross-Validated (20 fold, repeated 5 times)
##
## Summary of sample sizes: 678, 679, 678, 679, 679, 679, ...
##
## Resampling results across tuning parameters:
##
## C ROC Sens Spec ROC SD Sens SD
## 0.25 0.8559590 0.8940909 0.7202747 0.07245976 0.06808380
## 0.50 0.8555195 0.9018182 0.7122527 0.07383021 0.06254475
## 1.00 0.8539386 0.9059091 0.7106044 0.07444788 0.06272301
## 2.00 0.8568007 0.9018182 0.7195055 0.07532675 0.06642832
## 4.00 0.8603447 0.9004545 0.7193956 0.07204604 0.07008962
## 8.00 0.8547777 0.8995455 0.7230220 0.07491671 0.06876700
## 16.00 0.8531543 0.9000000 0.7069780 0.07024009 0.07018038
## 32.00 0.8437812 0.9009091 0.6982967 0.07790353 0.06848875
## 64.00 0.8342732 0.8950000 0.6924725 0.08003484 0.07008962
## 128.00 0.8334940 0.8950000 0.6882967 0.07954788 0.07068263
## Spec SD
## 0.1325488
## 0.1300229
## 0.1277979
## 0.1337085
## 0.1270181
## 0.1237785
## 0.1251919
## 0.1263558
## 0.1248563
## 0.1270579
##
## Tuning parameter 'sigma' was held constant at a value of 0.02901079
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.02901079 and C = 4.
set.seed(1234)
nnet.v1 <- train(Survived ~ Sex + Pclass + Age + Family + I(Embarked=="S") +
I(title=="Noble") +
I(title=="Mr") +
I(title=="Mr"&Pclass=="3")+SibSp+
indicator1+indicator2+indicator3+indicator4+
indicator5,
data = training,
method = "nnet",
trControl = fitControl,preProcess=PP)
#The final values used for the model were sigma = 0.1767715 and C = 1.
nnet.v1
set.seed(1234)
xgb.v1 <- train(Survived ~ Sex + Pclass + Age + Family + I(Embarked=="S") +
I(title=="Noble") +
I(title=="Mr") +
I(title=="Mr"&Pclass=="3")+SibSp+
indicator1+indicator2+indicator3+indicator4+
indicator5,
data = training,
method = "xgbTree",
trControl = fitControl,preProcess=PP)
#The final values used for the model were sigma = 0.1767715 and C = 1.
xgb.v1
## eXtreme Gradient Boosting
##
## 714 samples
## 18 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered, scaled
## Resampling: Cross-Validated (20 fold, repeated 5 times)
##
## Summary of sample sizes: 678, 679, 678, 679, 679, 679, ...
##
## Resampling results across tuning parameters:
##
## max_depth nrounds ROC Sens Spec ROC SD
## 1 50 0.8637987 0.9090909 0.7088462 0.07532399
## 1 100 0.8635627 0.9090909 0.7109890 0.07655009
## 1 150 0.8636251 0.9086364 0.7138462 0.07812143
## 2 50 0.8705257 0.9045455 0.7276374 0.07802330
## 2 100 0.8714123 0.9095455 0.7066484 0.07774913
## 2 150 0.8704570 0.9168182 0.6986264 0.07811869
## 3 50 0.8761501 0.9150000 0.7003846 0.07236124
## 3 100 0.8705669 0.9145455 0.7180220 0.07472821
## 3 150 0.8685390 0.9113636 0.7179670 0.07534815
## Sens SD Spec SD
## 0.06060606 0.1419008
## 0.06060606 0.1425109
## 0.06145922 0.1414600
## 0.06572716 0.1296556
## 0.06698366 0.1332367
## 0.06526673 0.1346154
## 0.06545831 0.1331716
## 0.07114908 0.1270460
## 0.06787504 0.1206444
##
## Tuning parameter 'eta' was held constant at a value of 0.3
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 50, max_depth = 3
## and eta = 0.3.
glm.predictions <- predict(glm.v4,newdata=testing)
ada.predictions <- predict(ada.v1,newdata=testing)
rf.predictions <- predict(rf.v1,newdata=testing)
svm.predictions <- predict(svm.v1,newdata=testing)
gbm.predictions <- predict(gbm.v1,newdata=testing)
nnet.predictions <- predict(nnet.v1,newdata=testing)
xgb.predictions <- predict(xgb.v1,newdata=testing)
#finTesting <- data.frame(testing,GLM=glm.predictions,ADA=ada.predictions,RF=rf.predictions,
# SVM=svm.predictions,GBM=gbm.predictions,NNET=nnet.predictions,XGB=xgb.predictions)
#write.csv(finTesting,"./Titanic/Testing/testing.csv", row.names=FALSE)
confusionMatrix(glm.predictions,testing$Survived) #Accuracy : 0.8079 (no feature engineering 0.8136)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 96 21
## 1 13 47
##
## Accuracy : 0.8079
## 95% CI : (0.7421, 0.8632)
## No Information Rate : 0.6158
## P-Value [Acc > NIR] : 2.854e-08
##
## Kappa : 0.5849
## Mcnemar's Test P-Value : 0.2299
##
## Sensitivity : 0.8807
## Specificity : 0.6912
## Pos Pred Value : 0.8205
## Neg Pred Value : 0.7833
## Prevalence : 0.6158
## Detection Rate : 0.5424
## Detection Prevalence : 0.6610
## Balanced Accuracy : 0.7860
##
## 'Positive' Class : 0
##
confusionMatrix(ada.predictions,testing$Survived) #Accuracy : 0.7966 (no feature engineering 0.8079)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 93 20
## 1 16 48
##
## Accuracy : 0.7966
## 95% CI : (0.7297, 0.8533)
## No Information Rate : 0.6158
## P-Value [Acc > NIR] : 1.87e-07
##
## Kappa : 0.5653
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.8532
## Specificity : 0.7059
## Pos Pred Value : 0.8230
## Neg Pred Value : 0.7500
## Prevalence : 0.6158
## Detection Rate : 0.5254
## Detection Prevalence : 0.6384
## Balanced Accuracy : 0.7795
##
## 'Positive' Class : 0
##
confusionMatrix(rf.predictions,testing$Survived) #Accuracy : 0.7684 (no feature engineering 0.8023)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 89 21
## 1 20 47
##
## Accuracy : 0.7684
## 95% CI : (0.6992, 0.8284)
## No Information Rate : 0.6158
## P-Value [Acc > NIR] : 1.153e-05
##
## Kappa : 0.5091
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8165
## Specificity : 0.6912
## Pos Pred Value : 0.8091
## Neg Pred Value : 0.7015
## Prevalence : 0.6158
## Detection Rate : 0.5028
## Detection Prevalence : 0.6215
## Balanced Accuracy : 0.7538
##
## 'Positive' Class : 0
##
confusionMatrix(svm.predictions,testing$Survived) #Accuracy : 0.791 (no feature engineering 0.7966)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 93 21
## 1 16 47
##
## Accuracy : 0.791
## 95% CI : (0.7236, 0.8483)
## No Information Rate : 0.6158
## P-Value [Acc > NIR] : 4.548e-07
##
## Kappa : 0.552
## Mcnemar's Test P-Value : 0.5108
##
## Sensitivity : 0.8532
## Specificity : 0.6912
## Pos Pred Value : 0.8158
## Neg Pred Value : 0.7460
## Prevalence : 0.6158
## Detection Rate : 0.5254
## Detection Prevalence : 0.6441
## Balanced Accuracy : 0.7722
##
## 'Positive' Class : 0
##
confusionMatrix(gbm.predictions,testing$Survived) #Accuracy : 0.8192 (no feature engineering 0.7966)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 94 17
## 1 15 51
##
## Accuracy : 0.8192
## 95% CI : (0.7545, 0.8729)
## No Information Rate : 0.6158
## P-Value [Acc > NIR] : 3.784e-09
##
## Kappa : 0.6158
## Mcnemar's Test P-Value : 0.8597
##
## Sensitivity : 0.8624
## Specificity : 0.7500
## Pos Pred Value : 0.8468
## Neg Pred Value : 0.7727
## Prevalence : 0.6158
## Detection Rate : 0.5311
## Detection Prevalence : 0.6271
## Balanced Accuracy : 0.8062
##
## 'Positive' Class : 0
##
confusionMatrix(nnet.predictions,testing$Survived) #Accuracy :0.8249 (no feature engineering 0.8192)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 95 17
## 1 14 51
##
## Accuracy : 0.8249
## 95% CI : (0.7607, 0.8778)
## No Information Rate : 0.6158
## P-Value [Acc > NIR] : 1.304e-09
##
## Kappa : 0.6268
## Mcnemar's Test P-Value : 0.7194
##
## Sensitivity : 0.8716
## Specificity : 0.7500
## Pos Pred Value : 0.8482
## Neg Pred Value : 0.7846
## Prevalence : 0.6158
## Detection Rate : 0.5367
## Detection Prevalence : 0.6328
## Balanced Accuracy : 0.8108
##
## 'Positive' Class : 0
##
confusionMatrix(xgb.predictions,testing$Survived) #Accuracy : 0.7966 (no feature engineering 0.7853)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 92 19
## 1 17 49
##
## Accuracy : 0.7966
## 95% CI : (0.7297, 0.8533)
## No Information Rate : 0.6158
## P-Value [Acc > NIR] : 1.87e-07
##
## Kappa : 0.5678
## Mcnemar's Test P-Value : 0.8676
##
## Sensitivity : 0.8440
## Specificity : 0.7206
## Pos Pred Value : 0.8288
## Neg Pred Value : 0.7424
## Prevalence : 0.6158
## Detection Rate : 0.5198
## Detection Prevalence : 0.6271
## Balanced Accuracy : 0.7823
##
## 'Positive' Class : 0
##
cv.values <- resamples(list(Logit = glm.v4, Ada = ada.v1,
RF = rf.v1, SVM = svm.v1,GBM=gbm.v1,NNET=nnet.v1,XGB=xgb.v1))
dotplot(cv.values, metric = "ROC")
pr.df <- data.frame(glm.predictions,ada.predictions,rf.predictions,svm.predictions,gbm.predictions,nnet.predictions,xgb.predictions,Survived=testing$Survived)
comb.mod <- train(Survived~.,method = "ada",
metric = "ROC",
tuneGrid = ada.grid,
trControl = fitControl,preProcess=PP,data=pr.df)
fin <- predict(comb.mod,pr.df)
confusionMatrix(fin,testing$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 98 13
## 1 11 55
##
## Accuracy : 0.8644
## 95% CI : (0.805, 0.9112)
## No Information Rate : 0.6158
## P-Value [Acc > NIR] : 2.439e-13
##
## Kappa : 0.7118
## Mcnemar's Test P-Value : 0.8383
##
## Sensitivity : 0.8991
## Specificity : 0.8088
## Pos Pred Value : 0.8829
## Neg Pred Value : 0.8333
## Prevalence : 0.6158
## Detection Rate : 0.5537
## Detection Prevalence : 0.6271
## Balanced Accuracy : 0.8540
##
## 'Positive' Class : 0
##
# Assembled model accuracy : 0.8644
Each of the described above models were used to predict the results and submitted to Kaggle.com. As a result, here you can see the accuracy of each model below: GLM accuracy: 0.78469 ADA accuracy: 0.77990 RF accuracy: 0.77512 SVM accuracy: 0.78469 GBM accuracy: 0.77033 NNET accuracy: 0.78947 XGB accuracy: 0.77033 Combined model accuracy: 0.76555 ####Neural Networks approcah alone showed the highest results. Assembled model showed the best result in testing session, but was the least accurate model to perform in the evaluation phase on Kaggle.com mainly due to overfitting issue with the model.