Deloitte Tackles Titanic: MAPLE Solution

Team Name: MAPLE
Date: Friday, August 20, 2014
Author: Hardeep Chagger [hchagger@deloitte.ca] & Jovan Sardinha [jsardinha@deloitte.ca]

Creative Commons License
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License


Architecture and Session Information

Session Info

sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: i386-w64-mingw32/i386 (32-bit)
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
## [1] digest_0.6.4     evaluate_0.5.5   formatR_0.10     htmltools_0.2.4 
## [5] knitr_1.6        rmarkdown_0.2.49 stringr_0.6.2    tools_3.1.1     
## [9] yaml_2.1.13

Architecture Used

Amazon AWS

Snapshot of the instance used: AWS Instance

Libraries utilized

library(RCurl)
library(knitr)
library(caret)
library(ggplot2)
library(corrplot)
library(kernlab)
library(randomForest)
library(doParallel)
library(gbm)
library(survival)
library(splines)
library(xtable)
library(stargazer)
library(stringr)
library(reshape2)
library(plyr)
library(adabag)
library(rpart)

Loading the Data

# Downloading Training Data
if(!file.exists("./data")){dir.create("./data")}

#trainDataURL <- "https://inclass.kaggle.com/c/deloitte-tackles-titanic/download/train.csv"

trainDataURL <- "https://inclass.kaggle.com/c/deloitte-tackles-titanic/download/train2.csv"
trainFile <- paste("./data/", basename(trainDataURL), sep = "")
#download.file(trainDataURL, trainFile, method = "curl")

# Downloading Testing Data
if(!file.exists("./data")){dir.create("./data")}

testDataURL <- "https://inclass.kaggle.com/c/deloitte-tackles-titanic/download/test.csv"
testFile <- paste("./data/", basename(testDataURL), sep = "")

# Need username and password; do not use thi link
#download.file(testDataURL, testFile, method = "curl")

Data Pre-Processing

trainData$Type <- "Training"
testData$Type <- "Testing"

# $Survived initialization
testData$Survived <- NA

combi <- rbind(trainData, testData)

Exploratory Data Analysis and Feature Engineering

Initial exploratory data analysis was done independent of this script and was mainly conducted with Rattle and Microsoft Excel. Image below showcases the Rattle GUI Rattle Session

#Factor conversion
combi$Survived <- as.factor(combi$Survived)
combi$Pclass <- as.factor(combi$Pclass)

Title Analysis

combi$Name <- as.character(combi$Name)
combi$Title <-  sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]})

#Cleaning white spaces
combi$Title <- sapply(combi$Title, FUN = function(x){
     final <- "_"
     if(str_sub(x,end = -nchar(x))== " "){
          final <- str_sub(x,start = -nchar(x)+1)
     }
     
     final
})



table(combi$Title)
## 
##         Capt          Col          Don         Dona           Dr 
##            1            6            1            1            9 
##     Jonkheer         Lady        Major       Master         Miss 
##            1            1            2           75          324 
##         Mlle          Mme           Mr          Mrs           Ms 
##            2            1          949          250            3 
##          Rev          Sir the Countess 
##           10            1            1
table(combi$Title, combi$Sex)
##               
##                female male
##   Capt              0    1
##   Col               0    6
##   Don               0    1
##   Dona              1    0
##   Dr                1    8
##   Jonkheer          0    1
##   Lady              1    0
##   Major             0    2
##   Master            0   75
##   Miss            324    0
##   Mlle              2    0
##   Mme               1    0
##   Mr                0  949
##   Mrs             250    0
##   Ms                3    0
##   Rev               0   10
##   Sir               0    1
##   the Countess      1    0
titleCleaner <- function(title, sex){
     finalTitle = title

     menTitles <- c("Capt", "Col", "Don", "Jonkheer", "Major", "Rev", "Sir")
     womenTitles <- c("Dona", "Lady", "Mme", "the Countess")
     girlTitles <- c("Mlle", "Mme")
     
     if(title %in% menTitles ){
          finalTitle = "Mr"
     }

     if(title %in% womenTitles){
          finalTitle = "Mrs"
     }
     
     if(title %in% girlTitles){
          finalTitle = "Ms"
     }
     
     if(title == "Dr"){
          if(sex == "male"){
               finalTitle = "Mr"
          }else{
               finalTitle = "Mrs"
          }
     }
          
     
     finalTitle
}

combi$TitleCombined <- mapply(titleCleaner, combi$Title, combi$Sex)

#table(combi$Title, combi$TitleCombined)


combi$TitleCombined <- as.factor(combi$TitleCombined)

Family Size Analysis

combi$FamilySize <- combi$SibSp + combi$Parch + 1

combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})  
combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep="")
#combi$FamilyID[combi$FamilySize <= 2] <- 'Small'
combi$FamilyID <- as.factor(combi$FamilyID)

combi$SmallFamily <- ifelse(combi$FamilySize <=2, 1, 0)

Parent Analysis

#isParent
isParent <- function(FamilySize, Parch, SibSp, TitleCombined){
     isParent <- "No"
     if(FamilySize > 1){
          if(Parch > 1){
               #if(SibSp > 1){
                    if(TitleCombined == "Mr"){
                         isParent <- "Father"
                    }
                    if(TitleCombined == "Mrs"){
                         isParent <- "Mother"
                    }
               #}
          }
     }
     isParent
}

combi$isParent <- mapply(isParent, FamilySize = combi$FamilySize, Parch = combi$Parch, SibSp = combi$SibSp, TitleCombined = combi$TitleCombined)
combi$isParent <- as.factor(combi$isParent)

Cabin Analysis

combi$Cabin <- as.character(combi$Cabin)
combi$CabinLetter <- colsplit(combi$Cabin,"",names = c("a", "b", "c"))[,2]
combi[which(combi$CabinLetter==""), grep("CabinLetter", colnames(combi))] <- "Unknown"

combi$CabinLetter <- as.factor(combi$CabinLetter)

Number of Cabin Analysis

combi$NumberOfCabins <- lapply(strsplit(combi$Cabin, split='[ ]'),length)
combi$NumberOfCabins <- as.numeric(combi$NumberOfCabins)
combi <- combi[,-which( colnames(combi)=="Cabin" )]

Fare Analysis

combi$FarePerPerson <- mapply(x=combi$Fare, y=combi$FamilySize, FUN = function(x,y){
     if(is.na(x)){x<-0};
     if(is.na(y)){y<-1};
     
     x/y
     
})


combi$TotalFare <- sapply(combi$Fare, FUN = function(x){
     if(is.na(x)){x<-0};
     x
})


combi$FareDifference <- combi$TotalFare - combi$FarePerPerson
combi$FareDifference_scaled <- scale(combi$FareDifference)
combi$FareDifference_scaled_centered <- scale(combi$FareDifference, center = TRUE, 
                             scale = max(combi$FareDifference, na.rm = TRUE)/100) 

combi$Dependency <- as.factor(ifelse(combi$FareDifference==0, 1,0))

Fare Bin

some of of the analysis was influenced by: http://www.statisticalconsultants.co.nz/blog/B48.html

fareBreaks <- c(-1,7.75, 8, 13, 14.50, 15.10, 26,30.65,60,Inf) 
combi$FareBin <- cut(combi$FarePerPerson,fareBreaks)

Age Analysis

#combi$Age <- round(combi$Age,0)
AgeModel <- lm(Age ~ TitleCombined+Sex, data = combi[-which(is.na(combi$Age)),])
combi$Age2 <- predict(AgeModel, combi)
## Warning: prediction from a rank-deficient fit may be misleading
combi$Age2 <- ifelse(is.na(combi$Age), combi$Age2, combi$Age )
ageBreaks <- c(0,5,10,15,20,30,40,50,60,70,80,90,Inf)
#ageBreaks <- c(0,5,10,15,20,30,40,Inf)
combi$AgeBin <- cut(combi$Age2,ageBreaks)

Title Analysis

isDeadlyMr <- function(TitleCombined, Parch, FareDifference){
     DeadlyMr <- 0
     if(TitleCombined=="Mr"){
          if(Parch <=1){
               if(FareDifference <=0){
                    DeadlyMr <- 1
               }
          }
     }
     DeadlyMr    
}

#combi$MrAnalysis <- mapply(isDeadlyMr, TitleCombined=combi$TitleCombined, Parch=combi$Parch, FareDifference=combi$FareDifference)
#combi$MrAnalysis <- as.factor(combi$MrAnalysis)
write.table(combi, file = "./postAnalysis/combi.csv", sep=",",row.names = FALSE)

Pre-processing

General pre-processing

#Removing Blank Embarked data
combi <- combi[-which(combi$Embarked == ""),]

#Dropping the column
combi <- combi[,-which( colnames(combi)=="Title" )]
combi <- combi[,-which( colnames(combi)=="Name" )]
combi <- combi[,-which( colnames(combi)=="Ticket" )]


combi <- combi[,-which( colnames(combi)=="Fare" )] #Removed as there was 1 missing value
#combi <- combi[,-which( colnames(combi)=="FarePerPerson" )]

#combi <- combi[,-which( colnames(combi)=="SibSp" )]
#combi <- combi[,-which( colnames(combi)=="Parch" )]
combi <- combi[,-which( colnames(combi)=="FamilyID" )]
combi <- combi[,-which( colnames(combi)=="Surname" )]

combi <- combi[,-which( colnames(combi)=="Age")]
combi <- combi[,-which( colnames(combi)=="Age2")]

Splitting Test and Train Data

testingData <- combi[which(combi$Type=="Testing"),]
trainingData <- combi[which(combi$Type=="Training"),]

#Removing Type column
testingData <- testingData[,-which( colnames(combi)=="Type" )]
trainingData <- trainingData[,-which( colnames(combi)=="Type" )]

Breaking to Cross Validation

set.seed(112)
inTest <- createDataPartition(y=trainingData$Survived, p=0.8, list=FALSE)
training <- trainingData[inTest,] 
crossVal <- trainingData[-inTest,]
rm(trainData)
rm(testData)
rm(trainingData)
#rm(combi)

Model Building

#rm(trainData)

Our ah-ha moment! stratified model building with replacement

cl <- makeCluster(detectCores())
registerDoParallel(cl)

# Random Forest
set.seed(112)

training_Mr <- training[which(training$TitleCombined == "Mr"),]
training_P3 <- training[which(training$Pclass == 3),]
training_P1 <- training[which(training$Pclass == 1),]
training_FareDiff <- training[which(training$FareDifference >= 14.5),]
training_FarePerson <- training[which(training$FarePerPerson <= 20),]


modelFit_rf <- train(Survived ~., data=training[,-1], method="rf", prox=TRUE, trControl = trainControl(method="cv"),number=5)

modelFit_rf_Mr <- train(Survived ~., data=training_Mr[,-1], method="rf",prox=TRUE, trControl = trainControl(method="cv"),number=5)

modelFit_rf_P3 <- train(Survived ~., data=training_P3[,-1], method="rf",prox=TRUE, trControl = trainControl(method="cv"),number=5)



modelFit_rf_FareDiff <- train(Survived ~., data=training_FareDiff[,-1],prox=TRUE, method="rf", trControl = trainControl(method="cv"),number=5)


modelFit_rf_P1 <- train(Survived ~., data=training_P1[,-1], method="rf",prox=TRUE, trControl = trainControl(method="cv"),number=5)


modelFit_rf_FarePerson <- train(Survived ~., data=training_FarePerson[,-1], method="rf", trControl = trainControl(method="cv"),number=5)

Predicting the results

testingData$Survived<- predict(modelFit_rf, newdata=testingData[,-1])

testingData[which(testingData$TitleCombined == "Mr"),2] <- predict(modelFit_rf_Mr, newdata=testingData[which(testingData$TitleCombined == "Mr"),-1])


testingData[which(testingData$Pclass == 3),2] <- predict(modelFit_rf_P3, newdata=testingData[which(testingData$Pclass == 3),-1])


testingData[which(testingData$FareDifference >= 14.5),2] <- predict(modelFit_rf_FareDiff, newdata=testingData[which(testingData$FareDifference >= 14.5),-1])


testingData[which(testingData$Pclass == 1),2] <- predict(modelFit_rf_P1, newdata=testingData[which(testingData$Pclass == 1),-1])##Put at end

testingData[which(testingData$FarePerPerson <= 20),2] <- predict(modelFit_rf_FarePerson, newdata=testingData[which(testingData$FarePerPerson <= 20),-1])

Identification Tag for debugging

testingData$Tag <- ""
Tag <- grep("Tag", colnames(testingData))
##Identification
testingData[which(testingData$TitleCombined == "Mr"), Tag] <-  "Mr"
testingData[which(testingData$Pclass == 3), Tag] <-  "P3"
testingData[which(testingData$Pclass == 1), Tag] <-  "P1"
testingData[which(testingData$FareDifference >= 14.5), Tag] <-  "FareDiff"
testingData[which(testingData$FarePerPerson <= 20), Tag] <-  "FarePerson"

Submission Preparation

finalResults <- testingData[,1:2]
write.table(finalResults, file = "./results/finalResults.csv", sep = ",", row.names = FALSE)

Post Analysis

plot(varImp(modelFit_rf, scale = TRUE))

plot of chunk unnamed-chunk-26

plot(varImp(modelFit_rf_Mr, scale = TRUE))

plot of chunk unnamed-chunk-26

plot(varImp(modelFit_rf_P3, scale = TRUE))

plot of chunk unnamed-chunk-26

plot(varImp(modelFit_rf_FareDiff, scale = TRUE))

plot of chunk unnamed-chunk-26

plot(varImp(modelFit_rf_P1, scale = TRUE))

plot of chunk unnamed-chunk-26

Writing out the results for analysis

crossVal$Predicted2 <- predict(modelFit_rf, crossVal)
write.table(crossVal, file = "./postAnalysis/crossVal.csv", sep=",",row.names = FALSE)
write.table(testingData, file = "./postAnalysis/testing.csv", sep=",",row.names = FALSE)

Total time taken to run this script

##    user  system elapsed 
##   11.25    1.47   14.07