Output of the code from the post “A Data Science Exploration From the Titanic in R”

The code exposed in the blog post http://www.philippeadjiman.com/blog/?p=1095

library(caret)

## Loading required package: cluster

## Loading required package: foreach

## Loading required package: lattice

## Loading required package: plyr

## Loading required package: reshape2

library(pROC)

## Type 'citation("pROC")' for a citation.

## Attaching package: 'pROC'

## The following object is masked from 'package:stats':
## 
## cov, smooth, var

library(Amelia)

## Loading required package: foreign

## Loading required package: Rcpp

## Loading required package: RcppArmadillo

## ## ## Amelia II: Multiple Imputation ## (Version 1.7.2, built: 2013-04-03)
## ## Copyright (C) 2005-2013 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information ##


#replace below by your own path on disk or cloud

#rawdata will be used for both training and cross validation test
rawdata <- read.csv("~/Google\ Drive/padjiman/data/kaggle/titanic/train.csv",stringsAsFactors = F)
#this data won't be used in model evaluation. It will only be used for the submission.
test <- read.csv("~/Google\ Drive/padjiman/data/kaggle/titanic/test.csv",stringsAsFactors = F)

#used to unify attribute names which were different in training vs data sets in original data
names(test) = c("PassengerId", names(rawdata)[2:11])

#printing some missing data
par("mfrow" = c(1,2)) 
missmap(rawdata, main = "Missingness Map Train")
missmap(test, main = "Missingness Map Test")

plot of chunk unnamed-chunk-1

par("mfrow" = c(1,1)) 


################################
######  DATA PREPARATION  ######
################################


#functions to add the title feature to the data + do some conversions to factors
source("~/Google\ Drive/padjiman/data/kaggle/titanic/src/processData.r")
rawdata <- processData(rawdata)
test <- processData(test)


#imputing the one missing fare in the test set by a simple mean 
meanFare = mean(na.omit(rawdata$fare))
test[which( is.na(test$fare) ), ]$fare = meanFare

#to make roc work properly later on, we need to have the predicted class to be real labels rather than just 0 or 1
rawdata$survived <- factor(rawdata$survived, labels = c("yes","no"))

#class should be considered as a factor rather than a num
rawdata$pclass <- as.factor(rawdata$pclass )
test$pclass <- as.factor(test$pclass )

#create a training and test set for building and evaluating the model
#we set the seed to create each time the same (here 80/20) split
set.seed(4)
#samples <- createDataPartition(rawdata$survived, p = 0.8,list = FALSE)
samples <- sample(nrow(rawdata), nrow(rawdata)*0.8)
data.train <- rawdata[samples, ]
data.test <- rawdata[-samples, ]

#plotting ages distribution per title
boxplot( data.train$age ~ data.train$title   ,col="blue" , varwidth=TRUE)

plot of chunk unnamed-chunk-1


################################
######  MODEL BUILDING    ######
################################

#training a random forest (default algo in caret)

#WARNING: the exposed accurracy measure below are rounded for some reasons and always expose 0.8 no 
#matter what, couldn't find the reason. The real/not rounded accuracy shows up fine when executed on 
#regular R console

model1 <- train(survived ~ pclass + sex + title + sibsp +parch , data.train, importance=TRUE)

## randomForest 4.6-7

## Type rfNews() to see new features/changes/bug fixes.

## Loading required package: class

## Warning: executing %dopar% sequentially: no parallel backend registered

model1

## 712 samples
##  13 predictors
##   2 classes: 'yes', 'no' 
## 
## No pre-processing
## Resampling: Bootstrap (25 reps) 
## 
## Summary of sample sizes: 712, 712, 712, 712, 712, 712, ... 
## 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy  Kappa  Accuracy SD  Kappa SD
##   2     0.8       0.6    0.02         0.04    
##   5     0.8       0.6    0.02         0.06    
##   9     0.8       0.5    0.02         0.05    
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 2.

vaImp.model1 <- varImp(model1, verbose = FALSE) 
vaImp.model1

## rf variable importance
## 
##           Importance
## titleMr        100.0
## pclass3         98.3
## sexmale         88.2
## sibsp           43.3
## titleMrs        29.7
## titleRev        29.0
## titleMiss       27.3
## parch           11.9
## pclass2          0.0



#training a gbm. won't be used but exposed here just for the sake of the example
fitControl <- trainControl(## 10-fold CV 
                           method = "repeatedcv", 
                           number = 10, 
                           ## repeated ten times 
                           repeats = 10)

model2 <- train(survived ~   pclass + sex +  + sibsp +parch , 
                   data.train, distribution = "gaussian", method = "gbm",trControl = fitControl, verbose = FALSE)

## Loading required package: survival

## Loading required package: splines

## Loading required package: parallel

## Loaded gbm 2.1



model2

## 712 samples
##  13 predictors
##   2 classes: 'yes', 'no' 
## 
## No pre-processing
## Resampling: Cross-Validation (10 fold, repeated 10 times) 
## 
## Summary of sample sizes: 642, 640, 642, 641, 640, 640, ... 
## 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy  Kappa  Accuracy SD  Kappa SD
##   1                  50       0.8       0.5    0.05         0.1     
##   1                  100      0.8       0.5    0.05         0.1     
##   1                  200      0.8       0.5    0.05         0.1     
##   2                  50       0.8       0.5    0.04         0.1     
##   2                  100      0.8       0.5    0.04         0.1     
##   2                  200      0.8       0.5    0.04         0.1     
##   3                  50       0.8       0.5    0.04         0.1     
##   3                  100      0.8       0.5    0.04         0.1     
##   3                  200      0.8       0.5    0.05         0.1     
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were interaction.depth = 1, n.trees
##  = 100 and shrinkage = 0.1.

vaImp.model2 <- varImp(model2) 
vaImp.model2

## gbm variable importance
## 
##         Overall
## sexmale  100.00
## pclass3   24.48
## sibsp      4.32
## parch      3.11
## pclass2    0.00



################################
######  MODEL ROC EVAL    ######
################################


#code inspired from http://mkseo.pe.kr/stats/?p=790
result.predicted.prob.model1 <- predict(model1, data.test, type="prob")
result.roc.model1 <-  roc(data.test$survived, result.predicted.prob.model1$yes)
plot(result.roc.model1, print.thres="best", print.thres.best.method="closest.topleft")

plot of chunk unnamed-chunk-1

## 
## Call:
## roc.default(response = data.test$survived, predictor = result.predicted.prob.model1$yes)
## 
## Data: result.predicted.prob.model1$yes in 102 controls (data.test$survived yes) > 77 cases (data.test$survived no).
## Area under the curve: 0.862


result.coords.model1 <- coords(  result.roc.model1, "best", best.method="closest.topleft", 
                          ret=c("threshold", "accuracy"))
result.coords.model1

## threshold  accuracy 
##    0.5040    0.8603


################################
######  OBSERVING ERRORS    #####
################################

result.predicted.train <- predict(model1, data.train, type="prob")
factorPreditions.train <- factor(ifelse(result.predicted.train[,1] > result.coords.model1[1], 0,1), labels = c("yes","no"))
xtabs(~ factorPreditions.train + data.train$survived)

##                       data.train$survived
## factorPreditions.train yes  no
##                    yes 396  75
##                    no   51 190

wrongClassification <- data.train[(factorPreditions.train != data.train$survived) , ]
#wrongClassification

################################
#########  SUBMISSION    #######
################################

##here using model 1

result.predicted.final <- predict(model1, test, type="prob")
numericPreditions <- ifelse(result.predicted.final[,1] > result.coords.model1[1], 0,1)
submission = data.frame(PassengerId = test[,1] , Survived = numericPreditions) 

#this will generate quotes in the headers, make sure to remove them if you want to submit the file, otherwise the sumbission parser will break
#This output will give a score of 0.79426 on the leaderbord (corresponding to about the top 5% best submissions)
write.table(submission,file="~/Google\ Drive/padjiman/data/kaggle/titanic/finalOutput.csv",sep=",",row.names=FALSE, col.names=TRUE)