The code exposed in the blog post http://www.philippeadjiman.com/blog/?p=1095
library(caret)
## Loading required package: cluster
## Loading required package: foreach
## Loading required package: lattice
## Loading required package: plyr
## Loading required package: reshape2
library(pROC)
## Type 'citation("pROC")' for a citation.
## Attaching package: 'pROC'
## The following object is masked from 'package:stats':
##
## cov, smooth, var
library(Amelia)
## Loading required package: foreign
## Loading required package: Rcpp
## Loading required package: RcppArmadillo
## ## ## Amelia II: Multiple Imputation ## (Version 1.7.2, built: 2013-04-03)
## ## Copyright (C) 2005-2013 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information ##
#replace below by your own path on disk or cloud
#rawdata will be used for both training and cross validation test
rawdata <- read.csv("~/Google\ Drive/padjiman/data/kaggle/titanic/train.csv",stringsAsFactors = F)
#this data won't be used in model evaluation. It will only be used for the submission.
test <- read.csv("~/Google\ Drive/padjiman/data/kaggle/titanic/test.csv",stringsAsFactors = F)
#used to unify attribute names which were different in training vs data sets in original data
names(test) = c("PassengerId", names(rawdata)[2:11])
#printing some missing data
par("mfrow" = c(1,2))
missmap(rawdata, main = "Missingness Map Train")
missmap(test, main = "Missingness Map Test")
par("mfrow" = c(1,1))
################################
###### DATA PREPARATION ######
################################
#functions to add the title feature to the data + do some conversions to factors
source("~/Google\ Drive/padjiman/data/kaggle/titanic/src/processData.r")
rawdata <- processData(rawdata)
test <- processData(test)
#imputing the one missing fare in the test set by a simple mean
meanFare = mean(na.omit(rawdata$fare))
test[which( is.na(test$fare) ), ]$fare = meanFare
#to make roc work properly later on, we need to have the predicted class to be real labels rather than just 0 or 1
rawdata$survived <- factor(rawdata$survived, labels = c("yes","no"))
#class should be considered as a factor rather than a num
rawdata$pclass <- as.factor(rawdata$pclass )
test$pclass <- as.factor(test$pclass )
#create a training and test set for building and evaluating the model
#we set the seed to create each time the same (here 80/20) split
set.seed(4)
#samples <- createDataPartition(rawdata$survived, p = 0.8,list = FALSE)
samples <- sample(nrow(rawdata), nrow(rawdata)*0.8)
data.train <- rawdata[samples, ]
data.test <- rawdata[-samples, ]
#plotting ages distribution per title
boxplot( data.train$age ~ data.train$title ,col="blue" , varwidth=TRUE)
################################
###### MODEL BUILDING ######
################################
#training a random forest (default algo in caret)
#WARNING: the exposed accurracy measure below are rounded for some reasons and always expose 0.8 no
#matter what, couldn't find the reason. The real/not rounded accuracy shows up fine when executed on
#regular R console
model1 <- train(survived ~ pclass + sex + title + sibsp +parch , data.train, importance=TRUE)
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.
## Loading required package: class
## Warning: executing %dopar% sequentially: no parallel backend registered
model1
## 712 samples
## 13 predictors
## 2 classes: 'yes', 'no'
##
## No pre-processing
## Resampling: Bootstrap (25 reps)
##
## Summary of sample sizes: 712, 712, 712, 712, 712, 712, ...
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 0.8 0.6 0.02 0.04
## 5 0.8 0.6 0.02 0.06
## 9 0.8 0.5 0.02 0.05
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
vaImp.model1 <- varImp(model1, verbose = FALSE)
vaImp.model1
## rf variable importance
##
## Importance
## titleMr 100.0
## pclass3 98.3
## sexmale 88.2
## sibsp 43.3
## titleMrs 29.7
## titleRev 29.0
## titleMiss 27.3
## parch 11.9
## pclass2 0.0
#training a gbm. won't be used but exposed here just for the sake of the example
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
## repeated ten times
repeats = 10)
model2 <- train(survived ~ pclass + sex + + sibsp +parch ,
data.train, distribution = "gaussian", method = "gbm",trControl = fitControl, verbose = FALSE)
## Loading required package: survival
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1
model2
## 712 samples
## 13 predictors
## 2 classes: 'yes', 'no'
##
## No pre-processing
## Resampling: Cross-Validation (10 fold, repeated 10 times)
##
## Summary of sample sizes: 642, 640, 642, 641, 640, 640, ...
##
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa Accuracy SD Kappa SD
## 1 50 0.8 0.5 0.05 0.1
## 1 100 0.8 0.5 0.05 0.1
## 1 200 0.8 0.5 0.05 0.1
## 2 50 0.8 0.5 0.04 0.1
## 2 100 0.8 0.5 0.04 0.1
## 2 200 0.8 0.5 0.04 0.1
## 3 50 0.8 0.5 0.04 0.1
## 3 100 0.8 0.5 0.04 0.1
## 3 200 0.8 0.5 0.05 0.1
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were interaction.depth = 1, n.trees
## = 100 and shrinkage = 0.1.
vaImp.model2 <- varImp(model2)
vaImp.model2
## gbm variable importance
##
## Overall
## sexmale 100.00
## pclass3 24.48
## sibsp 4.32
## parch 3.11
## pclass2 0.00
################################
###### MODEL ROC EVAL ######
################################
#code inspired from http://mkseo.pe.kr/stats/?p=790
result.predicted.prob.model1 <- predict(model1, data.test, type="prob")
result.roc.model1 <- roc(data.test$survived, result.predicted.prob.model1$yes)
plot(result.roc.model1, print.thres="best", print.thres.best.method="closest.topleft")
##
## Call:
## roc.default(response = data.test$survived, predictor = result.predicted.prob.model1$yes)
##
## Data: result.predicted.prob.model1$yes in 102 controls (data.test$survived yes) > 77 cases (data.test$survived no).
## Area under the curve: 0.862
result.coords.model1 <- coords( result.roc.model1, "best", best.method="closest.topleft",
ret=c("threshold", "accuracy"))
result.coords.model1
## threshold accuracy
## 0.5040 0.8603
################################
###### OBSERVING ERRORS #####
################################
result.predicted.train <- predict(model1, data.train, type="prob")
factorPreditions.train <- factor(ifelse(result.predicted.train[,1] > result.coords.model1[1], 0,1), labels = c("yes","no"))
xtabs(~ factorPreditions.train + data.train$survived)
## data.train$survived
## factorPreditions.train yes no
## yes 396 75
## no 51 190
wrongClassification <- data.train[(factorPreditions.train != data.train$survived) , ]
#wrongClassification
################################
######### SUBMISSION #######
################################
##here using model 1
result.predicted.final <- predict(model1, test, type="prob")
numericPreditions <- ifelse(result.predicted.final[,1] > result.coords.model1[1], 0,1)
submission = data.frame(PassengerId = test[,1] , Survived = numericPreditions)
#this will generate quotes in the headers, make sure to remove them if you want to submit the file, otherwise the sumbission parser will break
#This output will give a score of 0.79426 on the leaderbord (corresponding to about the top 5% best submissions)
write.table(submission,file="~/Google\ Drive/padjiman/data/kaggle/titanic/finalOutput.csv",sep=",",row.names=FALSE, col.names=TRUE)