Let’s clean the global environment before moving further

rm(list=ls())
cat("\014")

Let’s load the dataset: From this Kaggle competition https://www.kaggle.com/c/titanic

train_data <- read.csv("~/Downloads/TitanicTrain.csv")
test_data <- read.csv("~/Downloads/TitanicTest.csv")

Let’s take a look at the dataset

str(train_data)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...

Let’s do some feature extraction: First feature extraction would be a factor variable indicating whether the person is a child or not

train_data$child<-as.factor(ifelse(train_data$Age<18,1,0))
table(train_data$child)
## 
##   0   1 
## 601 113

Second feature would be to categorize the fare into different levels

train_data$price_paid <- cut(train_data$Fare,breaks=c(-0.1,0.1,10,20,30,1000),labels=c("Free","Cheap","Medium","High","Expensive"))
table(train_data$price_paid)
## 
##      Free     Cheap    Medium      High Expensive 
##        15       321       179       142       234

Let’s take a closer look at the data

What does the distribution of survival look with respect to gender

table(train_data$Sex, train_data$Survived)
##         
##            0   1
##   female  81 233
##   male   468 109
prop.table(table(train_data$Sex, train_data$Survived))
##         
##                   0          1
##   female 0.09090909 0.26150393
##   male   0.52525253 0.12233446

What does the distribution of survival look with respect to Class of Travel?

table(train_data$Pclass, train_data$Survived)
##    
##       0   1
##   1  80 136
##   2  97  87
##   3 372 119
prop.table(table(train_data$Pclass, train_data$Survived),1)
##    
##             0         1
##   1 0.3703704 0.6296296
##   2 0.5271739 0.4728261
##   3 0.7576375 0.2423625

Now let’s look how multiple variable impact our target variable ‘Survived’

An R formula is of the form: dependent variable ~ independent variables (joined by + signs)dependent variable: what you want to measure of predict Independent variables: features you are using to measure/predict

aggregate(Survived ~ child + Sex, data= train_data,FUN=sum)
##   child    Sex Survived
## 1     0 female      159
## 2     1 female       38
## 3     0   male       70
## 4     1   male       23
aggregate(Survived ~ child + Sex, data= train_data,FUN=function(x){sum(x)/length(x)})
##   child    Sex  Survived
## 1     0 female 0.7718447
## 2     1 female 0.6909091
## 3     0   male 0.1772152
## 4     1   male 0.3965517
aggregate(Survived ~ price_paid + Pclass, data= train_data,FUN=function(x){sum(x)/length(x)})
##    price_paid Pclass  Survived
## 1        Free      1 0.0000000
## 2       Cheap      1 0.0000000
## 3        High      1 0.5121951
## 4   Expensive      1 0.6804734
## 5        Free      2 0.0000000
## 6      Medium      2 0.4285714
## 7        High      2 0.5714286
## 8   Expensive      2 0.5416667
## 9        Free      3 0.2500000
## 10      Cheap      3 0.2062500
## 11     Medium      3 0.4197531
## 12       High      3 0.2222222
## 13  Expensive      3 0.1951220
res <- aggregate(Survived ~ Pclass + price_paid + Sex, data= train_data,FUN=function(x){sum(x)/length(x)})
res[order(res$Pclass),]
##    Pclass price_paid    Sex  Survived
## 4       1       High female 0.8571429
## 7       1  Expensive female 0.9770115
## 10      1       Free   male 0.0000000
## 13      1      Cheap   male 0.0000000
## 17      1       High   male 0.4411765
## 20      1  Expensive   male 0.3658537
## 2       2     Medium female 0.9142857
## 5       2       High female 0.9032258
## 8       2  Expensive female 1.0000000
## 11      2       Free   male 0.0000000
## 15      2     Medium   male 0.1587302
## 18      2       High   male 0.1600000
## 21      2  Expensive   male 0.2142857
## 1       3      Cheap female 0.5937500
## 3       3     Medium female 0.5813953
## 6       3       High female 0.3333333
## 9       3  Expensive female 0.1250000
## 12      3       Free   male 0.2500000
## 14      3      Cheap   male 0.1093750
## 16      3     Medium   male 0.2368421
## 19      3       High   male 0.1250000
## 22      3  Expensive   male 0.2400000

Let’s look at Decision Trees

library('rpart')
library('rpart.plot')
?rpart
?rpart.control

Building a decision tree

decision_tree<-rpart(Survived ~ Pclass + price_paid + Sex + Age, data= train_data, method="class")
decision_tree
## n= 891 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 891 342 0 (0.61616162 0.38383838)  
##    2) Sex=male 577 109 0 (0.81109185 0.18890815)  
##      4) Age>=6.5 553  93 0 (0.83182640 0.16817360) *
##      5) Age< 6.5 24   8 1 (0.33333333 0.66666667) *
##    3) Sex=female 314  81 1 (0.25796178 0.74203822)  
##      6) Pclass>=2.5 144  72 0 (0.50000000 0.50000000)  
##       12) price_paid=High,Expensive 37   9 0 (0.75675676 0.24324324) *
##       13) price_paid=Cheap,Medium 107  44 1 (0.41121495 0.58878505)  
##         26) Age>=16.5 87  40 1 (0.45977011 0.54022989)  
##           52) Age>=27.5 19   6 0 (0.68421053 0.31578947) *
##           53) Age< 27.5 68  27 1 (0.39705882 0.60294118) *
##         27) Age< 16.5 20   4 1 (0.20000000 0.80000000) *
##      7) Pclass< 2.5 170   9 1 (0.05294118 0.94705882) *
rpart.plot(decision_tree)

Looking at the figure we can say that: - Most important feature to change prediction (evidence that would have the greatest impact on beliefs) is sex. - Then, for males, it is age. The Price/Class has little impact - For females it is Class and price_paid, then only age.

Now let’s see how our model performs with the test data

Before going there we need to perform feature extraction on the test data too

test_data$price_paid <- cut(test_data$Fare,breaks=c(-0.1,0.1,10,20,30,1000),labels=c("Free","Cheap","Medium","High","Expensive"))
head(predict(decision_tree,test_data))
##           0         1
## 1 0.8318264 0.1681736
## 2 0.6842105 0.3157895
## 3 0.8318264 0.1681736
## 4 0.8318264 0.1681736
## 5 0.3970588 0.6029412
## 6 0.8318264 0.1681736
test_data$survival_odds <- predict(decision_tree,test_data)[,2]
head(test_data)
##   PassengerId Pclass                                         Name    Sex  Age
## 1         892      3                             Kelly, Mr. James   male 34.5
## 2         893      3             Wilkes, Mrs. James (Ellen Needs) female 47.0
## 3         894      2                    Myles, Mr. Thomas Francis   male 62.0
## 4         895      3                             Wirz, Mr. Albert   male 27.0
## 5         896      3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0
## 6         897      3                   Svensson, Mr. Johan Cervin   male 14.0
##   SibSp Parch  Ticket    Fare Cabin Embarked price_paid survival_odds
## 1     0     0  330911  7.8292              Q      Cheap     0.1681736
## 2     1     0  363272  7.0000              S      Cheap     0.3157895
## 3     0     0  240276  9.6875              Q      Cheap     0.1681736
## 4     0     0  315154  8.6625              S      Cheap     0.1681736
## 5     1     1 3101298 12.2875              S     Medium     0.6029412
## 6     0     0    7538  9.2250              S      Cheap     0.1681736

Let’s understand the performance of the model

printcp(decision_tree)
## 
## Classification tree:
## rpart(formula = Survived ~ Pclass + price_paid + Sex + Age, data = train_data, 
##     method = "class")
## 
## Variables actually used in tree construction:
## [1] Age        Pclass     price_paid Sex       
## 
## Root node error: 342/891 = 0.38384
## 
## n= 891 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.444444      0   1.00000 1.00000 0.042446
## 2 0.027778      1   0.55556 0.55556 0.035750
## 3 0.023392      3   0.50000 0.53216 0.035188
## 4 0.010234      4   0.47661 0.51462 0.034749
## 5 0.010000      6   0.45614 0.50585 0.034524

-One row per level of the tree - Rel error is the relative error (R2 error) of applying the tree on the training data.(for now you want to minimize it).So here, using all 5 levels, your error is 0.45 - Xerror is the cross validation error, to avoid overfitting. - Xerror is the standard deviation of the cross validation error.

decision_tree1 <- rpart(Survived ~ Pclass + price_paid + Sex + Age, data= train_data, method="class" ,minsplit=75)
rpart.plot(decision_tree1)

Let’s look at the performance of this decision tree

printcp(decision_tree1)
## 
## Classification tree:
## rpart(formula = Survived ~ Pclass + price_paid + Sex + Age, data = train_data, 
##     method = "class", minsplit = 75)
## 
## Variables actually used in tree construction:
## [1] Age        Pclass     price_paid Sex       
## 
## Root node error: 342/891 = 0.38384
## 
## n= 891 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.444444      0   1.00000 1.00000 0.042446
## 2 0.027778      1   0.55556 0.55556 0.035750
## 3 0.014620      3   0.50000 0.51170 0.034675
## 4 0.010000      4   0.48538 0.51170 0.034675
decision_tree2 <- rpart(Survived ~ Pclass + price_paid + Sex + Age, data= train_data, method="class" ,minbucket=40)
printcp(decision_tree2)
## 
## Classification tree:
## rpart(formula = Survived ~ Pclass + price_paid + Sex + Age, data = train_data, 
##     method = "class", minbucket = 40)
## 
## Variables actually used in tree construction:
## [1] Pclass     price_paid Sex       
## 
## Root node error: 342/891 = 0.38384
## 
## n= 891 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.444444      0   1.00000 1.00000 0.042446
## 2 0.017544      1   0.55556 0.55556 0.035750
## 3 0.010000      3   0.52047 0.55263 0.035681
decision_tree3 <- rpart(Survived ~ Pclass + Fare + Sex + Age, data= train_data, method="class") 
rpart.plot(decision_tree3)

printcp(decision_tree3)
## 
## Classification tree:
## rpart(formula = Survived ~ Pclass + Fare + Sex + Age, data = train_data, 
##     method = "class")
## 
## Variables actually used in tree construction:
## [1] Age    Fare   Pclass Sex   
## 
## Root node error: 342/891 = 0.38384
## 
## n= 891 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.444444      0   1.00000 1.00000 0.042446
## 2 0.030702      1   0.55556 0.55556 0.035750
## 3 0.023392      3   0.49415 0.51754 0.034823
## 4 0.011696      4   0.47076 0.52632 0.035043
## 5 0.010000      7   0.43275 0.49708 0.034295

Further Resources