Let’s clean the global environment before moving further
rm(list=ls())
cat("\014")
Let’s load the dataset: From this Kaggle competition https://www.kaggle.com/c/titanic
train_data <- read.csv("~/Downloads/TitanicTrain.csv")
test_data <- read.csv("~/Downloads/TitanicTest.csv")
Let’s take a look at the dataset
str(train_data)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
Let’s do some feature extraction: First feature extraction would be a factor variable indicating whether the person is a child or not
train_data$child<-as.factor(ifelse(train_data$Age<18,1,0))
table(train_data$child)
##
## 0 1
## 601 113
Second feature would be to categorize the fare into different levels
train_data$price_paid <- cut(train_data$Fare,breaks=c(-0.1,0.1,10,20,30,1000),labels=c("Free","Cheap","Medium","High","Expensive"))
table(train_data$price_paid)
##
## Free Cheap Medium High Expensive
## 15 321 179 142 234
Let’s take a closer look at the data
What does the distribution of survival look with respect to gender
table(train_data$Sex, train_data$Survived)
##
## 0 1
## female 81 233
## male 468 109
prop.table(table(train_data$Sex, train_data$Survived))
##
## 0 1
## female 0.09090909 0.26150393
## male 0.52525253 0.12233446
What does the distribution of survival look with respect to Class of Travel?
table(train_data$Pclass, train_data$Survived)
##
## 0 1
## 1 80 136
## 2 97 87
## 3 372 119
prop.table(table(train_data$Pclass, train_data$Survived),1)
##
## 0 1
## 1 0.3703704 0.6296296
## 2 0.5271739 0.4728261
## 3 0.7576375 0.2423625
Now let’s look how multiple variable impact our target variable ‘Survived’
An R formula is of the form: dependent variable ~ independent variables (joined by + signs)dependent variable: what you want to measure of predict Independent variables: features you are using to measure/predict
aggregate(Survived ~ child + Sex, data= train_data,FUN=sum)
## child Sex Survived
## 1 0 female 159
## 2 1 female 38
## 3 0 male 70
## 4 1 male 23
aggregate(Survived ~ child + Sex, data= train_data,FUN=function(x){sum(x)/length(x)})
## child Sex Survived
## 1 0 female 0.7718447
## 2 1 female 0.6909091
## 3 0 male 0.1772152
## 4 1 male 0.3965517
aggregate(Survived ~ price_paid + Pclass, data= train_data,FUN=function(x){sum(x)/length(x)})
## price_paid Pclass Survived
## 1 Free 1 0.0000000
## 2 Cheap 1 0.0000000
## 3 High 1 0.5121951
## 4 Expensive 1 0.6804734
## 5 Free 2 0.0000000
## 6 Medium 2 0.4285714
## 7 High 2 0.5714286
## 8 Expensive 2 0.5416667
## 9 Free 3 0.2500000
## 10 Cheap 3 0.2062500
## 11 Medium 3 0.4197531
## 12 High 3 0.2222222
## 13 Expensive 3 0.1951220
res <- aggregate(Survived ~ Pclass + price_paid + Sex, data= train_data,FUN=function(x){sum(x)/length(x)})
res[order(res$Pclass),]
## Pclass price_paid Sex Survived
## 4 1 High female 0.8571429
## 7 1 Expensive female 0.9770115
## 10 1 Free male 0.0000000
## 13 1 Cheap male 0.0000000
## 17 1 High male 0.4411765
## 20 1 Expensive male 0.3658537
## 2 2 Medium female 0.9142857
## 5 2 High female 0.9032258
## 8 2 Expensive female 1.0000000
## 11 2 Free male 0.0000000
## 15 2 Medium male 0.1587302
## 18 2 High male 0.1600000
## 21 2 Expensive male 0.2142857
## 1 3 Cheap female 0.5937500
## 3 3 Medium female 0.5813953
## 6 3 High female 0.3333333
## 9 3 Expensive female 0.1250000
## 12 3 Free male 0.2500000
## 14 3 Cheap male 0.1093750
## 16 3 Medium male 0.2368421
## 19 3 High male 0.1250000
## 22 3 Expensive male 0.2400000
library('rpart')
library('rpart.plot')
?rpart
?rpart.control
Building a decision tree
decision_tree<-rpart(Survived ~ Pclass + price_paid + Sex + Age, data= train_data, method="class")
decision_tree
## n= 891
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 891 342 0 (0.61616162 0.38383838)
## 2) Sex=male 577 109 0 (0.81109185 0.18890815)
## 4) Age>=6.5 553 93 0 (0.83182640 0.16817360) *
## 5) Age< 6.5 24 8 1 (0.33333333 0.66666667) *
## 3) Sex=female 314 81 1 (0.25796178 0.74203822)
## 6) Pclass>=2.5 144 72 0 (0.50000000 0.50000000)
## 12) price_paid=High,Expensive 37 9 0 (0.75675676 0.24324324) *
## 13) price_paid=Cheap,Medium 107 44 1 (0.41121495 0.58878505)
## 26) Age>=16.5 87 40 1 (0.45977011 0.54022989)
## 52) Age>=27.5 19 6 0 (0.68421053 0.31578947) *
## 53) Age< 27.5 68 27 1 (0.39705882 0.60294118) *
## 27) Age< 16.5 20 4 1 (0.20000000 0.80000000) *
## 7) Pclass< 2.5 170 9 1 (0.05294118 0.94705882) *
rpart.plot(decision_tree)
Looking at the figure we can say that: - Most important feature to change prediction (evidence that would have the greatest impact on beliefs) is sex. - Then, for males, it is age. The Price/Class has little impact - For females it is Class and price_paid, then only age.
Now let’s see how our model performs with the test data
Before going there we need to perform feature extraction on the test data too
test_data$price_paid <- cut(test_data$Fare,breaks=c(-0.1,0.1,10,20,30,1000),labels=c("Free","Cheap","Medium","High","Expensive"))
head(predict(decision_tree,test_data))
## 0 1
## 1 0.8318264 0.1681736
## 2 0.6842105 0.3157895
## 3 0.8318264 0.1681736
## 4 0.8318264 0.1681736
## 5 0.3970588 0.6029412
## 6 0.8318264 0.1681736
test_data$survival_odds <- predict(decision_tree,test_data)[,2]
head(test_data)
## PassengerId Pclass Name Sex Age
## 1 892 3 Kelly, Mr. James male 34.5
## 2 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0
## 3 894 2 Myles, Mr. Thomas Francis male 62.0
## 4 895 3 Wirz, Mr. Albert male 27.0
## 5 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0
## 6 897 3 Svensson, Mr. Johan Cervin male 14.0
## SibSp Parch Ticket Fare Cabin Embarked price_paid survival_odds
## 1 0 0 330911 7.8292 Q Cheap 0.1681736
## 2 1 0 363272 7.0000 S Cheap 0.3157895
## 3 0 0 240276 9.6875 Q Cheap 0.1681736
## 4 0 0 315154 8.6625 S Cheap 0.1681736
## 5 1 1 3101298 12.2875 S Medium 0.6029412
## 6 0 0 7538 9.2250 S Cheap 0.1681736
Let’s understand the performance of the model
printcp(decision_tree)
##
## Classification tree:
## rpart(formula = Survived ~ Pclass + price_paid + Sex + Age, data = train_data,
## method = "class")
##
## Variables actually used in tree construction:
## [1] Age Pclass price_paid Sex
##
## Root node error: 342/891 = 0.38384
##
## n= 891
##
## CP nsplit rel error xerror xstd
## 1 0.444444 0 1.00000 1.00000 0.042446
## 2 0.027778 1 0.55556 0.55556 0.035750
## 3 0.023392 3 0.50000 0.53216 0.035188
## 4 0.010234 4 0.47661 0.51462 0.034749
## 5 0.010000 6 0.45614 0.50585 0.034524
-One row per level of the tree - Rel error is the relative error (R2 error) of applying the tree on the training data.(for now you want to minimize it).So here, using all 5 levels, your error is 0.45 - Xerror is the cross validation error, to avoid overfitting. - Xerror is the standard deviation of the cross validation error.
decision_tree1 <- rpart(Survived ~ Pclass + price_paid + Sex + Age, data= train_data, method="class" ,minsplit=75)
rpart.plot(decision_tree1)
Let’s look at the performance of this decision tree
printcp(decision_tree1)
##
## Classification tree:
## rpart(formula = Survived ~ Pclass + price_paid + Sex + Age, data = train_data,
## method = "class", minsplit = 75)
##
## Variables actually used in tree construction:
## [1] Age Pclass price_paid Sex
##
## Root node error: 342/891 = 0.38384
##
## n= 891
##
## CP nsplit rel error xerror xstd
## 1 0.444444 0 1.00000 1.00000 0.042446
## 2 0.027778 1 0.55556 0.55556 0.035750
## 3 0.014620 3 0.50000 0.51170 0.034675
## 4 0.010000 4 0.48538 0.51170 0.034675
decision_tree2 <- rpart(Survived ~ Pclass + price_paid + Sex + Age, data= train_data, method="class" ,minbucket=40)
printcp(decision_tree2)
##
## Classification tree:
## rpart(formula = Survived ~ Pclass + price_paid + Sex + Age, data = train_data,
## method = "class", minbucket = 40)
##
## Variables actually used in tree construction:
## [1] Pclass price_paid Sex
##
## Root node error: 342/891 = 0.38384
##
## n= 891
##
## CP nsplit rel error xerror xstd
## 1 0.444444 0 1.00000 1.00000 0.042446
## 2 0.017544 1 0.55556 0.55556 0.035750
## 3 0.010000 3 0.52047 0.55263 0.035681
decision_tree3 <- rpart(Survived ~ Pclass + Fare + Sex + Age, data= train_data, method="class")
rpart.plot(decision_tree3)
printcp(decision_tree3)
##
## Classification tree:
## rpart(formula = Survived ~ Pclass + Fare + Sex + Age, data = train_data,
## method = "class")
##
## Variables actually used in tree construction:
## [1] Age Fare Pclass Sex
##
## Root node error: 342/891 = 0.38384
##
## n= 891
##
## CP nsplit rel error xerror xstd
## 1 0.444444 0 1.00000 1.00000 0.042446
## 2 0.030702 1 0.55556 0.55556 0.035750
## 3 0.023392 3 0.49415 0.51754 0.034823
## 4 0.011696 4 0.47076 0.52632 0.035043
## 5 0.010000 7 0.43275 0.49708 0.034295