titanic-decision-tree.R

# Read in data
df = read.csv(file = "titanic.csv")

# I wanted to name df$Survived data to make the tree read better
df$Survived <- as.factor(sample(c("Survived","Died"),891,replace =TRUE))



# Decision Tree Time!
library(rpart)
library(rpart.plot)
tre <- rpart(Survived ~ Pclass+Age+Sex,method = "class", data = df)
rpart.plot(tre,main = "Titanic Survivorship Tree")

# I tried using more variables and I found that the tree was harder to read
# when I included FamilyMember and Embarked. I settled on using Class, Age,
# and Sex, as they made the tree more readable and easy to interpret.

# Reading this tree, the factors that led to survival were: being an infant
# (age less than 1.5) or being older than 49. It's interesting to see that 
# class comes into the tree only when looking at folks who are older than
# 49, and that Sex was not used in creating this tree!


# Testing the accuracy of tree...
df2 = cbind(df)
p_survivor <- predict(tre, data=df2, type=c("class"))
p_prob <- predict(tre,data=df2,type=c("prob"))
df2 <- cbind(df2,p_survivor)
df2 <- cbind(df2,p_prob)

acc <- sum(df2$Survived == df2$p_survivor)/length(df2$Survived)
acc

## [1] 0.5521886

# Accuracy of about 54%... a bit better than a guess. 

printcp(tre)

## 
## Classification tree:
## rpart(formula = Survived ~ Pclass + Age + Sex, data = df, method = "class")
## 
## Variables actually used in tree construction:
## [1] Age
## 
## Root node error: 442/891 = 0.49607
## 
## n= 891 
## 
##         CP nsplit rel error xerror     xstd
## 1 0.022624      0   1.00000 1.0362 0.033753
## 2 0.015837      1   0.97738 1.1267 0.033531
## 3 0.013575      4   0.92986 1.0611 0.033720
## 4 0.010000      6   0.90271 1.0633 0.033715

# Adding in all the columns to see if more data makes a difference in accuracy
tre2 <- rpart(Survived ~ Pclass+Age+Sex+FamilyMember+Cabin+Embarked,method = "class", data = df)
rpart.plot(tre2,main = "Titanic Survivorship Tree")

df2 = cbind(df)
p_survivor <- predict(tre2, data=df2, type=c("class"))
p_prob <- predict(tre2,data=df2,type=c("prob"))
df2 <- cbind(df2,p_survivor)
df2 <- cbind(df2,p_prob)
acc <- sum(df2$Survived == df2$p_survivor)/length(df2$Survived)
acc

## [1] 0.5914703

# Accuracy of about 55% - doesn't seem to be an advantage 
# in making the model more complex.

titanic-decision-tree.R

andrewwhelan

2022-02-05