# Read in data
df = read.csv(file = "titanic.csv")
# I wanted to name df$Survived data to make the tree read better
df$Survived <- as.factor(sample(c("Survived","Died"),891,replace =TRUE))
# Decision Tree Time!
library(rpart)
library(rpart.plot)
tre <- rpart(Survived ~ Pclass+Age+Sex,method = "class", data = df)
rpart.plot(tre,main = "Titanic Survivorship Tree")

# I tried using more variables and I found that the tree was harder to read
# when I included FamilyMember and Embarked. I settled on using Class, Age,
# and Sex, as they made the tree more readable and easy to interpret.
# Reading this tree, the factors that led to survival were: being an infant
# (age less than 1.5) or being older than 49. It's interesting to see that
# class comes into the tree only when looking at folks who are older than
# 49, and that Sex was not used in creating this tree!
# Testing the accuracy of tree...
df2 = cbind(df)
p_survivor <- predict(tre, data=df2, type=c("class"))
p_prob <- predict(tre,data=df2,type=c("prob"))
df2 <- cbind(df2,p_survivor)
df2 <- cbind(df2,p_prob)
acc <- sum(df2$Survived == df2$p_survivor)/length(df2$Survived)
acc
## [1] 0.5521886
# Accuracy of about 54%... a bit better than a guess.
printcp(tre)
##
## Classification tree:
## rpart(formula = Survived ~ Pclass + Age + Sex, data = df, method = "class")
##
## Variables actually used in tree construction:
## [1] Age
##
## Root node error: 442/891 = 0.49607
##
## n= 891
##
## CP nsplit rel error xerror xstd
## 1 0.022624 0 1.00000 1.0362 0.033753
## 2 0.015837 1 0.97738 1.1267 0.033531
## 3 0.013575 4 0.92986 1.0611 0.033720
## 4 0.010000 6 0.90271 1.0633 0.033715
# Adding in all the columns to see if more data makes a difference in accuracy
tre2 <- rpart(Survived ~ Pclass+Age+Sex+FamilyMember+Cabin+Embarked,method = "class", data = df)
rpart.plot(tre2,main = "Titanic Survivorship Tree")

df2 = cbind(df)
p_survivor <- predict(tre2, data=df2, type=c("class"))
p_prob <- predict(tre2,data=df2,type=c("prob"))
df2 <- cbind(df2,p_survivor)
df2 <- cbind(df2,p_prob)
acc <- sum(df2$Survived == df2$p_survivor)/length(df2$Survived)
acc
## [1] 0.5914703
# Accuracy of about 55% - doesn't seem to be an advantage
# in making the model more complex.