Classifying Titanic Passengers as Survivors or Victims

We will explore the Titanic dataset and build a model that will predict whether or not a passenger survived or not.

library(dplyr)         
library(ggplot2)
library(rpart)
library(rpart.plot)
library(caret)
library(randomForest)

train <- read.csv("C:\\Users\\Vadim_Katsemba\\Downloads\\Titanictrain.csv")
test <- read.csv("C:\\Users\\Vadim_Katsemba\\Downloads\\Titanictest.csv")

test$Survived <- NA
all_data <- rbind(train, test)

ggplot(all_data[1:891,], aes(Age, fill = factor(Survived))) + geom_histogram(bins = 30) + xlab("Age") + scale_fill_discrete(name = "Survived") + ggtitle("Survivors by Age")

## Warning: Removed 177 rows containing non-finite values (stat_bin).

ggplot(all_data[1:891,], aes(Sex, fill = factor(Survived))) + geom_bar(stat = "count", position = "dodge") + xlab("Sex") + scale_fill_discrete(name = "Survived") + ggtitle("Survivors by Sex")

ggplot(all_data[1:891,], aes(Age, fill = factor(Survived))) + geom_histogram(bins = 30) + xlab("Age") + facet_grid(.~Sex) + scale_fill_discrete(name = "Survived") + ggtitle("Survivors by Age and Sex")

## Warning: Removed 177 rows containing non-finite values (stat_bin).

ggplot(all_data[1:891,], aes(Pclass, fill = factor(Survived))) + geom_bar(stat = "count") + xlab("Pclass") + facet_grid(.~Sex) + scale_fill_discrete(name = "Survived") + ggtitle("Survivors by Passenger Class and Sex")

ggplot(all_data[1:891,], aes(x = Age, y = Sex)) +
  geom_jitter(aes(colour = factor(Survived))) +
  facet_wrap(~Pclass) +
  labs(x = "Age", y = "Sex", title = "Survivors by Age, Sex and Passenger Class") +
  scale_fill_discrete(name = "Survived") +
  scale_x_continuous(name = "Age", limits = c(0, 81))

## Warning: Removed 177 rows containing missing values (geom_point).

all_data$Title <- gsub('(.*, )|(\\..*)', '', all_data$Name)
table(all_data$Sex, all_data$Title)

##         
##          Capt Col Don Dona  Dr Jonkheer Lady Major Master Miss Mlle Mme
##   female    0   0   0    1   1        0    1     0      0  260    2   1
##   male      1   4   1    0   7        1    0     2     61    0    0   0
##         
##           Mr Mrs  Ms Rev Sir the Countess
##   female   0 197   2   0   0            1
##   male   757   0   0   8   1            0

officer <- c('Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev')
royalty <- c('Dona', 'Lady', 'the Countess','Sir', 'Jonkheer')

all_data$Title[all_data$Title == 'Mlle']        <- 'Miss' 
all_data$Title[all_data$Title == 'Ms']          <- 'Miss'
all_data$Title[all_data$Title == 'Mme']         <- 'Mrs' 
all_data$Title[all_data$Title %in% royalty]  <- 'Royalty'
all_data$Title[all_data$Title %in% officer]  <- 'Officer'

ggplot(all_data[1:891,], aes(Title,fill = factor(Survived))) +
  geom_bar(stat = "count")+
  xlab('Title') +
  scale_fill_discrete(name = " Survived") + 
  ggtitle("Survivors by Title")

all_data$FamilySize <- all_data$SibSp + all_data$Parch + 1

ggplot(all_data[1:891,], aes(x = FamilySize, fill = factor(Survived))) +
  geom_bar(stat='count', position='dodge') +
  scale_x_continuous(breaks=c(1:11)) +
  xlab('Family Size') +
  scale_fill_discrete(name = "Survived") + 
  ggtitle("Survivors by Family Size")

all_data$Sex <- factor(all_data$Sex)
all_data$Title <- factor(all_data$Title)
all_data$Pclass <- factor(all_data$Pclass)
all_data$FamilySize <- factor(all_data$FamilySize)
all_data$Embarked <- factor(all_data$Embarked)

features <- all_data[1:891, c("Pclass", "Title", "Sex", "Embarked","FamilySize")]
response <- as.factor(train$Survived)
features$Survived <- as.factor(train$Survived)

set.seed(1234)
inTrain <- createDataPartition(features$Survived, times = 1, p = 0.8, list = FALSE)
train_eval = features[inTrain,]
test_eval = features[-inTrain,]

round(prop.table(table(train$Survived)*100), digits = 2)

## 
##    0    1 
## 0.62 0.38

round(prop.table(table(train_eval$Survived)*100), digits = 2)

## 
##    0    1 
## 0.62 0.38

round(prop.table(table(test_eval$Survived)*100), digits = 2)

## 
##    0    1 
## 0.62 0.38

set.seed(1234)
DT_model <- train(Survived ~., data = train_eval, method = "rpart")

library(rattle)
fancyRpartPlot(DT_model$finalModel)

DT_Pred <- predict(DT_model, data = train_eval, type = "raw")
confusionMatrix(DT_Pred, train_eval$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 423 114
##          1  17 160
##                                           
##                Accuracy : 0.8165          
##                  95% CI : (0.7861, 0.8443)
##     No Information Rate : 0.6162          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5843          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9614          
##             Specificity : 0.5839          
##          Pos Pred Value : 0.7877          
##          Neg Pred Value : 0.9040          
##              Prevalence : 0.6162          
##          Detection Rate : 0.5924          
##    Detection Prevalence : 0.7521          
##       Balanced Accuracy : 0.7727          
##                                           
##        'Positive' Class : 0               
##

set.seed(1234)
rf_model <- randomForest(x = train_eval[,-6], y = train_eval[, 6], importance = TRUE, ntree = 1000)

rf_predict <- predict(rf_model, newdata = test_eval)
confusionMatrix(rf_predict, test_eval$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 95 23
##          1 14 45
##                                           
##                Accuracy : 0.791           
##                  95% CI : (0.7236, 0.8483)
##     No Information Rate : 0.6158          
##     P-Value [Acc > NIR] : 4.548e-07       
##                                           
##                   Kappa : 0.5469          
##                                           
##  Mcnemar's Test P-Value : 0.1884          
##                                           
##             Sensitivity : 0.8716          
##             Specificity : 0.6618          
##          Pos Pred Value : 0.8051          
##          Neg Pred Value : 0.7627          
##              Prevalence : 0.6158          
##          Detection Rate : 0.5367          
##    Detection Prevalence : 0.6667          
##       Balanced Accuracy : 0.7667          
##                                           
##        'Positive' Class : 0               
##