1 Intro

This project aims to explore the Titanic passenger’s survival likelihood through machine learning techniques.




2 Explore the Data

2.1 Load the raw data and setting up

# load packages
pacman::p_load(tidyverse,here,ggplot2,knitr,gridExtra,kableExtra,caTools,stargazer)

# change settings
options(scipen = 999,digits = 3)

# create file path using here()
file_path <- here("raw_data", "titanic1.csv")

# load the raw data
titanic <- read.csv(file_path)

2.2 Check the Data

kable(head(titanic,n=5), format = "html", escape = FALSE, align = "c") %>%
  kable_styling(latex_options="scale_down")
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body
1 1 Allen, Miss. Elisabeth Walton female 29.000 0 0 24160 211 B5 S 2 NA
1 1 Allison, Master. Hudson Trevor male 0.917 1 2 113781 152 C22 C26 S 11 NA
1 0 Allison, Miss. Helen Loraine female 2.000 1 2 113781 152 C22 C26 S NA
1 0 Allison, Mr. Hudson Joshua Creighton male 30.000 1 2 113781 152 C22 C26 S 135
1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.000 1 2 113781 152 C22 C26 S NA
str(titanic)
## 'data.frame':    1309 obs. of  13 variables:
##  $ pclass  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ survived: int  1 1 0 0 0 1 1 0 1 0 ...
##  $ name    : chr  "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Creighton" ...
##  $ sex     : chr  "female" "male" "female" "male" ...
##  $ age     : num  29 0.917 2 30 25 ...
##  $ sibsp   : int  0 1 1 1 1 0 1 0 2 0 ...
##  $ parch   : int  0 2 2 2 2 0 0 0 0 0 ...
##  $ ticket  : chr  "24160" "113781" "113781" "113781" ...
##  $ fare    : num  211 152 152 152 152 ...
##  $ cabin   : chr  "B5" "C22 C26" "C22 C26" "C22 C26" ...
##  $ embarked: chr  "S" "S" "S" "S" ...
##  $ boat    : chr  "2" "11" "" "" ...
##  $ body    : int  NA NA NA 135 NA NA NA NA NA 22 ...
kable(summary(titanic), format = "html", escape = FALSE, align = "c") %>%
  kable_styling(latex_options="scale_down",font_size = 10)
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body
Min. :1.00 Min. :0.000 Length:1309 Length:1309 Min. : 0.2 Min. :0.0 Min. :0.00 Length:1309 Min. : 0 Length:1309 Length:1309 Length:1309 Min. : 1
1st Qu.:2.00 1st Qu.:0.000 Class :character Class :character 1st Qu.:21.0 1st Qu.:0.0 1st Qu.:0.00 Class :character 1st Qu.: 8 Class :character Class :character Class :character 1st Qu.: 72
Median :3.00 Median :0.000 Mode :character Mode :character Median :28.0 Median :0.0 Median :0.00 Mode :character Median : 14 Mode :character Mode :character Mode :character Median :155
Mean :2.29 Mean :0.382 NA NA Mean :29.9 Mean :0.5 Mean :0.39 NA Mean : 33 NA NA NA Mean :161
3rd Qu.:3.00 3rd Qu.:1.000 NA NA 3rd Qu.:39.0 3rd Qu.:1.0 3rd Qu.:0.00 NA 3rd Qu.: 31 NA NA NA 3rd Qu.:256
Max. :3.00 Max. :1.000 NA NA Max. :80.0 Max. :8.0 Max. :9.00 NA Max. :512 NA NA NA Max. :328
NA NA NA NA NA’s :263 NA NA NA NA’s :1 NA NA NA NA’s :1188

2.3 Preparing the data and dealing with missing values

# convert relevant columns to factor

titanic$pclass<-as.factor(titanic$pclass)
titanic$survived<-as.factor(titanic$survived)
titanic$sex<-as.factor(titanic$sex)
titanic$embarked<-as.factor(titanic$embarked)

# check for missing values 
titanic[titanic == ""] <- NA
missing_values <- any(is.na(titanic))

cat("Any missing values in the titanicset:\n", missing_values)
## Any missing values in the titanicset:
##  TRUE
# check for missing values by var
missing_per_column <- colSums(is.na(titanic))

cat("Missing values per column\n")
## Missing values per column
print(missing_per_column)
##   pclass survived     name      sex      age    sibsp    parch   ticket 
##        0        0        0        0      263        0        0        0 
##     fare    cabin embarked     boat     body 
##        1     1014        2      823     1188
# count the total number of missing values 
total_missing <- sum(is.na(titanic))

cat("Total missing values in the titanicset:\n", total_missing)
## Total missing values in the titanicset:
##  3291
# create a df with missing counts by var
missing_df <- data.frame(
  var_name = names(missing_per_column),
  missing_count = missing_per_column,
  row.names = NULL)

missing_df <- arrange(missing_df, desc(missing_count))

kable(missing_df, format = "html", escape = FALSE, align = "c") %>%
  kable_styling()
var_name missing_count
body 1188
cabin 1014
boat 823
age 263
embarked 2
fare 1
pclass 0
survived 0
name 0
sex 0
sibsp 0
parch 0
ticket 0
# create bar plot of missing values
ggplot(missing_df , aes(x = reorder(var_name, missing_count), y = missing_count)) +
  geom_bar(stat = "identity", fill= "darkblue") +
  labs(title = "Missing Values Count",x = "Variable",y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

2.3.1 fill the missing ‘age’ values

There are several ways to deal with missing values. For example, the median can be used when we want to avoid the effect of extreme values, or the mean can be used in the case of a normal distribution. In our case, 263 values are missing. To make the results more accurate, we will examine the use of the median or the average as functions of ‘pclass’ and ‘sex’, with the reasonable assumption that there is a correlation between them

Let’s start with the histogram of ‘age’ variable

# create a histogram of 'age' ignore NA 
p_old <- ggplot(titanic, aes(x = age)) +
  geom_histogram(fill = "darkblue", color = "black",na.rm = TRUE) +
  labs(title = "Histogram of Age", x = "Age", y = "Frequency")

p_old
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

In our case, it seems to make sense to use the median (by ‘pclass’ and ‘sex’), so we will fill in the missing values accordingly.

missing_age<-titanic %>%
  group_by(pclass,sex) %>%
  summarise(
    mean_age = mean(age, na.rm = TRUE),
    median_age = median(age, na.rm = TRUE)
  )
## `summarise()` has grouped output by 'pclass'. You can override using the
## `.groups` argument.
titanic <- titanic %>%
  left_join(missing_age, by = c("pclass", "sex")) 

titanic$age<-ifelse(is.na(titanic$age),titanic$median_age,titanic$age)

titanic<-titanic %>%
  select(-median_age,-mean_age)

Now let’s examine how the new ‘age’ distribution looks like

# create a new histogram of 'age' ignore NA 
p_new <-ggplot(titanic, aes(x = age)) +
  geom_histogram(fill = "darkred", color = "black",na.rm = TRUE) +
  labs(title = "NEW - Histogram of Age", x = "Age", y = "Frequency")

combined_plots <- grid.arrange(p_new,p_old, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

combined_plots
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]

2.3.2 fill the missing ‘fare’ values

We have only one missing observation for the variable ‘fare’. Using mean or median will not have a significant effect in this case. However, we will use the median to avoid extreme values that pull the average upwards.

# create a histogram of 'fare' ignore NA 
ggplot(titanic, aes(x = fare)) +
  geom_histogram(fill = "darkblue", color = "black",na.rm = TRUE) +
  labs(title = "Histogram of Fare", x = "Fare", y = "Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cat("Median of fare:\n", median(titanic$fare,na.rm = TRUE))
## Median of fare:
##  14.5
cat("Mean of fare:\n", mean(titanic$fare,na.rm = TRUE))
## Mean of fare:
##  33.3
titanic$fare<-ifelse(is.na(titanic$fare),median(titanic$fare,na.rm = TRUE),titanic$fare)

2.3.3 fill the missing ‘embarked’ values

We have two missing values for the ‘embarked’ variable. To complete them, I will use the most common value under the assumption that this is the most probable option.

# create a bar plot of 'embarked' ignore NA 
ggplot(subset(titanic, !is.na(embarked)), aes(x = embarked)) +
  geom_bar(fill = "darkblue") +
  labs(title = "Embarked count", x = "Embarked", y = "Count")

titanic$embarked <- replace(titanic$embarked, is.na(titanic$embarked), "S")

2.3.4 Double check for missing values

Double-check for missing values and remove columns that should not affect the chance of survival (‘name’, ‘boat’, ‘body’, ‘ticket’). Note - The variable ‘cabin’ has 1014 missing observations. While it could be assumed that any missing observation indicates a passenger without a cabin, I find this assumption unreasonable. Therefore, I have chosen to remove the column to avoid artificially affecting the results

# removing unnecessary columns

titanic<-titanic %>%
  select(-name,-boat,-body,-cabin,-ticket)

# check again for missing values in the data
any_missing <- any(is.na(titanic))

cat("Any missing values in the titanicset:\n", any_missing)
## Any missing values in the titanicset:
##  FALSE

3 Descriptive Statistics

In this chapter, I will examine the relationship of different variables and ‘survived’ to get an initial indication of the relationship and influence between them.

# survival count 
ggplot(titanic, aes(x = survived,fill =survived)) +
  geom_bar() +
  labs(title = "Survival Count", x = "Survived", y = "Count")

# survival by gender
ggplot(titanic, aes(x = survived, fill = sex)) +
  geom_bar() +
  labs(title = "Survival by Gender", x = "Survived", y = "Count") +
  scale_fill_discrete(name = "Gender", labels = c("Female", "Male")) +
  geom_text(aes(label = ..count..), stat = "count", vjust = 1.5, colour = "white",position = position_stack())
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# survival by passenger class
ggplot(titanic, aes(x = survived, fill = pclass)) +
  geom_bar() +
  labs(title = "Survival by Passenger Class", x = "Survived", y = "Count") +
  scale_fill_discrete(name = "Passenger Class", labels = c("1st", "2nd", "3rd"))+
  geom_text(aes(label = ..count..), stat = "count", vjust = 1.5, colour = "white",position = position_stack())

# survival by embarked Port
ggplot(titanic, aes(x = survived, fill = embarked)) +
  geom_bar() +
  labs(title = "Survival by Embarked Port", x = "Survived", y = "Count") +
  scale_fill_discrete(name = "Embarked Port", labels = c("Cherbourg", "Queenstown", "Southampton"))+
  geom_text(aes(label = ..count..), stat = "count", vjust = 1.5, colour = "white",position = position_stack())

# survival by age
ggplot(titanic, aes(x = age,fill = survived )) + 
  geom_histogram() +
  labs(title = "Survival by Age", x = "Age", y = "Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

4 Prediction

4.1 Logistic regression

Step 1: Split data into training and test sets (80-20)
Step 2: Train the model
Step 3: Test the model
Step 4: Visualization of the results

# split the data 
set.seed(42) # for reproducibility
split <- sample.split(titanic$survived, SplitRatio = 0.8)
train_data <- subset(titanic, split == TRUE)
test_data <- subset(titanic, split == FALSE)

# train a logistic regression model
log_model <- glm(survived ~ pclass + sex + age + sibsp + parch + fare + embarked,
                 data = train_data, family = binomial)

# test the model
test_data$predicted <- predict(log_model, newdata = test_data, type = "response")
test_data$predicted <- ifelse(test_data$predicted > 0.5, 1, 0)

model_output <- summary(log_model)

stargazer(log_model, type = "html", title = "Titanic Logistic Regression Model Results")
Titanic Logistic Regression Model Results
Dependent variable:
survived
pclass2 -1.000***
(0.274)
pclass3 -2.060***
(0.277)
sexmale -2.620***
(0.183)
age -0.038***
(0.007)
sibsp -0.322***
(0.100)
parch -0.068
(0.101)
fare 0.001
(0.002)
embarkedQ -0.859**
(0.346)
embarkedS -0.662***
(0.208)
Constant 4.160***
(0.439)
Observations 1,047
Log Likelihood -481.000
Akaike Inf. Crit. 983.000
Note: p<0.1; p<0.05; p<0.01

4.1.1 Model accuracy

model_accuracy <- test_data %>%
  summarise(
    actual_1_count = sum(survived == 1),
    predicted_1_correct = sum(ifelse(survived == 1 & predicted==1, 1, 0)),
    actual_0_count = sum(survived == 0),
    predicted_0_correct = sum(ifelse(survived == 0 & predicted==0, 1, 0))
  )
  
 
  kable(model_accuracy, format = "html", escape = FALSE, align = "c") %>%
  kable_styling()
actual_1_count predicted_1_correct actual_0_count predicted_0_correct
100 73 162 140
  cat("Total correct prediction:", 
      sprintf("%.2f%%",
              ((model_accuracy$predicted_1_correct+model_accuracy$predicted_0_correct) / 
                (model_accuracy$actual_1_count+model_accuracy$actual_0_count)*100)))

Total correct prediction: 81.30%

  cat("Correct survival prediction percentage:", 
      sprintf("%.2f%%",
              ((model_accuracy$predicted_1_correct / model_accuracy$actual_1_count)*100)))

Correct survival prediction percentage: 73.00%

  cat("Correct not survival prediction percentage:", 
      sprintf("%.2f%%",
              ((model_accuracy$predicted_0_correct / model_accuracy$actual_0_count)*100)))

Correct not survival prediction percentage: 86.42%