1 Intro
2 Explore the Data
3 Descriptive Statistics
4 Prediction
- 4.1 Logistic regression
  - 4.1.1 Model accuracy

1 Intro

This project aims to explore the Titanic passenger’s survival likelihood through machine learning techniques.

2 Explore the Data

2.1 Load the raw data and setting up

# load packages
pacman::p_load(tidyverse,here,ggplot2,knitr,gridExtra,kableExtra,caTools,stargazer)

# change settings
options(scipen = 999,digits = 3)

# create file path using here()
file_path <- here("raw_data", "titanic1.csv")

# load the raw data
titanic <- read.csv(file_path)

2.2 Check the Data

kable(head(titanic,n=5), format = "html", escape = FALSE, align = "c") %>%
  kable_styling(latex_options="scale_down")

pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body
1	1	Allen, Miss. Elisabeth Walton	female	29.000	0	0	24160	211	B5	S	2	NA
1	1	Allison, Master. Hudson Trevor	male	0.917	1	2	113781	152	C22 C26	S	11	NA
1	0	Allison, Miss. Helen Loraine	female	2.000	1	2	113781	152	C22 C26	S		NA
1	0	Allison, Mr. Hudson Joshua Creighton	male	30.000	1	2	113781	152	C22 C26	S		135
1	0	Allison, Mrs. Hudson J C (Bessie Waldo Daniels)	female	25.000	1	2	113781	152	C22 C26	S		NA

str(titanic)

## 'data.frame':    1309 obs. of  13 variables:
##  $ pclass  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ survived: int  1 1 0 0 0 1 1 0 1 0 ...
##  $ name    : chr  "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Creighton" ...
##  $ sex     : chr  "female" "male" "female" "male" ...
##  $ age     : num  29 0.917 2 30 25 ...
##  $ sibsp   : int  0 1 1 1 1 0 1 0 2 0 ...
##  $ parch   : int  0 2 2 2 2 0 0 0 0 0 ...
##  $ ticket  : chr  "24160" "113781" "113781" "113781" ...
##  $ fare    : num  211 152 152 152 152 ...
##  $ cabin   : chr  "B5" "C22 C26" "C22 C26" "C22 C26" ...
##  $ embarked: chr  "S" "S" "S" "S" ...
##  $ boat    : chr  "2" "11" "" "" ...
##  $ body    : int  NA NA NA 135 NA NA NA NA NA 22 ...

kable(summary(titanic), format = "html", escape = FALSE, align = "c") %>%
  kable_styling(latex_options="scale_down",font_size = 10)

pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body
Min. :1.00	Min. :0.000	Length:1309	Length:1309	Min. : 0.2	Min. :0.0	Min. :0.00	Length:1309	Min. : 0	Length:1309	Length:1309	Length:1309	Min. : 1
1st Qu.:2.00	1st Qu.:0.000	Class :character	Class :character	1st Qu.:21.0	1st Qu.:0.0	1st Qu.:0.00	Class :character	1st Qu.: 8	Class :character	Class :character	Class :character	1st Qu.: 72
Median :3.00	Median :0.000	Mode :character	Mode :character	Median :28.0	Median :0.0	Median :0.00	Mode :character	Median : 14	Mode :character	Mode :character	Mode :character	Median :155
Mean :2.29	Mean :0.382	NA	NA	Mean :29.9	Mean :0.5	Mean :0.39	NA	Mean : 33	NA	NA	NA	Mean :161
3rd Qu.:3.00	3rd Qu.:1.000	NA	NA	3rd Qu.:39.0	3rd Qu.:1.0	3rd Qu.:0.00	NA	3rd Qu.: 31	NA	NA	NA	3rd Qu.:256
Max. :3.00	Max. :1.000	NA	NA	Max. :80.0	Max. :8.0	Max. :9.00	NA	Max. :512	NA	NA	NA	Max. :328
NA	NA	NA	NA	NA’s :263	NA	NA	NA	NA’s :1	NA	NA	NA	NA’s :1188

2.3 Preparing the data and dealing with missing values

# convert relevant columns to factor

titanic$pclass<-as.factor(titanic$pclass)
titanic$survived<-as.factor(titanic$survived)
titanic$sex<-as.factor(titanic$sex)
titanic$embarked<-as.factor(titanic$embarked)

# check for missing values 
titanic[titanic == ""] <- NA
missing_values <- any(is.na(titanic))

cat("Any missing values in the titanicset:\n", missing_values)

## Any missing values in the titanicset:
##  TRUE

# check for missing values by var
missing_per_column <- colSums(is.na(titanic))

cat("Missing values per column\n")

## Missing values per column

print(missing_per_column)

##   pclass survived     name      sex      age    sibsp    parch   ticket 
##        0        0        0        0      263        0        0        0 
##     fare    cabin embarked     boat     body 
##        1     1014        2      823     1188

# count the total number of missing values 
total_missing <- sum(is.na(titanic))

cat("Total missing values in the titanicset:\n", total_missing)

## Total missing values in the titanicset:
##  3291

# create a df with missing counts by var
missing_df <- data.frame(
  var_name = names(missing_per_column),
  missing_count = missing_per_column,
  row.names = NULL)

missing_df <- arrange(missing_df, desc(missing_count))

kable(missing_df, format = "html", escape = FALSE, align = "c") %>%
  kable_styling()

var_name	missing_count
body	1188
cabin	1014
boat	823
age	263
embarked	2
fare	1
pclass	0
survived	0
name	0
sex	0
sibsp	0
parch	0
ticket	0

# create bar plot of missing values
ggplot(missing_df , aes(x = reorder(var_name, missing_count), y = missing_count)) +
  geom_bar(stat = "identity", fill= "darkblue") +
  labs(title = "Missing Values Count",x = "Variable",y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

2.3.1 fill the missing ‘age’ values

There are several ways to deal with missing values. For example, the median can be used when we want to avoid the effect of extreme values, or the mean can be used in the case of a normal distribution. In our case, 263 values are missing. To make the results more accurate, we will examine the use of the median or the average as functions of ‘pclass’ and ‘sex’, with the reasonable assumption that there is a correlation between them

Let’s start with the histogram of ‘age’ variable

# create a histogram of 'age' ignore NA 
p_old <- ggplot(titanic, aes(x = age)) +
  geom_histogram(fill = "darkblue", color = "black",na.rm = TRUE) +
  labs(title = "Histogram of Age", x = "Age", y = "Frequency")

p_old

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

In our case, it seems to make sense to use the median (by ‘pclass’ and ‘sex’), so we will fill in the missing values accordingly.

missing_age<-titanic %>%
  group_by(pclass,sex) %>%
  summarise(
    mean_age = mean(age, na.rm = TRUE),
    median_age = median(age, na.rm = TRUE)
  )

## `summarise()` has grouped output by 'pclass'. You can override using the
## `.groups` argument.

titanic <- titanic %>%
  left_join(missing_age, by = c("pclass", "sex")) 

titanic$age<-ifelse(is.na(titanic$age),titanic$median_age,titanic$age)

titanic<-titanic %>%
  select(-median_age,-mean_age)

Now let’s examine how the new ‘age’ distribution looks like

# create a new histogram of 'age' ignore NA 
p_new <-ggplot(titanic, aes(x = age)) +
  geom_histogram(fill = "darkred", color = "black",na.rm = TRUE) +
  labs(title = "NEW - Histogram of Age", x = "Age", y = "Frequency")

combined_plots <- grid.arrange(p_new,p_old, ncol=2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

combined_plots

## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]

2.3.2 fill the missing ‘fare’ values

We have only one missing observation for the variable ‘fare’. Using mean or median will not have a significant effect in this case. However, we will use the median to avoid extreme values that pull the average upwards.

# create a histogram of 'fare' ignore NA 
ggplot(titanic, aes(x = fare)) +
  geom_histogram(fill = "darkblue", color = "black",na.rm = TRUE) +
  labs(title = "Histogram of Fare", x = "Fare", y = "Frequency")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cat("Median of fare:\n", median(titanic$fare,na.rm = TRUE))

## Median of fare:
##  14.5

cat("Mean of fare:\n", mean(titanic$fare,na.rm = TRUE))

## Mean of fare:
##  33.3

titanic$fare<-ifelse(is.na(titanic$fare),median(titanic$fare,na.rm = TRUE),titanic$fare)

2.3.3 fill the missing ‘embarked’ values

We have two missing values for the ‘embarked’ variable. To complete them, I will use the most common value under the assumption that this is the most probable option.

# create a bar plot of 'embarked' ignore NA 
ggplot(subset(titanic, !is.na(embarked)), aes(x = embarked)) +
  geom_bar(fill = "darkblue") +
  labs(title = "Embarked count", x = "Embarked", y = "Count")

titanic$embarked <- replace(titanic$embarked, is.na(titanic$embarked), "S")

2.3.4 Double check for missing values

Double-check for missing values and remove columns that should not affect the chance of survival (‘name’, ‘boat’, ‘body’, ‘ticket’). Note - The variable ‘cabin’ has 1014 missing observations. While it could be assumed that any missing observation indicates a passenger without a cabin, I find this assumption unreasonable. Therefore, I have chosen to remove the column to avoid artificially affecting the results

# removing unnecessary columns

titanic<-titanic %>%
  select(-name,-boat,-body,-cabin,-ticket)

# check again for missing values in the data
any_missing <- any(is.na(titanic))

cat("Any missing values in the titanicset:\n", any_missing)

## Any missing values in the titanicset:
##  FALSE

3 Descriptive Statistics

In this chapter, I will examine the relationship of different variables and ‘survived’ to get an initial indication of the relationship and influence between them.

# survival count 
ggplot(titanic, aes(x = survived,fill =survived)) +
  geom_bar() +
  labs(title = "Survival Count", x = "Survived", y = "Count")

# survival by gender
ggplot(titanic, aes(x = survived, fill = sex)) +
  geom_bar() +
  labs(title = "Survival by Gender", x = "Survived", y = "Count") +
  scale_fill_discrete(name = "Gender", labels = c("Female", "Male")) +
  geom_text(aes(label = ..count..), stat = "count", vjust = 1.5, colour = "white",position = position_stack())

## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# survival by passenger class
ggplot(titanic, aes(x = survived, fill = pclass)) +
  geom_bar() +
  labs(title = "Survival by Passenger Class", x = "Survived", y = "Count") +
  scale_fill_discrete(name = "Passenger Class", labels = c("1st", "2nd", "3rd"))+
  geom_text(aes(label = ..count..), stat = "count", vjust = 1.5, colour = "white",position = position_stack())

# survival by embarked Port
ggplot(titanic, aes(x = survived, fill = embarked)) +
  geom_bar() +
  labs(title = "Survival by Embarked Port", x = "Survived", y = "Count") +
  scale_fill_discrete(name = "Embarked Port", labels = c("Cherbourg", "Queenstown", "Southampton"))+
  geom_text(aes(label = ..count..), stat = "count", vjust = 1.5, colour = "white",position = position_stack())

# survival by age
ggplot(titanic, aes(x = age,fill = survived )) + 
  geom_histogram() +
  labs(title = "Survival by Age", x = "Age", y = "Frequency")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

4 Prediction

4.1 Logistic regression

Step 1: Split data into training and test sets (80-20)
Step 2: Train the model
Step 3: Test the model
Step 4: Visualization of the results

# split the data 
set.seed(42) # for reproducibility
split <- sample.split(titanic$survived, SplitRatio = 0.8)
train_data <- subset(titanic, split == TRUE)
test_data <- subset(titanic, split == FALSE)

# train a logistic regression model
log_model <- glm(survived ~ pclass + sex + age + sibsp + parch + fare + embarked,
                 data = train_data, family = binomial)

# test the model
test_data$predicted <- predict(log_model, newdata = test_data, type = "response")
test_data$predicted <- ifelse(test_data$predicted > 0.5, 1, 0)

model_output <- summary(log_model)

stargazer(log_model, type = "html", title = "Titanic Logistic Regression Model Results")

**Titanic Logistic Regression Model Results**

	Dependent variable:

	survived

pclass2	-1.000^***
	(0.274)

pclass3	-2.060^***
	(0.277)

sexmale	-2.620^***
	(0.183)

age	-0.038^***
	(0.007)

sibsp	-0.322^***
	(0.100)

parch	-0.068
	(0.101)

fare	0.001
	(0.002)

embarkedQ	-0.859^**
	(0.346)

embarkedS	-0.662^***
	(0.208)

Constant	4.160^***
	(0.439)


Observations	1,047
Log Likelihood	-481.000
Akaike Inf. Crit.	983.000

Note:	p<0.1; p<0.05; p<0.01

4.1.1 Model accuracy

model_accuracy <- test_data %>%
  summarise(
    actual_1_count = sum(survived == 1),
    predicted_1_correct = sum(ifelse(survived == 1 & predicted==1, 1, 0)),
    actual_0_count = sum(survived == 0),
    predicted_0_correct = sum(ifelse(survived == 0 & predicted==0, 1, 0))
  )
  
 
  kable(model_accuracy, format = "html", escape = FALSE, align = "c") %>%
  kable_styling()

actual_1_count	predicted_1_correct	actual_0_count	predicted_0_correct
100	73	162	140

  cat("Total correct prediction:", 
      sprintf("%.2f%%",
              ((model_accuracy$predicted_1_correct+model_accuracy$predicted_0_correct) / 
                (model_accuracy$actual_1_count+model_accuracy$actual_0_count)*100)))

Total correct prediction: 81.30%

  cat("Correct survival prediction percentage:", 
      sprintf("%.2f%%",
              ((model_accuracy$predicted_1_correct / model_accuracy$actual_1_count)*100)))

Correct survival prediction percentage: 73.00%

  cat("Correct not survival prediction percentage:", 
      sprintf("%.2f%%",
              ((model_accuracy$predicted_0_correct / model_accuracy$actual_0_count)*100)))

Correct not survival prediction percentage: 86.42%

Titanic - identify the factors that influenced the likelihood of survival

Aviad Kad

Last updated: 20-08-2023