This project aims to explore the Titanic passenger’s survival likelihood through machine learning techniques.
# load packages
pacman::p_load(tidyverse,here,ggplot2,knitr,gridExtra,kableExtra,caTools,stargazer)
# change settings
options(scipen = 999,digits = 3)
# create file path using here()
file_path <- here("raw_data", "titanic1.csv")
# load the raw data
titanic <- read.csv(file_path)
kable(head(titanic,n=5), format = "html", escape = FALSE, align = "c") %>%
kable_styling(latex_options="scale_down")
| pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | Allen, Miss. Elisabeth Walton | female | 29.000 | 0 | 0 | 24160 | 211 | B5 | S | 2 | NA |
| 1 | 1 | Allison, Master. Hudson Trevor | male | 0.917 | 1 | 2 | 113781 | 152 | C22 C26 | S | 11 | NA |
| 1 | 0 | Allison, Miss. Helen Loraine | female | 2.000 | 1 | 2 | 113781 | 152 | C22 C26 | S | NA | |
| 1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.000 | 1 | 2 | 113781 | 152 | C22 C26 | S | 135 | |
| 1 | 0 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female | 25.000 | 1 | 2 | 113781 | 152 | C22 C26 | S | NA |
str(titanic)
## 'data.frame': 1309 obs. of 13 variables:
## $ pclass : int 1 1 1 1 1 1 1 1 1 1 ...
## $ survived: int 1 1 0 0 0 1 1 0 1 0 ...
## $ name : chr "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Creighton" ...
## $ sex : chr "female" "male" "female" "male" ...
## $ age : num 29 0.917 2 30 25 ...
## $ sibsp : int 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : int 0 2 2 2 2 0 0 0 0 0 ...
## $ ticket : chr "24160" "113781" "113781" "113781" ...
## $ fare : num 211 152 152 152 152 ...
## $ cabin : chr "B5" "C22 C26" "C22 C26" "C22 C26" ...
## $ embarked: chr "S" "S" "S" "S" ...
## $ boat : chr "2" "11" "" "" ...
## $ body : int NA NA NA 135 NA NA NA NA NA 22 ...
kable(summary(titanic), format = "html", escape = FALSE, align = "c") %>%
kable_styling(latex_options="scale_down",font_size = 10)
| pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. :1.00 | Min. :0.000 | Length:1309 | Length:1309 | Min. : 0.2 | Min. :0.0 | Min. :0.00 | Length:1309 | Min. : 0 | Length:1309 | Length:1309 | Length:1309 | Min. : 1 | |
| 1st Qu.:2.00 | 1st Qu.:0.000 | Class :character | Class :character | 1st Qu.:21.0 | 1st Qu.:0.0 | 1st Qu.:0.00 | Class :character | 1st Qu.: 8 | Class :character | Class :character | Class :character | 1st Qu.: 72 | |
| Median :3.00 | Median :0.000 | Mode :character | Mode :character | Median :28.0 | Median :0.0 | Median :0.00 | Mode :character | Median : 14 | Mode :character | Mode :character | Mode :character | Median :155 | |
| Mean :2.29 | Mean :0.382 | NA | NA | Mean :29.9 | Mean :0.5 | Mean :0.39 | NA | Mean : 33 | NA | NA | NA | Mean :161 | |
| 3rd Qu.:3.00 | 3rd Qu.:1.000 | NA | NA | 3rd Qu.:39.0 | 3rd Qu.:1.0 | 3rd Qu.:0.00 | NA | 3rd Qu.: 31 | NA | NA | NA | 3rd Qu.:256 | |
| Max. :3.00 | Max. :1.000 | NA | NA | Max. :80.0 | Max. :8.0 | Max. :9.00 | NA | Max. :512 | NA | NA | NA | Max. :328 | |
| NA | NA | NA | NA | NA’s :263 | NA | NA | NA | NA’s :1 | NA | NA | NA | NA’s :1188 |
# convert relevant columns to factor
titanic$pclass<-as.factor(titanic$pclass)
titanic$survived<-as.factor(titanic$survived)
titanic$sex<-as.factor(titanic$sex)
titanic$embarked<-as.factor(titanic$embarked)
# check for missing values
titanic[titanic == ""] <- NA
missing_values <- any(is.na(titanic))
cat("Any missing values in the titanicset:\n", missing_values)
## Any missing values in the titanicset:
## TRUE
# check for missing values by var
missing_per_column <- colSums(is.na(titanic))
cat("Missing values per column\n")
## Missing values per column
print(missing_per_column)
## pclass survived name sex age sibsp parch ticket
## 0 0 0 0 263 0 0 0
## fare cabin embarked boat body
## 1 1014 2 823 1188
# count the total number of missing values
total_missing <- sum(is.na(titanic))
cat("Total missing values in the titanicset:\n", total_missing)
## Total missing values in the titanicset:
## 3291
# create a df with missing counts by var
missing_df <- data.frame(
var_name = names(missing_per_column),
missing_count = missing_per_column,
row.names = NULL)
missing_df <- arrange(missing_df, desc(missing_count))
kable(missing_df, format = "html", escape = FALSE, align = "c") %>%
kable_styling()
| var_name | missing_count |
|---|---|
| body | 1188 |
| cabin | 1014 |
| boat | 823 |
| age | 263 |
| embarked | 2 |
| fare | 1 |
| pclass | 0 |
| survived | 0 |
| name | 0 |
| sex | 0 |
| sibsp | 0 |
| parch | 0 |
| ticket | 0 |
# create bar plot of missing values
ggplot(missing_df , aes(x = reorder(var_name, missing_count), y = missing_count)) +
geom_bar(stat = "identity", fill= "darkblue") +
labs(title = "Missing Values Count",x = "Variable",y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
There are several ways to deal with missing values. For example, the median can be used when we want to avoid the effect of extreme values, or the mean can be used in the case of a normal distribution. In our case, 263 values are missing. To make the results more accurate, we will examine the use of the median or the average as functions of ‘pclass’ and ‘sex’, with the reasonable assumption that there is a correlation between them
Let’s start with the histogram of ‘age’ variable
# create a histogram of 'age' ignore NA
p_old <- ggplot(titanic, aes(x = age)) +
geom_histogram(fill = "darkblue", color = "black",na.rm = TRUE) +
labs(title = "Histogram of Age", x = "Age", y = "Frequency")
p_old
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
In our case, it seems to make sense to use the median (by ‘pclass’ and ‘sex’), so we will fill in the missing values accordingly.
missing_age<-titanic %>%
group_by(pclass,sex) %>%
summarise(
mean_age = mean(age, na.rm = TRUE),
median_age = median(age, na.rm = TRUE)
)
## `summarise()` has grouped output by 'pclass'. You can override using the
## `.groups` argument.
titanic <- titanic %>%
left_join(missing_age, by = c("pclass", "sex"))
titanic$age<-ifelse(is.na(titanic$age),titanic$median_age,titanic$age)
titanic<-titanic %>%
select(-median_age,-mean_age)
Now let’s examine how the new ‘age’ distribution looks like
# create a new histogram of 'age' ignore NA
p_new <-ggplot(titanic, aes(x = age)) +
geom_histogram(fill = "darkred", color = "black",na.rm = TRUE) +
labs(title = "NEW - Histogram of Age", x = "Age", y = "Frequency")
combined_plots <- grid.arrange(p_new,p_old, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
combined_plots
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
We have only one missing observation for the variable ‘fare’. Using mean or median will not have a significant effect in this case. However, we will use the median to avoid extreme values that pull the average upwards.
# create a histogram of 'fare' ignore NA
ggplot(titanic, aes(x = fare)) +
geom_histogram(fill = "darkblue", color = "black",na.rm = TRUE) +
labs(title = "Histogram of Fare", x = "Fare", y = "Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cat("Median of fare:\n", median(titanic$fare,na.rm = TRUE))
## Median of fare:
## 14.5
cat("Mean of fare:\n", mean(titanic$fare,na.rm = TRUE))
## Mean of fare:
## 33.3
titanic$fare<-ifelse(is.na(titanic$fare),median(titanic$fare,na.rm = TRUE),titanic$fare)
We have two missing values for the ‘embarked’ variable. To complete them, I will use the most common value under the assumption that this is the most probable option.
# create a bar plot of 'embarked' ignore NA
ggplot(subset(titanic, !is.na(embarked)), aes(x = embarked)) +
geom_bar(fill = "darkblue") +
labs(title = "Embarked count", x = "Embarked", y = "Count")
titanic$embarked <- replace(titanic$embarked, is.na(titanic$embarked), "S")
Double-check for missing values and remove columns that should not affect the chance of survival (‘name’, ‘boat’, ‘body’, ‘ticket’). Note - The variable ‘cabin’ has 1014 missing observations. While it could be assumed that any missing observation indicates a passenger without a cabin, I find this assumption unreasonable. Therefore, I have chosen to remove the column to avoid artificially affecting the results
# removing unnecessary columns
titanic<-titanic %>%
select(-name,-boat,-body,-cabin,-ticket)
# check again for missing values in the data
any_missing <- any(is.na(titanic))
cat("Any missing values in the titanicset:\n", any_missing)
## Any missing values in the titanicset:
## FALSE
In this chapter, I will examine the relationship of different variables and ‘survived’ to get an initial indication of the relationship and influence between them.
# survival count
ggplot(titanic, aes(x = survived,fill =survived)) +
geom_bar() +
labs(title = "Survival Count", x = "Survived", y = "Count")
# survival by gender
ggplot(titanic, aes(x = survived, fill = sex)) +
geom_bar() +
labs(title = "Survival by Gender", x = "Survived", y = "Count") +
scale_fill_discrete(name = "Gender", labels = c("Female", "Male")) +
geom_text(aes(label = ..count..), stat = "count", vjust = 1.5, colour = "white",position = position_stack())
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# survival by passenger class
ggplot(titanic, aes(x = survived, fill = pclass)) +
geom_bar() +
labs(title = "Survival by Passenger Class", x = "Survived", y = "Count") +
scale_fill_discrete(name = "Passenger Class", labels = c("1st", "2nd", "3rd"))+
geom_text(aes(label = ..count..), stat = "count", vjust = 1.5, colour = "white",position = position_stack())
# survival by embarked Port
ggplot(titanic, aes(x = survived, fill = embarked)) +
geom_bar() +
labs(title = "Survival by Embarked Port", x = "Survived", y = "Count") +
scale_fill_discrete(name = "Embarked Port", labels = c("Cherbourg", "Queenstown", "Southampton"))+
geom_text(aes(label = ..count..), stat = "count", vjust = 1.5, colour = "white",position = position_stack())
# survival by age
ggplot(titanic, aes(x = age,fill = survived )) +
geom_histogram() +
labs(title = "Survival by Age", x = "Age", y = "Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Step 1: Split data into training and test sets (80-20)
Step 2:
Train the model
Step 3: Test the model
Step 4: Visualization of
the results
# split the data
set.seed(42) # for reproducibility
split <- sample.split(titanic$survived, SplitRatio = 0.8)
train_data <- subset(titanic, split == TRUE)
test_data <- subset(titanic, split == FALSE)
# train a logistic regression model
log_model <- glm(survived ~ pclass + sex + age + sibsp + parch + fare + embarked,
data = train_data, family = binomial)
# test the model
test_data$predicted <- predict(log_model, newdata = test_data, type = "response")
test_data$predicted <- ifelse(test_data$predicted > 0.5, 1, 0)
model_output <- summary(log_model)
stargazer(log_model, type = "html", title = "Titanic Logistic Regression Model Results")
| Dependent variable: | |
| survived | |
| pclass2 | -1.000*** |
| (0.274) | |
| pclass3 | -2.060*** |
| (0.277) | |
| sexmale | -2.620*** |
| (0.183) | |
| age | -0.038*** |
| (0.007) | |
| sibsp | -0.322*** |
| (0.100) | |
| parch | -0.068 |
| (0.101) | |
| fare | 0.001 |
| (0.002) | |
| embarkedQ | -0.859** |
| (0.346) | |
| embarkedS | -0.662*** |
| (0.208) | |
| Constant | 4.160*** |
| (0.439) | |
| Observations | 1,047 |
| Log Likelihood | -481.000 |
| Akaike Inf. Crit. | 983.000 |
| Note: | p<0.1; p<0.05; p<0.01 |
model_accuracy <- test_data %>%
summarise(
actual_1_count = sum(survived == 1),
predicted_1_correct = sum(ifelse(survived == 1 & predicted==1, 1, 0)),
actual_0_count = sum(survived == 0),
predicted_0_correct = sum(ifelse(survived == 0 & predicted==0, 1, 0))
)
kable(model_accuracy, format = "html", escape = FALSE, align = "c") %>%
kable_styling()
| actual_1_count | predicted_1_correct | actual_0_count | predicted_0_correct |
|---|---|---|---|
| 100 | 73 | 162 | 140 |
cat("Total correct prediction:",
sprintf("%.2f%%",
((model_accuracy$predicted_1_correct+model_accuracy$predicted_0_correct) /
(model_accuracy$actual_1_count+model_accuracy$actual_0_count)*100)))
Total correct prediction: 81.30%
cat("Correct survival prediction percentage:",
sprintf("%.2f%%",
((model_accuracy$predicted_1_correct / model_accuracy$actual_1_count)*100)))
Correct survival prediction percentage: 73.00%
cat("Correct not survival prediction percentage:",
sprintf("%.2f%%",
((model_accuracy$predicted_0_correct / model_accuracy$actual_0_count)*100)))
Correct not survival prediction percentage: 86.42%