library(dplyr)
library(ggplot2)
library(gt)
library(ggiraph)
library(tidyr)
library(tidyverse)
The below table provides an overview on the Titanic dataset, as provided by Kaggle.
| Titanic Dataset Overview | ||
| Kaggle - May 2026 | ||
| Variable | Definition | Key |
|---|---|---|
| survival | survival | 0=No, 1=Yes |
| pclass | ticket class | 1=1st, 2=2nd, 3=3rd |
| sex | sex | |
| age | age in years | |
| sibsp | # of siblings / spouses aboard Titanic | |
| parch | # of parents / children aboard the Titanic | |
| ticket | ticket number | |
| fare | passenger fare | |
| cabin | cabin number | |
| embarked | port of embarkation | C=Cherbourg, Q=Queenstown, S=Southhampton |
| pclass: A proxy for socio-economic status (SES) 1st = Upper 2nd = Middle 3rd = Lower | ||
| age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5 | ||
| sibsp: The dataset defines family relations in this way… Sibling = brother, sister, stepbrother, stepsister Spouse = husband, wife (mistresses and fiancés were ignored) | ||
| parch: The dataset defines family relations in this way… Parent = mother, father Child = daughter, son, stepdaughter, stepson Some children travelled only with a nanny, therefore parch=0 for them. | ||
We begin first by identifying missing variables within each column. Both the training and test data sets will be evaluated. The test set is only explored to ensure that categorical variables do not differ from the training set.
train <- read.csv("train.csv")
test <- read.csv("test.csv")
missing_train <- train |>
dplyr::mutate(across(where(is.character), ~ dplyr::na_if(.x, ""))) |>
dplyr::summarise(across(everything(), ~ sum(is.na(.x)))) |>
tidyr::pivot_longer(cols = everything(), names_to = "Variables", values_to = "Total_train")
missing_test <- test |>
dplyr::mutate(across(where(is.character), ~ dplyr::na_if(.x, ""))) |>
dplyr::summarise(across(everything(), ~ sum(is.na(.x)))) |>
tidyr::pivot_longer(cols = everything(), names_to = "Variables", values_to = "Total_test")
missing <- dplyr::left_join(missing_train, missing_test, by="Variables")
missing |> gt()
| Variables | Total_train | Total_test |
|---|---|---|
| PassengerId | 0 | 0 |
| Survived | 0 | NA |
| Pclass | 0 | 0 |
| Name | 0 | 0 |
| Sex | 0 | 0 |
| Age | 177 | 86 |
| SibSp | 0 | 0 |
| Parch | 0 | 0 |
| Ticket | 0 | 0 |
| Fare | 0 | 1 |
| Cabin | 687 | 327 |
| Embarked | 2 | 0 |
From the table above, we can see that age requires a suitable method to fill in the missing values. However, Fare will also require treatment, in order to accurately predict the target variable. Let us now deep dive into each of the variables.
train <- train |>
dplyr::mutate(
Survived = dplyr::if_else(Survived == "1", "Survived", "Died"),
Pclass = paste("Class", Pclass))
ggplot(train, aes(y = factor(Pclass), fill = factor(Survived), color = factor(Survived))) +
geom_bar(position = "dodge", alpha=0.3) +
facet_wrap(~Sex) +
scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7")) +
scale_color_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7")) +
labs(
title = "Survival By Gender, per Class",
y = "Passenger Class",
x = "Total"
) +
theme_minimal(base_size = 12, base_family = "sans") +
theme(
plot.title = element_text(face = "bold", size = 14, margin = margin(b = 4)),
plot.subtitle = element_text(color = "grey40", size = 11, margin = margin(b = 12)),
plot.caption = element_text(color = "grey60", size = 9, hjust = 0),
plot.margin = margin(16, 16, 16, 16),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_text(color = "grey30", size = 10),
axis.text = element_text(color = "grey30"),
axis.ticks.x = element_line(color = "grey80"),
legend.position = "bottom",
legend.title = element_blank()
)
From the plot below, we observe that the majority of people embarked in Southampton (S). Therefore, any missing values will be imputed with Southampton.
ggplot(train, aes(y = factor(Embarked), fill = factor(Pclass), color = factor(Pclass))) +
geom_bar(position = "dodge", alpha = 0.3) +
#facet_wrap(~Sex) +
scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7")) +
scale_colour_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7")) +
labs(
title = "Embarkated Locations, by Class",
x = "Total Count",
y = "Embark Location"
) +
theme_minimal(base_size = 12, base_family = "sans") +
theme(
plot.title = element_text(face = "bold", size = 14, margin = margin(b = 4)),
plot.subtitle = element_text(color = "grey40", size = 11, margin = margin(b = 12)),
plot.caption = element_text(color = "grey60", size = 9, hjust = 0),
plot.margin = margin(16, 16, 16, 16),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_text(color = "grey30", size = 10),
axis.text = element_text(color = "grey30"),
axis.ticks.x = element_line(color = "grey80"),
legend.position = c(0.9, 0.15),
legend.title = element_blank()
)
When reviewing the titles, we observe that there are 17 distinct titles. However, many occur infrequently, or are simply translations of the same title repeated. The number of titles can be significantly reduced to 5 groups:
The separation between Ms and Miss is designed to helped us distinguish the likely age (for the missing values) between girls under the age of 18, and women over the age of 18.
train$Title <- str_extract(train$Name, "[A-Za-z]+(?=\\.)")
train |>
dplyr::group_by(Title, Survived) |>
dplyr::summarise(total = n(), .groups = "drop") |>
tidyr::pivot_wider(names_from = Survived, values_from = total, values_fill = 0) |>
gt()
| Title | Died | Survived |
|---|---|---|
| Capt | 1 | 0 |
| Col | 1 | 1 |
| Countess | 0 | 1 |
| Don | 1 | 0 |
| Dr | 4 | 3 |
| Jonkheer | 1 | 0 |
| Lady | 0 | 1 |
| Major | 1 | 1 |
| Master | 17 | 23 |
| Miss | 55 | 127 |
| Mlle | 0 | 2 |
| Mme | 0 | 1 |
| Mr | 436 | 81 |
| Mrs | 26 | 99 |
| Ms | 0 | 1 |
| Rev | 6 | 0 |
| Sir | 0 | 1 |
train <- train %>%
dplyr::mutate(
New_Title = dplyr::case_when(
Title %in% c("Don", "Rev", "Dr", "Major", "Sir", "Col", "Capt", "Jonkheer") ~ "Mr",
Title %in% c("Mme", "Countess", "Lady", "Dona") ~ "Mrs",
Title %in% c("Mlle", "Ms") ~ "Miss",
TRUE ~ Title
),
# Split Miss by age, fall back to Parch when Age is NA
New_Title = dplyr::case_when(
New_Title == "Miss" & !is.na(Age) & Age < 18 ~ "Ms",
New_Title == "Miss" & !is.na(Age) & Age >= 18 ~ "Miss",
New_Title == "Miss" & is.na(Age) & Parch > 0 ~ "Ms",
New_Title == "Miss" & is.na(Age) & Parch == 0 ~ "Miss",
TRUE ~ New_Title
),
Travel_Alone = dplyr::if_else(SibSp == 0 & Parch == 0, 1L, 0L)
)
survival_counts <- train |>
dplyr::group_by(New_Title, Pclass, Survived) |>
dplyr::summarise(total = n(), .groups = "drop") |>
tidyr::pivot_wider(names_from = Survived, values_from = total, values_fill = 0)
age_stats <- train |>
dplyr::group_by(New_Title, Pclass) |>
dplyr::summarise(
min_age = round(min(Age, na.rm = TRUE),0),
median_age = round(median(Age, na.rm = TRUE),0),
mean_age = round(mean(Age, na.rm = TRUE),0),
max_age = round(max(Age, na.rm = TRUE),0),
.groups = "drop"
)
survival_counts <- survival_counts |>
dplyr::left_join(age_stats, by = c("New_Title", "Pclass"))
survival_counts |>
tidyr::pivot_longer(
cols = c(Survived, Died),
names_to = "Survived",
values_to = "total"
) |>
ggplot(aes(y = New_Title, x = total, fill = Survived, colour = Survived)) +
geom_col(position = "dodge", alpha = 0.3) +
facet_wrap(~Pclass) +
scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
scale_colour_manual(values = c("#E69F00", "#56B4E9")) +
labs(
title = "Survival Counts by Title and Passenger Class",
x = "Count",
y = "Title"
) +
theme_minimal(base_size = 12, base_family = "sans") +
theme(
plot.title = element_text(face = "bold", size = 14, margin = margin(b = 4)),
plot.subtitle = element_text(color = "grey40", size = 11, margin = margin(b = 12)),
plot.caption = element_text(color = "grey60", size = 9, hjust = 0),
plot.margin = margin(16, 16, 16, 16),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_text(color = "grey30", size = 10),
axis.text = element_text(color = "grey30"),
axis.text.x = element_text(angle = 0, hjust = 1), # titles overlap without this
axis.ticks.x = element_line(color = "grey80"),
legend.position = "bottom",
legend.title = element_blank()
)
ggplot(data = train, aes(y = New_Title, x = Age, fill = New_Title, color = New_Title)) +
geom_boxplot(position = "dodge", alpha = 0.3) +
facet_wrap(~Pclass) +
scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7")) +
scale_colour_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7")) +
labs(
title = "Age Ranges by Title",
x = "Age Range",
y = "Title"
) +
theme_minimal(base_size = 12, base_family = "sans") +
theme(
plot.title = element_text(face = "bold", size = 14, margin = margin(b = 4)),
plot.subtitle = element_text(color = "grey40", size = 11, margin = margin(b = 12)),
plot.caption = element_text(color = "grey60", size = 9, hjust = 0),
plot.margin = margin(16, 16, 16, 16),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_text(color = "grey30", size = 10),
axis.text = element_text(color = "grey30"),
axis.text.x = element_text(angle = 0, hjust = 1), # titles overlap without this
axis.ticks.x = element_line(color = "grey80"),
legend.position = "None",
legend.title = element_blank()
)
Since we are missing a Fare value in test set, we need to identify a strategy to fill this value in. 115 tickets overlap in both the training and the test data set. We therefore will determine the per passenger price using the formula
\[ \text{Price per Passenger} = \frac{\text{Fare}}{\text{SibSp} + \text{Parch} + 1} \]
tickets_train = unique(train$Ticket)
tickets_test = unique(test$Ticket)
#any(tickets_train %in% tickets_test)
# How many overlap?
print(paste('Total overlapping ticket numbers:', sum(tickets_train %in% tickets_test)))
## [1] "Total overlapping ticket numbers: 115"
train <- train |>
dplyr::mutate(price_per_ticket = Fare / (SibSp + Parch + 1),
cabin_letter = if_else(is.na(Cabin) | str_trim(Cabin) == "", "M", str_sub(Cabin, 1, 1))
)
ggplot(data=train, aes(y=factor(Pclass), x=price_per_ticket, colour=Sex, fill=Sex)) +
geom_boxplot(alpha=0.3) +
scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7")) +
scale_colour_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7")) +
labs(
title = "Price Per Ticket, by Class and Gender",
y = "Passenger Class",
x = "Total"
) +
theme_minimal(base_size = 12, base_family = "sans") +
theme(
plot.title = element_text(face = "bold", size = 14, margin = margin(b = 4)),
plot.subtitle = element_text(color = "grey40", size = 11, margin = margin(b = 12)),
plot.caption = element_text(color = "grey60", size = 9, hjust = 0),
plot.margin = margin(16, 16, 16, 16),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_text(color = "grey30", size = 10),
axis.text = element_text(color = "grey30"),
axis.ticks.x = element_line(color = "grey80"),
legend.position = "bottom",
legend.title = element_blank()
)
First class tickets show some strong variation, notably among women. Zooming in on 1st class passengers, we can see female passengers appear to pay more for B cabins, and where cabin information is missing. Without further analysis, it is impossible to determine why these prices where higher (e.g. more luxurious cabins, closer to amenities, etc.)
first_class <- train |>
dplyr::filter(Pclass == "Class 1")
ggplot(data=first_class, aes(y=cabin_letter, x=price_per_ticket, colour=Sex, fill=Sex)) +
geom_boxplot(alpha=0.3) +
scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7")) +
scale_colour_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7")) +
labs(
title = "First Class Price Per Ticket, by Gender",
y = "Cabin Location",
x = "Ticket Price"
) +
theme_minimal(base_size = 12, base_family = "sans") +
theme(
plot.title = element_text(face = "bold", size = 14, margin = margin(b = 4)),
plot.subtitle = element_text(color = "grey40", size = 11, margin = margin(b = 12)),
plot.caption = element_text(color = "grey60", size = 9, hjust = 0),
plot.margin = margin(16, 16, 16, 16),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_text(color = "grey30", size = 10),
axis.text = element_text(color = "grey30"),
axis.ticks.x = element_line(color = "grey80"),
legend.position = "bottom",
legend.title = element_blank()
)
Here, we want to determine if the provided cabin locations may have played a role in improving survivability. Interestingly, cabin located in B, C, D, and E locations appear to have higher survival rates. Survival rates by class and cabin information does not convey any meaningful information.
ggplot(train, aes(y = cabin_letter, fill = factor(Survived), color = factor(Survived))) +
geom_bar(position = "dodge", alpha=0.3) +
scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7")) +
scale_colour_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7")) +
labs(
title = "First Class Survival, by Cabin Group",
y = "Cabin Location",
x = "Total"
) +
theme_minimal(base_size = 12, base_family = "sans") +
theme(
plot.title = element_text(face = "bold", size = 14, margin = margin(b = 4)),
plot.subtitle = element_text(color = "grey40", size = 11, margin = margin(b = 12)),
plot.caption = element_text(color = "grey60", size = 9, hjust = 0),
plot.margin = margin(16, 16, 16, 16),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_text(color = "grey30", size = 10),
axis.text = element_text(color = "grey30"),
axis.ticks.x = element_line(color = "grey80"),
legend.position = "bottom",
legend.title = element_blank()
)
ggplot(train, aes(y = cabin_letter, fill = factor(Survived), color = factor(Survived))) +
geom_bar(position = "dodge", alpha = 0.3) +
facet_wrap(~Pclass) +
scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7")) +
scale_colour_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7")) +
labs(
title = "Survival by Cabin Location and Passenger Class",
y = "Cabin Location",
x = "Total"
) +
theme_minimal(base_size = 12, base_family = "sans") +
theme(
plot.title = element_text(face = "bold", size = 14, margin = margin(b = 4)),
plot.subtitle = element_text(color = "grey40", size = 11, margin = margin(b = 12)),
plot.caption = element_text(color = "grey60", size = 9, hjust = 0),
plot.margin = margin(16, 16, 16, 16),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_text(color = "grey30", size = 10),
axis.text = element_text(color = "grey30"),
axis.ticks.x = element_line(color = "grey80"),
legend.position = "bottom",
legend.title = element_blank()
)