Kaggle: Titanic Dataset EDA (May 2026)

Titanic Dataset Overview
survival	survival	0=No, 1=Yes
pclass	ticket class	1=1st, 2=2nd, 3=3rd
sex	sex
age	age in years
sibsp	# of siblings / spouses aboard Titanic
parch	# of parents / children aboard the Titanic
ticket	ticket number
fare	passenger fare
cabin	cabin number
embarked	port of embarkation	C=Cherbourg, Q=Queenstown, S=Southhampton
pclass: A proxy for socio-economic status (SES) 1st = Upper 2nd = Middle 3rd = Lower
age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
sibsp: The dataset defines family relations in this way… Sibling = brother, sister, stepbrother, stepsister Spouse = husband, wife (mistresses and fiancés were ignored)
parch: The dataset defines family relations in this way… Parent = mother, father Child = daughter, son, stepdaughter, stepson Some children travelled only with a nanny, therefore parch=0 for them.

Data Exploration

Missing Values

We begin first by identifying missing variables within each column. Both the training and test data sets will be evaluated. The test set is only explored to ensure that categorical variables do not differ from the training set.

train <- read.csv("train.csv")
test <- read.csv("test.csv")

missing_train <- train |>
  dplyr::mutate(across(where(is.character), ~ dplyr::na_if(.x, ""))) |>
  dplyr::summarise(across(everything(), ~ sum(is.na(.x)))) |>
  tidyr::pivot_longer(cols = everything(), names_to = "Variables", values_to = "Total_train")

missing_test <- test |>
  dplyr::mutate(across(where(is.character), ~ dplyr::na_if(.x, ""))) |>
  dplyr::summarise(across(everything(), ~ sum(is.na(.x)))) |>
  tidyr::pivot_longer(cols = everything(), names_to = "Variables", values_to = "Total_test")

missing <- dplyr::left_join(missing_train, missing_test, by="Variables")

missing |> gt()

Variables	Total_train	Total_test
PassengerId	0	0
Survived	0	NA
Pclass	0	0
Name	0	0
Sex	0	0
Age	177	86
SibSp	0	0
Parch	0	0
Ticket	0	0
Fare	0	1
Cabin	687	327
Embarked	2	0

From the table above, we can see that age requires a suitable method to fill in the missing values. However, Fare will also require treatment, in order to accurately predict the target variable. Let us now deep dive into each of the variables.

train <- train |>
  dplyr::mutate(
    Survived = dplyr::if_else(Survived == "1", "Survived", "Died"),
    Pclass   = paste("Class", Pclass))

ggplot(train, aes(y = factor(Pclass), fill = factor(Survived), color = factor(Survived))) +
  geom_bar(position = "dodge", alpha=0.3) +
  facet_wrap(~Sex) + 
  scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", 
                                "#0072B2", "#D55E00", "#CC79A7")) +
  scale_color_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", 
                                "#0072B2", "#D55E00", "#CC79A7")) +
  labs(
    title = "Survival By Gender, per Class",
    y = "Passenger Class",
    x = "Total"
  ) +
  theme_minimal(base_size = 12, base_family = "sans") +
  theme(
    plot.title = element_text(face = "bold", size = 14, margin = margin(b = 4)),
    plot.subtitle = element_text(color = "grey40", size = 11, margin = margin(b = 12)),
    plot.caption = element_text(color = "grey60", size = 9, hjust = 0),
    plot.margin = margin(16, 16, 16, 16),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title = element_text(color = "grey30", size = 10),
    axis.text = element_text(color = "grey30"),
    axis.ticks.x = element_line(color = "grey80"),
    legend.position = "bottom",
    legend.title = element_blank()
  )

From the plot below, we observe that the majority of people embarked in Southampton (S). Therefore, any missing values will be imputed with Southampton.

ggplot(train, aes(y = factor(Embarked), fill = factor(Pclass), color = factor(Pclass))) +
  geom_bar(position = "dodge", alpha = 0.3) +
  #facet_wrap(~Sex) + 
  scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", 
                                "#0072B2", "#D55E00", "#CC79A7")) +
  scale_colour_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", 
                                "#0072B2", "#D55E00", "#CC79A7")) +
  labs(
    title = "Embarkated Locations, by Class",
    x = "Total Count",
    y = "Embark Location"
  ) +
  theme_minimal(base_size = 12, base_family = "sans") +
  theme(
    plot.title = element_text(face = "bold", size = 14, margin = margin(b = 4)),
    plot.subtitle = element_text(color = "grey40", size = 11, margin = margin(b = 12)),
    plot.caption = element_text(color = "grey60", size = 9, hjust = 0),
    plot.margin = margin(16, 16, 16, 16),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title = element_text(color = "grey30", size = 10),
    axis.text = element_text(color = "grey30"),
    axis.ticks.x = element_line(color = "grey80"),
    legend.position = c(0.9, 0.15),
    legend.title = element_blank()
  )

Titles

When reviewing the titles, we observe that there are 17 distinct titles. However, many occur infrequently, or are simply translations of the same title repeated. The number of titles can be significantly reduced to 5 groups:

Mr
Mrs
Ms
Miss
Master

The separation between Ms and Miss is designed to helped us distinguish the likely age (for the missing values) between girls under the age of 18, and women over the age of 18.

train$Title <- str_extract(train$Name, "[A-Za-z]+(?=\\.)")

train |>
  dplyr::group_by(Title, Survived) |>
  dplyr::summarise(total = n(), .groups = "drop") |>
  tidyr::pivot_wider(names_from = Survived, values_from = total, values_fill = 0) |>
  gt()

Title	Died	Survived
Capt	1	0
Col	1	1
Countess	0	1
Don	1	0
Dr	4	3
Jonkheer	1	0
Lady	0	1
Major	1	1
Master	17	23
Miss	55	127
Mlle	0	2
Mme	0	1
Mr	436	81
Mrs	26	99
Ms	0	1
Rev	6	0
Sir	0	1

train <- train %>%
  dplyr::mutate(
    New_Title = dplyr::case_when(
      Title %in% c("Don", "Rev", "Dr", "Major", "Sir", "Col", "Capt", "Jonkheer") ~ "Mr",
      Title %in% c("Mme", "Countess", "Lady", "Dona") ~ "Mrs",
      Title %in% c("Mlle", "Ms") ~ "Miss",
      TRUE ~ Title
    ),
    # Split Miss by age, fall back to Parch when Age is NA
    New_Title = dplyr::case_when(
      New_Title == "Miss" & !is.na(Age) & Age <  18 ~ "Ms",
      New_Title == "Miss" & !is.na(Age) & Age >= 18 ~ "Miss",
      New_Title == "Miss" &  is.na(Age) & Parch >  0 ~ "Ms",
      New_Title == "Miss" &  is.na(Age) & Parch == 0 ~ "Miss",
      TRUE ~ New_Title
    ),
    Travel_Alone = dplyr::if_else(SibSp == 0 & Parch == 0, 1L, 0L)
  )

survival_counts <- train |>
  dplyr::group_by(New_Title, Pclass, Survived) |>
  dplyr::summarise(total = n(), .groups = "drop") |>
  tidyr::pivot_wider(names_from = Survived, values_from = total, values_fill = 0)

age_stats <- train |>
  dplyr::group_by(New_Title, Pclass) |>
  dplyr::summarise(
    min_age    = round(min(Age, na.rm = TRUE),0),
    median_age = round(median(Age, na.rm = TRUE),0),
    mean_age   = round(mean(Age, na.rm = TRUE),0),
    max_age    = round(max(Age, na.rm = TRUE),0),
    .groups = "drop"
  )

survival_counts <- survival_counts |>
  dplyr::left_join(age_stats, by = c("New_Title", "Pclass"))

survival_counts |>
  tidyr::pivot_longer(
    cols      = c(Survived, Died),
    names_to  = "Survived",
    values_to = "total"
  ) |>
  ggplot(aes(y = New_Title, x = total, fill = Survived, colour = Survived)) +
  geom_col(position = "dodge", alpha = 0.3) +
  facet_wrap(~Pclass) +
  scale_fill_manual(values   = c("#E69F00", "#56B4E9")) +
  scale_colour_manual(values = c("#E69F00", "#56B4E9")) +
  labs(
    title   = "Survival Counts by Title and Passenger Class",
    x       = "Count",
    y       = "Title"
  ) +
  theme_minimal(base_size = 12, base_family = "sans") +
  theme(
    plot.title          = element_text(face = "bold", size = 14, margin = margin(b = 4)),
    plot.subtitle       = element_text(color = "grey40", size = 11, margin = margin(b = 12)),
    plot.caption        = element_text(color = "grey60", size = 9, hjust = 0),
    plot.margin         = margin(16, 16, 16, 16),
    panel.grid.major.x  = element_blank(),
    panel.grid.minor    = element_blank(),
    axis.title          = element_text(color = "grey30", size = 10),
    axis.text           = element_text(color = "grey30"),
    axis.text.x         = element_text(angle = 0, hjust = 1),  # titles overlap without this
    axis.ticks.x        = element_line(color = "grey80"),
    legend.position     = "bottom",
    legend.title        = element_blank()
  )

ggplot(data = train, aes(y = New_Title, x = Age, fill = New_Title, color = New_Title)) +
  geom_boxplot(position = "dodge", alpha = 0.3) +
  facet_wrap(~Pclass) +
  scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", 
                                "#0072B2", "#D55E00", "#CC79A7")) +
  scale_colour_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", 
                                "#0072B2", "#D55E00", "#CC79A7")) +
  labs(
    title   = "Age Ranges by Title",
    x       = "Age Range",
    y       = "Title"
  ) +
  theme_minimal(base_size = 12, base_family = "sans") +
  theme(
    plot.title          = element_text(face = "bold", size = 14, margin = margin(b = 4)),
    plot.subtitle       = element_text(color = "grey40", size = 11, margin = margin(b = 12)),
    plot.caption        = element_text(color = "grey60", size = 9, hjust = 0),
    plot.margin         = margin(16, 16, 16, 16),
    panel.grid.major.x  = element_blank(),
    panel.grid.minor    = element_blank(),
    axis.title          = element_text(color = "grey30", size = 10),
    axis.text           = element_text(color = "grey30"),
    axis.text.x         = element_text(angle = 0, hjust = 1),  # titles overlap without this
    axis.ticks.x        = element_line(color = "grey80"),
    legend.position     = "None",
    legend.title        = element_blank()
  )

Fare by Class

Since we are missing a Fare value in test set, we need to identify a strategy to fill this value in. 115 tickets overlap in both the training and the test data set. We therefore will determine the per passenger price using the formula

\[ \text{Price per Passenger} = \frac{\text{Fare}}{\text{SibSp} + \text{Parch} + 1} \]

tickets_train = unique(train$Ticket)
tickets_test = unique(test$Ticket)

#any(tickets_train %in% tickets_test) 

# How many overlap?
print(paste('Total overlapping ticket numbers:', sum(tickets_train %in% tickets_test)))

## [1] "Total overlapping ticket numbers: 115"

train <- train |> 
  dplyr::mutate(price_per_ticket = Fare / (SibSp + Parch + 1),
                cabin_letter = if_else(is.na(Cabin) | str_trim(Cabin) == "", "M", str_sub(Cabin, 1, 1))
                )

ggplot(data=train, aes(y=factor(Pclass), x=price_per_ticket, colour=Sex, fill=Sex)) +
  geom_boxplot(alpha=0.3)  + 
  scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", 
                                "#0072B2", "#D55E00", "#CC79A7")) +
  scale_colour_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", 
                                "#0072B2", "#D55E00", "#CC79A7")) +
  labs(
    title = "Price Per Ticket, by Class and Gender",
    y = "Passenger Class",
    x = "Total"
  ) +
  theme_minimal(base_size = 12, base_family = "sans") +
  theme(
    plot.title = element_text(face = "bold", size = 14, margin = margin(b = 4)),
    plot.subtitle = element_text(color = "grey40", size = 11, margin = margin(b = 12)),
    plot.caption = element_text(color = "grey60", size = 9, hjust = 0),
    plot.margin = margin(16, 16, 16, 16),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title = element_text(color = "grey30", size = 10),
    axis.text = element_text(color = "grey30"),
    axis.ticks.x = element_line(color = "grey80"),
    legend.position = "bottom",
    legend.title = element_blank()
  )

First class tickets show some strong variation, notably among women. Zooming in on 1st class passengers, we can see female passengers appear to pay more for B cabins, and where cabin information is missing. Without further analysis, it is impossible to determine why these prices where higher (e.g. more luxurious cabins, closer to amenities, etc.)

first_class <- train |>
  dplyr::filter(Pclass == "Class 1")

ggplot(data=first_class, aes(y=cabin_letter, x=price_per_ticket, colour=Sex, fill=Sex)) +
  geom_boxplot(alpha=0.3)  + 
  scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", 
                                "#0072B2", "#D55E00", "#CC79A7")) +
  scale_colour_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", 
                                "#0072B2", "#D55E00", "#CC79A7")) +
  labs(
    title = "First Class Price Per Ticket, by Gender",
    y = "Cabin Location",
    x = "Ticket Price"
  ) +
  theme_minimal(base_size = 12, base_family = "sans") +
  theme(
    plot.title = element_text(face = "bold", size = 14, margin = margin(b = 4)),
    plot.subtitle = element_text(color = "grey40", size = 11, margin = margin(b = 12)),
    plot.caption = element_text(color = "grey60", size = 9, hjust = 0),
    plot.margin = margin(16, 16, 16, 16),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title = element_text(color = "grey30", size = 10),
    axis.text = element_text(color = "grey30"),
    axis.ticks.x = element_line(color = "grey80"),
    legend.position = "bottom",
    legend.title = element_blank()
  )

Survival by Cabin Location

Here, we want to determine if the provided cabin locations may have played a role in improving survivability. Interestingly, cabin located in B, C, D, and E locations appear to have higher survival rates. Survival rates by class and cabin information does not convey any meaningful information.

ggplot(train, aes(y = cabin_letter, fill = factor(Survived), color = factor(Survived))) +
  geom_bar(position = "dodge", alpha=0.3) +
  scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", 
                                "#0072B2", "#D55E00", "#CC79A7")) +
  scale_colour_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", 
                                "#0072B2", "#D55E00", "#CC79A7")) +
  labs(
    title = "First Class Survival, by Cabin Group",
    y = "Cabin Location",
    x = "Total"
  ) +
  theme_minimal(base_size = 12, base_family = "sans") +
  theme(
    plot.title = element_text(face = "bold", size = 14, margin = margin(b = 4)),
    plot.subtitle = element_text(color = "grey40", size = 11, margin = margin(b = 12)),
    plot.caption = element_text(color = "grey60", size = 9, hjust = 0),
    plot.margin = margin(16, 16, 16, 16),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title = element_text(color = "grey30", size = 10),
    axis.text = element_text(color = "grey30"),
    axis.ticks.x = element_line(color = "grey80"),
    legend.position = "bottom",
    legend.title = element_blank()
  )

ggplot(train, aes(y = cabin_letter, fill = factor(Survived), color = factor(Survived))) +
  geom_bar(position = "dodge", alpha = 0.3) +
  facet_wrap(~Pclass) + 
  scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", 
                                "#0072B2", "#D55E00", "#CC79A7")) +
  scale_colour_manual(values = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", 
                                "#0072B2", "#D55E00", "#CC79A7")) +
  labs(
    title = "Survival by Cabin Location and Passenger Class",
    y = "Cabin Location",
    x = "Total"
  ) +
  theme_minimal(base_size = 12, base_family = "sans") +
  theme(
    plot.title = element_text(face = "bold", size = 14, margin = margin(b = 4)),
    plot.subtitle = element_text(color = "grey40", size = 11, margin = margin(b = 12)),
    plot.caption = element_text(color = "grey60", size = 9, hjust = 0),
    plot.margin = margin(16, 16, 16, 16),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title = element_text(color = "grey30", size = 10),
    axis.text = element_text(color = "grey30"),
    axis.ticks.x = element_line(color = "grey80"),
    legend.position = "bottom",
    legend.title = element_blank()
  )