QBS181 Final Project – Aim 1: Exploratory Analysis of Titanic Dataset

titanic <- read.csv("Total_Cleaned.csv")

titanic <- titanic %>%
mutate(
Survived = factor(Survived, labels = c("No", "Yes")),
Pclass   = factor(Pclass, labels = c("1st", "2nd", "3rd")),
Sex      = as.factor(Sex),
Embarked = as.factor(Embarked)
) %>%
drop_na(Age, Fare)

ggplot(titanic, aes(x = Sex, fill = Survived)) +
geom_bar(position = "fill") +
scale_fill_viridis_d(option = "E") +
labs(title = "Survival Rate by Gender", y = "Proportion", x = "Sex") +
theme_minimal(base_size = 14)

ggplot(titanic, aes(x = Pclass, fill = Survived)) +
geom_bar(position = "fill") +
scale_fill_viridis_d(option = "C") +
labs(title = "Survival Rate by Passenger Class", y = "Proportion", x = "Class") +
theme_minimal(base_size = 14)

ggplot(titanic, aes(x = Embarked, fill = Survived)) +
geom_bar(position = "fill") +
scale_fill_viridis_d(option = "D") +
labs(title = "Survival Rate by Embarked Port", y = "Proportion", x = "Port") +
theme_minimal(base_size = 14)

ggplot(titanic, aes(x = Survived, y = Fare, fill = Survived)) +
geom_violin(trim = FALSE, alpha = 0.5) +
geom_boxplot(width = 0.15, outlier.color = "red", alpha = 0.7) +
scale_y_continuous(labels = dollar_format(prefix = "$")) +
scale_fill_viridis_d() +
labs(title = "Fare Distribution by Survival Status", y = "Fare", x = "Survived") +
theme_minimal(base_size = 14)

ggplot(titanic, aes(x = Survived, y = Age, fill = Survived)) +
geom_violin(trim = FALSE, alpha = 0.5) +
geom_boxplot(width = 0.15, outlier.color = "red", alpha = 0.7) +
scale_fill_viridis_d() +
labs(title = "Age Distribution by Survival", y = "Age", x = "Survived") +
theme_minimal(base_size = 14)

ggplot(titanic, aes(x = Pclass, fill = Survived)) +
geom_bar(position = "fill") +
facet_wrap(~ Sex) +
scale_fill_viridis_d(option = "D") +
labs(title = "Survival by Gender and Passenger Class", y = "Proportion", x = "Class") +
theme_minimal(base_size = 14)

ggplot(titanic, aes(x = Age, y = Fare, color = Survived, shape = Sex)) +
geom_point(alpha = 0.6, size = 2.5) +
scale_color_viridis_d() +
labs(title = "Fare vs Age Colored by Survival and Sex", y = "Fare ($)", x = "Age") +
theme_light(base_size = 14)

titanic <- titanic %>%
mutate(FamilySize = SibSp + Parch + 1)

ggplot(titanic, aes(x = FamilySize, fill = Survived)) +
geom_bar(position = "fill") +
scale_fill_viridis_d(option = "E") +
labs(title = "Survival Rate by Family Size", y = "Proportion", x = "Family Size") +
theme_minimal(base_size = 14)

num_vars <- titanic %>%
select(Age, Fare, SibSp, Parch, FamilySize) %>%
mutate_all(as.numeric)

cor_matrix <- cor(num_vars, use = "complete.obs")
cor_melt <- melt(cor_matrix)

ggplot(cor_melt, aes(Var1, Var2, fill = value)) +
geom_tile() +
geom_text(aes(label = round(value, 2)), color = "white", size = 4) +
scale_fill_viridis_c(option = "A") +
labs(title = "Correlation Heatmap of Numeric Features", x = "", y = "") +
theme_minimal(base_size = 14)

## 🔟 Age Distribution by Sex (Histogram + Density Overlay)

# Use Titanic dataset
df <- titanic

# Compute mean age per sex
mu <- df %>%
  group_by(Sex) %>%
  summarise(grp.mean = mean(Age, na.rm = TRUE))

# Custom labeller for facet titles (uppercase)
capitalize_labeller <- as_labeller(function(value) toupper(value))

# Calculate scaling factor for density overlay
age_range <- diff(range(df$Age, na.rm = TRUE))
n_obs <- nrow(df)
n_bins <- 30
scale_factor <- n_obs * age_range / n_bins

# Create histogram + density + mean line + facet by sex
ggplot(df, aes(x = Age, color = Sex, fill = Sex)) +
  geom_histogram(position = "identity", alpha = 0.5, bins = n_bins) +
  geom_density(aes(y = ..density.. * scale_factor), alpha = 0.6, inherit.aes = TRUE) +
  geom_vline(data = mu, aes(xintercept = grp.mean, color = Sex), linetype = "dashed") +
  scale_color_brewer(palette = "Dark2") +
  scale_fill_brewer(palette = "Dark2") +
  facet_wrap(~ Sex, ncol = 2, labeller = capitalize_labeller) +
  labs(
    title = "Age Distribution by Sex",
    x = "Age (years)",
    y = "Count"
  ) +
  theme_linedraw() +
  theme(
    plot.title = element_text(face = "bold", size = 12, hjust = 0.5),
    strip.text = element_text(face = "bold", size = 10)
  )

# ============================================================
# Survival Rate by Age Group and Sex
# ============================================================

# Load libraries
library(tidyverse)
library(ggpubr)

# Read cleaned Titanic data
titanic <- read.csv("Total_Cleaned.csv")

# Data preparation
titanic <- titanic %>%
  mutate(
    Survived = factor(Survived, labels = c("No", "Yes")),
    Sex = factor(Sex),
    AgeGroup = case_when(
      Age < 18 ~ "Child",
      Age >= 18 & Age < 60 ~ "Adult",
      Age >= 60 ~ "Senior"
    )
  ) %>%
  drop_na(Age, Sex, Survived)

# Calculate survival percentage by group
survival_summary <- titanic %>%
  group_by(AgeGroup, Sex) %>%
  summarise(SurvivalRate = mean(Survived == "Yes") * 100)

# Perform Fisher's exact test for each AgeGroup
fisher_results <- titanic %>%
  group_by(AgeGroup) %>%
  summarise(
    p.value = fisher.test(table(Sex, Survived))$p.value
  ) %>%
  mutate(p.adj = p.adjust(p.value, method = "fdr"))

fisher_results

## # A tibble: 3 × 3
##   AgeGroup   p.value     p.adj
##   <chr>        <dbl>     <dbl>
## 1 Adult    8.16e-131 2.45e-130
## 2 Child    1.59e-  9 2.39e-  9
## 3 Senior   1.57e-  7 1.57e-  7

# Merge for plotting
plot_data <- survival_summary

# Plot
# Plot
ggplot(plot_data, aes(x = AgeGroup, y = SurvivalRate, color = Sex)) +
  geom_point(size = 4) +
  geom_segment(aes(xend = AgeGroup, y = 0, yend = SurvivalRate, color = Sex), size = 1) +
  geom_text(aes(label = round(SurvivalRate, 1)), vjust = -0.5, size = 4, fontface = "bold") +
  scale_color_manual(values = c("darkorange2", "seagreen3")) +
  scale_y_continuous(limits = c(0, 120), expand = c(0, 0)) +
  labs(
    title = "Survival Rate by Age Group and Sex",
    subtitle = "Fisher’s Exact Test (FDR-adjusted)",
    x = "Age Group",
    y = "Survival (%)"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    legend.position = "right"
  ) +
  geom_signif(
    data = data.frame(
      xmin = c("Child", "Adult", "Child"),
      xmax = c("Adult", "Senior", "Senior"),
      annotations = c("***", "***", "***"),
      y_position = c(110, 115, 120)
    ),
    aes(xmin = xmin, xmax = xmax, annotations = annotations, y_position = y_position),
    inherit.aes = FALSE,
    manual = TRUE,
    tip_length = 0.02,
    textsize = 5,
    vjust = 0.3
  )

library(tidyverse)
library(ggpubr)

# Read Titanic dataset
titanic <- read.csv("Total_Cleaned.csv")

# Data preparation
titanic <- titanic %>%
  mutate(
    Survived = factor(Survived, labels = c("Died", "Survived"))
  )

# Wilcoxon rank-sum test
wilcox_test <- wilcox.test(Fare ~ Survived, data = titanic)
p_value <- wilcox_test$p.value
significance <- ifelse(p_value < 0.0001, "****",
                  ifelse(p_value < 0.001, "***",
                  ifelse(p_value < 0.01, "**",
                  ifelse(p_value < 0.05, "*", "ns"))))

# Plot
ggplot(titanic, aes(x = Survived, y = Fare, fill = Survived)) +
  geom_violin(trim = FALSE, alpha = 0.4, color = NA) +
  geom_boxplot(width = 0.15, outlier.shape = NA, alpha = 0.6) +
  scale_fill_manual(values = c("#E74C3C", "#27AE60")) +
  labs(
    title = "Fare Distribution by Survival Status",
    subtitle = "Wilcoxon rank-sum test: Survivors paid significantly higher fares",
    x = NULL,
    y = "Fare (USD)"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
    legend.position = "right",
    legend.title = element_text(face = "bold"),
    axis.title.y = element_text(face = "bold")
  ) +
  geom_signif(
    comparisons = list(c("Died", "Survived")),
    annotations = significance,
    y_position = max(titanic$Fare, na.rm = TRUE) * 0.9,
    tip_length = 0.02,
    textsize = 5
  )

library(tidyverse)
library(ggforce)

# Read Titanic data
titanic <- read.csv("Total_Cleaned.csv")

# Prepare data
titanic <- titanic %>%
  mutate(
    Survived = factor(Survived, labels = c("Died", "Survived")),
    Pclass = factor(Pclass),
    Sex = factor(Sex)
  )

# Summarize counts
titanic_pset <- titanic %>%
  count(Sex, Pclass, Survived)

# Convert to ggforce compatible long format
titanic_long <- titanic_pset %>%
  gather_set_data(1:3)

# Plot using geom_parallel_sets
ggplot(titanic_long, aes(x = x, id = id, split = y, value = n)) +
  geom_parallel_sets(aes(fill = Survived), alpha = 0.7, axis.width = 0.25) +
  geom_parallel_sets_axes(axis.width = 0.25, fill = "grey70") +
  geom_parallel_sets_labels(size = 4, angle = 0, colour = "black") +
  scale_fill_manual(values = c("#E74C3C", "#27AE60")) +
  theme_minimal(base_size = 14) +
  labs(
    title = "Passenger Flow: Sex → Pclass → Survival",
    y = "Count",
    fill = "Survival Status"
  ) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    legend.title = element_text(face = "bold")
  )

QBS181 Final Project – Aim 1: Exploratory Analysis of Titanic Dataset

Group Data Wrangling Project