titanic <- read.csv("Total_Cleaned.csv")
titanic <- titanic %>%
mutate(
Survived = factor(Survived, labels = c("No", "Yes")),
Pclass = factor(Pclass, labels = c("1st", "2nd", "3rd")),
Sex = as.factor(Sex),
Embarked = as.factor(Embarked)
) %>%
drop_na(Age, Fare)
ggplot(titanic, aes(x = Sex, fill = Survived)) +
geom_bar(position = "fill") +
scale_fill_viridis_d(option = "E") +
labs(title = "Survival Rate by Gender", y = "Proportion", x = "Sex") +
theme_minimal(base_size = 14)
ggplot(titanic, aes(x = Pclass, fill = Survived)) +
geom_bar(position = "fill") +
scale_fill_viridis_d(option = "C") +
labs(title = "Survival Rate by Passenger Class", y = "Proportion", x = "Class") +
theme_minimal(base_size = 14)
ggplot(titanic, aes(x = Embarked, fill = Survived)) +
geom_bar(position = "fill") +
scale_fill_viridis_d(option = "D") +
labs(title = "Survival Rate by Embarked Port", y = "Proportion", x = "Port") +
theme_minimal(base_size = 14)
ggplot(titanic, aes(x = Survived, y = Fare, fill = Survived)) +
geom_violin(trim = FALSE, alpha = 0.5) +
geom_boxplot(width = 0.15, outlier.color = "red", alpha = 0.7) +
scale_y_continuous(labels = dollar_format(prefix = "$")) +
scale_fill_viridis_d() +
labs(title = "Fare Distribution by Survival Status", y = "Fare", x = "Survived") +
theme_minimal(base_size = 14)
ggplot(titanic, aes(x = Survived, y = Age, fill = Survived)) +
geom_violin(trim = FALSE, alpha = 0.5) +
geom_boxplot(width = 0.15, outlier.color = "red", alpha = 0.7) +
scale_fill_viridis_d() +
labs(title = "Age Distribution by Survival", y = "Age", x = "Survived") +
theme_minimal(base_size = 14)
ggplot(titanic, aes(x = Pclass, fill = Survived)) +
geom_bar(position = "fill") +
facet_wrap(~ Sex) +
scale_fill_viridis_d(option = "D") +
labs(title = "Survival by Gender and Passenger Class", y = "Proportion", x = "Class") +
theme_minimal(base_size = 14)
ggplot(titanic, aes(x = Age, y = Fare, color = Survived, shape = Sex)) +
geom_point(alpha = 0.6, size = 2.5) +
scale_color_viridis_d() +
labs(title = "Fare vs Age Colored by Survival and Sex", y = "Fare ($)", x = "Age") +
theme_light(base_size = 14)
titanic <- titanic %>%
mutate(FamilySize = SibSp + Parch + 1)
ggplot(titanic, aes(x = FamilySize, fill = Survived)) +
geom_bar(position = "fill") +
scale_fill_viridis_d(option = "E") +
labs(title = "Survival Rate by Family Size", y = "Proportion", x = "Family Size") +
theme_minimal(base_size = 14)
num_vars <- titanic %>%
select(Age, Fare, SibSp, Parch, FamilySize) %>%
mutate_all(as.numeric)
cor_matrix <- cor(num_vars, use = "complete.obs")
cor_melt <- melt(cor_matrix)
ggplot(cor_melt, aes(Var1, Var2, fill = value)) +
geom_tile() +
geom_text(aes(label = round(value, 2)), color = "white", size = 4) +
scale_fill_viridis_c(option = "A") +
labs(title = "Correlation Heatmap of Numeric Features", x = "", y = "") +
theme_minimal(base_size = 14)
## 🔟 Age Distribution by Sex (Histogram + Density Overlay)
# Use Titanic dataset
df <- titanic
# Compute mean age per sex
mu <- df %>%
group_by(Sex) %>%
summarise(grp.mean = mean(Age, na.rm = TRUE))
# Custom labeller for facet titles (uppercase)
capitalize_labeller <- as_labeller(function(value) toupper(value))
# Calculate scaling factor for density overlay
age_range <- diff(range(df$Age, na.rm = TRUE))
n_obs <- nrow(df)
n_bins <- 30
scale_factor <- n_obs * age_range / n_bins
# Create histogram + density + mean line + facet by sex
ggplot(df, aes(x = Age, color = Sex, fill = Sex)) +
geom_histogram(position = "identity", alpha = 0.5, bins = n_bins) +
geom_density(aes(y = ..density.. * scale_factor), alpha = 0.6, inherit.aes = TRUE) +
geom_vline(data = mu, aes(xintercept = grp.mean, color = Sex), linetype = "dashed") +
scale_color_brewer(palette = "Dark2") +
scale_fill_brewer(palette = "Dark2") +
facet_wrap(~ Sex, ncol = 2, labeller = capitalize_labeller) +
labs(
title = "Age Distribution by Sex",
x = "Age (years)",
y = "Count"
) +
theme_linedraw() +
theme(
plot.title = element_text(face = "bold", size = 12, hjust = 0.5),
strip.text = element_text(face = "bold", size = 10)
)
# ============================================================
# Survival Rate by Age Group and Sex
# ============================================================
# Load libraries
library(tidyverse)
library(ggpubr)
# Read cleaned Titanic data
titanic <- read.csv("Total_Cleaned.csv")
# Data preparation
titanic <- titanic %>%
mutate(
Survived = factor(Survived, labels = c("No", "Yes")),
Sex = factor(Sex),
AgeGroup = case_when(
Age < 18 ~ "Child",
Age >= 18 & Age < 60 ~ "Adult",
Age >= 60 ~ "Senior"
)
) %>%
drop_na(Age, Sex, Survived)
# Calculate survival percentage by group
survival_summary <- titanic %>%
group_by(AgeGroup, Sex) %>%
summarise(SurvivalRate = mean(Survived == "Yes") * 100)
# Perform Fisher's exact test for each AgeGroup
fisher_results <- titanic %>%
group_by(AgeGroup) %>%
summarise(
p.value = fisher.test(table(Sex, Survived))$p.value
) %>%
mutate(p.adj = p.adjust(p.value, method = "fdr"))
fisher_results
## # A tibble: 3 × 3
## AgeGroup p.value p.adj
## <chr> <dbl> <dbl>
## 1 Adult 8.16e-131 2.45e-130
## 2 Child 1.59e- 9 2.39e- 9
## 3 Senior 1.57e- 7 1.57e- 7
# Merge for plotting
plot_data <- survival_summary
# Plot
# Plot
ggplot(plot_data, aes(x = AgeGroup, y = SurvivalRate, color = Sex)) +
geom_point(size = 4) +
geom_segment(aes(xend = AgeGroup, y = 0, yend = SurvivalRate, color = Sex), size = 1) +
geom_text(aes(label = round(SurvivalRate, 1)), vjust = -0.5, size = 4, fontface = "bold") +
scale_color_manual(values = c("darkorange2", "seagreen3")) +
scale_y_continuous(limits = c(0, 120), expand = c(0, 0)) +
labs(
title = "Survival Rate by Age Group and Sex",
subtitle = "Fisher’s Exact Test (FDR-adjusted)",
x = "Age Group",
y = "Survival (%)"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "right"
) +
geom_signif(
data = data.frame(
xmin = c("Child", "Adult", "Child"),
xmax = c("Adult", "Senior", "Senior"),
annotations = c("***", "***", "***"),
y_position = c(110, 115, 120)
),
aes(xmin = xmin, xmax = xmax, annotations = annotations, y_position = y_position),
inherit.aes = FALSE,
manual = TRUE,
tip_length = 0.02,
textsize = 5,
vjust = 0.3
)
library(tidyverse)
library(ggpubr)
# Read Titanic dataset
titanic <- read.csv("Total_Cleaned.csv")
# Data preparation
titanic <- titanic %>%
mutate(
Survived = factor(Survived, labels = c("Died", "Survived"))
)
# Wilcoxon rank-sum test
wilcox_test <- wilcox.test(Fare ~ Survived, data = titanic)
p_value <- wilcox_test$p.value
significance <- ifelse(p_value < 0.0001, "****",
ifelse(p_value < 0.001, "***",
ifelse(p_value < 0.01, "**",
ifelse(p_value < 0.05, "*", "ns"))))
# Plot
ggplot(titanic, aes(x = Survived, y = Fare, fill = Survived)) +
geom_violin(trim = FALSE, alpha = 0.4, color = NA) +
geom_boxplot(width = 0.15, outlier.shape = NA, alpha = 0.6) +
scale_fill_manual(values = c("#E74C3C", "#27AE60")) +
labs(
title = "Fare Distribution by Survival Status",
subtitle = "Wilcoxon rank-sum test: Survivors paid significantly higher fares",
x = NULL,
y = "Fare (USD)"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
legend.position = "right",
legend.title = element_text(face = "bold"),
axis.title.y = element_text(face = "bold")
) +
geom_signif(
comparisons = list(c("Died", "Survived")),
annotations = significance,
y_position = max(titanic$Fare, na.rm = TRUE) * 0.9,
tip_length = 0.02,
textsize = 5
)
library(tidyverse)
library(ggforce)
# Read Titanic data
titanic <- read.csv("Total_Cleaned.csv")
# Prepare data
titanic <- titanic %>%
mutate(
Survived = factor(Survived, labels = c("Died", "Survived")),
Pclass = factor(Pclass),
Sex = factor(Sex)
)
# Summarize counts
titanic_pset <- titanic %>%
count(Sex, Pclass, Survived)
# Convert to ggforce compatible long format
titanic_long <- titanic_pset %>%
gather_set_data(1:3)
# Plot using geom_parallel_sets
ggplot(titanic_long, aes(x = x, id = id, split = y, value = n)) +
geom_parallel_sets(aes(fill = Survived), alpha = 0.7, axis.width = 0.25) +
geom_parallel_sets_axes(axis.width = 0.25, fill = "grey70") +
geom_parallel_sets_labels(size = 4, angle = 0, colour = "black") +
scale_fill_manual(values = c("#E74C3C", "#27AE60")) +
theme_minimal(base_size = 14) +
labs(
title = "Passenger Flow: Sex → Pclass → Survival",
y = "Count",
fill = "Survival Status"
) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
legend.title = element_text(face = "bold")
)