Introduction
The Titanic data analysis explores the impact of various factors on
passenger survival rates. First-class passengers had a much greater
likelihood of survival at 64% compared to those in second and third
class at 47% and 24%. Additionally, the findings show that women and
children had significantly higher survival rates, consistent with the
“women and children first” policy of the times. The analysis also
highlights fare price variations and their correlation with survival.
Female passengers paid an average of 19 pounds more compared to male
passengers, and first-class passengers paid an average of 84 pounds more
compared with 13 pounds, which was the average fare for third-class
passengers.This finding could emphasize the economic disparities in
survival outcomes. Further statistical analyses and visualizations
provide insights into the relationships between passenger demographics
and survival probability. First and second class female passengers had a
survival rate of 97% and 92%. However, male passengers in third class
recorded a survival rate of just 14%. Visual plots confirm that gender
and class were the most significant predictors of survival. These
findings align with historical records of the Titanic disaster,
reinforcing the social and economic factors that influenced survival
chances.
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 589159 31.5 1353118 72.3 660385 35.3
## Vcells 1094928 8.4 8388608 64.0 1770207 13.6
set.seed(123123)
# Load necessary libraries
library(readr)
library(dplyr)
library(ggplot2)
library(knitr)
setwd("C:/Users/Yung Cho/Documents/DATA712/Data")
tidata <- read_csv("titanic_data.csv", col_names = TRUE)
head(tidata)
## # A tibble: 6 × 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 1 0 3 Braund… male 22 1 0 A/5 2… 7.25 <NA>
## 2 2 1 1 Cuming… fema… 38 1 0 PC 17… 71.3 C85
## 3 3 1 3 Heikki… fema… 26 0 0 STON/… 7.92 <NA>
## 4 4 1 1 Futrel… fema… 35 1 0 113803 53.1 C123
## 5 5 0 3 Allen,… male 35 0 0 373450 8.05 <NA>
## 6 6 0 3 Moran,… male NA 0 0 330877 8.46 <NA>
## # ℹ 1 more variable: Embarked <chr>
# Calculate average fare by sex
avg_fare_by_sex <- tidata %>%
group_by(Sex) %>%
summarise(avg_fare = round(mean(Fare, na.rm = TRUE), 2))
# Calculate average fare by passenger class
avg_fare_by_class <- tidata %>%
group_by(Pclass) %>%
summarise(avg_fare = round(mean(Fare, na.rm = TRUE), 2))
# Print the results
print("Average Fare by Sex:")
## [1] "Average Fare by Sex:"
print(avg_fare_by_sex)
## # A tibble: 2 × 2
## Sex avg_fare
## <chr> <dbl>
## 1 female 44.5
## 2 male 25.5
print("Average Fare by Passenger Class:")
## [1] "Average Fare by Passenger Class:"
print(avg_fare_by_class)
## # A tibble: 3 × 2
## Pclass avg_fare
## <dbl> <dbl>
## 1 1 84.2
## 2 2 20.7
## 3 3 13.7
# Calculate average survival rate by sex
avg_survival_by_sex <- tidata %>%
group_by(Sex) %>%
summarise(avg_survival = round(mean(Survived, na.rm = TRUE), 2))
# Calculate average survival rate by passenger class
avg_survival_by_class <- tidata %>%
group_by(Pclass) %>%
summarise(avg_survival = round(mean(Survived, na.rm = TRUE), 2))
# Print the results
print("Average Survival Rate by Sex:")
## [1] "Average Survival Rate by Sex:"
print(avg_survival_by_sex)
## # A tibble: 2 × 2
## Sex avg_survival
## <chr> <dbl>
## 1 female 0.74
## 2 male 0.19
print("Average Survival Rate by Passenger Class:")
## [1] "Average Survival Rate by Passenger Class:"
print(avg_survival_by_class)
## # A tibble: 3 × 2
## Pclass avg_survival
## <dbl> <dbl>
## 1 1 0.63
## 2 2 0.47
## 3 3 0.24
# Calculate survival rate by age group and sex
# Group Age into specified intervals
tidata <- tidata %>%
mutate(AgeGroup = cut(Age, breaks = c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90), right = FALSE, labels = c("0-10", "11-20", "21-30", "31-40", "41-50", "51-60", "61-70", "71-80", "81-90")))
survival_by_age_group_and_sex <- tidata %>%
group_by(AgeGroup, Sex) %>%
summarise(count = n(), survived = sum(Survived), survival_rate = round(mean(Survived, na.rm = TRUE), 2))
# Display the results using kable
kable(survival_by_age_group_and_sex, caption = "Survival Rates by Age Group and Sex")
Survival Rates by Age Group and Sex
0-10 |
female |
30 |
19 |
0.63 |
0-10 |
male |
32 |
19 |
0.59 |
11-20 |
female |
45 |
34 |
0.76 |
11-20 |
male |
57 |
7 |
0.12 |
21-30 |
female |
72 |
52 |
0.72 |
21-30 |
male |
148 |
25 |
0.17 |
31-40 |
female |
60 |
50 |
0.83 |
31-40 |
male |
107 |
23 |
0.21 |
41-50 |
female |
32 |
22 |
0.69 |
41-50 |
male |
57 |
12 |
0.21 |
51-60 |
female |
18 |
16 |
0.89 |
51-60 |
male |
30 |
4 |
0.13 |
61-70 |
female |
4 |
4 |
1.00 |
61-70 |
male |
15 |
2 |
0.13 |
71-80 |
male |
6 |
0 |
0.00 |
81-90 |
male |
1 |
1 |
1.00 |
NA |
female |
53 |
36 |
0.68 |
NA |
male |
124 |
16 |
0.13 |
# Create a crosstabulation of survival rates by sex and passenger class
crosstab_survival <- tidata %>%
group_by(Sex, Pclass) %>%
summarise(count = n(), survived = sum(Survived),
survival_rate = round(mean(Survived, na.rm = TRUE), 2))
# Display the crosstabulation using kable
kable(crosstab_survival, caption = "Crosstabulation of Survival Rates by Sex and Passenger Class")
Crosstabulation of Survival Rates by Sex and Passenger
Class
female |
1 |
94 |
91 |
0.97 |
female |
2 |
76 |
70 |
0.92 |
female |
3 |
144 |
72 |
0.50 |
male |
1 |
122 |
45 |
0.37 |
male |
2 |
108 |
17 |
0.16 |
male |
3 |
347 |
47 |
0.14 |
ggplot(tidata, aes(x = factor(Pclass), fill = factor(Survived))) +
geom_bar(position = "fill") +
facet_wrap(~ Sex) +
labs(title = "Survival Rates by Sex and Passenger Class", x = "Passenger Class", y = "Proportion", fill = "Survived")

# Visualization of survival rates by age group and sex
ggplot(tidata, aes(x = AgeGroup, fill = factor(Survived))) +
geom_bar(position = "fill") +
facet_wrap(~ Sex) +
labs(title = "Survival Rates by Age Group and Sex", x = "Age Group", y = "Proportion", fill = "Survived")
