Introduction

The Titanic data analysis explores the impact of various factors on passenger survival rates. First-class passengers had a much greater likelihood of survival at 64% compared to those in second and third class at 47% and 24%. Additionally, the findings show that women and children had significantly higher survival rates, consistent with the “women and children first” policy of the times. The analysis also highlights fare price variations and their correlation with survival. Female passengers paid an average of 19 pounds more compared to male passengers, and first-class passengers paid an average of 84 pounds more compared with 13 pounds, which was the average fare for third-class passengers.This finding could emphasize the economic disparities in survival outcomes. Further statistical analyses and visualizations provide insights into the relationships between passenger demographics and survival probability. First and second class female passengers had a survival rate of 97% and 92%. However, male passengers in third class recorded a survival rate of just 14%. Visual plots confirm that gender and class were the most significant predictors of survival. These findings align with historical records of the Titanic disaster, reinforcing the social and economic factors that influenced survival chances.

knitr::opts_chunk$set(message = FALSE, warning = FALSE)

rm(list = ls())
gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  589159 31.5    1353118 72.3   660385 35.3
## Vcells 1094928  8.4    8388608 64.0  1770207 13.6
set.seed(123123)
# Load necessary libraries
library(readr)
library(dplyr)
library(ggplot2)
library(knitr)

setwd("C:/Users/Yung Cho/Documents/DATA712/Data")
tidata <- read_csv("titanic_data.csv", col_names = TRUE)

head(tidata)
## # A tibble: 6 × 12
##   PassengerId Survived Pclass Name    Sex     Age SibSp Parch Ticket  Fare Cabin
##         <dbl>    <dbl>  <dbl> <chr>   <chr> <dbl> <dbl> <dbl> <chr>  <dbl> <chr>
## 1           1        0      3 Braund… male     22     1     0 A/5 2…  7.25 <NA> 
## 2           2        1      1 Cuming… fema…    38     1     0 PC 17… 71.3  C85  
## 3           3        1      3 Heikki… fema…    26     0     0 STON/…  7.92 <NA> 
## 4           4        1      1 Futrel… fema…    35     1     0 113803 53.1  C123 
## 5           5        0      3 Allen,… male     35     0     0 373450  8.05 <NA> 
## 6           6        0      3 Moran,… male     NA     0     0 330877  8.46 <NA> 
## # ℹ 1 more variable: Embarked <chr>
# Calculate average fare by sex
avg_fare_by_sex <- tidata %>%
  group_by(Sex) %>%
  summarise(avg_fare = round(mean(Fare, na.rm = TRUE), 2))

# Calculate average fare by passenger class
avg_fare_by_class <- tidata %>%
  group_by(Pclass) %>%
  summarise(avg_fare = round(mean(Fare, na.rm = TRUE), 2))

# Print the results
print("Average Fare by Sex:")
## [1] "Average Fare by Sex:"
print(avg_fare_by_sex)
## # A tibble: 2 × 2
##   Sex    avg_fare
##   <chr>     <dbl>
## 1 female     44.5
## 2 male       25.5
print("Average Fare by Passenger Class:")
## [1] "Average Fare by Passenger Class:"
print(avg_fare_by_class)
## # A tibble: 3 × 2
##   Pclass avg_fare
##    <dbl>    <dbl>
## 1      1     84.2
## 2      2     20.7
## 3      3     13.7
# Calculate average survival rate by sex
avg_survival_by_sex <- tidata %>%
  group_by(Sex) %>%
  summarise(avg_survival = round(mean(Survived, na.rm = TRUE), 2))

# Calculate average survival rate by passenger class
avg_survival_by_class <- tidata %>%
  group_by(Pclass) %>%
  summarise(avg_survival = round(mean(Survived, na.rm = TRUE), 2))

# Print the results
print("Average Survival Rate by Sex:")
## [1] "Average Survival Rate by Sex:"
print(avg_survival_by_sex)
## # A tibble: 2 × 2
##   Sex    avg_survival
##   <chr>         <dbl>
## 1 female         0.74
## 2 male           0.19
print("Average Survival Rate by Passenger Class:")
## [1] "Average Survival Rate by Passenger Class:"
print(avg_survival_by_class)
## # A tibble: 3 × 2
##   Pclass avg_survival
##    <dbl>        <dbl>
## 1      1         0.63
## 2      2         0.47
## 3      3         0.24
# Calculate survival rate by age group and sex

# Group Age into specified intervals
tidata <- tidata %>%
  mutate(AgeGroup = cut(Age, breaks = c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90), right = FALSE, labels = c("0-10", "11-20", "21-30", "31-40", "41-50", "51-60", "61-70", "71-80", "81-90")))

survival_by_age_group_and_sex <- tidata %>%
  group_by(AgeGroup, Sex) %>%
  summarise(count = n(), survived = sum(Survived), survival_rate = round(mean(Survived, na.rm = TRUE), 2))

# Display the results using kable
kable(survival_by_age_group_and_sex, caption = "Survival Rates by Age Group and Sex")
Survival Rates by Age Group and Sex
AgeGroup Sex count survived survival_rate
0-10 female 30 19 0.63
0-10 male 32 19 0.59
11-20 female 45 34 0.76
11-20 male 57 7 0.12
21-30 female 72 52 0.72
21-30 male 148 25 0.17
31-40 female 60 50 0.83
31-40 male 107 23 0.21
41-50 female 32 22 0.69
41-50 male 57 12 0.21
51-60 female 18 16 0.89
51-60 male 30 4 0.13
61-70 female 4 4 1.00
61-70 male 15 2 0.13
71-80 male 6 0 0.00
81-90 male 1 1 1.00
NA female 53 36 0.68
NA male 124 16 0.13
# Create a crosstabulation of survival rates by sex and passenger class
crosstab_survival <- tidata %>%
  group_by(Sex, Pclass) %>%
  summarise(count = n(), survived = sum(Survived), 
  survival_rate = round(mean(Survived, na.rm = TRUE), 2))

# Display the crosstabulation using kable
kable(crosstab_survival, caption = "Crosstabulation of Survival Rates by Sex and Passenger Class")
Crosstabulation of Survival Rates by Sex and Passenger Class
Sex Pclass count survived survival_rate
female 1 94 91 0.97
female 2 76 70 0.92
female 3 144 72 0.50
male 1 122 45 0.37
male 2 108 17 0.16
male 3 347 47 0.14
ggplot(tidata, aes(x = factor(Pclass), fill = factor(Survived))) +
  geom_bar(position = "fill") +
  facet_wrap(~ Sex) +
  labs(title = "Survival Rates by Sex and Passenger Class", x = "Passenger Class", y = "Proportion", fill = "Survived")

# Visualization of survival rates by age group and sex
ggplot(tidata, aes(x = AgeGroup, fill = factor(Survived))) +
  geom_bar(position = "fill") +
  facet_wrap(~ Sex) +
  labs(title = "Survival Rates by Age Group and Sex", x = "Age Group", y = "Proportion", fill = "Survived")