# Load the packages
library(readxl)
library(here)
## here() starts at /Users/varad/Documents/Academics/Year 2024/DACSS 601
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
On April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.
While there was some element of luck involved in surviving, it seems some groups of people were more likely to survive than others.
titanic_data <- here("titanic3.xls") %>%
read_excel()
## Warning: Coercing text to numeric in M1306 / R1306C13: '328'
titanic_data
## # A tibble: 1,309 × 14
## pclass survived name sex age sibsp parch ticket fare cabin embarked
## <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <chr>
## 1 1 1 Allen, … fema… 29 0 0 24160 211. B5 S
## 2 1 1 Allison… male 0.917 1 2 113781 152. C22 … S
## 3 1 0 Allison… fema… 2 1 2 113781 152. C22 … S
## 4 1 0 Allison… male 30 1 2 113781 152. C22 … S
## 5 1 0 Allison… fema… 25 1 2 113781 152. C22 … S
## 6 1 1 Anderso… male 48 0 0 19952 26.6 E12 S
## 7 1 1 Andrews… fema… 63 1 0 13502 78.0 D7 S
## 8 1 0 Andrews… male 39 0 0 112050 0 A36 S
## 9 1 1 Appleto… fema… 53 2 0 11769 51.5 C101 S
## 10 1 0 Artagav… male 71 0 0 PC 17… 49.5 <NA> C
## # ℹ 1,299 more rows
## # ℹ 3 more variables: boat <chr>, body <dbl>, home.dest <chr>
# Cleaning the data
titanic_clean <- titanic_data %>%
# Convert factors to characters
mutate_if(is.factor, as.character) %>%
# Handle missing values (example: fill NA in 'age' with median)
mutate(age = ifelse(is.na(age), median(age, na.rm = TRUE), age))
titanic_clean
## # A tibble: 1,309 × 14
## pclass survived name sex age sibsp parch ticket fare cabin embarked
## <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <chr>
## 1 1 1 Allen, … fema… 29 0 0 24160 211. B5 S
## 2 1 1 Allison… male 0.917 1 2 113781 152. C22 … S
## 3 1 0 Allison… fema… 2 1 2 113781 152. C22 … S
## 4 1 0 Allison… male 30 1 2 113781 152. C22 … S
## 5 1 0 Allison… fema… 25 1 2 113781 152. C22 … S
## 6 1 1 Anderso… male 48 0 0 19952 26.6 E12 S
## 7 1 1 Andrews… fema… 63 1 0 13502 78.0 D7 S
## 8 1 0 Andrews… male 39 0 0 112050 0 A36 S
## 9 1 1 Appleto… fema… 53 2 0 11769 51.5 C101 S
## 10 1 0 Artagav… male 71 0 0 PC 17… 49.5 <NA> C
## # ℹ 1,299 more rows
## # ℹ 3 more variables: boat <chr>, body <dbl>, home.dest <chr>
str(titanic_clean)
## tibble [1,309 × 14] (S3: tbl_df/tbl/data.frame)
## $ pclass : num [1:1309] 1 1 1 1 1 1 1 1 1 1 ...
## $ survived : num [1:1309] 1 1 0 0 0 1 1 0 1 0 ...
## $ name : chr [1:1309] "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Creighton" ...
## $ sex : chr [1:1309] "female" "male" "female" "male" ...
## $ age : num [1:1309] 29 0.917 2 30 25 ...
## $ sibsp : num [1:1309] 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : num [1:1309] 0 2 2 2 2 0 0 0 0 0 ...
## $ ticket : chr [1:1309] "24160" "113781" "113781" "113781" ...
## $ fare : num [1:1309] 211 152 152 152 152 ...
## $ cabin : chr [1:1309] "B5" "C22 C26" "C22 C26" "C22 C26" ...
## $ embarked : chr [1:1309] "S" "S" "S" "S" ...
## $ boat : chr [1:1309] "2" "11" NA NA ...
## $ body : num [1:1309] NA NA NA 135 NA NA NA NA NA 22 ...
## $ home.dest: chr [1:1309] "St Louis, MO" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" ...
The Titanic dataset contains information about passengers on the ill-fated Titanic voyage. The Titanic dataset offers a comprehensive look into the tragic maiden voyage of the RMS Titanic, a British passenger liner that sank in the North Atlantic Ocean in April 1912 after hitting an iceberg during her maiden voyage from Southampton to New York City. The dataset provides a window into the lives of those onboard, encompassing a diverse group of passengers from different socio-economic backgrounds, encapsulated within the three passenger classes.
pclass: Passenger class (1st, 2nd, 3rd) - This categorical variable divides passengers into three classes (1st, 2nd, and 3rd), reflecting the socio-economic stratification of the early 20th century.
survived: Survival status (1 = Yes, 0 = No) - A binary categorical variable indicating survival (1) or non-survival (0) of the passengers.
name: Name of the passenger - Textual data providing the names of the passengers, allowing for individual identification and historical research.
sex: Gender - A categorical variable recording the gender of passengers.
age: Age in years - A numerical variable detailing the age of each passenger, giving insights into the age distribution onboard.
sibsp: Number of siblings/spouses aboard - This numerical variable counts the number of siblings or spouses that a passenger had aboard the Titanic.
parch: Number of parents/children aboard - Similar to sibsp, this numerical variable tallies the number of parents or children a passenger had on the ship.
ticket: Ticket number - A combination of text and numeric data representing each passenger’s ticket number.
fare: Passenger fare - A numerical variable showing how much each passenger paid, potentially indicating their financial status.
cabin: Cabin number - Textual data providing the cabin number for passengers, which can be linked to their class and location on the ship.
embarked: Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton) - A categorical variable indicating the port where passengers boarded the Titanic, with codes for Cherbourg, Queenstown, and Southampton.
boat: Lifeboat (if survived) - For survivors, this text/numeric variable identifies the lifeboat they were on, giving insight into the rescue process.
body: Body identification number (if did not survive) - For those who did not survive, this numeric variable provides a body identification number, if available.
home.dest: Home/Destination - Textual data about the passengers’ home or intended destination, offering a glimpse into their personal journeys and backgrounds.
# statistics
summary_statistics <- titanic_clean %>%
summarise(
MeanAge = mean(age, na.rm = TRUE),
MedianAge = median(age, na.rm = TRUE),
SdAge = sd(age, na.rm = TRUE),
SurvivedCount = sum(survived == 1, na.rm = TRUE),
NotSurvivedCount = sum(survived == 0, na.rm = TRUE),
)
summary_statistics
## # A tibble: 1 × 5
## MeanAge MedianAge SdAge SurvivedCount NotSurvivedCount
## <dbl> <dbl> <dbl> <int> <int>
## 1 29.5 28 12.9 500 809
# Survival rates by passenger class
titanic_clean %>%
group_by(pclass) %>%
summarise(SurvivalRate = mean(survived == 1)) %>%
ggplot(aes(x = factor(pclass), y = SurvivalRate, fill = factor(pclass))) +
geom_bar(stat = "identity") +
labs(title = "Survival Rates by Passenger Class", x = "Passenger Class", y = "Survival Rate") +
scale_fill_brewer(palette = "Set1")
# Age distribution among survivors and non-survivors
titanic_clean %>%
ggplot(aes(x = age, fill = as.factor(survived))) +
geom_histogram(bins = 30, position = "identity", alpha = 0.6) +
labs(title = "Age Distribution of Survivors and Non-Survivors", x = "Age", y = "Count") +
scale_fill_brewer(palette = "Set1", name = "Survived", labels = c("No", "Yes"))
# Gender proportion amongst survivors
titanic_clean %>%
group_by(sex) %>%
summarise(SurvivalRate = mean(survived == 1)) %>%
ggplot(aes(x = sex, y = SurvivalRate, fill = sex)) +
geom_bar(stat = "identity") +
labs(title = "Survival Rates by Gender", x = "Gender", y = "Survival Rate") +
scale_fill_brewer(palette = "Pastel1")
# Fare distribution
titanic_clean %>%
ggplot(aes(x = fare)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Distribution of Fares", x = "Fare", y = "Count")
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
There are some research questions that came when going over the dataset.