This project analyzes the Titanic dataset to understand survival patterns among passengers. It focuses on factors such as passenger class, gender, age, fare, and embarkation point. The objective is to extract meaningful insights using statistical summaries and visualizations. This analysis helps in understanding how different variables influenced survival chances.
#Libraries required
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Import Titanic dataset
titanic <- read.csv("Titanic.csv", stringsAsFactors = FALSE)
# View first 6 rows
head(titanic)
## X survived pclass sex age sibsp parch fare embarked class who
## 1 0 0 3 male 22 1 0 7.2500 S Third man
## 2 1 1 1 female 38 1 0 71.2833 C First woman
## 3 2 1 3 female 26 0 0 7.9250 S Third woman
## 4 3 1 1 female 35 1 0 53.1000 S First woman
## 5 4 0 3 male 35 0 0 8.0500 S Third man
## 6 5 0 3 male NA 0 0 8.4583 Q Third man
## adult_male deck embark_town alive alone
## 1 TRUE Southampton no FALSE
## 2 FALSE C Cherbourg yes FALSE
## 3 FALSE Southampton yes TRUE
## 4 FALSE C Southampton yes FALSE
## 5 TRUE Southampton no TRUE
## 6 TRUE Queenstown no TRUE
#Creating replica of Titanic dataset
tc<-titanic
#Converting empty cells into NA
tc[tc == ""] <- NA
tc[tc == " "] <- NA
#Basic structure of dataset
str(tc)
## 'data.frame': 891 obs. of 16 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ sex : chr "male" "female" "female" "female" ...
## $ age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ sibsp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ embarked : chr "S" "C" "S" "S" ...
## $ class : chr "Third" "First" "Third" "First" ...
## $ who : chr "man" "woman" "woman" "woman" ...
## $ adult_male : logi TRUE FALSE FALSE FALSE TRUE TRUE ...
## $ deck : chr NA "C" NA "C" ...
## $ embark_town: chr "Southampton" "Cherbourg" "Southampton" "Southampton" ...
## $ alive : chr "no" "yes" "yes" "yes" ...
## $ alone : logi FALSE FALSE TRUE FALSE TRUE TRUE ...
#Summary of dataset
summary(tc)
## X survived pclass sex
## Min. : 0.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:222.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :445.0 Median :0.0000 Median :3.000 Mode :character
## Mean :445.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:667.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :890.0 Max. :1.0000 Max. :3.000
##
## age sibsp parch fare
## Min. : 0.42 Min. :0.000 Min. :0.0000 Min. : 0.00
## 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.: 7.91
## Median :28.00 Median :0.000 Median :0.0000 Median : 14.45
## Mean :29.70 Mean :0.523 Mean :0.3816 Mean : 32.20
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.: 31.00
## Max. :80.00 Max. :8.000 Max. :6.0000 Max. :512.33
## NA's :177
## embarked class who adult_male
## Length:891 Length:891 Length:891 Mode :logical
## Class :character Class :character Class :character FALSE:354
## Mode :character Mode :character Mode :character TRUE :537
##
##
##
##
## deck embark_town alive alone
## Length:891 Length:891 Length:891 Mode :logical
## Class :character Class :character Class :character FALSE:354
## Mode :character Mode :character Mode :character TRUE :537
##
##
##
##
#Dimensions of dataset
dim(tc)
## [1] 891 16
#Check missing values
colSums(is.na(tc))
## X survived pclass sex age sibsp
## 0 0 0 0 177 0
## parch fare embarked class who adult_male
## 0 0 2 0 0 0
## deck embark_town alive alone
## 688 2 0 0
#Analysis Question 1:
#How many passengers belonged to each passenger class?
titanic %>%
count(pclass)
## pclass n
## 1 1 216
## 2 2 184
## 3 3 491
#Interpretation:
#This shows the number of passengers in 1st, 2nd, and 3rd class.
#Analysis Question 2:
#What is the distribution of male and female passengers?
titanic %>%
count(sex)
## sex n
## 1 female 314
## 2 male 577
#Interpretation:
#This shows whether there were more male or female passengers onboard.
#Analysis Question 3:
#What are the minimum, maximum, and average ages of passengers?
titanic %>%
summarise(
Min_Age = min(age, na.rm = TRUE),
Max_Age = max(age, na.rm = TRUE),
Avg_Age = mean(age, na.rm = TRUE)
)
## Min_Age Max_Age Avg_Age
## 1 0.42 80 29.69912
#Interpretation:
#This helps understand the age spread of passengers.
#Analysis Question 4:
#How many passengers survived and how many did not survive?
titanic %>%
count(survived)
## survived n
## 1 0 549
## 2 1 342
#Interpretation:
#0 = Did not survive, 1 = Survived
#Analysis Question 5:
#Which passenger class had the highest survival count?
titanic %>%
group_by(pclass, survived) %>%
summarise(count = n(), .groups = "drop")
## # A tibble: 6 × 3
## pclass survived count
## <int> <int> <int>
## 1 1 0 80
## 2 1 1 136
## 3 2 0 97
## 4 2 1 87
## 5 3 0 372
## 6 3 1 119
#Interpretation:
#This shows how survival differed among passenger classes.
#Analysis Question 6:
#How does survival differ between male and female passengers?
titanic %>%
group_by(sex, survived) %>%
summarise(count = n(), .groups = "drop")
## # A tibble: 4 × 3
## sex survived count
## <chr> <int> <int>
## 1 female 0 81
## 2 female 1 233
## 3 male 0 468
## 4 male 1 109
#Interpretation:
#This helps compare male and female survival counts.
#Analysis Question 7:
#Create age groups (Child, Adult, Senior) and count passengers in each.
titanic_age <- titanic %>%
mutate(Age_Group = case_when(
age < 18 ~ "Child",
age >= 18 & age < 60 ~ "Adult",
age >= 60 ~ "Senior",
TRUE ~ "Unknown"
))
titanic_age %>%
count(Age_Group)
## Age_Group n
## 1 Adult 575
## 2 Child 113
## 3 Senior 26
## 4 Unknown 177
#Interpretation:
#Passengers are grouped into meaningful age categories.
#Analysis Question 8:
#Did females and children have higher survival rates?
titanic_rule <- titanic %>%
mutate(Category = case_when(
sex == "female" ~ "Female",
age < 18 ~ "Child",
TRUE ~ "Adult Male"
))
titanic_rule %>%
group_by(Category, survived) %>%
summarise(Count = n(), .groups = "drop")
## # A tibble: 6 × 3
## Category survived Count
## <chr> <int> <int>
## 1 Adult Male 0 433
## 2 Adult Male 1 86
## 3 Child 0 35
## 4 Child 1 23
## 5 Female 0 81
## 6 Female 1 233
#Interpretation:
#This checks whether women and children had better survival chances.
#Analysis Question 9:
#Did passengers who paid higher fares survive more often?
titanic %>%
group_by(survived) %>%
summarise(Avg_Fare = mean(fare, na.rm = TRUE))
## # A tibble: 2 × 2
## survived Avg_Fare
## <int> <dbl>
## 1 0 22.1
## 2 1 48.4
#Interpretation:
#This compares average fare of survivors and non-survivors.
#Analysis Question 10:
#Which embarkation point had the highest number of passengers and survivors?
titanic %>%
group_by(embarked, survived) %>%
summarise(count = n(), .groups = "drop")
## # A tibble: 7 × 3
## embarked survived count
## <chr> <int> <int>
## 1 "" 1 2
## 2 "C" 0 75
## 3 "C" 1 93
## 4 "Q" 0 47
## 5 "Q" 1 30
## 6 "S" 0 427
## 7 "S" 1 217
#Interpretation:
#This compares survival patterns across embarkation points.
#Analysis Question 11:
#Which combination of class, gender, and age group had the lowest survival rate?
titanic_risk <- titanic %>%
mutate(Age_Group = case_when(
age < 18 ~ "Child",
age >= 18 & age < 60 ~ "Adult",
age >= 60 ~ "Senior",
TRUE ~ "Unknown"
))
titanic_risk %>%
group_by(pclass, sex, Age_Group, survived) %>%
summarise(count = n(), .groups = "drop")
## # A tibble: 39 × 5
## pclass sex Age_Group survived count
## <int> <chr> <chr> <int> <int>
## 1 1 female Adult 0 2
## 2 1 female Adult 1 72
## 3 1 female Child 0 1
## 4 1 female Child 1 7
## 5 1 female Senior 1 3
## 6 1 female Unknown 1 9
## 7 1 male Adult 0 49
## 8 1 male Adult 1 34
## 9 1 male Child 1 4
## 10 1 male Senior 0 12
## # ℹ 29 more rows
#Interpretation:
#This helps identify the most vulnerable passenger group.
#Analysis Question 12:
#Among survivors, who paid the highest fares?
titanic %>%
filter(survived == 1) %>%
arrange(desc(fare)) %>%
select(pclass, fare, sex, age) %>%
head(10)
## pclass fare sex age
## 1 1 512.3292 female 35
## 2 1 512.3292 male 36
## 3 1 512.3292 male 35
## 4 1 263.0000 female 23
## 5 1 263.0000 female 24
## 6 1 262.3750 female 18
## 7 1 262.3750 female 21
## 8 1 247.5208 female 50
## 9 1 227.5250 female 42
## 10 1 227.5250 female 18
#Interpretation:
#This lists the top 10 survivors who paid the highest fares.
#Analysis Question 13:
#Create a bar plot showing total passengers in each class.
ggplot(titanic, aes(x = factor(pclass))) +
geom_bar(fill = "skyblue") +
labs(
title = "Passenger Count by Class",
x = "Passenger Class",
y = "Count"
)
#Interpretation:
#This bar chart shows which passenger class had the most passengers.
#Analysis Question 14:
#Create a bar plot comparing survival counts between male and female passengers.
ggplot(titanic, aes(x = sex, fill = factor(survived))) +
geom_bar(position = "dodge") +
labs(
title = "Survival by Gender",
x = "Gender",
y = "Count",
fill = "Survived"
)
#Interpretation:
#This bar chart compares male and female survival outcomes.
#Analysis Question 15:
#Survival by Passenger Class
ggplot(titanic, aes(x = factor(pclass), fill = factor(survived))) +
geom_bar(position = "dodge") +
labs(
title = "Survival by Passenger Class",
x = "Passenger Class",
y = "Count",
fill = "Survived"
)
#Interpretation:
#This graph shows how survival differed across ticket classes.
#Analysis Question 16:
#A transport analyst compares survival across ports.
ggplot(titanic, aes(x = embarked, fill = factor(survived))) +
geom_bar(position = "dodge") +
labs(title = "Survival by Embarkation Port",
x = "Port",
y = "Count",
fill = "Survived")
#Interpretation:
#Ports with more upper-class passengers show higher survival
#Analysis Question 17:
#A health analyst studies survival across age groups.
titanic_age <- titanic %>%
mutate(Age_Group = case_when(
age < 18 ~ "Child",
age >= 18 & age < 60 ~ "Adult",
age >= 60 ~ "Senior",
TRUE ~ "Unknown"
))
ggplot(titanic_age, aes(x = Age_Group, fill = factor(survived))) +
geom_bar(position = "dodge") +
labs(title = "Survival by Age Group",
x = "Age Group",
y = "Count",
fill = "Survived")
#Interpretation:
#Children show relatively better survival
#Adults show higher death counts
#Grouped Bar Plot: Gender vs Class
#Analysis Question 18
#A demographic analyst studies gender distribution across classes.
ggplot(titanic, aes(x = factor(pclass), fill = sex)) +
geom_bar(position = "dodge") +
labs(title = "Gender Distribution by Class",
x = "Class",
y = "Count")
#Interpretation:
#3rd class has more male passengers
#Analysis Question 19:
#A behavioral analyst studies travel patterns.
titanic_family <- titanic %>%
mutate(Family_Size = sibsp + parch + 1)
ggplot(titanic_family, aes(x = Family_Size)) +
geom_bar(fill = "purple") +
labs(title = "Family Size Distribution",
x = "Family Size",
y = "Count")
#Interpretation:
#Most passengers traveled alone or in small families
#Analysis Question 20:
#Survival Rate by Age Group (%)
titanic_age %>%
group_by(Age_Group, survived) %>%
summarise(n = n()) %>%
group_by(Age_Group) %>%
mutate(percent = n / sum(n)) %>%
ggplot(aes(x = Age_Group, y = percent, fill = factor(survived))) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Survival Percentage by Age Group",
x = "Age Group",
y = "Percentage")
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by Age_Group and survived.
## ℹ Output is grouped by Age_Group.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(Age_Group, survived))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
#Interpretation:
#Children show higher survival percentage
#Analysis Question 21:
#Faceted Bar Plot: Class vs Gender vs Survival
ggplot(titanic, aes(x = sex, fill = factor(survived))) +
geom_bar(position = "dodge") +
facet_wrap(~pclass) +
labs(title = "Class vs Gender vs Survival",
x = "Gender",
y = "Count")
#Interpretation:
#Female survival higher across all classes
#3rd class males most affected