Introduction

This project analyzes the Titanic dataset to understand survival patterns among passengers. It focuses on factors such as passenger class, gender, age, fare, and embarkation point. The objective is to extract meaningful insights using statistical summaries and visualizations. This analysis helps in understanding how different variables influenced survival chances.

#Libraries required
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Import Titanic dataset
titanic <- read.csv("Titanic.csv", stringsAsFactors = FALSE)
# View first 6 rows
head(titanic)
##   X survived pclass    sex age sibsp parch    fare embarked class   who
## 1 0        0      3   male  22     1     0  7.2500        S Third   man
## 2 1        1      1 female  38     1     0 71.2833        C First woman
## 3 2        1      3 female  26     0     0  7.9250        S Third woman
## 4 3        1      1 female  35     1     0 53.1000        S First woman
## 5 4        0      3   male  35     0     0  8.0500        S Third   man
## 6 5        0      3   male  NA     0     0  8.4583        Q Third   man
##   adult_male deck embark_town alive alone
## 1       TRUE      Southampton    no FALSE
## 2      FALSE    C   Cherbourg   yes FALSE
## 3      FALSE      Southampton   yes  TRUE
## 4      FALSE    C Southampton   yes FALSE
## 5       TRUE      Southampton    no  TRUE
## 6       TRUE       Queenstown    no  TRUE
#Creating replica of Titanic dataset
tc<-titanic
#Converting empty cells into NA
tc[tc == ""] <- NA
tc[tc == " "] <- NA
#Basic structure of dataset
str(tc)
## 'data.frame':    891 obs. of  16 variables:
##  $ X          : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ sex        : chr  "male" "female" "female" "female" ...
##  $ age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ sibsp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ embarked   : chr  "S" "C" "S" "S" ...
##  $ class      : chr  "Third" "First" "Third" "First" ...
##  $ who        : chr  "man" "woman" "woman" "woman" ...
##  $ adult_male : logi  TRUE FALSE FALSE FALSE TRUE TRUE ...
##  $ deck       : chr  NA "C" NA "C" ...
##  $ embark_town: chr  "Southampton" "Cherbourg" "Southampton" "Southampton" ...
##  $ alive      : chr  "no" "yes" "yes" "yes" ...
##  $ alone      : logi  FALSE FALSE TRUE FALSE TRUE TRUE ...
#Summary of dataset
summary(tc)
##        X            survived          pclass          sex           
##  Min.   :  0.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:222.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :445.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :445.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:667.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :890.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##       age            sibsp           parch             fare       
##  Min.   : 0.42   Min.   :0.000   Min.   :0.0000   Min.   :  0.00  
##  1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:  7.91  
##  Median :28.00   Median :0.000   Median :0.0000   Median : 14.45  
##  Mean   :29.70   Mean   :0.523   Mean   :0.3816   Mean   : 32.20  
##  3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.: 31.00  
##  Max.   :80.00   Max.   :8.000   Max.   :6.0000   Max.   :512.33  
##  NA's   :177                                                      
##    embarked            class               who            adult_male     
##  Length:891         Length:891         Length:891         Mode :logical  
##  Class :character   Class :character   Class :character   FALSE:354      
##  Mode  :character   Mode  :character   Mode  :character   TRUE :537      
##                                                                          
##                                                                          
##                                                                          
##                                                                          
##      deck           embark_town           alive             alone        
##  Length:891         Length:891         Length:891         Mode :logical  
##  Class :character   Class :character   Class :character   FALSE:354      
##  Mode  :character   Mode  :character   Mode  :character   TRUE :537      
##                                                                          
##                                                                          
##                                                                          
## 
#Dimensions of dataset
dim(tc)
## [1] 891  16
#Check missing values
colSums(is.na(tc))
##           X    survived      pclass         sex         age       sibsp 
##           0           0           0           0         177           0 
##       parch        fare    embarked       class         who  adult_male 
##           0           0           2           0           0           0 
##        deck embark_town       alive       alone 
##         688           2           0           0
#Analysis Question 1:
  #How many passengers belonged to each passenger class?
titanic %>%
  count(pclass)
##   pclass   n
## 1      1 216
## 2      2 184
## 3      3 491
#Interpretation:
  #This shows the number of passengers in 1st, 2nd, and 3rd class.
#Analysis Question 2:
  #What is the distribution of male and female passengers?
titanic %>%
  count(sex)
##      sex   n
## 1 female 314
## 2   male 577
#Interpretation:
  #This shows whether there were more male or female passengers onboard.
#Analysis Question 3:
  #What are the minimum, maximum, and average ages of passengers?
titanic %>%
  summarise(
    Min_Age = min(age, na.rm = TRUE),
    Max_Age = max(age, na.rm = TRUE),
    Avg_Age = mean(age, na.rm = TRUE)
  )
##   Min_Age Max_Age  Avg_Age
## 1    0.42      80 29.69912
#Interpretation:
  #This helps understand the age spread of passengers.
#Analysis Question 4:
  #How many passengers survived and how many did not survive?
titanic %>%
  count(survived)
##   survived   n
## 1        0 549
## 2        1 342
#Interpretation:
  #0 = Did not survive, 1 = Survived
#Analysis Question 5:
  #Which passenger class had the highest survival count?
titanic %>%
  group_by(pclass, survived) %>%
  summarise(count = n(), .groups = "drop")
## # A tibble: 6 × 3
##   pclass survived count
##    <int>    <int> <int>
## 1      1        0    80
## 2      1        1   136
## 3      2        0    97
## 4      2        1    87
## 5      3        0   372
## 6      3        1   119
#Interpretation:
  #This shows how survival differed among passenger classes.
#Analysis Question 6:
  #How does survival differ between male and female passengers?
titanic %>%
  group_by(sex, survived) %>%
  summarise(count = n(), .groups = "drop")
## # A tibble: 4 × 3
##   sex    survived count
##   <chr>     <int> <int>
## 1 female        0    81
## 2 female        1   233
## 3 male          0   468
## 4 male          1   109
#Interpretation:
  #This helps compare male and female survival counts.
#Analysis Question 7:
  #Create age groups (Child, Adult, Senior) and count passengers in each.
titanic_age <- titanic %>%
  mutate(Age_Group = case_when(
    age < 18 ~ "Child",
    age >= 18 & age < 60 ~ "Adult",
    age >= 60 ~ "Senior",
    TRUE ~ "Unknown"
  )) 

 titanic_age %>%
  count(Age_Group)
##   Age_Group   n
## 1     Adult 575
## 2     Child 113
## 3    Senior  26
## 4   Unknown 177
 #Interpretation:
  #Passengers are grouped into meaningful age categories.
#Analysis Question 8:
  #Did females and children have higher survival rates?
titanic_rule <- titanic %>%
  mutate(Category = case_when(
    sex == "female" ~ "Female",
    age < 18 ~ "Child",
    TRUE ~ "Adult Male"
  ))

titanic_rule %>%
  group_by(Category, survived) %>%
  summarise(Count = n(), .groups = "drop")
## # A tibble: 6 × 3
##   Category   survived Count
##   <chr>         <int> <int>
## 1 Adult Male        0   433
## 2 Adult Male        1    86
## 3 Child             0    35
## 4 Child             1    23
## 5 Female            0    81
## 6 Female            1   233
#Interpretation:
  #This checks whether women and children had better survival chances.
#Analysis Question 9:
  #Did passengers who paid higher fares survive more often?
titanic %>%
  group_by(survived) %>%
  summarise(Avg_Fare = mean(fare, na.rm = TRUE))
## # A tibble: 2 × 2
##   survived Avg_Fare
##      <int>    <dbl>
## 1        0     22.1
## 2        1     48.4
#Interpretation:
  #This compares average fare of survivors and non-survivors.
#Analysis Question 10:
  #Which embarkation point had the highest number of passengers and survivors?
titanic %>%
  group_by(embarked, survived) %>%
  summarise(count = n(), .groups = "drop")
## # A tibble: 7 × 3
##   embarked survived count
##   <chr>       <int> <int>
## 1 ""              1     2
## 2 "C"             0    75
## 3 "C"             1    93
## 4 "Q"             0    47
## 5 "Q"             1    30
## 6 "S"             0   427
## 7 "S"             1   217
#Interpretation:
  #This compares survival patterns across embarkation points.
#Analysis Question 11:
  #Which combination of class, gender, and age group had the lowest survival rate?
titanic_risk <- titanic %>%
  mutate(Age_Group = case_when(
    age < 18 ~ "Child",
    age >= 18 & age < 60 ~ "Adult",
    age >= 60 ~ "Senior",
    TRUE ~ "Unknown"
  ))

titanic_risk %>%
  group_by(pclass, sex, Age_Group, survived) %>%
  summarise(count = n(), .groups = "drop")
## # A tibble: 39 × 5
##    pclass sex    Age_Group survived count
##     <int> <chr>  <chr>        <int> <int>
##  1      1 female Adult            0     2
##  2      1 female Adult            1    72
##  3      1 female Child            0     1
##  4      1 female Child            1     7
##  5      1 female Senior           1     3
##  6      1 female Unknown          1     9
##  7      1 male   Adult            0    49
##  8      1 male   Adult            1    34
##  9      1 male   Child            1     4
## 10      1 male   Senior           0    12
## # ℹ 29 more rows
#Interpretation:
  #This helps identify the most vulnerable passenger group.
#Analysis Question 12:
  #Among survivors, who paid the highest fares?
titanic %>%
  filter(survived == 1) %>%
  arrange(desc(fare)) %>%
  select(pclass, fare, sex, age) %>%
  head(10)
##    pclass     fare    sex age
## 1       1 512.3292 female  35
## 2       1 512.3292   male  36
## 3       1 512.3292   male  35
## 4       1 263.0000 female  23
## 5       1 263.0000 female  24
## 6       1 262.3750 female  18
## 7       1 262.3750 female  21
## 8       1 247.5208 female  50
## 9       1 227.5250 female  42
## 10      1 227.5250 female  18
#Interpretation:
  #This lists the top 10 survivors who paid the highest fares.
#Analysis Question 13:
  #Create a bar plot showing total passengers in each class.
ggplot(titanic, aes(x = factor(pclass))) +
  geom_bar(fill = "skyblue") +
  labs(
    title = "Passenger Count by Class",
    x = "Passenger Class",
    y = "Count"
  )

#Interpretation:
  #This bar chart shows which passenger class had the most passengers.
#Analysis Question 14:
  #Create a bar plot comparing survival counts between male and female passengers.
ggplot(titanic, aes(x = sex, fill = factor(survived))) +
  geom_bar(position = "dodge") +
  labs(
    title = "Survival by Gender",
    x = "Gender",
    y = "Count",
    fill = "Survived"
  )

#Interpretation:
  #This bar chart compares male and female survival outcomes.
#Analysis Question 15:
  #Survival by Passenger Class
ggplot(titanic, aes(x = factor(pclass), fill = factor(survived))) +
  geom_bar(position = "dodge") +
  labs(
    title = "Survival by Passenger Class",
    x = "Passenger Class",
    y = "Count",
    fill = "Survived"
  )

#Interpretation:
  #This graph shows how survival differed across ticket classes.
#Analysis Question 16:
  #A transport analyst compares survival across ports.
ggplot(titanic, aes(x = embarked, fill = factor(survived))) +
  geom_bar(position = "dodge") +
  labs(title = "Survival by Embarkation Port",
       x = "Port",
       y = "Count",
       fill = "Survived")

#Interpretation:
  #Ports with more upper-class passengers show higher survival
#Analysis Question 17:
  #A health analyst studies survival across age groups.
titanic_age <- titanic %>%
  mutate(Age_Group = case_when(
    age < 18 ~ "Child",
    age >= 18 & age < 60 ~ "Adult",
    age >= 60 ~ "Senior",
    TRUE ~ "Unknown"
  ))

ggplot(titanic_age, aes(x = Age_Group, fill = factor(survived))) +
  geom_bar(position = "dodge") +
  labs(title = "Survival by Age Group",
       x = "Age Group",
       y = "Count",
       fill = "Survived")

#Interpretation:
#Children show relatively better survival
#Adults show higher death counts
#Grouped Bar Plot: Gender vs Class
#Analysis Question 18
  #A demographic analyst studies gender distribution across classes.
ggplot(titanic, aes(x = factor(pclass), fill = sex)) +
  geom_bar(position = "dodge") +
  labs(title = "Gender Distribution by Class",
       x = "Class",
       y = "Count")

#Interpretation:
  #3rd class has more male passengers
#Analysis Question 19:
  #A behavioral analyst studies travel patterns.
titanic_family <- titanic %>%
  mutate(Family_Size = sibsp + parch + 1)

ggplot(titanic_family, aes(x = Family_Size)) +
  geom_bar(fill = "purple") +
  labs(title = "Family Size Distribution",
       x = "Family Size",
       y = "Count")

#Interpretation:
  #Most passengers traveled alone or in small families
#Analysis Question 20:
  #Survival Rate by Age Group (%)
titanic_age %>%
  group_by(Age_Group, survived) %>%
  summarise(n = n()) %>%
  group_by(Age_Group) %>%
  mutate(percent = n / sum(n)) %>%
  ggplot(aes(x = Age_Group, y = percent, fill = factor(survived))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Survival Percentage by Age Group",
       x = "Age Group",
       y = "Percentage")
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by Age_Group and survived.
## ℹ Output is grouped by Age_Group.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(Age_Group, survived))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.

#Interpretation:
  #Children show higher survival percentage
#Analysis Question 21:
  #Faceted Bar Plot: Class vs Gender vs Survival
ggplot(titanic, aes(x = sex, fill = factor(survived))) +
  geom_bar(position = "dodge") +
  facet_wrap(~pclass) +
  labs(title = "Class vs Gender vs Survival",
       x = "Gender",
       y = "Count")

#Interpretation:
#Female survival higher across all classes
#3rd class males most affected