Load Packages

library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)

Load Dataset

titanic_data<-read.csv("C:/Users/usre/Downloads/Coursework 07/titanic.csv")

Data description:

Question 1

Calculate the survival rate for male and female passengers. Visualize the survival rates using a bar plot.

survival_rate <- titanic_data %>%
  group_by(Sex) %>%
  summarise(SurvivalRate = mean(Survived, na.rm = TRUE))
ggplot(survival_rate, aes(x = Sex, y = SurvivalRate, fill = Sex)) +
  geom_bar(stat = "identity") +
  labs(title = "Survival Rate by Gender", x = "Gender", y = "Survival Rate") +
  scale_y_continuous(labels = scales::percent) +
  theme_minimal()

comment:

The survival rate for female passengers on the Titanic was 74.2%, while for male passengers it was 18.9%. The bar plot clearly shows that females had a significantly higher survival rate

Question 2

Investigate the relationship between ticket class (Pclass) and survival. What is the survival rate for each passenger class? Visualize it using a stacked bar plot.

survival_by_class <- titanic_data %>%
  group_by(Pclass) %>%
  summarise(SurvivalRate = mean(Survived, na.rm = TRUE))
view(survival_by_class)
survival_counts <- titanic_data %>%
  group_by(Pclass, Survived) %>%
  summarise(Count = n())
## `summarise()` has grouped output by 'Pclass'. You can override using the
## `.groups` argument.
view(survival_counts)
##(stacked bar plot)
ggplot(survival_counts, aes(x = factor(Pclass), y = Count, fill = factor(Survived))) +
  geom_bar(stat = "identity", position = "fill") +
  labs(title = "Survival Rate by Passenger Class", x = "Passenger Class", y = "Proportion of Passengers", fill = "Survived") +
  scale_y_continuous(labels = scales::percent) +
  theme_minimal()

Question 3

Analyze the age distribution of passengers on the Titanic. Compare the distribution for those who survived and those who did not using a boxplot.

##missing value##
sum(is.na(titanic_data$Age))
## [1] 177
titanic_age_data<-titanic_data%>%
  filter(!is.na(Age))
summary(titanic_age_data$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.42   20.12   28.00   29.70   38.00   80.00
#Boxplot##
ggplot(titanic_age_data, aes(x = factor(Survived), y = Age, fill = factor(Survived))) +
  geom_boxplot() +
  labs(title = "Age Distribution of Titanic Passengers", x = "Survived (0 = No, 1 = Yes)", y = "Age") +
  scale_x_discrete(labels = c("0" = "Did not Survive", "1" = "Survived")) +
  theme_minimal()

Question 4

Analyze how family size (number of siblings, spouses, parents, or children aboard) affects the likelihood of survival. Visualize the relationship between family size and survival rate using an appropriate plot.

titanic_data<-titanic_data%>%
  mutate(FamilySize=SibSp+Parch+1)
head(titanic_data)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                              Heikkinen, Miss. Laina female  26     0     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5                            Allen, Mr. William Henry   male  35     0     0
## 6                                    Moran, Mr. James   male  NA     0     0
##             Ticket    Fare Cabin Embarked FamilySize
## 1        A/5 21171  7.2500              S          2
## 2         PC 17599 71.2833   C85        C          2
## 3 STON/O2. 3101282  7.9250              S          1
## 4           113803 53.1000  C123        S          2
## 5           373450  8.0500              S          1
## 6           330877  8.4583              Q          1
family_survival_rate <- titanic_data %>%
  group_by(FamilySize) %>%
  summarise(SurvivalRate = mean(Survived, na.rm = TRUE))

ggplot(family_survival_rate, aes(x = FamilySize, y = SurvivalRate)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "red", size = 2) +
  labs(title = "Survival Rate by Family Size", x = "Family Size", y = "Survival Rate") +
  scale_y_continuous(labels = scales::percent) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Question 5

Visualize the age distribution of passengers based on their class (Pclass) and gender (Sex) using a facet grid plot.

ggplot(titanic_age_data, aes(x = Age, fill = Sex)) +
  geom_histogram(binwidth = 5, color = "black", alpha = 0.7) +
  facet_grid(Pclass ~ Sex) +
  labs(title = "Age Distribution of Passengers by Class and Gender", x = "Age", y = "Count") +
  theme_minimal()

Question 6

Group passengers into age categories (children, adults, elderly) and calculate the survival rate for each age group. Visualize the survival rate using a bar plot. Hints: You may need to create a new column.

titanic_data <- titanic_data %>%
  mutate(AgeGroup = case_when(
    Age <= 12 ~ "Children",
    Age > 12 & Age <= 64 ~ "Adults",
    Age > 64 ~ "Elderly",
    TRUE ~ "Unknown"
  ))

survival_by_age_group <- titanic_data %>%
  group_by(AgeGroup) %>%
  summarise(SurvivalRate = mean(Survived, na.rm = TRUE))

# a bar plot
ggplot(survival_by_age_group, aes(x = AgeGroup, y = SurvivalRate, fill = AgeGroup)) +
  geom_bar(stat = "identity") +
  labs(title = "Survival Rate by Age Group", x = "Age Group", y = "Survival Rate") +
  scale_y_continuous(labels = scales::percent) +
  theme_minimal()

Question 7

Perform a chi-square test to determine if there is a significant association between gender (Sex) and survival.

contingency_table<-table(titanic_data$Sex,titanic_data$Survived)
chi_square_test<-chisq.test(contingency_table)
chi_square_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  contingency_table
## X-squared = 260.72, df = 1, p-value < 2.2e-16

comment: The test result (X-squared = 260.72, p-value < 2.2e-16) indicates a significant association between gender and survival. This suggests that gender significantly influenced survival chances, with women being more likely to survive.

Question 8

Analyze the correlation between fare, and survival. Use scatter plots to visualize the relationships and calculate correlation coefficients. Hints: Use cor.test() to test the significance of the relationship.

correlation_test<-cor.test(titanic_data$Fare,titanic_data$Survived,method="pearson")
ggplot(titanic_data, aes(x = Fare, y = Survived)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Scatter Plot of Fare vs. Survival", x = "Fare", y = "Survival (0 = No, 1 = Yes)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

comment- The Pearson correlation shows a weak positive correlation (r = 0.2573) between fare and survival on the Titanic, meaning higher fare passengers had a slightly better chance of survival. The p-value (6.12e-15) indicates this result is statistically significant.