Load Required Libraries and Data

#Libraries
library(readr)
library(ggplot2)
library(dplyr)
library(tidyr)
library(corrplot)
library(stringr)
library(randomForest)
library(pROC)
library(ROSE)
library(caret)
library(rmarkdown)
library(ggExtra)
library(heatmaply)
library(plotly)
library(GGally)
library(wordcloud)
#Read Data
train <- read_csv("Desktop/train.csv")

Preliminary Data Exploration

cat("\nSummary of the train data:\n")
## 
## Summary of the train data:
print(summary(train))
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 
cat("\nMissing data by variable:\n")
## 
## Missing data by variable:
print(sapply(train, function(x) sum(is.na(x))))
## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0         687           2
#Exploring Patterns in Missing Data: Missing data in Age vs Survival
ggplot(train, aes(x = is.na(Age), fill = as.factor(Survived))) +
  geom_bar(position = "fill") +
  labs(x = "Missing Age", y = "Proportion", fill = "Survived") +
  theme_minimal()

#Investigating Outliers: Boxplot for Age
ggplot(train, aes(y = Age)) +
  geom_boxplot() +
  theme_minimal()

Dealing With Outliers

# Function to identify outliers
identify_outliers <- function(x) {
  qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
  H <- 1.5 * IQR(x, na.rm = T)
  y <- ifelse(x < (qnt[1] - H), (qnt[1] - H), 
              ifelse(x > (qnt[2] + H), (qnt[2] + H), x))
  y
}

# Apply the function to Fare and Age
train$Fare <- identify_outliers(train$Fare)
train$Age <- identify_outliers(train$Age)

Handling Missing Data

# Replace missing ages with median age
train$Age[is.na(train$Age)] <- median(train$Age, na.rm = TRUE)
# Replace missing embarked with the mode
most_common_embarked <- names(which.max(table(train$Embarked)))
train$Embarked[is.na(train$Embarked)] <- most_common_embarked

# Replace missing cabins with 'Unknown'
train$Cabin[is.na(train$Cabin)] <- 'Unknown'
# Print summaries after handling missing data
cat("\nSummary of the train data after handling missing data:\n")
## 
## Summary of the train data after handling missing data:
print(summary(train))
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:22.00   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.30   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:35.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :64.81   Max.   :8.000   Max.   :6.0000  
##     Ticket               Fare          Cabin             Embarked        
##  Length:891         Min.   : 0.00   Length:891         Length:891        
##  Class :character   1st Qu.: 7.91   Class :character   Class :character  
##  Mode  :character   Median :14.45   Mode  :character   Mode  :character  
##                     Mean   :24.05                                        
##                     3rd Qu.:31.00                                        
##                     Max.   :65.63
cat("\nMissing data by variable after handling missing data:\n")
## 
## Missing data by variable after handling missing data:
print(sapply(train, function(x) sum(is.na(x))))
## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0           0 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0           0           0

Exploratory Data Analysis

#Distribution by Sex
ggplot(train, aes(x = Sex, fill = as.factor(Survived))) +
  geom_bar(position = "fill") +
  labs(x = "Sex", y = "Proportion", fill = "Survived") +
  theme_minimal()

#Age and Sex Distribution
ggplot(train, aes(Age)) + 
  geom_histogram(binwidth = 5, fill = 'blue', color = 'black') + 
  facet_grid(. ~ Sex) + 
  theme_minimal()

#Survival by Passenger Class
ggplot(train, aes(x = Pclass, fill = as.factor(Survived))) +
  geom_bar(position = "fill") +
  labs(x = "Passenger Class", y = "Percent", fill = "Survived") +
  scale_fill_discrete(labels = c("No", "Yes")) +
  theme_minimal()

Correlation Matrix and Heatmap for Numeric Columns

num.cols <- sapply(train, is.numeric)
cor.data <- cor(train[, num.cols], use = "pairwise.complete.obs")
corrplot(cor.data, method = "color")

Feature Engineering

#Drop PassengerId
train <- select(train, -PassengerId)
#Combine SIbSp and Parch into New Variable: Family Size
train <- mutate(train, FamilySize = SibSp + Parch)
#Check the Data after Preprocessing
cat("\nSummary of the train data after preprocessing:\n")
## 
## Summary of the train data after preprocessing:
print(summary(train))
##     Survived          Pclass          Name               Sex           
##  Min.   :0.0000   Min.   :1.000   Length:891         Length:891        
##  1st Qu.:0.0000   1st Qu.:2.000   Class :character   Class :character  
##  Median :0.0000   Median :3.000   Mode  :character   Mode  :character  
##  Mean   :0.3838   Mean   :2.309                                        
##  3rd Qu.:1.0000   3rd Qu.:3.000                                        
##  Max.   :1.0000   Max.   :3.000                                        
##       Age            SibSp           Parch           Ticket         
##  Min.   : 0.42   Min.   :0.000   Min.   :0.0000   Length:891        
##  1st Qu.:22.00   1st Qu.:0.000   1st Qu.:0.0000   Class :character  
##  Median :28.00   Median :0.000   Median :0.0000   Mode  :character  
##  Mean   :29.30   Mean   :0.523   Mean   :0.3816                     
##  3rd Qu.:35.00   3rd Qu.:1.000   3rd Qu.:0.0000                     
##  Max.   :64.81   Max.   :8.000   Max.   :6.0000                     
##       Fare          Cabin             Embarked           FamilySize     
##  Min.   : 0.00   Length:891         Length:891         Min.   : 0.0000  
##  1st Qu.: 7.91   Class :character   Class :character   1st Qu.: 0.0000  
##  Median :14.45   Mode  :character   Mode  :character   Median : 0.0000  
##  Mean   :24.05                                         Mean   : 0.9046  
##  3rd Qu.:31.00                                         3rd Qu.: 1.0000  
##  Max.   :65.63                                         Max.   :10.0000

Advanced Exploratory Data Analysis

#Investigating by Embarked
ggplot(train, aes(x = Embarked, fill = as.factor(Survived))) +
  geom_bar(position = "fill") +
  labs(x = "Embarked", y = "Proportion", fill = "Survived") +
  theme_minimal()

#Histogram of Fare
ggplot(train, aes(Fare)) +
  geom_histogram(fill = 'blue', color = 'black', binwidth = 5) +
  theme_minimal()

#Boxplot for Fare
ggplot(train, aes(y = Fare)) +
  geom_boxplot() +
  theme_minimal()

#Bivariate Analysis: Age vs Survival
ggplot(train, aes(x = Age, y = as.factor(Survived))) +
  geom_boxplot() +
  theme_minimal()

#Pclass vs Survival
ggplot(train, aes(x = Pclass, y = as.factor(Survived))) +
  geom_boxplot() +
  theme_minimal()

Creating Meaningful Derived Features

#Deriving title from Passenger Names
train <- train %>%
  mutate(Title = str_extract(Name, "\\b\\w+\\b\\."))
# Examine Title vs Survival
ggplot(train, aes(x = Title, fill = as.factor(Survived))) +
  geom_bar(position = "fill") +
  labs(x = "Title", y = "Proportion", fill = "Survived") +
  theme_minimal()

#Survival by Sex, Stratified by Pclass
ggplot(train, aes(x = Sex, fill = as.factor(Survived))) +
  geom_bar(position = "fill") +
  facet_wrap(~Pclass) +
  labs(x = "Sex", y = "Proportion", fill = "Survived") +
  theme_minimal()

Examining Cabin

train <- train %>%
  mutate(Deck = str_sub(Cabin, 1, 1))  

# Display survival by Deck
ggplot(train, aes(x = Deck, fill = as.factor(Survived))) +
  geom_bar(position = "fill") +
  theme_minimal()

Creating Age Groups

train <- train %>%
  mutate(AgeGroup = cut(Age, breaks = c(0, 18, 40, 65, Inf), labels = c('Child', 'Young Adult', 'Adult', 'Senior')))

# Display survival by Age Group
ggplot(train, aes(x = AgeGroup, fill = as.factor(Survived))) +
  geom_bar(position = "fill") +
  theme_minimal()

Ticket Analysis

ticket_counts <- table(train$Ticket)
train <- train %>%
  mutate(TicketShared = ticket_counts[Ticket] > 1)

# Display survival by Ticket Shared
ggplot(train, aes(x = TicketShared, fill = as.factor(Survived))) +
  geom_bar(position = "fill") +
  theme_minimal()

# Scatterplot with Density
p <- ggplot(train, aes(x=Age, y=Fare)) + 
  geom_point(aes(color = as.factor(Survived)), alpha = 0.6) +
  theme_minimal()
ggMarginal(p, type = "histogram")

# Faceted Plots
ggplot(train, aes(x=Age, y=Fare)) +
  geom_point(aes(color=Survived)) +
  facet_grid(Sex ~ Pclass) +
  theme_minimal()

# Correlation Heatmap
num.cols <- sapply(train, is.numeric)
correlation <- cor(train[, num.cols], use = "pairwise.complete.obs")
heatmaply(correlation)
# Complex Plot with ggplot2
train %>%
  group_by(Pclass, Survived) %>%
  summarise(n=n()) %>%
  mutate(freq=n/sum(n)) %>%
  ggplot(aes(x=Pclass, y=n, fill=Survived)) +
  geom_bar(stat="identity") +
  geom_text(aes(label=paste0(sprintf("%.1f", freq*100),"%")), position=position_stack(0.5), size=4) +
  theme_minimal()

# Interactive plots
p <- ggplot(data=train, aes(x=Age, y=Fare)) +
  geom_point(aes(color=Survived), size=2, alpha=0.6) +
  theme_minimal()
ggplotly(p)
# Violin plot for Age distribution per Sex and Survived status
ggplot(train, aes(x = Sex, y = Age, fill = as.factor(Survived))) +
  geom_violin(scale = "width", trim = FALSE) +
  theme_minimal()

# Box plot of Fare across different Embarked ports
ggplot(train, aes(x = Embarked, y = Fare, fill = Embarked)) +
  geom_boxplot() +
  theme_minimal()

# Density plot of Age across Survived status
ggplot(train, aes(x = Age, fill = as.factor(Survived))) +
  geom_density(alpha = 0.5) +
  theme_minimal()