Load Required Libraries and Data
#Libraries
library(readr)
library(ggplot2)
library(dplyr)
library(tidyr)
library(corrplot)
library(stringr)
library(randomForest)
library(pROC)
library(ROSE)
library(caret)
library(rmarkdown)
library(ggExtra)
library(heatmaply)
library(plotly)
library(GGally)
library(wordcloud)
#Read Data
train <- read_csv("Desktop/train.csv")
Preliminary Data Exploration
cat("\nSummary of the train data:\n")
##
## Summary of the train data:
print(summary(train))
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
cat("\nMissing data by variable:\n")
##
## Missing data by variable:
print(sapply(train, function(x) sum(is.na(x))))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 687 2
#Exploring Patterns in Missing Data: Missing data in Age vs Survival
ggplot(train, aes(x = is.na(Age), fill = as.factor(Survived))) +
geom_bar(position = "fill") +
labs(x = "Missing Age", y = "Proportion", fill = "Survived") +
theme_minimal()
#Investigating Outliers: Boxplot for Age
ggplot(train, aes(y = Age)) +
geom_boxplot() +
theme_minimal()
Dealing With Outliers
# Function to identify outliers
identify_outliers <- function(x) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
H <- 1.5 * IQR(x, na.rm = T)
y <- ifelse(x < (qnt[1] - H), (qnt[1] - H),
ifelse(x > (qnt[2] + H), (qnt[2] + H), x))
y
}
# Apply the function to Fare and Age
train$Fare <- identify_outliers(train$Fare)
train$Age <- identify_outliers(train$Age)
Handling Missing Data
# Replace missing ages with median age
train$Age[is.na(train$Age)] <- median(train$Age, na.rm = TRUE)
# Replace missing embarked with the mode
most_common_embarked <- names(which.max(table(train$Embarked)))
train$Embarked[is.na(train$Embarked)] <- most_common_embarked
# Replace missing cabins with 'Unknown'
train$Cabin[is.na(train$Cabin)] <- 'Unknown'
# Print summaries after handling missing data
cat("\nSummary of the train data after handling missing data:\n")
##
## Summary of the train data after handling missing data:
print(summary(train))
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:22.00 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.30 Mean :0.523 Mean :0.3816
## 3rd Qu.:35.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :64.81 Max. :8.000 Max. :6.0000
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median :14.45 Mode :character Mode :character
## Mean :24.05
## 3rd Qu.:31.00
## Max. :65.63
cat("\nMissing data by variable after handling missing data:\n")
##
## Missing data by variable after handling missing data:
print(sapply(train, function(x) sum(is.na(x))))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 0
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 0 0
Exploratory Data Analysis
#Distribution by Sex
ggplot(train, aes(x = Sex, fill = as.factor(Survived))) +
geom_bar(position = "fill") +
labs(x = "Sex", y = "Proportion", fill = "Survived") +
theme_minimal()
#Age and Sex Distribution
ggplot(train, aes(Age)) +
geom_histogram(binwidth = 5, fill = 'blue', color = 'black') +
facet_grid(. ~ Sex) +
theme_minimal()
#Survival by Passenger Class
ggplot(train, aes(x = Pclass, fill = as.factor(Survived))) +
geom_bar(position = "fill") +
labs(x = "Passenger Class", y = "Percent", fill = "Survived") +
scale_fill_discrete(labels = c("No", "Yes")) +
theme_minimal()
Correlation Matrix and Heatmap for Numeric Columns
num.cols <- sapply(train, is.numeric)
cor.data <- cor(train[, num.cols], use = "pairwise.complete.obs")
corrplot(cor.data, method = "color")
Feature Engineering
#Drop PassengerId
train <- select(train, -PassengerId)
#Combine SIbSp and Parch into New Variable: Family Size
train <- mutate(train, FamilySize = SibSp + Parch)
#Check the Data after Preprocessing
cat("\nSummary of the train data after preprocessing:\n")
##
## Summary of the train data after preprocessing:
print(summary(train))
## Survived Pclass Name Sex
## Min. :0.0000 Min. :1.000 Length:891 Length:891
## 1st Qu.:0.0000 1st Qu.:2.000 Class :character Class :character
## Median :0.0000 Median :3.000 Mode :character Mode :character
## Mean :0.3838 Mean :2.309
## 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :1.0000 Max. :3.000
## Age SibSp Parch Ticket
## Min. : 0.42 Min. :0.000 Min. :0.0000 Length:891
## 1st Qu.:22.00 1st Qu.:0.000 1st Qu.:0.0000 Class :character
## Median :28.00 Median :0.000 Median :0.0000 Mode :character
## Mean :29.30 Mean :0.523 Mean :0.3816
## 3rd Qu.:35.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :64.81 Max. :8.000 Max. :6.0000
## Fare Cabin Embarked FamilySize
## Min. : 0.00 Length:891 Length:891 Min. : 0.0000
## 1st Qu.: 7.91 Class :character Class :character 1st Qu.: 0.0000
## Median :14.45 Mode :character Mode :character Median : 0.0000
## Mean :24.05 Mean : 0.9046
## 3rd Qu.:31.00 3rd Qu.: 1.0000
## Max. :65.63 Max. :10.0000
Advanced Exploratory Data Analysis
#Investigating by Embarked
ggplot(train, aes(x = Embarked, fill = as.factor(Survived))) +
geom_bar(position = "fill") +
labs(x = "Embarked", y = "Proportion", fill = "Survived") +
theme_minimal()
#Histogram of Fare
ggplot(train, aes(Fare)) +
geom_histogram(fill = 'blue', color = 'black', binwidth = 5) +
theme_minimal()
#Boxplot for Fare
ggplot(train, aes(y = Fare)) +
geom_boxplot() +
theme_minimal()
#Bivariate Analysis: Age vs Survival
ggplot(train, aes(x = Age, y = as.factor(Survived))) +
geom_boxplot() +
theme_minimal()
#Pclass vs Survival
ggplot(train, aes(x = Pclass, y = as.factor(Survived))) +
geom_boxplot() +
theme_minimal()
Creating Meaningful Derived Features
#Deriving title from Passenger Names
train <- train %>%
mutate(Title = str_extract(Name, "\\b\\w+\\b\\."))
# Examine Title vs Survival
ggplot(train, aes(x = Title, fill = as.factor(Survived))) +
geom_bar(position = "fill") +
labs(x = "Title", y = "Proportion", fill = "Survived") +
theme_minimal()
#Survival by Sex, Stratified by Pclass
ggplot(train, aes(x = Sex, fill = as.factor(Survived))) +
geom_bar(position = "fill") +
facet_wrap(~Pclass) +
labs(x = "Sex", y = "Proportion", fill = "Survived") +
theme_minimal()
Examining Cabin
train <- train %>%
mutate(Deck = str_sub(Cabin, 1, 1))
# Display survival by Deck
ggplot(train, aes(x = Deck, fill = as.factor(Survived))) +
geom_bar(position = "fill") +
theme_minimal()
Creating Age Groups
train <- train %>%
mutate(AgeGroup = cut(Age, breaks = c(0, 18, 40, 65, Inf), labels = c('Child', 'Young Adult', 'Adult', 'Senior')))
# Display survival by Age Group
ggplot(train, aes(x = AgeGroup, fill = as.factor(Survived))) +
geom_bar(position = "fill") +
theme_minimal()
Ticket Analysis
ticket_counts <- table(train$Ticket)
train <- train %>%
mutate(TicketShared = ticket_counts[Ticket] > 1)
# Display survival by Ticket Shared
ggplot(train, aes(x = TicketShared, fill = as.factor(Survived))) +
geom_bar(position = "fill") +
theme_minimal()
# Scatterplot with Density
p <- ggplot(train, aes(x=Age, y=Fare)) +
geom_point(aes(color = as.factor(Survived)), alpha = 0.6) +
theme_minimal()
ggMarginal(p, type = "histogram")
# Faceted Plots
ggplot(train, aes(x=Age, y=Fare)) +
geom_point(aes(color=Survived)) +
facet_grid(Sex ~ Pclass) +
theme_minimal()
# Correlation Heatmap
num.cols <- sapply(train, is.numeric)
correlation <- cor(train[, num.cols], use = "pairwise.complete.obs")
heatmaply(correlation)
# Complex Plot with ggplot2
train %>%
group_by(Pclass, Survived) %>%
summarise(n=n()) %>%
mutate(freq=n/sum(n)) %>%
ggplot(aes(x=Pclass, y=n, fill=Survived)) +
geom_bar(stat="identity") +
geom_text(aes(label=paste0(sprintf("%.1f", freq*100),"%")), position=position_stack(0.5), size=4) +
theme_minimal()
# Interactive plots
p <- ggplot(data=train, aes(x=Age, y=Fare)) +
geom_point(aes(color=Survived), size=2, alpha=0.6) +
theme_minimal()
ggplotly(p)
# Violin plot for Age distribution per Sex and Survived status
ggplot(train, aes(x = Sex, y = Age, fill = as.factor(Survived))) +
geom_violin(scale = "width", trim = FALSE) +
theme_minimal()
# Box plot of Fare across different Embarked ports
ggplot(train, aes(x = Embarked, y = Fare, fill = Embarked)) +
geom_boxplot() +
theme_minimal()
# Density plot of Age across Survived status
ggplot(train, aes(x = Age, fill = as.factor(Survived))) +
geom_density(alpha = 0.5) +
theme_minimal()