Load Required Libraries

# Load essential packages
library(ggplot2)
library(titanic)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Data set Overview

# load the inbuilt data set
data("titanic_train")

# display first few rows
head(titanic_train)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                              Heikkinen, Miss. Laina female  26     0     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5                            Allen, Mr. William Henry   male  35     0     0
## 6                                    Moran, Mr. James   male  NA     0     0
##             Ticket    Fare Cabin Embarked
## 1        A/5 21171  7.2500              S
## 2         PC 17599 71.2833   C85        C
## 3 STON/O2. 3101282  7.9250              S
## 4           113803 53.1000  C123        S
## 5           373450  8.0500              S
## 6           330877  8.4583              Q

1. Descriptive Statistics

1.1 Structure and Summary

str(titanic_train)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...
summary(titanic_train)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 

1.2 Dimensions of the Dataset

cat("Number of Rows: ", nrow(titanic_train), "\n")
## Number of Rows:  891
cat("Number of Columns: ", ncol(titanic_train))
## Number of Columns:  12

1.3 Basic Statistical Measures

titanic_train %>%
summarise(
Mean_Age = mean(Age, na.rm = TRUE),
Median_Age = median(Age, na.rm = TRUE),
SD_Age = sd(Age, na.rm = TRUE)
)
##   Mean_Age Median_Age  SD_Age
## 1 29.69912         28 14.5265

2. Data Visualization

Visual exploration helps identify patterns and relationships among variables.

2.1 Histogram of passenger ages

hist(titanic_train$Age,
main = "Age Distribution of Titanic Passengers",
xlab = "Age",
col = "lightblue",
border = "black")

2.2 Boxplot

boxplot(Age ~ Sex, data = titanic_train,
main = "Age Distribution by Sex",
xlab = "Sex",
ylab = "Age",
col = c("pink", "lightblue"))

2.3 Scatter plot

plot(titanic_train$Age, titanic_train$Fare,
main = "Scatter Plot of Age vs Fare",
xlab = "Age",
ylab = "Fare",
col = "darkgreen",
pch = 19)

## 2.4 Pair Plot

titanic_num <- titanic_train[, c("Age", "Fare", "Pclass", "Survived")]
pairs(titanic_num,
main = "Pair Plot of Titanic Numeric Features",
pch = 21,
bg = c("red", "green3")[as.factor(titanic_train$Survived)])

## 2.5 Density Plot

ggplot(titanic_train, aes(x = Age, fill = as.factor(Survived))) +
geom_density(alpha = 0.6) +
labs(title = "Density Plot of Passenger Age by Survival Status",
x = "Age",
y = "Density",
fill = "Survived") +
theme_minimal()
## Warning: Removed 177 rows containing non-finite outside the scale range
## (`stat_density()`).