# Load essential packages
library(ggplot2)
library(titanic)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# load the inbuilt data set
data("titanic_train")
# display first few rows
head(titanic_train)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Heikkinen, Miss. Laina female 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 5 Allen, Mr. William Henry male 35 0 0
## 6 Moran, Mr. James male NA 0 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 S
## 2 PC 17599 71.2833 C85 C
## 3 STON/O2. 3101282 7.9250 S
## 4 113803 53.1000 C123 S
## 5 373450 8.0500 S
## 6 330877 8.4583 Q
str(titanic_train)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
summary(titanic_train)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
cat("Number of Rows: ", nrow(titanic_train), "\n")
## Number of Rows: 891
cat("Number of Columns: ", ncol(titanic_train))
## Number of Columns: 12
titanic_train %>%
summarise(
Mean_Age = mean(Age, na.rm = TRUE),
Median_Age = median(Age, na.rm = TRUE),
SD_Age = sd(Age, na.rm = TRUE)
)
## Mean_Age Median_Age SD_Age
## 1 29.69912 28 14.5265
Visual exploration helps identify patterns and relationships among variables.
hist(titanic_train$Age,
main = "Age Distribution of Titanic Passengers",
xlab = "Age",
col = "lightblue",
border = "black")
boxplot(Age ~ Sex, data = titanic_train,
main = "Age Distribution by Sex",
xlab = "Sex",
ylab = "Age",
col = c("pink", "lightblue"))
plot(titanic_train$Age, titanic_train$Fare,
main = "Scatter Plot of Age vs Fare",
xlab = "Age",
ylab = "Fare",
col = "darkgreen",
pch = 19)
## 2.4 Pair Plot
titanic_num <- titanic_train[, c("Age", "Fare", "Pclass", "Survived")]
pairs(titanic_num,
main = "Pair Plot of Titanic Numeric Features",
pch = 21,
bg = c("red", "green3")[as.factor(titanic_train$Survived)])
## 2.5 Density Plot
ggplot(titanic_train, aes(x = Age, fill = as.factor(Survived))) +
geom_density(alpha = 0.6) +
labs(title = "Density Plot of Passenger Age by Survival Status",
x = "Age",
y = "Density",
fill = "Survived") +
theme_minimal()
## Warning: Removed 177 rows containing non-finite outside the scale range
## (`stat_density()`).