Importing/Cleaning Data
remove(list = ls())
train <- read.csv("~/Downloads/train.csv")
colSums(is.na(train))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 0 0
train_clean <- na.omit(train)
Summary Statistics
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
?stargazer
stargazer(train_clean, type = "text", title="Titanic", digits=2, out="table1.txt")
##
## Titanic
## ===========================================
## Statistic N Mean St. Dev. Min Max
## -------------------------------------------
## PassengerId 714 448.58 259.12 1 891
## Survived 714 0.41 0.49 0 1
## Pclass 714 2.24 0.84 1 3
## Age 714 29.70 14.53 0.42 80.00
## SibSp 714 0.51 0.93 0 5
## Parch 714 0.43 0.85 0 6
## Fare 714 34.69 52.92 0.00 512.33
## -------------------------------------------
Boxplot
boxplot(train_clean$Survived, main = "Survived Based on Pclass", xlab = "Survived", ylab = "Pclass", border = "blue", col = "red", horizontal = TRUE)

Histogram
Fare <- train_clean$Fare
hist(Fare, xlab = "Fare", ylab = "Pounds", col = "blue")

Density Plot
library(ggplot2)
library(readxl)
den <- density(train_clean$Fare)
library(ggplot2)
ggplot(train_clean, aes(x = Fare)) +
geom_density(fill = "skyblue", alpha = 0.7) +
labs(title = "Titanic Fare",
x = "Fare",
y = "Density")

Key Takeaways
- Most tickets costed below 100 pounds which is equivalent to about
15k usd today