titanic <- read.csv("test.csv", stringsAsFactors = FALSE) # load dataset
df_clean <- na.omit(titanic) # clean dataset
stargazer(df_clean,
type = ("text"),
summary.stat = c("min",
"median",
"mean",
"max",
"sd"))
##
## ===================================================
## Statistic Min Median Mean Max St. Dev.
## ---------------------------------------------------
## PassengerId 892 1,100 1,100.233 1,307 122.910
## Pclass 1 2 2.142 3 0.846
## Age 0.170 27.000 30.181 76.000 14.105
## SibSp 0 0 0.483 8 0.875
## Parch 0 0 0.399 6 0.812
## Fare 0.000 16.000 40.982 512.329 61.229
## ---------------------------------------------------
One key observation is that the average age of passengers in the test set is relatively young. Most passengers fall within the 20–40 age range.
ggplot(df_clean, aes(x = factor(Age), y = Fare)) +
geom_boxplot() +
labs(x = "Age", y = "Fare", title = "Fare Distribution by Age") +
theme(
axis.title.x = element_text(margin = margin(t = 10)),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1) # Rotate labels
)
ggplot(df_clean, aes(x = Age)) +
geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
labs(title = "Age Distribution", x = "Age", y = "Count")