library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
# Load Titanic dataset
titanic = read.csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
# View dataset structure
str(titanic)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
titanic = titanic %>%
select(-PassengerId, -Name, -Ticket, -Cabin)
titanic[titanic == ""] = NA
titanic = na.omit(titanic)
# 1, Bar chart
# Pass dist. by gender
ggplot(titanic, aes(x = Sex)) +
geom_bar(fill = "pink") +
labs(title = "Pass dist. by Gender", x = "Gender", y = "Nmb of Pass.")

# Survl by cLass
ggplot(titanic, aes(x = factor(Pclass), fill = factor(Survived))) +
geom_bar(position = "dodge") +
labs(title = "Survl by Pass' class.", x = "Class", y = "Num of Pass.", fill = "Survived")

# by gender
ggplot(titanic, aes(x = Sex, fill = factor(Survived))) +
geom_bar(position = "dodge") +
labs(title = "Survive by Gender/Sex", x = "Gender", y = "Num of Pass.", fill = "Survived")

# 2. Box PLot
# Fare dist, (survival)
ggplot(titanic, aes(x = factor(Survived), y = Fare, fill = factor(Survived))) +
geom_boxplot() +
scale_fill_manual(values = c("gray", "blue")) +
labs(title = "FD by Survival", x = "Survived", y = "Fare") +
theme_minimal()

# Age Dist, (class)
ggplot(titanic, aes(x = factor(Pclass), y = Age, fill = factor(Pclass))) +
geom_boxplot() +
scale_fill_manual(values = c("blue", "gray", "green")) +
labs(title = "AD by Class", x = "Class", y = "Age") +
theme_minimal()

# 3. ScaTTeR plot
# comparison age and fare (Survival)
ggplot(titanic, aes(x = Age, y = Fare, color = factor(Survived))) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
scale_color_manual(values = c("gray", "blue")) +
labs(title = "Age vs Fare || Survival", x = "Age", y = "Fare", color = "Survived") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# comparison age and fare (class)
ggplot(titanic, aes(x = Age, y = Fare, color = factor(Pclass))) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
scale_color_manual(values = c("blue", "gray", "green")) +
labs(title = "Age vs Fare || Class", x = "Age", y = "Fare", color = "Class") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
