#baca data set lalu assignt ke dalam variable titanic
Titanic <- read.csv("kapal_titanic.csv")
Titanicpclass: A proxy for socio-economic status (SES) 1st = Upper 2nd = Middle 3rd = Lower
age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
sibsp: The dataset defines family relations in this way… Sibling = brother, sister, stepbrother, stepsister Spouse = husband, wife (mistresses and fiancés were ignored)
parch: The dataset defines family relations in this way… Parent = mother, father Child = daughter, son, stepdaughter, stepson Some children travelled only with a nanny, therefore parch=0 for them. Variable Definition Key survival Survival 0 = No, 1 = Yes pclass Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd embarked Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton
## [1] 891 9
## 'data.frame': 891 obs. of 9 variables:
## $ survived: int 0 1 1 1 0 0 0 0 1 1 ...
## $ pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ sibsp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ embarked: Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
## $ deck : Factor w/ 8 levels "","A","B","C",..: 1 4 1 4 1 1 6 1 1 1 ...
#rubah tipe data
Titanic <- Titanic %>%
mutate(survived = as.factor(survived)) %>%
mutate(pclass = as.factor(pclass))## survived pclass sex age sibsp parch fare embarked
## 0 0 0 177 0 0 0 0
## deck
## 0
Karna yang memiliki nilai NA adalah baris pada kolom age maka tidak dilakukan Drop missing value dikarnakan dapat menghilangkan informasi yang lain
p1 <- ggplot(Titanic[!is.na(Titanic$survived),], aes(x = survived, fill = survived)) +
geom_bar(stat='count') +
labs(x = 'How many people died and survived on the Titanic?') +
geom_label(stat='count',aes(label=..count..), size=7) +
theme_grey()
p1Terlihat pada plot diatas bahwa penumpang kapal yang berhasil selamat sebanyak 342 orang dan yang tidak selamat 549
s1 <- ggplot(Titanic, aes(x = sex, fill = sex)) +
geom_bar(stat='count', position='dodge') + theme_grey() +
labs(x = 'Number of Humans by Sex') +
geom_label(stat='count', aes(label=..count..)) +
scale_fill_manual("legend", values = c("female" = "chocolate", "male" = "green"))
s2 <- ggplot(Titanic[!is.na(Titanic$survived),], aes(x = sex, fill = survived)) +
geom_bar(stat='count', position='dodge') + theme_grey() +
labs(x = 'Survived or Not Based on Sex') +
geom_label(stat='count', aes(label=..count..))
grid.arrange(s1,s2, nrow=1)dari plot diatas terlihat bahwa penumpang perempuan lebih banyak yang selamat ketimbang penumpang pria.
pcls <- ggplot(data = Titanic, mapping = aes(x = sex)) +
geom_bar(mapping = aes(fill = survived)) + theme_linedraw() +
ggtitle("Pclass Survived Distribution Based on pclass") +
xlab("Pclass") + ylab("Number of Customers") + facet_wrap(Titanic$pclass) +
scale_y_continuous(breaks = c(0, 50, 100, 200 , 300, 400, 500))
ggplotly(pcls)## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
Terlihat bahwa penumpang yang memesan tiket di kelas 1 lebih banyak yag selamat dan sedikit yang tidak selamat dibanding 2 kelas yang lain,
sd1 <- ggplot(Titanic[(!is.na(Titanic$survived) & !is.na(Titanic$age)),], aes(x = age, fill = survived)) +
geom_density(alpha=0.5, aes(fill=factor(survived))) + labs(title="Survival density and Age") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_grey()
sd1Dari plot diatas kebanyakan penumpang yang tidak selamat ada di rentang usia antara 20 - 40 tahun dan yang selamat ada di rentang usia sekitar <=10 - 40 tahun.