The data is about biography of passengers of Titanic. It contains information about gender, who survived or not, ticket class, age, total sibling that aboard the titanic, number of parents aboard, passenger fare, and port of embarkation.
This analysis is to see following objectives :
1. How Many passengers survived the Titanic based on class?
2. How Many passengers survived the Titanic based on Sex?
3. How Many passengers survived the Titanic based on Port Embarked?
The step of analysis is as followed :
library(ggplot2)
library(GGally)
library(ggthemes)
library(ggpubr)
library(lubridate)
library(readr)
library(dplyr)
library(tidyr)
library(glue)
library(plotly)
library(tidyverse)
train <- read.csv("D:/Meinari/algoritma/Tugas/titanic/train.csv")
summary(train)
## PassengerId Survived Pclass
## Min. : 1.0 Min. :0.0000 Min. :1.000
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000
## Median :446.0 Median :0.0000 Median :3.000
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Name Sex Age
## Abbing, Mr. Anthony : 1 female:314 Min. : 0.42
## Abbott, Mr. Rossmore Edward : 1 male :577 1st Qu.:20.12
## Abbott, Mrs. Stanton (Rosa Hunt) : 1 Median :28.00
## Abelson, Mr. Samuel : 1 Mean :29.70
## Abelson, Mrs. Samuel (Hannah Wizosky): 1 3rd Qu.:38.00
## Adahl, Mr. Mauritz Nils Martin : 1 Max. :80.00
## (Other) :885 NA's :177
## SibSp Parch Ticket Fare
## Min. :0.000 Min. :0.0000 1601 : 7 Min. : 0.00
## 1st Qu.:0.000 1st Qu.:0.0000 347082 : 7 1st Qu.: 7.91
## Median :0.000 Median :0.0000 CA. 2343: 7 Median : 14.45
## Mean :0.523 Mean :0.3816 3101295 : 6 Mean : 32.20
## 3rd Qu.:1.000 3rd Qu.:0.0000 347088 : 6 3rd Qu.: 31.00
## Max. :8.000 Max. :6.0000 CA 2144 : 6 Max. :512.33
## (Other) :852
## Cabin Embarked
## :687 : 2
## B96 B98 : 4 C:168
## C23 C25 C27: 4 Q: 77
## G6 : 4 S:644
## C22 C26 : 3
## D : 3
## (Other) :186
str(train)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
colSums(is.na(train))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 0 0
head(train)
unique(train$Pclass)
## [1] 3 1 2
unique(train$Age)
## [1] 22.00 38.00 26.00 35.00 NA 54.00 2.00 27.00 14.00 4.00 58.00 20.00
## [13] 39.00 55.00 31.00 34.00 15.00 28.00 8.00 19.00 40.00 66.00 42.00 21.00
## [25] 18.00 3.00 7.00 49.00 29.00 65.00 28.50 5.00 11.00 45.00 17.00 32.00
## [37] 16.00 25.00 0.83 30.00 33.00 23.00 24.00 46.00 59.00 71.00 37.00 47.00
## [49] 14.50 70.50 32.50 12.00 9.00 36.50 51.00 55.50 40.50 44.00 1.00 61.00
## [61] 56.00 50.00 36.00 45.50 20.50 62.00 41.00 52.00 63.00 23.50 0.92 43.00
## [73] 60.00 10.00 64.00 13.00 48.00 0.75 53.00 57.00 80.00 70.00 24.50 6.00
## [85] 0.67 30.50 0.42 34.50 74.00
unique(train$Cabin)
## [1] C85 C123 E46
## [5] G6 C103 D56 A6
## [9] C23 C25 C27 B78 D33 B30
## [13] C52 B28 C83 F33
## [17] F G73 E31 A5 D10 D12
## [21] D26 C110 B58 B60 E101
## [25] F E69 D47 B86 F2
## [29] C2 E33 B19 A7
## [33] C49 F4 A32 B4
## [37] B80 A31 D36 D15
## [41] C93 C78 D35 C87
## [45] B77 E67 B94 C125
## [49] C99 C118 D7 A19
## [53] B49 D C22 C26 C106
## [57] C65 E36 C54 B57 B59 B63 B66
## [61] C7 E34 C32 B18
## [65] C124 C91 E40 T
## [69] C128 D37 B35 E50
## [73] C82 B96 B98 E10 E44
## [77] A34 C104 C111 C92
## [81] E38 D21 E12 E63
## [85] A14 B37 C30 D20
## [89] B79 E25 D46 B73
## [93] C95 B38 B39 B22
## [97] C86 C70 A16 C101
## [101] C68 A10 E68 B41
## [105] A20 D19 D50 D9
## [109] A23 B50 A26 D48
## [113] E58 C126 B71 B51 B53 B55
## [117] D49 B5 B20 F G63
## [121] C62 C64 E24 C90 C45
## [125] E8 B101 D45 C46
## [129] D30 E121 D11 E77
## [133] F38 B3 D6 B82 B84
## [137] D17 A36 B102 B69
## [141] E49 C47 D28 E17
## [145] A24 C50 B42 C148
## 148 Levels: A10 A14 A16 A19 A20 A23 A24 A26 A31 A32 A34 A36 A5 A6 A7 ... T
table(train$Pclass)
##
## 1 2 3
## 216 184 491
levels(train$Embarked)
## [1] "" "C" "Q" "S"
# Change label if embarked
train2<- train %>%
mutate(Embarked = as.factor(case_when(Embarked == "" ~ "other",
Embarked == "C" ~ "Cherbourg",
Embarked == "Q" ~ "Queenstown",
Embarked == "S" ~ "Southampton"))) %>%
mutate(Survived = as.factor(case_when(Survived == 1 ~ "Survived",
Survived == 0 ~ "Died")))
train1 <- train2 %>%
group_by(Pclass, Survived) %>%
summarise(total=n()) %>%
ungroup() %>%
mutate(text = glue(
"Number of Survival/Death = {total}"
))
plot1 <- ggplot(data = train1, aes(x= reorder(Pclass, total), y=total, text = text))+
geom_col(aes(fill= Survived), position = "dodge")+
theme()+
labs(title="Titanic's Passenger Survived based on their class", x="Class", y="Number of Survival", caption="Made by Meinari Claudia")
ggplotly(plot1, tooltip = "text")
Conclusion 1 : so based on class, passengers who on class 3 have number of death highest, 372 people.
train3 <- train2 %>%
group_by(Sex, Survived) %>%
summarise(total=n()) %>%
ungroup() %>%
mutate(text=glue(
"Number of Survival/Death = {total}"
))
plot2 <- ggplot(data = train3, aes(x= reorder(Sex, total), y=total, text=text))+
geom_col(aes(fill=Survived), position = "dodge")+
theme()+
labs(title="Titanic's Passenger Survived based on Sex", x="Sex", y="Number of Survival", caption="Made by Meinari Claudia")
ggplotly(plot2, tooltip="text")
Conclusion 2 : based on gender, number death of male (468) exceeded female (81). the survival ratio of male was 19% compare to female was 74%.
train4 <- train2 %>%
group_by(Embarked, Survived) %>%
summarise(total = n()) %>%
ungroup() %>%
mutate(text=glue(
"Number of Survival/Death = {total}"
))
plot3 <- ggplot(data = train4, aes(x= reorder(Embarked, total), y=total, text=text))+
geom_col(aes(fill= Survived), position = "dodge")+
theme()+
labs(title="Titanic's Passenger Survived based on Port Embarked", x="Port Embarked", y="Number of Survival", caption="Made by Meinari Claudia")
ggplotly(plot3, tooltip = "text")
Conclusion3 : based on Port Embarked, most passengers of Titanic embarked at Southampton followed by Cherbourg and Queenstown. However, the biggest death ratio came from them who embarked at Southampton (66%).