This is the dataset of titanic, I have chosen from Kaggle. This data set has below columns.
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ✓ purrr 0.3.3
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# get the data from Git repository
url <- "https://raw.githubusercontent.com/SubhalaxmiRout002/tidyverse/master/titanic.csv"
# read the csv file
titanic_data <- read.csv(url, stringsAsFactors = FALSE)
# view fisrt 6 rows of data
head(titanic_data)## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Heikkinen, Miss. Laina female 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 5 Allen, Mr. William Henry male 35 0 0
## 6 Moran, Mr. James male NA 0 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 S
## 2 PC 17599 71.2833 C85 C
## 3 STON/O2. 3101282 7.9250 S
## 4 113803 53.1000 C123 S
## 5 373450 8.0500 S
## 6 330877 8.4583 Q
# remove unwanted column
titanic_data <- titanic_data %>% select(-SibSp,-Ticket,-Fare,-Cabin,-Embarked,-Parch)
# remove where name in NA
titanic_data <- titanic_data %>% filter(!is.na(Name))
# remove duplicates from data, if present any
titanic_data <- unique(titanic_data)
# rename column
titanic_data <- titanic_data %>% rename(Class_Type = Pclass)
# view data
head(titanic_data,5)## PassengerId Survived Class_Type
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## Name Sex Age
## 1 Braund, Mr. Owen Harris male 22
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38
## 3 Heikkinen, Miss. Laina female 26
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35
## 5 Allen, Mr. William Henry male 35
# number of survivor group by sex
survivor_sex <- titanic_data %>% filter(Survived == 1) %>% group_by(Sex) %>% count(Survived)
# draw graph
ggplot(survivor_sex) + geom_bar(aes(x = survivor_sex$Sex, y = survivor_sex$n), stat = "identity", fill = "steelblue") + xlab("Sex") + ylab("Number of survivor(#)") + ggtitle("Number of suvivor by sex") + theme(plot.title = element_text(hjust = 0.5))+ geom_text(aes(x = survivor_sex$Sex,y = survivor_sex$n,label=survivor_sex$n), vjust=1.6, color="white", size=4.5)# number of survivor group by class type
survivor_class <- titanic_data %>% filter(Survived == 1) %>% group_by(Class_Type,Sex) %>% count(Survived)
# draw graph
ggplot(data = survivor_class, aes(x = survivor_class$Class_Type, y = survivor_class$n, fill = survivor_class$Sex)) + geom_bar(stat = "identity", position=position_stack())+ xlab("Class Type") + ylab("Number of survivor(#)") + ggtitle("Number of suvivor by Class") + theme(plot.title = element_text(hjust = 0.5))+ geom_text(aes(x = survivor_class$Class_Type,y = survivor_class$n,label=survivor_class$n), hjust=1.2,vjust = 2.6, color="white", size=4.5)+scale_fill_manual(values=c("#1f5f76", "#92acb8"))+labs(fill = "Sex")From Plot 4.1 and 4.2 we found:
I have used, select(), filter(), mutate(), rename() functions of tidyverse package to clean and manupulate data.