Discussion thread created by : Samuel Bellows
UNICEF dataset that gives the under 5 mortality for many countries across the years 1950-2015. The problem is that the year variable is spread out into 65 different columns, 1 for each year, that need to be gathered into 1 column. In order to make this dataset tiny we would gather the year columns into one column until we had a 3 column dataset of Country name, Year, and Mortality.
#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("ggplot2")
#install.packages("DT")
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Data is stored in the Github and loaded data from Github to Rstudio using read.csv() method.
# read csv file data in a variable using read.csv()
data <- read.csv('https://raw.githubusercontent.com/SubhalaxmiRout002/Data-607-Project-2-Dataset-1/master/unicef-u5mr.csv', header = TRUE, stringsAsFactors = F)
# convert data to data frame
data <- data.frame(data)
# display data using datatable
datatable(data,options = list(scrollX = TRUE, paging=TRUE,fixedHeader=TRUE))This dataset year has given from 1950 to 2015. Each year mentioned as a column. Using tidyr convert these columns to Year column.
# using gather() convert column to row
data <- data %>% gather(Year, Mortality, U5MR.1950:U5MR.2015, na.rm = TRUE)
# Uremove "U5MR" from the name
data$Year <- sub('U5MR.','',data$Year)
# arrange Mortality by desc order
data <- data %>% arrange(desc(Mortality))
# display data using datatable
datatable(data,options = list(scrollX = TRUE, paging=TRUE,fixedHeader=TRUE))Mortality based on year:
data1 <- data.frame(data1)
ggplot(data = data1, mapping = aes(x = CountryName, y = Mortality)) +
geom_bar(aes(reorder(CountryName,Mortality),Mortality),stat = "identity",fill = "steelblue") +
coord_flip() + xlab("Country") + ylab("#Mortality") +
ggtitle("Highest to lowest mortality in 1950") +
theme(plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", color = NA)) +
geom_text(aes( y = Mortality,label=Mortality), hjust = -0.20, color="black", size=3.5)data3 <- data.frame(head(data2, 10))
ggplot(data = data3, mapping = aes(x = CountryName, y = Mortality)) +
geom_bar(aes(reorder(CountryName,Mortality),Mortality),stat = "identity",fill = "steelblue") +
coord_flip() + xlab("Country") + ylab("#Mortality") +
ggtitle("Top 10 countries with mortality in 2015") +
theme(plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", color = NA)) +
geom_text(aes( y = Mortality,label=Mortality), hjust=-0.20, color="black", size=3.5)# last 10 row data stored in the variable
data4 <- data.frame(tail(data2, 10))
ggplot(data = data4, mapping = aes(x = CountryName, y = Mortality)) +
geom_bar(aes(reorder(CountryName,Mortality),Mortality),stat = "identity",fill = "steelblue") +
coord_flip() + xlab("Country") + ylab("#Mortality") +
ggtitle("Bottom 10 countries with mortality in 2015") +
theme(plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", color = NA)) +
geom_text(aes( y = Mortality,label=Mortality), hjust=-0.20,color="black", size=3.5)# stored data in data frame
data5 <- data.frame(data)
# calculate mean and group by using country
data5 <- data5 %>% group_by(CountryName) %>% summarise(round(mean(Mortality),2))
data5 <- mutate(data5,Avg_mortality = `round(mean(Mortality), 2)`)
# select country name and average mortality
data5 <- select(data5, CountryName, Avg_mortality)
#apply filter on country
data5 <- data5 %>% filter(grepl('United States of America|Japan|Norway|Switzerland|Australia|Ireland|Germany|Iceland|Singapore|Sweden|Netherlands' , CountryName )) %>% arrange(desc(Avg_mortality))
# 10 developed countries average mortality
ggplot(data = data5, mapping = aes(x = CountryName, y = Avg_mortality)) +
geom_bar(aes(reorder(CountryName,Avg_mortality),Avg_mortality),stat = "identity",fill = "steelblue") +
xlab("Country") + ylab("Average Mortality") +
ggtitle("Developed countries Average Mortality") +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes( y = Avg_mortality,label=Avg_mortality), vjust = - 0.2, color="black", size=3.5) +
theme(axis.text.x=element_text(angle=30,hjust=1),panel.background = element_rect(fill = "white", color = NA))# store data in data frame
data6 <- data.frame(data)
# apply filter on Contry name = US
data6 <- data6 %>% filter(CountryName == "United States of America")
ggplot(data6,aes(x = CountryName, y = Mortality)) +
geom_boxplot() + xlab("Country") +
stat_summary(fun=mean, colour="darkred", geom="point", size=3,show.legend = FALSE) +
stat_summary(fun=mean, colour="red", geom="text", show.legend = FALSE,
vjust=-0.7, aes( label=round(..y.., digits=1))) ## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.500 8.625 13.650 17.467 26.225 37.700
The plot 4.2, plot 4.4, and plot 4.5 shows mortality goes down year over year.