Project 2 (Global Child Mortality Rates data)

This is a dataset from Unicef with mortality rates under 5 years across different countries from 1950 to 2015. Data Reference: https://raw.githubusercontent.com/baruab/msdsrepo/main/DATA-607/unicef-u5mr.csv

Let’s load the required libraries in R for data analysis
library(dplyr)
library(downloader)
library(stringr)
library(htmlTable)
library(tidyverse)
library(reshape2)


Download and read the CSV file to a dataframe object
View the data
# Import the data from the csv file
unicef_data <- read.csv("https://raw.githubusercontent.com/baruab/msdsrepo/main/DATA-607/unicef-u5mr.csv", stringsAsFactors = FALSE)

head(unicef_data, 5) 
##   CountryName U5MR.1950 U5MR.1951 U5MR.1952 U5MR.1953 U5MR.1954 U5MR.1955
## 1 Afghanistan        NA        NA        NA        NA        NA        NA
## 2     Albania        NA        NA        NA        NA        NA        NA
## 3     Algeria        NA        NA        NA        NA       251     249.9
## 4     Andorra        NA        NA        NA        NA        NA        NA
## 5      Angola        NA        NA        NA        NA        NA        NA
##   U5MR.1956 U5MR.1957 U5MR.1958 U5MR.1959 U5MR.1960 U5MR.1961 U5MR.1962
## 1        NA        NA        NA        NA        NA     356.5     350.6
## 2        NA        NA        NA        NA        NA        NA        NA
## 3       249       248     247.5     246.7     246.3     246.1     246.2
## 4        NA        NA        NA        NA        NA        NA        NA
## 5        NA        NA        NA        NA        NA        NA        NA
##   U5MR.1963 U5MR.1964 U5MR.1965 U5MR.1966 U5MR.1967 U5MR.1968 U5MR.1969
## 1     345.0     339.7     334.1     328.7     323.3     318.1     313.0
## 2        NA        NA        NA        NA        NA        NA        NA
## 3     246.8     247.4     248.2     248.7     248.4     247.4     245.3
## 4        NA        NA        NA        NA        NA        NA        NA
## 5        NA        NA        NA        NA        NA        NA        NA
##   U5MR.1970 U5MR.1971 U5MR.1972 U5MR.1973 U5MR.1974 U5MR.1975 U5MR.1976
## 1     307.8     302.1     296.4     290.8     284.9     279.4     273.6
## 2        NA        NA        NA        NA        NA        NA        NA
## 3     241.7     236.5     230.0     222.5     214.2     205.0     195.2
## 4        NA        NA        NA        NA        NA        NA        NA
## 5        NA        NA        NA        NA        NA        NA        NA
##   U5MR.1977 U5MR.1978 U5MR.1979 U5MR.1980 U5MR.1981 U5MR.1982 U5MR.1983
## 1     267.8     261.6     255.5     249.1     242.7     236.2     229.7
## 2        NA      91.1      84.7      78.6      73.0      67.8      62.8
## 3     184.9     173.8     161.8     148.1     132.5     115.8      99.2
## 4        NA        NA        NA        NA        NA        NA        NA
## 5        NA        NA        NA     234.1     232.8     231.5     230.2
##   U5MR.1984 U5MR.1985 U5MR.1986 U5MR.1987 U5MR.1988 U5MR.1989 U5MR.1990
## 1     222.9     216.0     209.2     202.1     195.0     187.8     181.0
## 2      58.3      54.3      50.7      47.6      44.9      42.5      40.6
## 3      83.8      71.2      61.9      55.4      51.2      48.5      46.8
## 4        NA        NA        NA        NA        NA        NA       8.5
## 5     229.1     228.3     227.5     226.9     226.5     226.2     226.0
##   U5MR.1991 U5MR.1992 U5MR.1993 U5MR.1994 U5MR.1995 U5MR.1996 U5MR.1997
## 1     174.2     167.8     162.0     156.8     152.3     148.6     145.5
## 2      38.8      37.3      36.0      34.6      33.2      31.8      30.3
## 3      45.7      44.9      44.1      43.3      42.5      41.8      41.1
## 4       7.9       7.4       6.9       6.4       6.0       5.7       5.3
## 5     225.9     226.0     225.8     225.5     224.8     224.0     222.6
##   U5MR.1998 U5MR.1999 U5MR.2000 U5MR.2001 U5MR.2002 U5MR.2003 U5MR.2004
## 1     142.6     139.9     137.0     133.8     130.3     126.8     123.2
## 2      28.9      27.5      26.2      24.9      23.6      22.5      21.5
## 3      40.6      40.2      39.7      38.9      37.8      36.5      35.1
## 4       5.0       4.8       4.6       4.4       4.2       4.1       4.0
## 5     220.8     218.9     216.7     214.1     211.7     209.2     206.7
##   U5MR.2005 U5MR.2006 U5MR.2007 U5MR.2008 U5MR.2009 U5MR.2010 U5MR.2011
## 1     119.6     116.3     113.2     110.4     107.6     105.0     102.3
## 2      20.5      19.5      18.7      17.9      17.3      16.6      16.0
## 3      33.6      32.1      30.7      29.4      28.3      27.3      26.6
## 4       3.9       3.7       3.6       3.5       3.4       3.3       3.2
## 5     203.9     200.5     196.4     192.0     187.3     182.5     177.3
##   U5MR.2012 U5MR.2013 U5MR.2014 U5MR.2015
## 1      99.5      96.7      93.9      91.1
## 2      15.5      14.9      14.4      14.0
## 3      26.1      25.8      25.6      25.5
## 4       3.1       3.0       2.9       2.8
## 5     172.2     167.1     162.2     156.9
Clean up the column names
names(unicef_data) <- gsub("U5MR.", "", names(unicef_data))
Remove the first column of the data frame
subset_unicef_data <- select( unicef_data, -1)

# Assign row averages to new variable named row_mean
subset_unicef_data$row_mean <- round(rowMeans(subset_unicef_data, na.rm=TRUE),2)

# Add the first column back to the dataframe 
bind_unicef_data <- cbind(select(unicef_data, 1), subset_unicef_data)

# Sort the dataframe by the mean of the mortality rate 
highest_child_mortality <-  arrange(bind_unicef_data, desc(row_mean)) %>% head(5) 

# Display the countries with highest rates
htmlTable( select( highest_child_mortality, c('CountryName', 'row_mean')))
CountryName row_mean
1 Mali 273.7
2 Sierra Leone 267.95
3 Niger 262.15
4 Burkina Faso 247.16
5 Guinea 235.47

Lets do some analysis on the top 5 countries, to find any relationship.

Subset last 5 yrs from the dataset for the top 5 countries from (2011 - 2015)

Transform the dataset from wide format to long format

subset_child_mortality <-  select( highest_child_mortality, c('CountryName', '2011','2012','2013', '2014','2015'))

subset_child_mortality1 <- subset_child_mortality %>%
    gather("year", "cases", "2011":"2012":"2013":"2014":"2015")

# Plot the country line graph by the year to identify trend 
ggplot(subset_child_mortality1, aes(CountryName,cases, group=year)) + 
   geom_line(aes(color=year))+
 geom_point(aes(shape=year))

The mortality rates consistently decreased from 2011 - 2015 for all the countries.


For the last 5 years, find mean, standard deviation, variance of the cases

mean_child_mortality <- subset_child_mortality1 %>% group_by(CountryName) %>%  summarise(avg_deaths = mean(cases, na.rm=TRUE)) 

sd_child_mortality <- subset_child_mortality1 %>% group_by(CountryName) %>%  summarise(sd_deaths = sd(cases, na.rm=TRUE))

min_child_mortality <- subset_child_mortality1 %>% group_by(CountryName) %>%  summarise(min_deaths = min(cases, na.rm=TRUE))

max_child_mortality <- subset_child_mortality1 %>% group_by(CountryName) %>%  summarise(max_deaths = max(cases, na.rm=TRUE))

variance_child_mortality <- subset_child_mortality1 %>% group_by(CountryName) %>%  summarise(variance_deaths = var(cases, na.rm=TRUE))

child_mortality_stats <- mean_child_mortality %>% inner_join(sd_child_mortality, by = 'CountryName') %>% inner_join(min_child_mortality, by = 'CountryName') %>% inner_join(max_child_mortality, by = 'CountryName') %>% inner_join(variance_child_mortality, by = 'CountryName') 

arrange(child_mortality_stats, desc(avg_deaths)) 
## # A tibble: 5 x 6
##   CountryName  avg_deaths sd_deaths min_deaths max_deaths variance_deaths
##   <chr>             <dbl>     <dbl>      <dbl>      <dbl>           <dbl>
## 1 Sierra Leone      134.      12.0       120.        151.           144. 
## 2 Mali              123.       6.76      115.        132.            45.7
## 3 Niger             105.       8.13       95.5       116.            66.1
## 4 Guinea            101.       5.54       93.7       108.            30.6
## 5 Burkina Faso       97.2      7.23       88.6       107.            52.3

The trend of different statistical values is consistent across countries.

child_mortality_stats1 <- child_mortality_stats 

library(reshape2)
#melt data frame into long format
df <- melt(child_mortality_stats1 ,  id.vars = 'CountryName')

# Everything on the same plot
ggplot(df, aes(CountryName,value, col=variable)) + 
  geom_line(aes(color=variable))+
  geom_point()  

Load another dataset with Continent/Country mapping data

# Import the data from the csv file
continent_data <- read.csv("https://pkgstore.datahub.io/JohnSnowLabs/country-and-continent-codes-list/country-and-continent-codes-list-csv_csv/data/b7876b7f496677669644f3d1069d3121/country-and-continent-codes-list-csv_csv.csv", stringsAsFactors = FALSE)

head(continent_data, 5)
##   Continent_Name Continent_Code                                 Country_Name
## 1           Asia             AS             Afghanistan, Islamic Republic of
## 2         Europe             EU                         Albania, Republic of
## 3     Antarctica             AN Antarctica (the territory South of 60 deg S)
## 4         Africa             AF     Algeria, People's Democratic Republic of
## 5        Oceania             OC                               American Samoa
##   Two_Letter_Country_Code Three_Letter_Country_Code Country_Number
## 1                      AF                       AFG              4
## 2                      AL                       ALB              8
## 3                      AQ                       ATA             10
## 4                      DZ                       DZA             12
## 5                      AS                       ASM             16
continent_data$Country_Name <- gsub(",.*$", "", continent_data$Country_Name)

continent_data = continent_data %>% dplyr::mutate(CountryName = gsub(",.*$", "", continent_data$Country_Name))

Join the two datasets

Group the data by Continent, summarize and sort them by their cumulative totals

joined_df <- left_join( continent_data,bind_unicef_data, by = "CountryName")

sum_cont_df <- joined_df  %>% select( Continent_Name, Continent_Code, row_mean) %>% group_by( Continent_Code)  %>% mutate(sum_mean = sum(row_mean, na.rm=TRUE)) %>% summarize(sum_mean = max(sum_mean)) %>% arrange(desc(sum_mean))


htmlTable(sum_cont_df)
Continent_Code sum_mean
1 AF 7965.11
2 AS 3363.82
3 1093.4
4 EU 1046.75
5 SA 816.34
6 OC 750.77
7 AN 0

Join the top 5 counties dataset with the continent dataset

top_cont_country_df <- inner_join( continent_data,subset_child_mortality, by = "CountryName")
top_cont_country_df <- top_cont_country_df  %>% select( Continent_Name, Continent_Code, CountryName) 
htmlTable(top_cont_country_df)
Continent_Name Continent_Code CountryName
1 Africa AF Guinea
2 Africa AF Mali
3 Africa AF Niger
4 Africa AF Sierra Leone
5 Africa AF Burkina Faso

This is to concluded the Africa has the highest cases of child mortality, with all the countries with high cases are in Africa.