Gini Values Worldwide

According to Wikipedia the Gini Coefficient is “a measure of statistical dispersion intended to represent the income or wealth distribution of a nation’s residents, and is the most commonly used measure of inequality.” [https://en.wikipedia.org/wiki/Gini_coefficient]. The data was collected from World Bank’s website at [http://databank.worldbank.org/data/reports.aspx?source=2&series=SI.POV.GINI&country=#]

The Gini Coefficient is given in percent form with 100 being perfect inequality and 0 being perfect equality.

Read in the data, which was downloaded from the reference and uploaded to github

GiniData <- read.csv("https://raw.githubusercontent.com/NNedd/MSDA-Github-repository/master/Data607/Project2/Data_Extract_From_World_Development_Indicators/49d6d15c-fab2-4acf-b212-5d1c8222b8b8_Data.csv", stringsAsFactors = FALSE, na.strings = c("", ".."))
head(GiniData)
##                     ï..Series.Name Series.Code   Country.Name Country.Code
## 1 GINI index (World Bank estimate) SI.POV.GINI    Afghanistan          AFG
## 2 GINI index (World Bank estimate) SI.POV.GINI        Albania          ALB
## 3 GINI index (World Bank estimate) SI.POV.GINI        Algeria          DZA
## 4 GINI index (World Bank estimate) SI.POV.GINI American Samoa          ASM
## 5 GINI index (World Bank estimate) SI.POV.GINI        Andorra          ADO
## 6 GINI index (World Bank estimate) SI.POV.GINI         Angola          AGO
##   X1990..YR1990. X2000..YR2000. X2001..YR2001. X2002..YR2002.
## 1             NA             NA             NA             NA
## 2             NA             NA             NA          31.74
## 3             NA             NA             NA             NA
## 4             NA             NA             NA             NA
## 5             NA             NA             NA             NA
## 6             NA          51.96             NA             NA
##   X2003..YR2003. X2004..YR2004. X2005..YR2005. X2006..YR2006.
## 1             NA             NA             NA             NA
## 2             NA             NA           30.6             NA
## 3             NA             NA             NA             NA
## 4             NA             NA             NA             NA
## 5             NA             NA             NA             NA
## 6             NA             NA             NA             NA
##   X2007..YR2007. X2008..YR2008. X2009..YR2009. X2010..YR2010.
## 1             NA             NA             NA             NA
## 2             NA          29.98             NA             NA
## 3             NA             NA             NA             NA
## 4             NA             NA             NA             NA
## 5             NA             NA             NA             NA
## 6             NA          42.72             NA             NA
##   X2011..YR2011. X2012..YR2012. X2013..YR2013. X2014..YR2014.
## 1             NA             NA             NA             NA
## 2             NA          28.96             NA             NA
## 3             NA             NA             NA             NA
## 4             NA             NA             NA             NA
## 5             NA             NA             NA             NA
## 6             NA             NA             NA             NA
##   X2015..YR2015. X2016..YR2016.
## 1             NA             NA
## 2             NA             NA
## 3             NA             NA
## 4             NA             NA
## 5             NA             NA
## 6             NA             NA

Select only the Country Name along with the GINI values from 2000 to 2014

GiniData_part <- select(GiniData, Country.Code, Country.Name, X2000..YR2000.:X2014..YR2014.)
head(GiniData_part)
##   Country.Code   Country.Name X2000..YR2000. X2001..YR2001. X2002..YR2002.
## 1          AFG    Afghanistan             NA             NA             NA
## 2          ALB        Albania             NA             NA          31.74
## 3          DZA        Algeria             NA             NA             NA
## 4          ASM American Samoa             NA             NA             NA
## 5          ADO        Andorra             NA             NA             NA
## 6          AGO         Angola          51.96             NA             NA
##   X2003..YR2003. X2004..YR2004. X2005..YR2005. X2006..YR2006.
## 1             NA             NA             NA             NA
## 2             NA             NA           30.6             NA
## 3             NA             NA             NA             NA
## 4             NA             NA             NA             NA
## 5             NA             NA             NA             NA
## 6             NA             NA             NA             NA
##   X2007..YR2007. X2008..YR2008. X2009..YR2009. X2010..YR2010.
## 1             NA             NA             NA             NA
## 2             NA          29.98             NA             NA
## 3             NA             NA             NA             NA
## 4             NA             NA             NA             NA
## 5             NA             NA             NA             NA
## 6             NA          42.72             NA             NA
##   X2011..YR2011. X2012..YR2012. X2013..YR2013. X2014..YR2014.
## 1             NA             NA             NA             NA
## 2             NA          28.96             NA             NA
## 3             NA             NA             NA             NA
## 4             NA             NA             NA             NA
## 5             NA             NA             NA             NA
## 6             NA             NA             NA             NA

Change to long format, omit N/A values and sort

giniLong <- gather(GiniData_part,Year, Gini, X2000..YR2000.:X2014..YR2014.)
#giniLong <- na.omit(giniLong)
giniLong_sort <- arrange(giniLong, Country.Name)
giniLong_sort <- na.omit(giniLong_sort)
giniLong_sort$Year <- as.numeric(str_extract(giniLong_sort$Year, "[[:digit:]]{4}"))
head(giniLong_sort)
##    Country.Code Country.Name Year  Gini
## 18          ALB      Albania 2002 31.74
## 21          ALB      Albania 2005 30.60
## 24          ALB      Albania 2008 29.98
## 28          ALB      Albania 2012 28.96
## 76          AGO       Angola 2000 51.96
## 84          AGO       Angola 2008 42.72

Display trend by country using sparkbars embedded in a table1

r <- range(giniLong_sort$Gini)

#Setup sparkbars
bar_string <- "type: 'bar', barColor: 'purple', negBarColor: 'green', highlightColor: 'grey'"
cb_bar = JS(paste0("function (oSettings, json) { $('.spark:not(:has(canvas))').sparkline('html', { ", 
    bar_string, " }); }"), collapse = "")

#Indicate which columns will use sparkbars(the second in this case)
table_columns <- list(list(targets = c(1), render = JS("function(data, type, full){ return '<span class=spark>' + data + '</span>' }")))

trend_details <- 
  giniLong_sort %>% 
  group_by(Country.Name) %>% 
  summarise(Trend = paste(Gini, collapse = ",")) 

d1 <- datatable(trend_details, rownames = FALSE, options = list(columnDefs = table_columns,     fnDrawCallback = cb_bar))
d1$dependencies <- append(d1$dependencies, htmlwidgets:::getDependency("sparkline"))
d1

Azerbaijan shows an interesting trend. There was high gini followed by many years of low gini then a spike again

AZE_data <- filter(giniLong_sort, Country.Code == "AZE")
ggplot(AZE_data, aes(Year, Gini)) + geom_col()

Summarise by country and find mean for each country

giniLong_group <- group_by(giniLong_sort, Country.Code)
giniSummary <- summarise(giniLong_group, mean(Gini))
giniSummary_sort <- arrange(giniSummary, `mean(Gini)`)

10 Highest/Lowest inequalities

bottom10 <- head(giniSummary_sort,10)
datatable(bottom10, caption = "Lowest Gini Values")
top10 <- tail(giniSummary_sort, 10)
datatable(top10, caption = "Highest Gini Values")

The higher the gini value the less equality there is. By the same token, the lower the gini value the greater equality there is

Based on the tables above the lowest average gini over 2000 to 2014 is Azerbaijan (relative high equality in the country), and the highest is Botswana which signifies high inequality in the country

Map of World Wide mean gini coefficient

#create a map-shaped window
par(mai=c(0,0,0.2,0),xaxs="i",yaxs="i")
#join to a coarse resolution map
spdf <- joinCountryData2Map(giniSummary_sort, joinCode="ISO3", nameJoinColumn="Country.Code")
## 144 codes from your data successfully matched countries in the map
## 5 codes from your data failed to match with a country code in the map
## 99 codes from the map weren't represented in your data
mapCountryData(spdf, nameColumnToPlot="mean(Gini)")

Consumer Complaints Data

In this result set we will be analyzing the Consumer complaints about financial services

“The Consumer Financial Protection Bureau (CFPB) is the first federal agency solely focused on consumer financial protection and consumer complaints are an integral part of that work. The CFPB helps connect consumers with financial companies to make their voices heard This Monthly Complaint Report provides a high-level snapshot of trends in consumer complaints. The Monthly Complaint Report uses a three-month rolling average, comparing the current average to the same period in the prior year where appropriate, to account for monthly and seasonal fluctuations. In some cases, we use month-to-month comparisons to highlight more immediate trends. For companylevel complaint data, we use a three-month rolling average of complaints sent to companies for response. This company-level complaint data lags other complaint data in this report by two months to reflect the 60 days companies have to respond to complaints, confirming a commercial relationship with the consumer” -https://s3.amazonaws.com/files.consumerfinance.gov/f/documents/201702_cfpb_Monthly-Complaint-Report.pdf

Read in Data

file_path <- c("https://raw.githubusercontent.com/nobieyi00/CUNY_MSDA_R/master/consumer_complaints.csv")

cc_df <- read.csv(file_path, header = TRUE, stringsAsFactors = FALSE,
           na.strings = c("N/A",''))

Find the number of sub-product complaints

cc_df %>%
group_by(Sub.product) %>%
summarise(count=n())
## # A tibble: 38 × 2
##                               Sub.product count
##                                     <chr> <int>
## 1             (CD) Certificate of deposit    12
## 2                                    Auto    11
## 3      Cashing a check without an account     2
## 4                        Checking account   231
## 5  Conventional adjustable mortgage (ARM)   100
## 6             Conventional fixed mortgage   308
## 7                             Credit card   133
## 8                         Debt settlement     2
## 9            Domestic (US) money transfer    13
## 10                   Federal student loan     8
## # ... with 28 more rows

Find the number of sub-product complaints

cc_df %>%
 group_by(Sub.product) %>%
  summarise(count=n()) %>%
  filter(Sub.product != '' )%>%
   arrange(desc(count))
## # A tibble: 37 × 2
##                               Sub.product count
##                                     <chr> <int>
## 1                          Other mortgage   315
## 2             Conventional fixed mortgage   308
## 3                        Checking account   231
## 4   Other (i.e. phone, health club, etc.)   207
## 5                             Credit card   133
## 6                           I do not know   126
## 7                            FHA mortgage   114
## 8              Other bank product/service   109
## 9  Conventional adjustable mortgage (ARM)   100
## 10                                Medical    87
## # ... with 27 more rows

Find the product that gets most complaints

cc_df %>%
 group_by(Product) %>%
  summarise(No_Products=n()) %>%
   arrange(No_Products)
## # A tibble: 11 × 2
##                    Product No_Products
##                      <chr>       <int>
## 1  Other financial service           4
## 2             Prepaid card          18
## 3          Money transfers          27
## 4              Payday loan          28
## 5             Student loan          44
## 6            Consumer Loan         159
## 7              Credit card         339
## 8  Bank account or service         383
## 9          Debt collection         625
## 10        Credit reporting         638
## 11                Mortgage         921

Find the product that gets most complaints with sub products

cc_df %>%
  filter(Sub.product != '' )%>%
 group_by(Product, Sub.product) %>%
  summarise(No_Products=n()) %>%
   arrange(No_Products)
## Source: local data frame [39 x 3]
## Groups: Product [9]
## 
##                    Product                        Sub.product No_Products
##                      <chr>                              <chr>       <int>
## 1            Consumer Loan                         Title loan           1
## 2             Prepaid card              Gift or merchant card           1
## 3             Prepaid card                       Payroll card           1
## 4             Student loan     Federal student loan servicing           1
## 5  Bank account or service Cashing a check without an account           2
## 6  Other financial service                    Debt settlement           2
## 7  Other financial service          Refund anticipation check           2
## 8             Prepaid card                    ID prepaid card           2
## 9             Prepaid card                      Mobile wallet           2
## 10            Prepaid card    Government benefit payment card           3
## # ... with 29 more rows

TOP 10 Companies with the least timely response and most complaints

company_df <- cc_df %>%
  filter(Timely.response.=='No')%>%
  group_by(Company)%>%
  summarise(No_complaints = n())%>%
  arrange(desc(No_complaints))


company_ten <-head(company_df , 10)
company_ten
## # A tibble: 10 × 2
##                                     Company No_complaints
##                                       <chr>         <int>
## 1                     Wells Fargo & Company            11
## 2                      High Point Asset Inc             3
## 3  Accelerated Receivables Management, Inc.             2
## 4                       B.C. Services, Inc.             2
## 5                          Bank of the West             2
## 6                  FirstBank of Puerto Rico             2
## 7                            Mobiloans, LLC             2
## 8                  Planet Home Lending, LLC             2
## 9                     Real Time Resolutions             2
## 10                             Alorica Inc.             1
# Generate data
 ggplot(company_ten, aes(x=Company,y=No_complaints,fill=Company)) + geom_bar(stat="identity", show.legend = FALSE) + coord_flip()

We clearly see it is Wells fargo is at fault

Which state has the most complaints, top 10

 top10_states<- cc_df %>%
  group_by(State)%>%
  summarise(No_complaints = n())%>%
  arrange(desc(No_complaints))%>%
  select(State)%>%
  head(10)

 top10_states
## # A tibble: 10 × 1
##    State
##    <chr>
## 1     CA
## 2     FL
## 3     TX
## 4     NY
## 5     GA
## 6     NJ
## 7     PA
## 8     VA
## 9     MD
## 10    OH

Analyze complaints on a monthly basis per product

Product_Month_df <- cc_df %>%
  separate(Date.received, c("Month","Day","Year"), sep = "/") %>%
  mutate(Month_num= as.numeric(Month)) %>%
  group_by(Product,Month_num)%>%
  summarise(No_complaints = n())%>%
  arrange(Product , Month_num)

pm_b <-Product_Month_df%>%
  filter(Product=="Bank account or service")
 

     
  
 ggplot(Product_Month_df, aes(Month_num, No_complaints,colour=Product)) + 
    geom_line() + 
    geom_point() +ggtitle(label="Trend of complaints per month") 

We notice that Debt collection is usually high in beginning of year

 plot(pm_b$Month_num, pm_b$No_complaints)
  lines(pm_b$Month_num[order(pm_b$Month_num)], pm_b$No_complaints)
   title(main="Bank Accout or Service trend per month", xlab="Month", ylab="Number of complaints")

Peform trend analysis compare last years Number of complaints to this years per states with most complaints

S_y_complain <-cc_df %>%
  separate(Date.received, c("Month","Day","Year"), sep = "/") %>%
  mutate(Year_num= as.numeric(Year)) %>%
  group_by(State,Year_num)%>%
  summarise(No_complaints = n())%>%
  filter(State %in%  top10_states$State )%>%
  arrange(State,Year_num)

S_y_complain 
## Source: local data frame [27 x 3]
## Groups: State [10]
## 
##    State Year_num No_complaints
##    <chr>    <dbl>         <int>
## 1     CA     2013            58
## 2     CA     2016           375
## 3     CA     2017             4
## 4     FL     2013            37
## 5     FL     2016           251
## 6     FL     2017             1
## 7     GA     2013            18
## 8     GA     2016           156
## 9     GA     2017             1
## 10    MD     2013             3
## # ... with 17 more rows
ggplot(S_y_complain, aes(x=State,y=No_complaints,fill=Year_num)) + geom_bar(stat="identity", show.legend = TRUE) 

Child Mortality Rates

Child mortality rate around the world is a bench mark used to know how healthy are the childeren around the world. World leaders need to pay attention to their countries child mortality to avoid catastrophic consequences in future generation. In this data set we would study the child mortality rates This data was retrieved from http://www.childmortality.org/files_v20/download/RatesDeaths_AllIndicators.xlsx

Load data with Child mortality rate data

file_path <- c("https://raw.githubusercontent.com/nobieyi00/CUNY_MSDA_R/master/childmortality.csv")

CMR_df <- read.csv(file_path, header = TRUE, stringsAsFactors = FALSE,
           na.strings = c("N/A",''))

Get the median values for each country as the value for analysis

CMR_1<-CMR_df %>%
  filter(Uncertainty.bounds. == 'Median')

Lets tidy the data and find Infant mortality so that it can be easily analyzed using gather()

IM <-CMR_1 %>%
gather("type_year","Value",4:ncol(CMR_1)) %>%
#filter(type_year %in% c('Infant.Deaths.','IMR'))%>%
mutate(year= as.numeric(str_sub(type_year,-4,-1)), type = str_sub(type_year,1,str_length(type_year)-5) )%>%
filter(type %in% c('Infant.Deaths','IMR'))%>%
filter(is.na(Value)==FALSE)%>%  
select (CountryName, year,type,Value)

Analyze some countries with high Mortality rates and notice trend in two year periods

Cnt_10<-IM%>% 
  filter(type=="IMR" )%>%
mutate(year_range=ifelse(year %in% c(1950:1970),  '1950_1970',
                  ifelse(year %in% c(1971:1990),'1971_1990',
                  ifelse(year %in% c(1991:2010),'1991_2010',
                  ifelse(year %in% c(2011:2030),'2011_2030',NA)))))%>%
    group_by(CountryName,year_range)  %>%
    summarise(avg  = sum(Value)/n())%>%
    arrange(desc(avg))%>%
select(CountryName)%>%
  head(10)

         
 tre<- IM%>% 
  filter(type=="IMR" )%>%
  filter(CountryName %in% Cnt_10$CountryName  )%>% 
    
mutate(year_range=ifelse(year %in% c(1950:1970),  '1950_1970',
                  ifelse(year %in% c(1971:1990),'1971_1990',
                  ifelse(year %in% c(1991:2010),'1991_2010',
                  ifelse(year %in% c(2011:2030),'2011_2030',NA)))))%>%
    group_by(CountryName,year_range)  %>%
    summarise(avg  = sum(Value)/n())%>%
    arrange(CountryName)


ggplot(tre, aes(x=CountryName,y=avg,fill=year_range)) + geom_bar(stat="identity", show.legend = TRUE) 

Cnt_10<-IM%>% 
  filter(type=="IMR" )%>%
mutate(year_range=ifelse(year %in% c(1950:1970),  '1950_1970',
                  ifelse(year %in% c(1971:1990),'1971_1990',
                  ifelse(year %in% c(1991:2010),'1991_2010',
                  ifelse(year %in% c(2011:2030),'2011_2030',NA)))))%>%
    group_by(CountryName,year_range)  %>%
    summarise(avg  = sum(Value)/n())%>%
    arrange(desc(avg))%>%
select(CountryName)%>%
  head(3)

         
 tre<- IM%>% 
  filter(type=="IMR" )%>%
  filter(CountryName %in% Cnt_10$CountryName  )%>% 
    
mutate(year_range=ifelse(year %in% c(1950:1970),  '1950_1970',
                  ifelse(year %in% c(1971:1990),'1971_1990',
                  ifelse(year %in% c(1991:2010),'1991_2010',
                  ifelse(year %in% c(2011:2030),'2011_2030',NA)))))%>%
    group_by(CountryName,year_range)  %>%
    summarise(avg  = sum(Value)/n())%>%
    arrange(CountryName)

ggplot(tre, aes(year_range, avg,colour=CountryName, group=CountryName
                  )) +  geom_point() +ggtitle(label="Trend of infant mortality rates over the years") +
    geom_line()

Infant Number of Deaths

Lets look the country that had the most reduced rate of child deaths over time. This country we can say made the most improvement

But first lets look at any country and plot it out over time

france_ND <-IM%>% 
  filter(type=="Infant.Deaths" & CountryName== "France" )

ggplot(france_ND, aes(year, Value,colour=CountryName, group=CountryName
                  )) +  geom_point() +ggtitle(label="Trend of Infant number of deaths over the years") +
    geom_line()

#find coorelation

corr_france <- cor(france_ND$Value,france_ND$year)

sd(france_ND$year)
## [1] 19.19635
sd(france_ND$Value)
## [1] 9336.837
Regression_slop <-corr_france*(sd(france_ND$Value)/sd(france_ND$year))

Lets look at the trend of Zambia

Zambia_ND <-IM%>% 
  filter(type=="Infant.Deaths" & CountryName== "Zambia" )

ggplot(Zambia_ND, aes(year, Value,colour=CountryName, group=CountryName
                  )) +  geom_point() +ggtitle(label="Trend of Infant number of deaths over the years") +
    geom_line()

#find coorelation

corr_Zambia <- cor(Zambia_ND$Value,Zambia_ND$year)

sd(Zambia_ND$year)
## [1] 18.3303
sd(Zambia_ND$Value)
## [1] 9402.08
Regression_slop <-corr_Zambia*(sd(Zambia_ND$Value)/sd(Zambia_ND$year))
Regression_slop
## [1] 379.2305

Lets look at Nigeria

Nigeria_ND <-IM%>% 
  filter(type=="Infant.Deaths" & CountryName== "Nigeria" )

ggplot(Nigeria_ND, aes(year, Value,colour=CountryName, group=CountryName
                  )) +  geom_point() +ggtitle(label="Trend of Infant number of deaths over the years") +
    geom_line()

##What is regression slope for all the countries ###Lets get the top 10 countries

IM%>% 
  filter(type=="Infant.Deaths" )%>% 
  group_by(CountryName)%>% 
  summarize(Cor_co = cor(Value,year), sd_year=sd(year), sd_value=sd(Value) ,Regression_slope=cor(Value,year)*(sd(Value)/sd(year)))%>% 
arrange(Regression_slope)%>%
head(10)
## Warning in cor(c(16874, 17437, 17478, 16961, 16345, 15696, 15158, 14891, :
## the standard deviation is zero

## Warning in cor(c(16874, 17437, 17478, 16961, 16345, 15696, 15158, 14891, :
## the standard deviation is zero
## # A tibble: 10 × 5
##    CountryName     Cor_co  sd_year  sd_value Regression_slope
##          <chr>      <dbl>    <dbl>     <dbl>            <dbl>
## 1        China -0.8917827 13.71131 640440.74       -41654.224
## 2        India -0.9458409 18.33030 674350.51       -34796.388
## 3    Indonesia -0.9812851 18.90767 166591.65        -8645.904
## 4       Brazil -0.9879252 17.75293 130914.41        -7285.198
## 5   Bangladesh -0.8016545 18.33030 116985.35        -5116.219
## 6        Egypt -0.9770570 17.75293  79095.60        -4353.134
## 7       Turkey -0.9913345 18.33030  70690.77        -3823.079
## 8         Iran -0.9697807 13.13393  47994.76        -3543.830
## 9       Mexico -0.9777475 15.44345  53748.73        -3402.912
## 10    Thailand -0.9758883 19.19635  43217.27        -2197.043

Lets look at China

China_ND <-IM%>% 
  filter(type=="Infant.Deaths" & CountryName== "China" )

ggplot(China_ND, aes(year, Value,colour=CountryName, group=CountryName
                  )) +  geom_point() +ggtitle(label="Trend of Infant number of deaths over the years") +
    geom_line()

###Lets get the bottom 10 countries

IM%>% 
  filter(type=="Infant.Deaths" )%>% 
  group_by(CountryName)%>% 
  summarize(Cor_co = cor(Value,year), sd_year=sd(year), sd_value=sd(Value) ,Regression_slope=cor(Value,year)*(sd(Value)/sd(year)))%>% 
arrange(desc(Regression_slope))%>%
head(10)
## Warning in cor(c(16874, 17437, 17478, 16961, 16345, 15696, 15158, 14891, :
## the standard deviation is zero

## Warning in cor(c(16874, 17437, 17478, 16961, 16345, 15696, 15158, 14891, :
## the standard deviation is zero
## # A tibble: 10 × 5
##      CountryName    Cor_co  sd_year  sd_value Regression_slope
##            <chr>     <dbl>    <dbl>     <dbl>            <dbl>
## 1        Nigeria 0.8135546 15.15476 59767.937        3208.5290
## 2       Congo DR 0.9606360 13.71131 40002.844        2802.6623
## 3         Angola 0.9765648 10.53565 17618.692        1633.1017
## 4         Uganda 0.7131499 18.04162 20326.875         803.4816
## 5           Chad 0.9927706 12.84523  9811.845         758.3289
## 6          Sudan 0.9040948 18.33030 13998.715         690.4504
## 7          Niger 0.8290873 14.28869  9388.198         544.7410
## 8       Tanzania 0.5495608 18.90767 17787.294         516.9965
## 9  Cote d Ivoire 0.8912637 17.17556  9685.933         502.6164
## 10       Somalia 0.9004450  9.66954  4583.354         426.8102

This shows Nigeria infant deaths increased over time and is the worst in the world. We can also say that the top 10 infant child deaths are in African countries. However, a plot above shows it is is reducing. I am from Nigeria and I am surprised that our infant mortality rose the highest.

Under 5 mortality analysis

Under5_data <- select(CMR_df, ISO.Code:U5MR.2015,Under.five.Deaths.1950:Under.five.Deaths.2015)
datatable(head(Under5_data))

Under 5 mortality rate analysis

Under5_rate <- Under5_data %>% select(ISO.Code:U5MR.2015) %>%
 filter(Uncertainty.bounds. == "Median")

Transform data into long form

under5_rateLong <- gather(Under5_rate,Year, MortalityRate, U5MR.1950:U5MR.2015)

under5_rateLong$Year <- as.numeric(str_extract(under5_rateLong$Year, "[[:digit:]]{4}"))
head(under5_rateLong)
##   ISO.Code          CountryName Uncertainty.bounds. Year MortalityRate
## 1      AFG          Afghanistan              Median 1950            NA
## 2      AGO               Angola              Median 1950            NA
## 3      ALB              Albania              Median 1950            NA
## 4      AND              Andorra              Median 1950            NA
## 5      ARE United Arab Emirates              Median 1950            NA
## 6      ARG            Argentina              Median 1950            NA

Eliminate N/A values and redundant Uncertainty Bounds Column

Under5_rateSome <- na.omit(under5_rateLong)
Under5_rateSome <- select(Under5_rateSome, ISO.Code, CountryName, Year, MortalityRate)

Display Map of Mortality rates

#create a map-shaped window
par(mai=c(0,0,0.2,0),xaxs="i",yaxs="i")
#join to a coarse resolution map
spdf <- joinCountryData2Map(Under5_rateSome, joinCode="ISO3", nameJoinColumn="ISO.Code")
## 10244 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 48 codes from the map weren't represented in your data
mapCountryData(spdf, nameColumnToPlot="MortalityRate",missingCountryCol = "grey",colourPalette = c("green", "yellow", "red"))
## Warning in rwmGetColours(colourPalette, numColours): 3 colours specified
## and 7 required, using interpolation to calculate colours

From the map it can be seen that countries in South America, Africa and Asia tend to have higher mortality rates than the rest of the world

One exeption in Africa is Namibia

namibia_data <- filter(Under5_rateSome, CountryName == "Namibia")
boxplot(namibia_data$MortalityRate)

Isolate Numbers

Under5_count <- Under5_data %>% select(ISO.Code:Uncertainty.bounds.,Under.five.Deaths.1950:Under.five.Deaths.2015) %>%
 filter(Uncertainty.bounds. == "Median")

Transform data into long form

under5_countLong <- gather(Under5_count,Year, MortalityRate, Under.five.Deaths.1950:Under.five.Deaths.2015)
under5_countLong$Year <- as.numeric(str_extract(under5_countLong$Year, "[[:digit:]]{4}"))
head(under5_countLong)
##   ISO.Code          CountryName Uncertainty.bounds. Year MortalityRate
## 1      AFG          Afghanistan              Median 1950            NA
## 2      AGO               Angola              Median 1950            NA
## 3      ALB              Albania              Median 1950            NA
## 4      AND              Andorra              Median 1950            NA
## 5      ARE United Arab Emirates              Median 1950            NA
## 6      ARG            Argentina              Median 1950            NA

Eliminate N/A values and redundant Uncertainty Bounds Column

Under5_countLong <- na.omit(under5_countLong)
Under5_countLong <- select(Under5_countLong, ISO.Code, CountryName, Year, MortalityRate)
head(Under5_countLong)
##    ISO.Code  CountryName Year MortalityRate
## 9       AUS    Australia 1950          6169
## 14      BEN        Benin 1950         27932
## 15      BFA Burkina Faso 1950         72277
## 30      CAN       Canada 1950         17848
## 31      CHE  Switzerland 1950          3365
## 49      DNK      Denmark 1950          2917
r <- range(Under5_countLong$MortalityRate)

#Setup sparklines
box_string <- "type: 'box', lineColor: 'black', whiskerColor: 'black', outlierFillColor: 'black', outlierLineColor: 'black', medianColor: 'black', boxFillColor: 'orange', boxLineColor: 'black'"
cb_box = JS(paste0("function (oSettings, json) { $('.spark:not(:has(canvas))').sparkline('html', { ", 
    box_string, ", chartRangeMin: ", r[1], ", chartRangeMax: ", r[2], " }); }"), 
    collapse = "")


#Indicate which columns will use sparklines (the second in this case)
table_columns2 <- list(list(targets = c(1), render = JS("function(data, type, full){ return '<span class=spark>' + data + '</span>' }")))

trend_details2 <- 
  Under5_countLong %>% 
  group_by(CountryName) %>% 
  summarise(Trend = paste(MortalityRate, collapse = ",")) 

d2 <- datatable(trend_details2, rownames = FALSE, options = list(columnDefs = table_columns2, fnDrawCallback = cb_box))
d2$dependencies <- append(d2$dependencies, htmlwidgets:::getDependency("sparkline"))
d2

China and India show great variability in their mortality numbers, almost all other countries show very little variablity in the numbers

china_data <- filter(Under5_countLong, CountryName == "China")
f <- ggplot(china_data, aes(Year,MortalityRate))
f+geom_col()

in the 1900’s China hand a large number of deaths of children under 5. This amount has decreased over the years.

Neonatal Deaths

Lets tidy the data and find Neonatal mortality so that it can be easily analyzed using gather()

NM <-CMR_1 %>%
gather("type_year","Value",4:ncol(CMR_1)) %>%
#filter(type_year %in% c('Infant.Deaths.','IMR'))%>%
mutate(year= as.numeric(str_sub(type_year,-4,-1)), type = str_sub(type_year,1,str_length(type_year)-5) )%>%
filter(type %in% c('Neonatal.Deaths','NMR'))%>%
filter(is.na(Value)==FALSE)%>%  
select (ISO.Code,CountryName, year,type,Value)

Analyse the trend of those with highest rates

TopNM<-NM%>% 
  filter(type=="NMR" )%>%
mutate(year_range=ifelse(year %in% c(1950:1970),  '1950_1970',
                  ifelse(year %in% c(1971:1990),'1971_1990',
                  ifelse(year %in% c(1991:2010),'1991_2010',
                  ifelse(year %in% c(2011:2030),'2011_2030',NA)))))%>%
    group_by(CountryName,year_range)  %>%
    summarise(avg  = sum(Value)/n())%>%
    arrange(desc(avg))%>%
select(CountryName)%>%
  head(6)

         
 tre<- NM%>% 
  filter(type=="NMR" )%>%
  filter(CountryName %in% TopNM$CountryName  )%>% 
    
mutate(year_range=ifelse(year %in% c(1950:1970),  '1950_1970',
                  ifelse(year %in% c(1971:1990),'1971_1990',
                  ifelse(year %in% c(1991:2010),'1991_2010',
                  ifelse(year %in% c(2011:2030),'2011_2030',NA)))))%>%
    group_by(CountryName,year_range)  %>%
    summarise(avg  = sum(Value)/n())%>%
    arrange(CountryName)

ggplot(tre, aes(year_range, avg,colour=CountryName, group=CountryName
                  )) +  geom_point() +ggtitle(label="Trend of neonatal mortality rates over the years") +
    geom_line()

###World View

NM_map <- NM%>% 
  filter(type=="NMR" )
#create a map-shaped window
par(mai=c(0,0,0.2,0),xaxs="i",yaxs="i")
#join to a coarse resolution map
spdf <- joinCountryData2Map(NM_map, joinCode="ISO3", nameJoinColumn="ISO.Code")
## 8494 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 48 codes from the map weren't represented in your data
mapCountryData(spdf, nameColumnToPlot="Value",missingCountryCol = "grey",colourPalette = "diverging", mapTitle = "World wide neonatal mortality rates")


  1. This code was adapted from the following two sources: https://leonawicz.github.io/HtmlWidgetExamples/ex_dt_sparkline.html and https://github.com/htmlwidgets/sparkline/issues/3 both accessed on March 12, 2017.