library(tidyverse)
library(kableExtra)
library(knitr)
library(stringr)
library(readr)
library(plyr)
library(dplyr)
library(data.table)
library(readxl)
require(xlsx)
library(rJava)
library(xlsx)

Tidy dataset 1: Movies database

Load in Data from xlsx file

  • Dataframe requires tidying
    • Join individual movies into one movies column
    • Separate timestamp by day and time
      • Could perform some interesting time series analysis on the effects of time of day of review
    • Omit NA Values
movies <- read.xlsx("Movie_Ratings.xlsx",1)
movies%<>% 
    gather(Movies,n,3:12) %>% 
    select(Timestamp,Name,Movies,n,Number.of.Movies.Seen) %>%   
    separate('Timestamp',c("day","time")," ") %>% 
    na.omit() %>% 
    arrange(Name)
View(movies)
    
movies %>% 
    dplyr::group_by(Movies) %>% 
    dplyr::summarise(round(mean(n),2))
## # A tibble: 10 x 2
##    Movies                                      `round(mean(n), 2)`
##    <chr>                                                     <dbl>
##  1 Alien..Covenant                                            0.38
##  2 Blade.Runner.2049                                          1.54
##  3 Ghost.in.the.Shell                                         0.69
##  4 Guardians.of.the.Galaxy.2                                  2.73
##  5 Spider.Man..Homecoming                                     1.58
##  6 Star.Wars..The.Last.Jedi                                   2.92
##  7 Thor..Ragnarok                                             2.08
##  8 Valerian.and.the.City.of.a.Thousand.Planets                0.54
##  9 War.for.the.Planet.of.the.Apes                             0.96
## 10 Wonder.Woman                                               3.12

data Visualization

  • Display average rating by user +Display average rating by movie
movies %>%  
    dplyr::group_by(Name) %>% 
    dplyr::summarise(Average_Rating_by_user=round(mean(n),2)) %>% 
    arrange(desc(Average_Rating_by_user)) %>% 
    ggplot(., aes(x=Name , Average_Rating_by_user)) + 
    geom_bar(aes(fill = Name), position = "dodge", stat = "identity")+
    theme(axis.text.x=element_text(angle=90,hjust=1))+
    coord_flip()

movies %>%  
    dplyr::group_by(Movies) %>% 
    dplyr::summarise(Average_movie_rating=round(mean(n),2)) %>% 
    arrange(desc(Average_movie_rating)) %>% 
    ggplot(., aes(x=Movies , Average_movie_rating)) + 
    geom_bar(aes(fill = Movies), position = "dodge", stat = "identity")+
    theme(axis.text.x=element_text(angle=45,hjust=1))

World Bank info for Infant Mortality, Tuberculosis, and Aids

  • Binish recommended this dataset website
  • link to source
  • Randomly chose 4 countires with the letter U
  • Read in files individually/with functional
#Manually Load in CSV
Uganda<- read.csv("https://raw.githubusercontent.com/justinherman42/Justin-Data-607/master/tidy_data_week4/uganda.csv",na.strings=c("","NA"))
Ukraine <- read.csv("https://raw.githubusercontent.com/justinherman42/Justin-Data-607/master/tidy_data_week4/Ukraine.csv",na.strings=c("","NA"))
United_kingdom <- read.csv("https://raw.githubusercontent.com/justinherman42/Justin-Data-607/master/tidy_data_week4/United_Kingdom.csv",na.strings=c("","NA"))
United_states <- read.csv("https://raw.githubusercontent.com/justinherman42/Justin-Data-607/master/tidy_data_week4/United_states.csv",na.strings=c("","NA"))


# Functional to load csv
files <- c("https://raw.githubusercontent.com/justinherman42/Justin-Data-607/master/tidy_data_week4/uganda.csv", "https://raw.githubusercontent.com/justinherman42/Justin-Data-607/master/tidy_data_week4/Ukraine.csv", "https://raw.githubusercontent.com/justinherman42/Justin-Data-607/master/tidy_data_week4/United_Kingdom.csv", "https://raw.githubusercontent.com/justinherman42/Justin-Data-607/master/tidy_data_week4/United_states.csv")
temp = files
myfiles <-lapply(temp,read.csv)

Join on shared row index values(survey questions)

  • This will decrease our sample to only those questions that are answered in every dataset
merged_df<- inner_join(United_states,United_kingdom) %>% 
    inner_join(.,Ukraine) %>% 
    inner_join(.,Uganda)

# With Functional data
merged_df_2<- inner_join(myfiles[[1]],myfiles[[2]]) %>% 
    inner_join(myfiles[[3]]) %>%   
    inner_join(myfiles[[4]])
#kable(merged_df_2[1:15,])

Tidy Dataset

getting_tidy <- merged_df[-(1),] %>% 
    gather(United_states,Values,2:57) %>% 
    mutate(Year=rep(unlist(lapply(2015:2002,function(x) rep(x,23))),4)) %>% 
    spread(X, Values) 
## Warning: attributes are not identical across measure variables;
## they will be dropped

Get rid of special characters ] [ - and add spacing for next pipe of tidying

  • Use pipe to Tidy Mean[min-max] columns into mean,min,max columns
getting_tidy <-  as_data_frame(lapply(getting_tidy,function(x){ str_replace_all(x,"\\]|\\["," ")}))
## Warning: `as_data_frame()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
getting_tidy <-  as_data_frame(lapply(getting_tidy,function(x){ str_replace_all(x,"-","  ")}))



final_df <- getting_tidy %>% 
    separate(.,`Incidence of tuberculosis (per 100 000 population per year)`,into=c('Mean_Incidence_Tuberculosis_100,000','Min_Tuberculosis',"Max_Tuberculosis")) %>% 
    separate(.,`Antiretroviral therapy coverage among people with HIV infection eligible for ART according to 2010 guidelines (%)`,into=c('Mean_Antiretroviral_coverage','Min_Antiretroviral_coverage','Max_Antiretroviral_coverage')) %>% 
    separate(.,`Infant mortality rate (probability of dying between birth and age 1 per 1000 live births)`,into=c('Mean_Infant mortality_Rate/1000','Min_Infant_Mortality rate',"Max_Infant_Mortality_Rate"),sep= "  ") %>% 
    separate(.,`Under-five mortality rate (probability of dying by age 5 per 1000 live births)`,into=c('Mean_Under-five_Mortality_Rate/1000','Min_Under-five_Mortality_Rate',"Max_Under-five_Mortality_Rate"), sep="  ") %>% 
    separate(.,`Tuberculosis treatment coverage`,into=c('Mean_Tuberculosis_Coverage','Min_Tuberculosis_Coverage',"Max_Tuberculosis_Coverage"))
## Warning: Expected 3 pieces. Additional pieces discarded in 56 rows [1, 2,
## 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## Warning: Expected 3 pieces. Additional pieces discarded in 2 rows [8, 22].
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2 rows [36,
## 50].
## Warning: Expected 3 pieces. Additional pieces discarded in 55 rows [1, 2,
## 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, ...].
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [18].

More piping

  • First column needs to be renamed to country column
  • Eliminate the .integers after the values in country column
  • Trim df
  • Convert columns to numeric for analysis
final_df %<>% 
    plyr::rename(.,c('United_states'= 'Country')) %>% 
    mutate(Country=rep(c("Uganda","Ukraine","United_Kingdom","United_States"),each=14)) %>% 
    select(colnames(.)[c(1,2,10,11,12,13,14,15,16,17,21,22,23,30,31,32,33,34,35)]) %>% 
    arrange(.,`Country`)  

final_df[] <- lapply(final_df,function(x){ str_trim(x)}) 
final_df[3:19] <- lapply(final_df[3:19],function(x) as.numeric(as.character(x))) 
## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion
final_df <- as_data_frame(final_df)
kable(final_df)
Country Year Gross national income per capita (PPP int. $) Hospital beds (per 10 000 population) Mean_Incidence_Tuberculosis_100,000 Min_Tuberculosis Max_Tuberculosis Mean_Infant mortality_Rate/1000 Min_Infant_Mortality rate Max_Infant_Mortality_Rate Number of neonatal deaths (thousands) Number of under-five deaths (thousands) Poliomyelitis - number of reported cases Mean_Tuberculosis_Coverage Min_Tuberculosis_Coverage Max_Tuberculosis_Coverage Mean_Under-five_Mortality_Rate/1000 Min_Under-five_Mortality_Rate Max_Under-five_Mortality_Rate
Uganda 2015 NA NA 202 120 304 39.2 35.4 43.3 38 93 NA 52 34 87 55.9 49.3 63.3
Uganda 2014 NA NA 202 127 294 41.0 37.8 44.5 38 97 0 56 39 89 60.1 54.4 66.4
Uganda 2005 890 NA 233 156 325 74.4 70.3 78.7 38 155 NA 62 44 92 120.3 112.8 128.3
Uganda 2004 840 NA 240 169 323 80.5 76.3 85.0 39 164 NA 66 49 94 131.3 123.5 139.6
Uganda 2003 800 NA 248 172 336 86.7 82.0 91.4 39 173 NA 65 48 94 142.5 133.9 151.3
Uganda 2002 760 NA 256 171 358 92.5 87.5 97.5 40 181 NA 62 44 92 153.1 143.8 162.6
Uganda 2013 NA NA 203 134 288 43.9 40.8 47.2 38 103 0 60 42 91 65.0 59.4 71.1
Uganda 2012 1320 NA 205 135 289 46.5 43.2 50.0 38 109 NA 60 43 91 70.2 64.1 76.8
Uganda 2011 1310 NA 207 143 283 49.5 46.1 53.3 38 114 NA 64 47 92 75.5 69.1 82.3
Uganda 2010 1240 5 210 139 296 52.7 49.2 56.4 38 119 NA 60 43 91 81.0 74.6 87.7
Uganda 2009 1200 4 213 140 302 55.7 52.2 59.3 38 124 NA 60 42 91 86.4 80.1 93.0
Uganda 2008 1150 NA 217 145 304 58.9 55.4 62.5 38 130 NA 61 44 92 92.6 86.2 99.4
Uganda 2007 1070 NA 222 146 313 63.3 59.6 67.3 38 137 NA 60 43 92 100.5 93.6 107.7
Uganda 2006 990 NA 227 150 319 68.6 64.7 72.7 38 145 NA 61 43 92 109.8 102.7 117.3
Ukraine 2015 NA NA 91 59 130 8.1 7.8 8.4 3 5 NA 74 52 110 9.4 9.1 9.8
Ukraine 2014 NA NA 94 61 135 8.4 8.1 8.7 3 5 0 75 52 120 9.8 9.5 10.1
Ukraine 2005 6410 87 127 82 181 12.5 12.0 12.9 4 6 NA 67 47 100 14.5 13.9 15.0
Ukraine 2004 6000 87 127 82 182 13.0 12.3 13.6 4 6 NA NA NA NA 15.1 14.3 15.8
Ukraine 2003 5170 88 126 81 180 13.7 12.8 14.5 4 6 NA 130 91 200 15.9 14.8 16.8
Ukraine 2002 4590 89 123 80 176 14.3 13.2 15.4 4 7 NA 68 48 110 16.7 15.4 18.0
Ukraine 2013 NA 88 96 62 138 8.8 8.5 9.0 3 5 0 84 59 130 10.2 9.9 10.5
Ukraine 2012 8670 89 101 65 144 9.2 8.9 9.4 3 5 NA 90 63 140 10.7 10.3 11.0
Ukraine 2011 8170 90 105 68 150 9.6 9.3 9.9 3 5 NA 71 50 110 11.2 10.8 11.5
Ukraine 2010 7590 94 110 71 157 10.1 9.8 10.4 3 6 NA 67 47 100 11.7 11.4 12.1
Ukraine 2009 7130 94 115 74 164 10.6 10.3 10.9 4 6 NA 68 48 110 12.3 12.0 12.7
Ukraine 2008 8370 87 119 77 170 11.1 10.8 11.4 4 6 NA 69 48 110 12.9 12.5 13.2
Ukraine 2007 7930 87 123 79 175 11.6 11.3 11.9 4 6 NA 66 46 100 13.4 13.1 13.8
Ukraine 2006 7110 87 125 81 179 12.0 11.7 12.3 4 6 NA 71 49 110 14.0 13.6 14.3
United_Kingdom 2015 NA NA 10 9 1 3.7 3.6 3.9 2 4 NA 89 81 98 4.4 4.3 4.6
United_Kingdom 2014 NA NA 11 10 13 3.8 3.7 3.9 2 4 0 89 81 98 4.5 4.4 4.6
United_Kingdom 2005 33820 37 15 14 17 5.1 5.0 5.2 3 4 NA 89 81 98 6.0 5.9 6.2
United_Kingdom 2004 32430 39 13 12 15 5.3 5.2 5.4 2 4 NA 89 81 98 6.2 6.0 6.3
United_Kingdom 2003 30450 40 13 12 15 5.4 5.3 5.4 2 4 NA 89 81 98 6.3 6.1 6.4
United_Kingdom 2002 29390 40 13 12 14 5.4 5.3 5.5 2 4 NA 89 81 98 6.3 6.2 6.5
United_Kingdom 2013 NA 28 13 12 14 3.9 3.8 4.0 2 4 0 89 81 98 4.6 4.5 4.7
United_Kingdom 2012 34640 28 14 13 16 4.1 4.0 4.2 2 4 NA 89 81 98 4.8 4.7 4.9
United_Kingdom 2011 35270 29 15 13 16 4.2 4.2 4.3 2 4 NA 89 81 98 5.0 4.9 5.1
United_Kingdom 2010 34510 30 14 13 15 4.4 4.3 4.5 2 4 NA 89 81 98 5.2 5.1 5.3
United_Kingdom 2009 35260 33 15 13 16 4.6 4.5 4.7 2 4 NA 89 81 98 5.4 5.3 5.5
United_Kingdom 2008 37110 34 15 13 16 4.8 4.7 4.9 2 4 NA 89 81 98 5.6 5.5 5.7
United_Kingdom 2007 36480 34 15 13 16 4.9 4.8 5.0 3 4 NA 89 81 98 5.8 5.6 5.9
United_Kingdom 2006 35620 36 15 14 17 5.0 4.9 5.1 3 4 NA 89 81 98 5.9 5.8 6.0
United_States 2015 NA NA 3 3 2 5.7 5.4 5.9 15 26 NA 87 75 100 6.6 6.4 6.9
United_States 2014 NA NA 3 2 2 5.8 5.7 6.0 16 27 0 87 75 100 6.8 6.6 7.0
United_States 2005 44740 NA 5 5 4 6.8 6.7 6.9 19 33 NA 87 75 100 8.0 7.8 8.1
United_States 2004 42260 NA 5 7 4 6.9 6.8 7.0 19 33 NA 87 75 100 8.1 7.9 8.2
United_States 2003 39960 NA 5 9 5 6.9 6.7 7.0 19 33 NA 87 75 100 8.1 8.0 8.3
United_States 2002 38590 NA 6 5 2 6.9 6.8 7.0 19 33 NA 87 75 100 8.2 8.1 8.4
United_States 2013 NA 29 3 3 2 5.9 5.8 6.0 16 27 0 87 75 100 6.9 6.8 7.1
United_States 2012 52620 29 3 7 3 6.0 5.9 6.1 16 28 NA 87 75 100 7.0 6.9 7.2
United_States 2011 50860 29 3 9 3 6.1 6.0 6.3 16 29 NA 87 75 100 7.2 7.1 7.3
United_States 2010 48880 30 4 2 3 6.2 6.1 6.3 17 30 NA 87 75 100 7.3 7.2 7.5
United_States 2009 47240 31 4 3 3 6.4 6.3 6.5 17 31 NA 87 75 100 7.5 7.4 7.6
United_States 2008 48650 NA 4 9 4 6.5 6.4 6.6 17 31 NA 87 75 100 7.6 7.5 7.8
United_States 2007 48420 NA 5 1 4 6.6 6.5 6.7 18 32 NA 87 75 100 7.8 7.6 7.9
United_States 2006 47390 NA 5 3 4 6.7 6.6 6.8 18 33 NA 87 75 100 7.9 7.7 8.0

Visual exploratory analysis of Infant Mortality

final_df %>%
    mutate(GNP=as.numeric(`Gross national income per capita (PPP int. $)`)) %>% 
    dplyr::group_by(Country) %>% 
    ggplot(., aes(x=Year ,y=GNP))+ 
    geom_bar(aes(fill = Country), position = "dodge", stat = "identity")+
    theme(axis.text.x=element_text(angle=45,hjust=1))
## Warning: Removed 12 rows containing missing values (geom_bar).

final_df %>%
    dplyr::group_by(Country) %>% 
    ggplot(., aes(x=Year ,y=`Mean_Infant mortality_Rate/1000`))+ 
    geom_bar(aes(fill = Country), position = "dodge", stat = "identity")+
    theme(axis.text.x=element_text(angle=45,hjust=1))

final_df_display <- final_df %>% 
    arrange(Country,Year) %>% 
    select(.,Country,`Mean_Infant mortality_Rate/1000`)
  • Uganda seems to have reduced it’s infant mortality rate by over 50%

Numerical display of Infant Mortality

## Numerical display     
kable(final_df_display)
Country Mean_Infant mortality_Rate/1000
Uganda 92.5
Uganda 86.7
Uganda 80.5
Uganda 74.4
Uganda 68.6
Uganda 63.3
Uganda 58.9
Uganda 55.7
Uganda 52.7
Uganda 49.5
Uganda 46.5
Uganda 43.9
Uganda 41.0
Uganda 39.2
Ukraine 14.3
Ukraine 13.7
Ukraine 13.0
Ukraine 12.5
Ukraine 12.0
Ukraine 11.6
Ukraine 11.1
Ukraine 10.6
Ukraine 10.1
Ukraine 9.6
Ukraine 9.2
Ukraine 8.8
Ukraine 8.4
Ukraine 8.1
United_Kingdom 5.4
United_Kingdom 5.4
United_Kingdom 5.3
United_Kingdom 5.1
United_Kingdom 5.0
United_Kingdom 4.9
United_Kingdom 4.8
United_Kingdom 4.6
United_Kingdom 4.4
United_Kingdom 4.2
United_Kingdom 4.1
United_Kingdom 3.9
United_Kingdom 3.8
United_Kingdom 3.7
United_States 6.9
United_States 6.9
United_States 6.9
United_States 6.8
United_States 6.7
United_States 6.6
United_States 6.5
United_States 6.4
United_States 6.2
United_States 6.1
United_States 6.0
United_States 5.9
United_States 5.8
United_States 5.7

Display of infant Mortality with the 3 samples closer in scale

final_df %>%
    dplyr::filter(., Country %in% c("United_Kingdom", "Ukraine","United_States")) %>% 
    dplyr::group_by(Country) %>% 
    ggplot(., aes(x=Year ,y=`Mean_Infant mortality_Rate/1000`))+ 
    geom_bar(aes(fill = Country), position = "dodge", stat = "identity")+
    theme(axis.text.x=element_text(angle=45,hjust=1))

  • Ukraine has seen a reduction of around 40%
  • US and UK have also seen a reduction, however UK seems to have a more substantial reduction(30%) over the US reduction of (17%)

Look at Tuberculosis

final_df %>%
    dplyr::filter(., Country %in% c("Uganda", "Ukraine","United_Kingdom","United_States")) %>% 
    dplyr::group_by(Country) %>% 
    ggplot(., aes(x=Year ,y=`Mean_Incidence_Tuberculosis_100,000`))+ 
    geom_bar(aes(fill = Country), position = "dodge", stat = "identity")+
    theme(axis.text.x=element_text(angle=45,hjust=1))

  • Uganda seems to have seen some moderate improvement over the past 13 years in tuberculosis, but the rate is relatively high still
  • Ukraine seems to have has a similar improvement

Look at Tuberculosis in Heavily industrialized nations( UK, US)

final_df %>%
    dplyr::filter(., Country %in% c("United_Kingdom","United_States")) %>% 
    dplyr::group_by(Country) %>% 
    ggplot(., aes(x=Year ,y=`Mean_Incidence_Tuberculosis_100,000`))+ 
    geom_bar(aes(fill = Country), position = "dodge", stat = "identity")+
    theme(axis.text.x=element_text(angle=45,hjust=1))

  • England appears to have made some moderate improvement in the past several years
  • Tuberculosis in the United states is almost non-existant and it appears it is decreasing

Third Dataset- Gapminder

  • Burcu recommended the Gapminder data website
  • link to source
  • mix of WHO datasets and

War or food?

  • Are nations dealing with war more food insecure?
  • How do state expenditures differ on war versus food?

Load in third dataset

delete.na <- function(DF, n=0) {
  DF[rowSums(is.na(DF)) <= n,]
}


War<- read.xlsx("War.xlsx", header=TRUE,1,)
food_security <- read.xlsx("indicator food_consumption.xlsx", header=TRUE,1,)
Agricultural_expend<- read.xlsx("Agricultural_expenditure.xlsx", header=TRUE,1,)
military_expend <- read.xlsx("military_expenditure.xlsx", header=TRUE,1,)

dim(food_security)
## [1] 259  48
dim(military_expend)
## [1] 270  25
dim(Agricultural_expend)
## [1] 270  52
dim(War)
## [1] 192   4

DF Preparation

  • Eliminate rows with substantial NA’s
  • Convert columns to numeric
  • Rename columns to acoid duplication of Column name sfor eventual Join
  • Filter df’s for common shared year range of 1988-2007
military_expend <- delete.na(military_expend, 20)
Agricultural_expend <- delete.na(Agricultural_expend, 15)
food_security <- delete.na(food_security, 20)
War <- as_data_frame(lapply(War, function(x) if(is.numeric(x)) round(x,2) else x))
military_expend <- as_data_frame(lapply(military_expend, function(x) if(is.numeric(x)) round(x,2) else x))
War <- War[2:4]
colnames(War) <- c("Country", 'war_2002','war_2004')
colnames(military_expend) <-  c("Country",paste(1988:2011,'Military_expend'))
colnames(food_security) <- c("Country",paste (1961:2007,"food_security"))
colnames(Agricultural_expend) <- c("Country",paste(1961:2011,"Agricultural_expend"))
Agricultural_expend <- Agricultural_expend[c(1,29:48)]
food_security <- food_security[c(1,29:48)]
military_expend <- military_expend[c(1:21)]


merged_df_3<- inner_join(War,military_expend) %>%
    inner_join(.,food_security) %>% 
    inner_join(.,Agricultural_expend)  
## Joining, by = "Country"
## Warning: Column `Country` joining factors with different levels, coercing
## to character vector
## Joining, by = "Country"
## Warning: Column `Country` joining character vector and factor, coercing
## into character vector
## Joining, by = "Country"
## Warning: Column `Country` joining character vector and factor, coercing
## into character vector
dim(merged_df_3)
## [1] 83 63

Exploratory Analysis of Death from wars data

  • explore war death df
merged_df_3$war_2002-merged_df_3$war_2004
##  [1]  14.90   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
## [11]   0.00 154.58   0.00   0.01  -0.20   0.00   0.00   0.34  52.60   0.00
## [21]   8.06  88.66  -0.02   0.01   0.00   0.00   0.00   0.00   0.00   0.00
## [31]   0.00   0.00   0.00   0.00   0.02   2.71   0.12   0.00  -0.02   0.00
## [41]   0.00   4.02   1.41   0.00   0.00   0.00   0.00   0.00   0.00   0.00
## [51]   0.00   0.00   0.18 -19.12  -0.02   0.00   0.01   0.00   0.32   0.00
## [61]   0.03   1.55   0.00   0.88  -0.75   9.93   0.00  -0.44  -0.07 -44.86
## [71]   0.00   0.00   0.00   0.00   0.00   0.16   7.46   0.00  -0.07  -0.33
## [81]   0.00   0.00  33.29
merged_df_3[1:3] %>% 
    mutate(.,average_deaths=(war_2002+war_2004)/2) %>% 
    arrange(.,average_deaths) %>%  
    kable(.)
Country war_2002 war_2004 average_deaths
Argentina 0.00 0.00 0.000
Australia 0.00 0.00 0.000
Austria 0.00 0.00 0.000
Belgium 0.00 0.00 0.000
Benin 0.00 0.00 0.000
Bolivia 0.00 0.00 0.000
Botswana 0.00 0.00 0.000
Brazil 0.00 0.00 0.000
Brunei 0.00 0.00 0.000
Burkina Faso 0.00 0.00 0.000
Cameroon 0.00 0.00 0.000
Chile 0.00 0.00 0.000
China 0.00 0.00 0.000
Cuba 0.00 0.00 0.000
Fiji 0.00 0.00 0.000
Finland 0.00 0.00 0.000
France 0.00 0.00 0.000
Gabon 0.00 0.00 0.000
Gambia 0.00 0.00 0.000
Germany 0.00 0.00 0.000
Ghana 0.00 0.00 0.000
Guyana 0.00 0.00 0.000
Honduras 0.00 0.00 0.000
Ireland 0.00 0.00 0.000
Japan 0.00 0.00 0.000
Lesotho 0.00 0.00 0.000
Madagascar 0.00 0.00 0.000
Malaysia 0.00 0.00 0.000
Mali 0.00 0.00 0.000
Malta 0.00 0.00 0.000
Mauritius 0.00 0.00 0.000
Mexico 0.00 0.00 0.000
New Zealand 0.00 0.00 0.000
Norway 0.00 0.00 0.000
Paraguay 0.00 0.00 0.000
Portugal 0.00 0.00 0.000
Swaziland 0.00 0.00 0.000
Sweden 0.00 0.00 0.000
Togo 0.00 0.00 0.000
Tunisia 0.00 0.00 0.000
United Arab Emirates 0.00 0.00 0.000
Venezuela 0.00 0.00 0.000
Canada 0.01 0.00 0.005
Egypt 0.01 0.00 0.005
Denmark 0.00 0.02 0.010
Hungary 0.01 0.01 0.010
Italy 0.00 0.02 0.010
Netherlands 0.00 0.02 0.010
South Africa 0.01 0.01 0.010
Peru 0.03 0.00 0.015
United Kingdom 0.00 0.07 0.035
Zambia 0.04 0.04 0.040
Niger 0.06 0.05 0.055
Iran 0.12 0.00 0.060
Jordan 0.07 0.07 0.070
Malawi 0.07 0.07 0.070
Turkey 0.16 0.00 0.080
Mauritania 0.09 0.09 0.090
United States 0.01 0.34 0.175
Spain 0.00 0.44 0.220
Saudi Arabia 0.03 0.78 0.405
India 0.53 0.51 0.520
Kuwait 1.41 0.00 0.705
Thailand 0.99 0.99 0.990
Chad 2.03 2.23 2.130
Indonesia 3.98 1.27 2.625
Philippines 4.10 2.55 3.325
Kenya 5.47 1.45 3.460
Pakistan 4.30 3.98 4.140
Sri Lanka 4.88 4.95 4.915
Sierra Leone 9.93 0.00 4.965
Rwanda 9.04 8.16 8.600
Myanmar 9.52 9.34 9.430
Algeria 18.31 3.41 10.860
Nepal 6.25 25.37 15.810
Zimbabwe 33.80 0.51 17.155
Colombia 20.33 19.99 20.160
Congo, Rep. 52.60 0.00 26.300
Cote d’Ivoire 32.25 24.19 28.220
Uganda 43.92 36.46 40.190
Sudan 57.91 102.77 80.340
Congo, Dem. Rep. 130.20 41.54 85.870
Burundi 193.64 39.06 116.350

Create bins of death from wars to categorize Nations

  • Add average deaths from war column
  • Categorize deaths from war by levels
    • no deaths, low_deaths, moderate_deaths, high_deaths
food_war <- merged_df_3 %>% 
    gather(Wars,Values,4:63) %>% 
    separate(.,Wars,into= c("Year", "Type"),sep=" ") %>% 
    spread(Type,Values) %>% 
    mutate(.,average_deaths=(war_2002+war_2004)/2) %>% 
    select(-c(war_2002,war_2004)) %>% 
    mutate(Deaths_from_war=cut(average_deaths, breaks=c(-1,0,1,10,Inf), labels= c("no_deaths", "low_deaths", "moderate_deaths", "high_deaths")))
   
    

kable(food_war[1:50,])
Country Year Agricultural_expend food_security Military_expend average_deaths Deaths_from_war
Algeria 1988 12.167656 2795.00 1.75 10.86 high_deaths
Algeria 1989 13.039015 2862.84 1.54 10.86 high_deaths
Algeria 1990 11.358267 2856.21 1.46 10.86 high_deaths
Algeria 1991 10.167058 2835.47 1.23 10.86 high_deaths
Algeria 1992 12.126846 2972.77 2.19 10.86 high_deaths
Algeria 1993 12.097073 2971.61 2.56 10.86 high_deaths
Algeria 1994 10.058437 2865.20 3.14 10.86 high_deaths
Algeria 1995 10.497818 2891.30 2.95 10.86 high_deaths
Algeria 1996 11.766699 2893.72 3.09 10.86 high_deaths
Algeria 1997 9.482320 2844.07 3.63 10.86 high_deaths
Algeria 1998 12.533281 2904.07 3.96 10.86 high_deaths
Algeria 1999 12.201079 2957.91 3.77 10.86 high_deaths
Algeria 2000 8.879884 2928.84 3.44 10.86 high_deaths
Algeria 2001 10.407560 3003.63 3.80 10.86 high_deaths
Algeria 2002 10.003598 3034.33 3.67 10.86 high_deaths
Algeria 2003 10.491237 3073.26 3.25 10.86 high_deaths
Algeria 2004 10.188493 3090.13 3.30 10.86 high_deaths
Algeria 2005 8.221657 3059.24 2.85 10.86 high_deaths
Algeria 2006 7.988789 3101.20 2.64 10.86 high_deaths
Algeria 2007 8.025346 3153.38 2.90 10.86 high_deaths
Argentina 1988 8.977868 3028.90 1.98 0.00 no_deaths
Argentina 1989 9.616065 2980.35 1.78 0.00 no_deaths
Argentina 1990 8.123676 2924.80 1.37 0.00 no_deaths
Argentina 1991 6.716492 3024.96 1.42 0.00 no_deaths
Argentina 1992 5.990787 3087.95 1.34 0.00 no_deaths
Argentina 1993 5.492025 3123.27 1.34 0.00 no_deaths
Argentina 1994 5.440196 3162.52 1.46 0.00 no_deaths
Argentina 1995 5.698112 3168.17 1.47 0.00 no_deaths
Argentina 1996 5.997435 3162.69 1.24 0.00 no_deaths
Argentina 1997 5.599945 3135.69 1.14 0.00 no_deaths
Argentina 1998 5.617612 3166.62 1.14 0.00 no_deaths
Argentina 1999 4.735356 3266.75 1.22 0.00 no_deaths
Argentina 2000 4.971221 3271.70 1.15 0.00 no_deaths
Argentina 2001 4.814450 3183.34 1.18 0.00 no_deaths
Argentina 2002 10.683021 2965.90 1.09 0.00 no_deaths
Argentina 2003 10.986832 3059.74 1.06 0.00 no_deaths
Argentina 2004 10.412916 3029.91 0.96 0.00 no_deaths
Argentina 2005 9.401029 3091.79 0.93 0.00 no_deaths
Argentina 2006 8.392957 2969.12 0.86 0.00 no_deaths
Argentina 2007 9.394875 2940.98 0.88 0.00 no_deaths
Australia 1988 5.208421 3121.54 2.28 0.00 no_deaths
Australia 1989 5.364690 3141.58 2.15 0.00 no_deaths
Australia 1990 4.923955 3177.79 2.10 0.00 no_deaths
Australia 1991 3.631461 3109.40 2.15 0.00 no_deaths
Australia 1992 3.520368 3116.99 2.19 0.00 no_deaths
Australia 1993 3.710071 3044.14 2.21 0.00 no_deaths
Australia 1994 3.799483 3038.66 2.17 0.00 no_deaths
Australia 1995 3.416733 3081.55 2.07 0.00 no_deaths
Australia 1996 3.819477 3056.62 1.97 0.00 no_deaths
Australia 1997 3.666767 3091.38 1.91 0.00 no_deaths

Summary Explorations

# 
food_war <-food_war %>%
    dplyr::group_by(Deaths_from_war,Year)
agricultural_spending_by_year <- food_war %>%
    dplyr::summarize(agri_exp_mean=mean(Agricultural_expend, na.rm = TRUE))
#food_war %>%
   # dplyr::summarize(mean(Military_expend, na.rm = TRUE))
food_security_by_year <- food_war %>%
    dplyr::summarize(daily_calories=mean(food_security, na.rm = TRUE))

food_war <-food_war %>% 
     dplyr::group_by(Deaths_from_war)
# food_war %>% 
#     dplyr::summarize(mean(Agricultural_expend, na.rm = TRUE))
# food_war %>% 
#     dplyr::summarize(mean(Military_expend, na.rm = TRUE))
# food_war %>% 
#     dplyr::summarize(mean(food_security, na.rm = TRUE))
# 
# food_war    

Observations

  • My categories aren’t of even sizes so my clustering could effect my observations and lead to large impact from outliers
  • I don’t have an overall GDP per country, this could make a big difference in relative % GDP expenditures

+Tentative Observations + Areas with the most death from wars seem to spend more of their GDP on agricultural expenditures + Areas with the most deaths don’t spend much on military expenditures, and low death areas actually spend more than high death areas + High and moderate death areas are significantly more food insecure. Calorie consumption is about 30% lower in these areas

Data Visualization

food_security_by_year %>% 
    ggplot(.,aes(x=Year,y=daily_calories))+
    geom_bar(aes(fill = Deaths_from_war), position = "dodge", stat = "identity")+
    theme(axis.text.x=element_text(angle=45,hjust=1))

  • Caloric consumption seems rather Flat in all the categories over the last three decades
agricultural_spending_by_year %>% 
    ggplot(.,aes(x=Year,y=agri_exp_mean))+
    geom_bar(aes(fill = Deaths_from_war), position = "dodge", stat = "identity")+
    theme(axis.text.x=element_text(angle=45,hjust=1))

  • As a percent of GDP agricultural expenditure has decreased across all categories over last three decades
  • Area’s with higher casualties of war seem to have made large cuts to expenditures
food_war %>% 
    ggplot(.,aes(x=Agricultural_expend,y=food_security,shape=Deaths_from_war,color=Deaths_from_war))+
    geom_point()
## Warning: Removed 15 rows containing missing values (geom_point).

food_war %>% 
    filter(.,Military_expend <20) %>% 
    ggplot(.,aes(x=Military_expend,y=food_security,shape=Deaths_from_war,color=Deaths_from_war))+
    geom_point()

The Scatter plots are very Interesting

  • A handful of outliers were removed for better visualization

Graph 1 Agricultural exp vs Food Security

  • Two Clear Clusters develop in the first graph
    • Areas with high death rates seem to be the least food secure and silumtaneously spend the most on agricultural subsidy
    • Areas with low death rates spends the least on food subsidy, and are the most food secure

Graph 2 Military exp vs Food Security

  • Areas with low death rates seem to spend the most on the Military