# Load EDHS 2011
library(gridExtra)
library(maps)
library(ggplot2)
library(viridis)
library(rvest)
library(foreign)
library(haven)
library(survival)
library(car)
library(survey)
library(muhaz)
library(eha)
library(survminer)
library(car)
library(rgdal)
library(ggplot2)
library(sf)
library(dplyr)
library(lme4)
library(lmerTest)
require(MASS)
library(pROC)
pdhs_2016<-read.dta("~/Google Drive/DHS Resources/ET_2016_DHS_08152017_2025_103714/etpr70dt/ETPR70FL.DTA", convert.factors = TRUE)

#wdhs_2016<-read.dta( "~//Google Drive/DHS Resources/ET_2016_DHS_08152017_2025_103714/etir70dt/ETIR70FL.DTA", convert.factors = TRUE)

kdhs_2016<-read.dta( "~//Google Drive/DHS Resources/ET_2016_DHS_08152017_2025_103714/etkr70dt/ETKR70FL.DTA", convert.factors = TRUE)


#cdhs_2016<-read.dta( "~//Google Drive/DHS Resources/ET_2016_DHS_08152017_2025_103714/etcr70dt/ETCR70FL.DTA", convert.factors = TRUE)
 
attach(kdhs_2016)
kdhs_2016$hv001<-kdhs_2016$v001
kdhs_2016$hv002<-kdhs_2016$v002
kdhs_2016$hvidx<-kdhs_2016$b16
attach(pdhs_2016)
pdhs_2016 <- pdhs_2016[order(hv001, hv002,hvidx),]
df2 = left_join(pdhs_2016, kdhs_2016, by.x=c("hv001", "hv001", "hvidx"), by.y=c("hv001", "hv001", "hvidx"), no.dups = TRUE)

pedhs16<-df2


ethsp4<-readOGR(dsn="~//Google Drive/fikre/ethiopia_dhs/sdr_subnational_data_2017-03-28/shps",layer = "sdr_subnational_data_dhs_2016")
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/fikrewoldbitew/Google Drive/fikre/ethiopia_dhs/sdr_subnational_data_2017-03-28/shps", layer: "sdr_subnational_data_dhs_2016"
## with 11 features
## It has 55 fields

Used data from the Ethiopan Demographic and Health Surveys (EDHS 2000 - 2016).

pedhs16<-pedhs16[pedhs16$hv103=='yes'&pedhs16$hc70<=9990,]
pedhs16<-pedhs16[!is.na(pedhs16$hc70),]
pedhs16$stunt<-ifelse(pedhs16$hc70/100<=-2&pedhs16$hc70/100!=-2,1,0)
pedhs16$stunting<-ifelse(pedhs16$hc70/100<=-2&pedhs16$hc70/100!=-2,'yes','no')
pedhs16$stunt.r<-as.factor(pedhs16$stunt)
  
pedhs16$wasted<-ifelse(pedhs16$hc72/100<=-2&pedhs16$hc72/100!=-2,1,0)
pedhs16$wasting<-ifelse(pedhs16$hc72/100<=-2&pedhs16$hc72/100!=-2,'yes','no')

pedhs16$underweight<-ifelse(pedhs16$hc71/100<=-2&pedhs16$hc71/100!=-2,1,0)
pedhs16$combined<-ifelse(pedhs16$wasted==1|pedhs16$stunt==1|pedhs16$underweight==1, 1, 0)
pedhs16$tot_ch<-1 

pedhs16$age<-pedhs16$hc1
pedhs16$res<-pedhs16$hv025
pedhs16$sex<-pedhs16$hc27
pedhs16$time_water<-pedhs16$hv204


#pedhs16$any_anemia<-Recode(pedhs16$hc57, recodes="'severe' ='Yes';
#'moderate'='Yes'; 'mild'='Yes';'not anemic' = 'No'; else=NA", as.factor = F)

#pedhs16$toilet<-pedhs16$hv024
pedhs16$watersource<-Recode(pedhs16$hv201, recodes="'piped water' ='Improved'; 'piped into dwelling'='Improved'; 'piped to yard/plot'='Improved'; 'piped to neighbor'='Improved'; 'public tap/standpipe'='Improved';
'tube well water'='Unimproved'; 'tube well or borehole'= 'Improved';                        
'dug well (open/protected)'='Unimproved';
'surface from spring'='Unimproved';
'protected well' ='Improved';                                    
'unprotected well' ='Unimproved';
'surface from spring' ='Unimproved';                                  
'protected spring' ='Improved';  
'unprotected spring'='Unimproved';  
'river/dam/lake/ponds/stream/canal/irrigation channel'='Unimproved';  
'rainwater'='Improved';                                           
'tanker truck'='Unimproved';  
'cart with small tank'='Unimproved';   
'bottled water'='Improved';
'other' = 'Unimproved' ; else=NA", as.factor = F)

pedhs16$toilet_facility<-Recode(pedhs16$hv205,
recodes='"flush toilet" =   "Improved";
"flush to piped sewer system"   =   "Improved";
"flush to septic tank"  =   "Improved";
"flush to pit latrine"  =   "Improved";
"flush to somewhere else"   =   "Improved";
"pit toilet latrine" =  "Improved";
"ventilated improved pit latrine (vip)" =   "Improved";
"pit latrine with slab" =   "Improved";
"pit latrine without slab/open pit" =   "Unimproved";
"no facility"   =   "Unimproved";
"no facility/bush/field"=   "Unimproved";
"composting toilet" =   "Unimproved";
"bucket toilet" =   "Unimproved";
"hanging toilet/latrine"=   "Unimproved";
"other" =   "Unimproved"; else="Improved"', as.factor = F)

#Education 
pedhs16$meduc<-Recode(pedhs16$hc61, recodes="'no education' ='0Noeducation'; 'primary'='1Primary'; 'secondary'='2Secodary/Higher'; 'technical/ vocational'='2Secodary/Higher'; 'higher'='2Secodary/Higher';else=NA", as.factor=T)

#pedhs16$pno_educ<-Recode(pedhs16$v106, recodes="'no education'=0;else=1", as.factor=T)

#Residence
pedhs16$res<-pedhs16$hv025
pedhs16$prec_b_interval<-pedhs16$hc63 
pedhs16$prec_b_order<-pedhs16$hc64 

pedhs16$mother_age<-Recode(pedhs16$v013,recodes="'15-19'='15-24'; '20-24'='15-24'; '25-29'='25-34';
'30-34'='25-34';  '35-39'='35-49'; '40-44'='35-49';'45-49'='35-49'; else=NA", as.factor=T)


pedhs16$Age<-Recode(pedhs16$hc1,recodes="0:29='< 30 months'; 30:60='> 30 months';else=NA", as.factor=T)
pedhs16$b_order<-Recode(pedhs16$hc64, recodes="1='1st-2nd'; 2='1st-2nd'; 3:14='3rd or later';else=NA", as.factor=T)
pedhs16$b_interval<-Recode(pedhs16$hc63, recodes="8:23='< 2 years'; 24:48='2-4 yrs'; 48:215='> 4 yrs';else=NA", as.factor=T)
pedhs16$sex<-pedhs16$hc27



#write.csv(pedhs16, '~//Google Drive/pedhs10.csv')

#time_to_water_source
pedhs16$time_water<-pedhs16$hv204

#wealth Index
pedhs16$mar_status<-Recode(pedhs16$v501, recodes="'married'='1in union';
'living with partner'='1in union';
'widowed'='1Not in union';
'divorced'='1Not in union';
'no longer living together/separated'='1Not in union';
'never in union '='1Not in union'; else=NA", as.factor=T)

#Wealth Index
pedhs16$wealth_index<-Recode(pedhs16$hv270, recodes="'poorest'='1Poor'; 'poorer'='1Poor';'middle'='2middle'; 'richer'='3rich';  'richest'='3rich'; else=NA", as.factor=T)

#anemia-level
pedhs16$anemia<-Recode(pedhs16$hc57, recodes="'severe'='Yes'; 'moderate'='Yes'; 'mild'='Yes'; 'not anemic'='No'; else=NA", as.factor=F)

#anemia-level
pedhs16$diahrea<-Recode(pedhs16$h11, recodes='"no"="1No"; "yes, last two weeks"="2Yes";
"don\'t know"= "1No";  else=NA', as.factor=F)
 

#anemia-level
pedhs16$anemia<-Recode(pedhs16$hc57, recodes="'severe'='Yes'; 'moderate'='Yes'; 'mild'='Yes'; 'not anemic'='No'; else=NA", as.factor=F)

#Children Ever Born 
pedhs16$parity<-Recode(pedhs16$v201, recodes="0='0'; 1:2='1-2'; 3:4='3-4'; 5:18='5+'; else=NA", as.factor=T)

pedhs16$fam_size<-Recode(pedhs16$hv009, recodes="0:3='1_Small(<4)'; 4:8='2_Medium (4-8) members) '; 9:19='3_>8 members';else=NA", as.factor=T)


pedhs16$family_size<-pedhs16$hv009
 
  
pedhs16$no_children<-Recode(pedhs16$hv014, recodes="1='1'; 2='2'; 3:10='3+'; else=NA", as.factor=T)

pedhs16$ceb<-pedhs16$v201

  
#Age at first Birth
pedhs16$age_fb<-Recode(pedhs16$v212, recodes="10:19='<20 years'; 20:49='20+years'; else=NA", as.factor=T)
#BMI 
pedhs16$bmi<-ifelse(pedhs16$v445==9999 | pedhs16$v445==9998 , NA, pedhs16$v445/100)

pedhs16$bmi_rec<-Recode(pedhs16$bmi, recodes="0:18.49='Underweight'; 18.5:24.99='Normal';
                       25:49='Overweight'; else=NA", as.factor=T)

#Occupation
pedhs16$occup<-Recode(pedhs16$v717, recodes="'not working'='1Unemployed'; 
                      'professional/technical/managerial'='2Non-manual/Professional';
                      'clerical' = '2Non-manual/Professional';
                               'sales' = '2Non-manual/Professional';
                      'agricultural - employee'='3Agricultural/Manual(skilled/unskilled)';
                      'agricultural - self employed'='3Agricultural/Manual(skilled/unskilled)'; 
                      'skilled manual'='3Agricultural/Manual(skilled/unskilled)'; 
                      'unskilled manual'='3Agricultural/Manual(skilled/unskilled)'; 
                      'others'='3Agricultural/Manual(skilled/unskilled)'; 
                      else=NA", as.factor=T)

pedhs16$breastfed_1sthr<-Recode(pedhs16$m34, recodes="0='1Yes'; 100='1Yes'; 101:223='2No'; else=NA", as.factor=T)

#Partner"s/Husband's education level 
pedhs16$pl_delivery<-Recode(pedhs16$m15, recodes='"home"="0Home"; "other home"="0Home"; 
 "respondent\'s home"="0Home"; else="1Health facility"', as.factor=F)

pedhs16$region<-Recode(pedhs16$hv024, recodes="'tigray' ='Tigray'; 'afar'='Afar'; 'amhara'='Amhara'; 'oromia'='Oromia'; 'somali'='Somali';
'benishangul'='Benishangul';'dire dawa'='Dire Dawa';'addis adaba'='Addis Adaba';'snnpr'='SNNPR';'gambela'='Gambela';
'harari'='Harari';else=NA", as.factor=F)
write.csv(pedhs16, '~//Google Drive/pedhs10.csv')
pedhs16<-merge(pedhs16, ethsp4@data[,-37], by.x="region", by.y="REGNAME" ,all.x=T)
## identify the working variables and the all the edhs datasets
pedhs16$wt<-pedhs16$hv005/1000000

options(survey.lonely.psu = "adjust")
pedhs16$yrstratum<-paste(pedhs16$SVYYEAR, pedhs16$hv022, sep="-")

pedhs16$yrpsu<-paste(pedhs16$SVYYEAR, pedhs16$hv021, sep="-")

#table(pedhs16$yrstratum)

des<-svydesign(ids=~yrpsu, strata=~yrstratum, weights=~wt, data = pedhs16[is.na(pedhs16$wt)==F,], nest=T)
reg.est.s<-svyby(~stunt,~region+SVYYEAR,FUN = svymean, na.rm=T, design =des)
reg.est.stunt<-reshape(reg.est.s, idvar = "region", timevar = "SVYYEAR", direction = "wide")
reg.est.stunt
##                       region stunt.2016    se.2016
## Addis Adaba.2016 Addis Adaba  0.1459285 0.01925328
## Afar.2016               Afar  0.4110693 0.02464632
## Amhara.2016           Amhara  0.4632034 0.01941072
## Benishangul.2016 Benishangul  0.4274297 0.02203513
## Dire Dawa.2016     Dire Dawa  0.4019738 0.02895576
## Gambela.2016         Gambela  0.2352716 0.01826578
## Harari.2016           Harari  0.3195869 0.02520432
## Oromia.2016           Oromia  0.3647153 0.01700376
## SNNPR.2016             SNNPR  0.3863328 0.01982741
## Somali.2016           Somali  0.2743531 0.01738827
## Tigray.2016           Tigray  0.3927101 0.01806748
reg.est.w<-svyby(~wasted,~region+SVYYEAR,FUN = svymean, na.rm=T, design =des)
reg.est.wasted<-reshape(reg.est.w, idvar = "region", timevar = "SVYYEAR", direction = "wide")
reg.est.wasted 
##                       region wasted.2016     se.2016
## Addis Adaba.2016 Addis Adaba  0.03472510 0.008311278
## Afar.2016               Afar  0.17760728 0.019688532
## Amhara.2016           Amhara  0.09718662 0.009674930
## Benishangul.2016 Benishangul  0.11151593 0.017453261
## Dire Dawa.2016     Dire Dawa  0.09888908 0.017358085
## Gambela.2016         Gambela  0.14006701 0.019506216
## Harari.2016           Harari  0.10744200 0.017425028
## Oromia.2016           Oromia  0.10434400 0.008757773
## SNNPR.2016             SNNPR  0.05966669 0.007108052
## Somali.2016           Somali  0.22522663 0.025358474
## Tigray.2016           Tigray  0.11013005 0.010672763
reg.est.u<-svyby(~underweight,~region+SVYYEAR,FUN = svymean, na.rm=T, design =des)
reg.est.uwgt<-reshape(reg.est.u, idvar = "region", timevar = "SVYYEAR", direction = "wide")
reg.est.uwgt
##                       region underweight.2016    se.2016
## Addis Adaba.2016 Addis Adaba       0.05034569 0.01539083
## Afar.2016               Afar       0.35547650 0.02909223
## Amhara.2016           Amhara       0.28210266 0.01696283
## Benishangul.2016 Benishangul       0.34069793 0.03049209
## Dire Dawa.2016     Dire Dawa       0.25680589 0.02431406
## Gambela.2016         Gambela       0.19200296 0.01981707
## Harari.2016           Harari       0.20501442 0.02110583
## Oromia.2016           Oromia       0.22238856 0.01307474
## SNNPR.2016             SNNPR       0.20644331 0.01691886
## Somali.2016           Somali       0.28968000 0.02088827
## Tigray.2016           Tigray       0.23039544 0.01463603
reg.est.c<-svyby(~combined,~region+SVYYEAR,FUN = svymean, na.rm=T, design =des)
reg.est.comb<-reshape(reg.est.c, idvar = "region", timevar = "SVYYEAR", direction = "wide")
reg.est.comb
##                       region combined.2016    se.2016
## Addis Adaba.2016 Addis Adaba     0.1833118 0.02340984
## Afar.2016               Afar     0.5434065 0.02718943
## Amhara.2016           Amhara     0.5402633 0.01781279
## Benishangul.2016 Benishangul     0.5147896 0.02794263
## Dire Dawa.2016     Dire Dawa     0.4839289 0.03138024
## Gambela.2016         Gambela     0.3563432 0.02476112
## Harari.2016           Harari     0.4136740 0.02949159
## Oromia.2016           Oromia     0.4546517 0.01840233
## SNNPR.2016             SNNPR     0.4367614 0.02022106
## Somali.2016           Somali     0.4662297 0.02492501
## Tigray.2016           Tigray     0.4813284 0.01854586
ethsp.w<-ethsp4
ethsp.u<-ethsp4
ethsp.c<-ethsp4

m.geo.stunt<-merge(ethsp4@data, reg.est.stunt, by.x="REGNAME", by.y="region" , sort=F)
m.geo.wasted<-merge(ethsp4@data, reg.est.wasted, by.x="REGNAME", by.y="region" , sort=F)
m.geo.uwgt<-merge(ethsp4@data, reg.est.uwgt, by.x="REGNAME", by.y="region" , sort=F)
m.geo.combined<-merge(ethsp4@data, reg.est.comb, by.x="REGNAME", by.y="region" , sort=F)

ethsp4@data<-m.geo.stunt
ethsp.w@data<-m.geo.wasted
ethsp.u@data<-m.geo.uwgt
ethsp.c@data<-m.geo.combined


library(sp)
brk.s<-quantile(m.geo.stunt$stunt.2016, probs = seq(0,1,length.out = 6), na.rm = T)
brk.w<-quantile(m.geo.wasted$wasted.2016, probs = seq(0,1,length.out = 6), na.rm = T)
brk.u<-quantile(m.geo.uwgt$underweight.2016, probs = seq(0,1,length.out = 6), na.rm = T)
brk.c<-quantile(m.geo.combined$combined.2016, probs = seq(0,1,length.out = 6), na.rm = T)




eth.geo.stunt<-st_as_sf(ethsp4)
eth.geo.wasted<-st_as_sf(ethsp.w)
eth.geo.uwgt<-st_as_sf(ethsp.u)
eth.geo.comb<-st_as_sf(ethsp.c)

eth.geo.stunt$stunt_q_16<-cut(eth.geo.stunt$stunt.2016, brk.s, include.lowest = T)
eth.geo.wasted$wasted_q_16<-cut(eth.geo.wasted$wasted.2016, brk.w, include.lowest = T)
eth.geo.uwgt$uwgt_q_16<-cut(eth.geo.uwgt$underweight.2016, brk.u, include.lowest = T)
eth.geo.comb$comb_q_16<-cut(eth.geo.comb$combined.2016, brk.c, include.lowest = T)



centroids<-coordinates(ethsp4)
centroids<-coordinates(ethsp.w)
centroids<-coordinates(ethsp.u)
centroids<-coordinates(ethsp.c)


eth.geo.stunt<-cbind(eth.geo.stunt, centroids)
eth.geo.wasted<-cbind(eth.geo.wasted, centroids)
eth.geo.uwgt<-cbind(eth.geo.uwgt, centroids)
eth.geo.comb<-cbind(eth.geo.comb, centroids)



p1<-eth.geo.stunt%>%
  ggplot()+geom_sf(aes(fill=stunt_q_16))+ scale_colour_brewer(palette = "YlOrRd" )+scale_fill_brewer(palette = "YlOrRd", na.value="grey")+guides(fill=guide_legend(title="U5-Stunting"))+geom_text(aes(label = DHSREGEN, x = X1, y = X2), cex=5)+ylab(label = "")+xlab(label = "")+ggtitle(label = "Spatial Distribution of Stunting by Regions of Ethiopia, EDHS 2016", subtitle = "")+theme(legend.text = element_text(size =10))+theme(plot.title = element_text(color="Blue", size=10, face="bold"), plot.subtitle = element_text(color="blue", size=10, face="bold"))+theme(legend.title = element_text(color="blue", size=10, face="bold"), strip.text.x = element_text(size = 10))+theme(plot.title = element_text(size=10))


p1

p2<-eth.geo.wasted%>%
  ggplot()+geom_sf(aes(fill=wasted_q_16))+ scale_colour_brewer(palette = "YlOrRd" )+scale_fill_brewer(palette = "YlOrRd", na.value="grey")+guides(fill=guide_legend(title="U5-Wasting"))+geom_text(aes(label = DHSREGEN, x = X1, y = X2), cex=5)+ylab(label = "")+xlab(label = "")+ggtitle(label = "Spatial Distribution of Wasting by Regions of Ethiopia, EDHS 2016", subtitle = "")+theme(legend.text = element_text(size =10))+theme(plot.title = element_text(color="Blue", size=10, face="bold"), plot.subtitle = element_text(color="blue", size=10, face="bold"))+theme(legend.title = element_text(color="blue", size=10, face="bold"), strip.text.x = element_text(size = 10))+theme(plot.title = element_text(size=10))


p2

p3<-eth.geo.uwgt%>%
  ggplot()+geom_sf(aes(fill=uwgt_q_16))+ scale_colour_brewer(palette = "YlOrRd" )+scale_fill_brewer(palette = "YlOrRd", na.value="grey")+guides(fill=guide_legend(title="U5-Underweight"))+geom_text(aes(label = DHSREGEN, x = X1, y = X2), cex=5)+ylab(label = "")+xlab(label = "")+ggtitle(label = "Spatial Distribution of Underweight by Regions of Ethiopia, EDHS 2016", subtitle = "")+theme(legend.text = element_text(size =10))+theme(plot.title = element_text(color="Blue", size=10, face="bold"), plot.subtitle = element_text(color="blue", size=10, face="bold"))+theme(legend.title = element_text(color="blue", size=10, face="bold"), strip.text.x = element_text(size = 10))+theme(plot.title = element_text(size=10))
p3

p4<-eth.geo.comb%>%
  ggplot()+geom_sf(aes(fill=comb_q_16))+ scale_colour_brewer(palette = "YlOrRd" )+scale_fill_brewer(palette = "YlOrRd", na.value="grey")+guides(fill=guide_legend(title="Combined Risk"))+geom_text(aes(label = DHSREGEN, x = X1, y = X2), cex=5)+ylab(label = "")+xlab(label = "")+ggtitle(label = "Spatial Distribution of Combined Malnutrition Risk by Regions of Ethiopia, EDHS 2016", subtitle = "")+theme(legend.text = element_text(size =10))+theme(plot.title = element_text(color="Blue", size=10, face="bold"), plot.subtitle = element_text(color="blue", size=10, face="bold"))+theme(legend.title = element_text(color="blue", size=10, face="bold"), strip.text.x = element_text(size = 10))+theme(plot.title = element_text(size=10))
p4

#ggsave(plot=p1,filename = "~/Google Drive/PhD_Courses/Spring 2019/PublicHealth/FinalProject/Under_5_Mortality.png", width = 18, height = 12,units = "in", dpi=100)
library(leaflet)
library(sf)
library(sp)
library(raster)
library(rgeos)

popch = paste0("<strong>", eth.geo.stunt$DHSREGEN, "</strong>", "</br>", eth.geo.stunt$CNNUTSCHA2)

bins <- c(30,40, 50,60, 70,80, 90,100, 110, 130)
bin <- c(3, 5, 10, 15, 20, 25,30,40, 50,60,70, 100, Inf)

pall <- colorBin("YlOrRd", domain =  eth.geo.stunt$CNNUTSCHA2, bins = bins)

mypal <- colorQuantile(palette = "YlOrRd", domain =  eth.geo.stunt$CNNUTSCHA2, n = 6, reverse = FALSE)


eth.geo.stunt$pch_mort<-eth.geo.stunt$CNNUTSCHA2*100


U5S <- leaflet(eth.geo.stunt) %>%
  addProviderTiles("OpenStreetMap.Mapnik") %>%
#setView(lat = 9, lng = 41, zoom = 6.2) %>%
  setView(lat = 9.292, lng =  40.71 , zoom = 6.2) %>%
 
   addProviderTiles("CartoDB.Positron") %>%
   addPolygons(data = eth.geo.stunt, fillColor = ~mypal(eth.geo.stunt$CNNUTSCHA2), 
   fillOpacity = 1,
   weight = 1,
   color = "white",
   popup = popch)%>%
  addLabelOnlyMarkers(data = eth.geo.stunt, lng = ~X1, lat = ~X2, label = ~DHSREGEN,
                    labelOptions = labelOptions(noHide = TRUE, direction = 'center', textOnly = TRUE)) %>%
  addLegend(pal = mypal, values = ~CNNUTSCHA2, title = "U5 Stunting",opacity = 0.5, position = "bottomright")

U5S
library(leaflet)
library(sf)
library(sp)
library(raster)
library(rgeos)

popch = paste0("<strong>", eth.geo.wasted$DHSREGEN, "</strong>", "</br>", eth.geo.wasted$CNNUTSCWH2)

bins <- c(30,40, 50,60, 70,80, 90,100, 110, 130)
bin <- c(3, 5, 10, 15, 20, 25,30,40, 50,60,70, 100, Inf)

pall <- colorBin("YlOrRd", domain =  eth.geo.wasted$CNNUTSCWH2, bins = bins)

mypal <- colorQuantile(palette = "YlOrRd", domain = eth.geo.wasted$CNNUTSCWH2, n = 6, reverse = FALSE)


eth.geo.wasted$pch_mort<-eth.geo.wasted$CNNUTSCWH2*100


U5W<- leaflet(eth.geo.wasted) %>%
  addProviderTiles("OpenStreetMap.Mapnik") %>%
#setView(lat = 9, lng = 41, zoom = 6.2) %>%
  setView(lat = 9.292, lng =  40.71 , zoom = 6.2) %>%
 
   addProviderTiles("CartoDB.Positron") %>%
   addPolygons(data = eth.geo.wasted, fillColor = ~mypal(eth.geo.wasted$CNNUTSCWH2), 
   fillOpacity = 1,
   weight = 1,
   color = "white",
   popup = popch)%>%
  addLabelOnlyMarkers(data = eth.geo.wasted, lng = ~X1, lat = ~X2, label = ~DHSREGEN,
                    labelOptions = labelOptions(noHide = TRUE, direction = 'center', textOnly = TRUE)) %>%
  addLegend(pal = mypal, values = ~CNNUTSCWH2, title = "U5 Wasted",opacity = 0.5, position = "bottomright")

U5W
library(leaflet)
library(sf)
library(sp)
library(raster)
library(rgeos)

popch = paste0("<strong>", eth.geo.uwgt$DHSREGEN, "</strong>", "</br>", eth.geo.stunt$CNNUTSCWA2)

bins <- c(30,40, 50,60, 70,80, 90,100, 110, 130)
bin <- c(3, 5, 10, 15, 20, 25,30,40, 50,60,70, 100, Inf)

pall <- colorBin("YlOrRd", domain =  eth.geo.stunt$CNNUTSCWA2, bins = bins)

mypal <- colorQuantile(palette = "YlOrRd", domain = eth.geo.uwgt$CNNUTSCWA2, n = 6, reverse = FALSE)


eth.geo.uwgt$pch_mort<-eth.geo.uwgt$CNNUTSCWA2*100


U5U<- leaflet(eth.geo.uwgt) %>%
  addProviderTiles("OpenStreetMap.Mapnik") %>%
#setView(lat = 9, lng = 41, zoom = 6.2) %>%
  setView(lat = 9.292, lng =  40.71 , zoom = 6.2) %>%
 
addProviderTiles("CartoDB.Positron") %>%
addPolygons(data = eth.geo.uwgt, fillColor = ~mypal(eth.geo.uwgt$CNNUTSCWA2), 
fillOpacity = 1,
weight = 1,
color = "white",
popup = popch)%>%
addLabelOnlyMarkers(data = eth.geo.uwgt, lng = ~X1, lat = ~X2, label = ~DHSREGEN,
labelOptions = labelOptions(noHide = TRUE, direction = 'center', textOnly = TRUE)) %>%
addLegend(pal = mypal, values = ~CNNUTSCWA2, title = "U5 Underweight",opacity = 0.5, position = "bottomright")

U5U
library(leaflet)
library(sf)
library(sp)
library(raster)
library(rgeos)


bins <- c(30,40, 50,60, 70,80, 90,100, 110, 130)
bin <- c(3, 5, 10, 15, 20, 25,30,40, 50,60,70, 100, Inf)

pall <- colorBin("YlOrRd", domain =  eth.geo.comb$combined.2016, bins = bins)

eth.geo.comb$combined<-eth.geo.comb$combined.2016*100
popch = paste0("<strong>", eth.geo.comb$DHSREGEN, "</strong>", "</br>", eth.geo.comb$combined)
mypal <- colorQuantile(palette = "YlOrRd", domain = eth.geo.comb$combined, n = 6, reverse = FALSE)


U5C<- leaflet(eth.geo.comb) %>%
  addProviderTiles("OpenStreetMap.Mapnik") %>%
#setView(lat = 9, lng = 41, zoom = 6.2) %>%
  setView(lat = 9.292, lng =  40.71 , zoom = 6.2) %>%
 
addProviderTiles("CartoDB.Positron") %>%
addPolygons(data = eth.geo.comb, fillColor = ~mypal(eth.geo.comb$combined), 
fillOpacity = 1,
weight = 1,
color = "white",
popup = popch)%>%
addLabelOnlyMarkers(data = eth.geo.comb, lng = ~X1, lat = ~X2, label = ~DHSREGEN,
labelOptions = labelOptions(noHide = TRUE, direction = 'center', textOnly = TRUE)) %>%
addLegend(pal = mypal, values = ~combined, title = "U5 Combined Malnutrition",opacity = 0.5, position = "bottomright")

U5C

#Calculate percentage by background characterstics for all years

prop.table(cat<-svytable(~Age+SVYYEAR,design=des), margin = 2)*100
##              SVYYEAR
## Age               2016
##   < 30 months 50.35385
##   > 30 months 49.64615
prop.table(cat<-svytable(~age_fb+SVYYEAR,design=des), margin = 2)*100
##            SVYYEAR
## age_fb          2016
##   <20 years 64.63254
##   20+years  35.36746
prop.table(cat<-svytable(~ stunt+SVYYEAR,design=des), margin = 2)*100
##      SVYYEAR
## stunt    2016
##     0 61.6386
##     1 38.3614
prop.table(cat<-svytable(~ wasted+SVYYEAR,design=des), margin = 2)*100
##       SVYYEAR
## wasted      2016
##      0 90.186707
##      1  9.813293
prop.table(cat<-svytable(~ underweight+SVYYEAR,design=des), margin = 2)*100
##            SVYYEAR
## underweight     2016
##           0 76.67906
##           1 23.32094
prop.table(cat<-svytable(~ combined+SVYYEAR,design=des), margin = 2)*100
##         SVYYEAR
## combined     2016
##        0 53.40146
##        1 46.59854
prop.table(cat<-svytable(~sex+SVYYEAR,design=des), margin = 2)*100
##         SVYYEAR
## sex          2016
##   male   51.12845
##   female 48.87155
prop.table(cat<-svytable(~meduc+SVYYEAR,design=des), margin = 2)*100
##                   SVYYEAR
## meduc                   2016
##   0Noeducation     65.878324
##   1Primary         27.099239
##   2Secodary/Higher  7.022437
prop.table(cat<-svytable(~res+SVYYEAR,design=des), margin = 2)*100
##        SVYYEAR
## res         2016
##   urban 10.90268
##   rural 89.09732
prop.table(cat<-svytable(~watersource+SVYYEAR,design=des), margin = 2)*100
##             SVYYEAR
## watersource      2016
##   Improved   56.16005
##   Unimproved 43.83995
prop.table(cat<-svytable(~toilet_facility+SVYYEAR,design=des), margin = 2)*100
##                SVYYEAR
## toilet_facility      2016
##      Improved    9.140233
##      Unimproved 90.859767
prop.table(cat<-svytable(~b_order+SVYYEAR,design=des), margin = 2)*100
##               SVYYEAR
## b_order           2016
##   1st-2nd      34.6108
##   3rd or later 65.3892
prop.table(cat<-svytable(~b_interval+SVYYEAR,design=des), margin = 2)*100
##            SVYYEAR
## b_interval      2016
##   < 2 years 19.09225
##   > 4 yrs   25.08676
##   2-4 yrs   55.82099
prop.table(cat<-svytable(~wealth_index+SVYYEAR,design=des), margin = 2)*100
##             SVYYEAR
## wealth_index     2016
##      1Poor   46.31847
##      2middle 20.82901
##      3rich   32.85252
prop.table(cat<-svytable(~pl_delivery+SVYYEAR,design=des), margin = 2)*100
##                   SVYYEAR
## pl_delivery            2016
##   0Home            67.22107
##   1Health facility 32.77893
prop.table(cat<-svytable(~bmi_rec+SVYYEAR,design=des), margin = 2)*100
##              SVYYEAR
## bmi_rec            2016
##   Normal      74.249211
##   Overweight   6.140153
##   Underweight 19.610636
prop.table(cat<-svytable(~family_size+SVYYEAR,design=des), margin = 2)*100
##            SVYYEAR
## family_size        2016
##          2   0.87570691
##          3   8.68116119
##          4  16.24540971
##          5  17.23706530
##          6  17.80270857
##          7  14.73567613
##          8  11.41752444
##          9   7.13908670
##          10  2.88538122
##          11  1.32671733
##          12  0.86168118
##          13  0.31801751
##          14  0.17535449
##          15  0.18026642
##          16  0.08720861
##          17  0.01581034
##          19  0.01522393
prop.table(cat<-svytable(~no_children+SVYYEAR,design=des), margin = 2)*100
##            SVYYEAR
## no_children     2016
##          1  37.76292
##          2  44.88307
##          3+ 17.35401
prop.table(cat<-svytable(~diahrea+SVYYEAR,design=des), margin = 2)*100
##        SVYYEAR
## diahrea     2016
##    1No  87.98408
##    2Yes 12.01592
prop.table(cat<-svytable(~region+SVYYEAR,design=des), margin = 2)*100
##              SVYYEAR
## region              2016
##   Addis Adaba  2.0787686
##   Afar         0.9402692
##   Amhara      20.1131706
##   Benishangul  1.0223200
##   Dire Dawa    0.3866153
##   Gambela      0.2214537
##   Harari       0.1919811
##   Oromia      43.2798556
##   SNNPR       21.0838728
##   Somali       4.0201735
##   Tigray       6.6615196

Fit the GLM model for predicting U5 Nutritional Status

pedhs16_c<-subset(pedhs16,select=c("age","Age","res","sex", "stunt","stunt.r",  "b_interval", "stunting","wasted","underweight", "b_order", "combined", "meduc","wt", "wealth_index","ceb", "family_size", "mother_age", "occup","age_fb", "watersource","time_water", "wasting", "toilet_facility","region", "pl_delivery",  "diahrea", "bmi_rec","fam_size", "no_children", "age_fb", "v717", "m15","v501", "hv201", "hc1", "hv205", "hc61", "hv025", "v212", "hc63", "hc64", "v013", "hc27", "hc63", "hc64", "hv014", "hv024", "hv204", "hv270", "h11", "v201", "hv009", "v445"))

pedhs16_c<-na.omit(pedhs16_c)

library(caret)
library(randomForest) 
set.seed(22222)
intrain.s<-createDataPartition(y=pedhs16_c$stunt, p=.70, list=F)
train.s <- pedhs16_c[intrain.s,]
test.s<- pedhs16_c[-intrain.s,]

intrain.w<-createDataPartition(y=pedhs16_c$wasted, p=.60, list=F)
train.w <- pedhs16_c[intrain.w,]
test.w<- pedhs16_c[-intrain.w,]

intrain.u<-createDataPartition(y=pedhs16_c$underweight, p=.70, list=F)
train.u <- pedhs16_c[intrain.u,]
test.u<- pedhs16_c[-intrain.u,]

intrain.c<-createDataPartition(y=pedhs16_c$combined, p=.70, list=F)
train.c <- pedhs16_c[intrain.c,]
test.c<- pedhs16_c[-intrain.c,]

Stunting

Determinants of Stunting - Hierarchical and Logistic Regression M

library(lme4)
library(arm)
library(lmerTest)

fit.mix.s<-glmer(stunt~Age+sex+age_fb+res+meduc+watersource+time_water+b_order+b_interval+wealth_index+toilet_facility+pl_delivery+bmi_rec+family_size+no_children+diahrea+(1|region), train.s, weights=train.s$wt,family=binomial) 

display(fit.mix.s, detail=T)
## glmer(formula = stunt ~ Age + sex + age_fb + res + meduc + watersource + 
##     time_water + b_order + b_interval + wealth_index + toilet_facility + 
##     pl_delivery + bmi_rec + family_size + no_children + diahrea + 
##     (1 | region), data = train.s, family = binomial, weights = train.s$wt)
##                             coef.est coef.se z value Pr(>|z|)
## (Intercept)                  0.11     0.27    0.41    0.68   
## Age> 30 months               0.63     0.06    9.87    0.00   
## sexfemale                   -0.16     0.06   -2.69    0.01   
## age_fb20+years              -0.06     0.07   -0.92    0.36   
## resrural                    -0.19     0.16   -1.25    0.21   
## meduc1Primary               -0.21     0.08   -2.68    0.01   
## meduc2Secodary/Higher       -0.60     0.21   -2.90    0.00   
## watersourceUnimproved       -0.07     0.07   -1.15    0.25   
## time_water                   0.00     0.00   -2.21    0.03   
## b_order3rd or later          0.02     0.09    0.21    0.84   
## b_interval> 4 yrs           -0.43     0.11   -4.12    0.00   
## b_interval2-4 yrs           -0.38     0.08   -4.67    0.00   
## wealth_index2middle         -0.29     0.08   -3.63    0.00   
## wealth_index3rich           -0.42     0.08   -5.14    0.00   
## toilet_facilityUnimproved    0.14     0.15    0.98    0.33   
## pl_delivery1Health facility -0.16     0.09   -1.89    0.06   
## bmi_recOverweight           -0.33     0.16   -2.09    0.04   
## bmi_recUnderweight           0.25     0.08    3.21    0.00   
## family_size                 -0.03     0.02   -1.64    0.10   
## no_children2                 0.04     0.08    0.46    0.65   
## no_children3+               -0.21     0.11   -1.89    0.06   
## diahrea2Yes                  0.14     0.10    1.49    0.14   
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  region   (Intercept) 0.26    
##  Residual             1.00    
## ---
## number of obs: 4802, groups: region, 11
## AIC = 6304.9, DIC = 7079.4
## deviance = 6669.2
exp(fixef(fit.mix.s)[-1])
##              Age> 30 months                   sexfemale 
##                   1.8797997                   0.8491836 
##              age_fb20+years                    resrural 
##                   0.9383437                   0.8231073 
##               meduc1Primary       meduc2Secodary/Higher 
##                   0.8120705                   0.5475864 
##       watersourceUnimproved                  time_water 
##                   0.9278117                   0.9996689 
##         b_order3rd or later           b_interval> 4 yrs 
##                   1.0184773                   0.6472918 
##           b_interval2-4 yrs         wealth_index2middle 
##                   0.6817012                   0.7516247 
##           wealth_index3rich   toilet_facilityUnimproved 
##                   0.6581978                   1.1522166 
## pl_delivery1Health facility           bmi_recOverweight 
##                   0.8491396                   0.7202198 
##          bmi_recUnderweight                 family_size 
##                   1.2801233                   0.9702389 
##                no_children2               no_children3+ 
##                   1.0373795                   0.8145929 
##                 diahrea2Yes 
##                   1.1533205
exp(confint(fit.mix.s, method="Wald"))
##                                 2.5 %    97.5 %
## .sig01                             NA        NA
## (Intercept)                 0.6562476 1.9069293
## Age> 30 months              1.6583195 2.1308602
## sexfemale                   0.7537858 0.9566548
## age_fb20+years              0.8198654 1.0739433
## resrural                    0.6058534 1.1182665
## meduc1Primary               0.6973680 0.9456393
## meduc2Secodary/Higher       0.3644332 0.8227869
## watersourceUnimproved       0.8161335 1.0547717
## time_water                  0.9993754 0.9999625
## b_order3rd or later         0.8555829 1.2123853
## b_interval> 4 yrs           0.5263724 0.7959890
## b_interval2-4 yrs           0.5804882 0.8005615
## wealth_index2middle         0.6441328 0.8770546
## wealth_index3rich           0.5611353 0.7720497
## toilet_facilityUnimproved   0.8667017 1.5317878
## pl_delivery1Health facility 0.7167588 1.0059702
## bmi_recOverweight           0.5294697 0.9796906
## bmi_recUnderweight          1.1008494 1.4885922
## family_size                 0.9357517 1.0059971
## no_children2                0.8867672 1.2135724
## no_children3+               0.6587067 1.0073704
## diahrea2Yes                 0.9559023 1.3915107
fit.s<-glm(stunt~Age+sex+age_fb+res+meduc+watersource+time_water+b_order+b_interval+wealth_index+toilet_facility+pl_delivery+bmi_rec+family_size+no_children+diahrea+region, train.s, weights=train.s$wt,family=binomial) 
 
summary(fit.s)
## 
## Call:
## glm(formula = stunt ~ Age + sex + age_fb + res + meduc + watersource + 
##     time_water + b_order + b_interval + wealth_index + toilet_facility + 
##     pl_delivery + bmi_rec + family_size + no_children + diahrea + 
##     region, family = binomial, data = train.s, weights = train.s$wt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5041  -0.8562  -0.2713   0.5454   3.6701  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -0.2840555  0.4281200  -0.663  0.50701    
## Age> 30 months               0.6272541  0.0618239  10.146  < 2e-16 ***
## sexfemale                   -0.1640445  0.0588894  -2.786  0.00534 ** 
## age_fb20+years              -0.0500792  0.0663729  -0.755  0.45054    
## resrural                    -0.2372241  0.1472015  -1.612  0.10706    
## meduc1Primary               -0.2066572  0.0759290  -2.722  0.00649 ** 
## meduc2Secodary/Higher       -0.6324261  0.1974690  -3.203  0.00136 ** 
## watersourceUnimproved       -0.0575911  0.0636065  -0.905  0.36524    
## time_water                  -0.0003030  0.0001422  -2.130  0.03316 *  
## b_order3rd or later          0.0212012  0.0858925   0.247  0.80504    
## b_interval> 4 yrs           -0.4580140  0.1019896  -4.491 7.10e-06 ***
## b_interval2-4 yrs           -0.3759730  0.0786132  -4.783 1.73e-06 ***
## wealth_index2middle         -0.2918836  0.0776376  -3.760  0.00017 ***
## wealth_index3rich           -0.4170731  0.0799339  -5.218 1.81e-07 ***
## toilet_facilityUnimproved    0.0709825  0.1343625   0.528  0.59730    
## pl_delivery1Health facility -0.1528801  0.0840597  -1.819  0.06896 .  
## bmi_recOverweight           -0.2933491  0.1463661  -2.004  0.04505 *  
## bmi_recUnderweight           0.2497476  0.0740959   3.371  0.00075 ***
## family_size                 -0.0279581  0.0178042  -1.570  0.11634    
## no_children2                 0.0263130  0.0777530   0.338  0.73505    
## no_children3+               -0.2310168  0.1043433  -2.214  0.02683 *  
## diahrea2Yes                  0.1356023  0.0931520   1.456  0.14547    
## regionAfar                   0.3470527  0.4899757   0.708  0.47876    
## regionAmhara                 0.9330390  0.3951521   2.361  0.01822 *  
## regionBenishangul            0.6432586  0.4756408   1.352  0.17625    
## regionDire Dawa              0.8808982  0.6318780   1.394  0.16329    
## regionGambela               -0.3332688  0.8698620  -0.383  0.70162    
## regionHarari                 0.4032397  0.8211255   0.491  0.62337    
## regionOromia                 0.4961953  0.3944231   1.258  0.20838    
## regionSNNPR                  0.4744647  0.3951094   1.201  0.22981    
## regionSomali                -0.1782729  0.4141975  -0.430  0.66690    
## regionTigray                 0.6912256  0.4039281   1.711  0.08703 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7061.4  on 4801  degrees of freedom
## Residual deviance: 6664.8  on 4770  degrees of freedom
## AIC: 6483
## 
## Number of Fisher Scoring iterations: 4
exp(cbind(coef(fit.s), confint(fit.s)))
##                                           2.5 %    97.5 %
## (Intercept)                 0.7527249 0.3095606 1.6821724
## Age> 30 months              1.8724620 1.6590902 2.1141288
## sexfemale                   0.8487042 0.7561411 0.9525033
## age_fb20+years              0.9511541 0.8349433 1.0830992
## resrural                    0.7888145 0.5912901 1.0534148
## meduc1Primary               0.8132984 0.7004936 0.9433995
## meduc2Secodary/Higher       0.5313012 0.3577036 0.7768312
## watersourceUnimproved       0.9440358 0.8333227 1.0693252
## time_water                  0.9996971 0.9994165 0.9999743
## b_order3rd or later         1.0214275 0.8633776 1.2090849
## b_interval> 4 yrs           0.6325386 0.5178054 0.7723743
## b_interval2-4 yrs           0.6866209 0.5885506 0.8010186
## wealth_index2middle         0.7468555 0.6411743 0.8693023
## wealth_index3rich           0.6589728 0.5631654 0.7704432
## toilet_facilityUnimproved   1.0735624 0.8265019 1.4001711
## pl_delivery1Health facility 0.8582326 0.7274207 1.0114179
## bmi_recOverweight           0.7457617 0.5573056 0.9898774
## bmi_recUnderweight          1.2837014 1.1099707 1.4841539
## family_size                 0.9724291 0.9389669 1.0068575
## no_children2                1.0266622 0.8815801 1.1957857
## no_children3+               0.7937261 0.6467368 0.9736370
## diahrea2Yes                 1.1452264 0.9534223 1.3738346
## regionAfar                  1.4148914 0.5527415 3.8261917
## regionAmhara                2.5422233 1.2210671 5.8419810
## regionBenishangul           1.9026708 0.7681820 5.0235480
## regionDire Dawa             2.4130661 0.6981855 8.5079493
## regionGambela               0.7165775 0.1019823 3.5482558
## regionHarari                1.4966657 0.2626331 7.1699873
## regionOromia                1.6424603 0.7900992 3.7695488
## regionSNNPR                 1.6071537 0.7719843 3.6928372
## regionSomali                0.8367140 0.3853086 1.9848902
## regionTigray                1.9961605 0.9405167 4.6552403

Confusion Matrix – Stunting - Hierarchical Model- (GLMER)

library(ROCR)
test_arr_predict <- predict(fit.mix.s, test.s, type="response")
#create a data frame of out-sample(test set) reference value and predicted value

out_sample_arr <- data.frame(labels = test.s$stunt, predictions = test_arr_predict)

#get the performance of the in-sample prediction
pred.out_sample_arr  <- prediction(out_sample_arr$predictions, out_sample_arr$labels)
perf.out_sample_arr = performance(pred.out_sample_arr, measure = "tpr", x.measure="fpr")

#This function generates the optimal cutoff value that maximizes the sensitivity and specificity simutaneously
opt.cut = function(perf, pred){
  cut.ind = mapply(FUN=function(x, y, p){
    d = (x - 0)^2 + (y-1)^2
    ind = which(d == min(d))
    c(sensitivity = y[[ind]], specificity = 1-x[[ind]], 
      cutoff = p[[ind]])
  }, perf@x.values, perf@y.values, pred@cutoffs)
}
print(opt.cut(perf.out_sample_arr, pred.out_sample_arr))
##                  [,1]
## sensitivity 0.7078788
## specificity 0.5154221
## cutoff      0.3430460
#classify the predicted values below the cutoff values to 0, above to 1
out_sample_arr$predClass  = ifelse(out_sample_arr$predictions >0.6365748,1,0)

# Confusion matrix 
confusionMatrix(table(out_sample_arr$labels ,out_sample_arr$predClass))
## Confusion Matrix and Statistics
## 
##    
##        0    1
##   0 1224    8
##   1  808   17
##                                           
##                Accuracy : 0.6033          
##                  95% CI : (0.5818, 0.6245)
##     No Information Rate : 0.9878          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0168          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.60236         
##             Specificity : 0.68000         
##          Pos Pred Value : 0.99351         
##          Neg Pred Value : 0.02061         
##              Prevalence : 0.98785         
##          Detection Rate : 0.59504         
##    Detection Prevalence : 0.59893         
##       Balanced Accuracy : 0.64118         
##                                           
##        'Positive' Class : 0               
## 

Confusion Matrix – Stunting - Logistic Regression Model (GLM)

library(ROCR)
test_arr_predict <- predict(fit.s, test.s, type="response")
#create a data frame of out-sample(test set) reference value and predicted value

out_sample_arr <- data.frame(labels = test.s$stunt, predictions = test_arr_predict)

#get the performance of the in-sample prediction
pred.out_sample_arr  <- prediction(out_sample_arr$predictions, out_sample_arr$labels)
perf.out_sample_arr = performance(pred.out_sample_arr, measure = "tpr", x.measure="fpr")

#This function generates the optimal cutoff value that maximizes the sensitivity and specificity simutaneously
opt.cut = function(perf, pred){
  cut.ind = mapply(FUN=function(x, y, p){
    d = (x - 0)^2 + (y-1)^2
    ind = which(d == min(d))
    c(sensitivity = y[[ind]], specificity = 1-x[[ind]], 
      cutoff = p[[ind]])
  }, perf@x.values, perf@y.values, pred@cutoffs)
}
print(opt.cut(perf.out_sample_arr, pred.out_sample_arr))
##                  [,1]
## sensitivity 0.6703030
## specificity 0.5478896
## cutoff      0.3419350
#classify the predicted values below the cutoff values to 0, above to 1
out_sample_arr$predClass  = ifelse(out_sample_arr$predictions >0.6365748,1,0)

# Confusion matrix 
confusionMatrix(table(out_sample_arr$labels ,out_sample_arr$predClass))
## Confusion Matrix and Statistics
## 
##    
##        0    1
##   0 1218   14
##   1  800   25
##                                           
##                Accuracy : 0.6043          
##                  95% CI : (0.5828, 0.6255)
##     No Information Rate : 0.981           
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0225          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6036          
##             Specificity : 0.6410          
##          Pos Pred Value : 0.9886          
##          Neg Pred Value : 0.0303          
##              Prevalence : 0.9810          
##          Detection Rate : 0.5921          
##    Detection Prevalence : 0.5989          
##       Balanced Accuracy : 0.6223          
##                                           
##        'Positive' Class : 0               
## 

Random Forest Model - Stunting

library(party)
library(randomForest)
controlDTree <- trainControl(method="cv", 5)
train.s= na.omit(train.s)
train.s$stunting <- factor(train.s$stunting)
rf2 <-randomForest(stunting~v717+m15+v501+hv201+hc1+hv205+hc61+hv025+v212+hc63+hc64+v013+hc27+hc63+hc64+hv014+hv024+hv204+hv270+h11+v201+hv009+v445, data=train.s) 

print(rf2) 
## 
## Call:
##  randomForest(formula = stunting ~ v717 + m15 + v501 + hv201 +      hc1 + hv205 + hc61 + hv025 + v212 + hc63 + hc64 + v013 +      hc27 + hc63 + hc64 + hv014 + hv024 + hv204 + hv270 + h11 +      v201 + hv009 + v445, data = train.s) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 33.59%
## Confusion matrix:
##       no yes class.error
## no  2486 584   0.1902280
## yes 1029 703   0.5941109
print(importance(rf2,type = 2)) 
##       MeanDecreaseGini
## v717          74.07013
## m15           61.76096
## v501          19.58124
## hv201        168.94126
## hc1          320.66027
## hv205         58.80136
## hc61          39.55445
## hv025         11.32081
## v212         136.34160
## hc63         197.66523
## hc64          92.18071
## v013          98.41926
## hc27          34.54762
## hv014         58.50836
## hv024        177.45771
## hv204        127.27707
## hv270         85.92757
## h11           20.94379
## v201          91.05105
## hv009        102.17918
## v445         219.44327
pred = predict(rf2, newdata=test.s)
accuracy <- table(pred, test.s[,"stunting"])
sum(diag(accuracy))/sum(accuracy)
## [1] 0.6349052
confusionMatrix(data=pred, as.factor(test.s$stunting))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  1002  521
##        yes  230  304
##                                           
##                Accuracy : 0.6349          
##                  95% CI : (0.6137, 0.6557)
##     No Information Rate : 0.5989          
##     P-Value [Acc > NIR] : 0.0004455       
##                                           
##                   Kappa : 0.193           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8133          
##             Specificity : 0.3685          
##          Pos Pred Value : 0.6579          
##          Neg Pred Value : 0.5693          
##              Prevalence : 0.5989          
##          Detection Rate : 0.4871          
##    Detection Prevalence : 0.7404          
##       Balanced Accuracy : 0.5909          
##                                           
##        'Positive' Class : no              
## 

ROC curves – Stunting

test_prob = predict(fit.mix.s, newdata = test.s, type = "response")
test_roc = roc(test.s$stunt ~ test_prob, plot = TRUE, print.auc = TRUE,percent=TRUE)

test_prob2 = predict(fit.s, newdata = test.s, type = "response")
test_roc2 = roc(test.s$stunt ~ test_prob2, plot = TRUE, print.auc = TRUE,percent=TRUE)

test_prob5 = predict(rf2, newdata=test.s, type = "prob")
test_roc5= roc(test.s$stunting, test_prob5[,c(2)],plot = TRUE, col='darkgoldenrod4', print.auc = TRUE,percent=TRUE)

plot(test_roc,    print.auc = TRUE,  col='blue',percent=TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test_roc5 , add = TRUE, col='darkgoldenrod4', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='red', print.auc = TRUE, print.auc.y = 20, print.auc.x = 0)
abline(a=0, b=1, lty=2, lwd=3, col="black")
legend("topleft", legend = c("Hierarchical Model","Random Forest Model","Logistic Regression Model") , pch = 15, bty = 'n', col = c("blue","darkgoldenrod4", "red"))

Wasting

Determinants of Wasting - Hierarchical and Logistic Regression M

library(lme4)
library(arm)
library(lmerTest)

fit.mix.w<-glmer(wasted~Age+sex+age_fb+res+meduc+watersource+time_water+b_order+b_interval+wealth_index+toilet_facility+pl_delivery+bmi_rec+family_size+no_children+diahrea+(1|region), train.w, weights=train.w$wt,family=binomial) 

display(fit.mix.w, detail=T)
## glmer(formula = wasted ~ Age + sex + age_fb + res + meduc + watersource + 
##     time_water + b_order + b_interval + wealth_index + toilet_facility + 
##     pl_delivery + bmi_rec + family_size + no_children + diahrea + 
##     (1 | region), data = train.w, family = binomial, weights = train.w$wt)
##                             coef.est coef.se z value Pr(>|z|)
## (Intercept)                 -1.90     0.46   -4.15    0.00   
## Age> 30 months              -0.65     0.11   -5.91    0.00   
## sexfemale                    0.09     0.10    0.90    0.37   
## age_fb20+years               0.06     0.11    0.55    0.58   
## resrural                    -1.07     0.26   -4.10    0.00   
## meduc1Primary               -0.06     0.14   -0.44    0.66   
## meduc2Secodary/Higher        0.33     0.35    0.93    0.35   
## watersourceUnimproved        0.13     0.11    1.15    0.25   
## time_water                   0.00     0.00   -0.74    0.46   
## b_order3rd or later          0.44     0.17    2.54    0.01   
## b_interval> 4 yrs           -0.36     0.19   -1.88    0.06   
## b_interval2-4 yrs           -0.07     0.13   -0.52    0.61   
## wealth_index2middle         -0.08     0.13   -0.64    0.52   
## wealth_index3rich           -0.61     0.15   -4.03    0.00   
## toilet_facilityUnimproved    0.34     0.28    1.22    0.22   
## pl_delivery1Health facility -0.29     0.15   -1.95    0.05   
## bmi_recOverweight           -0.95     0.36   -2.65    0.01   
## bmi_recUnderweight           0.46     0.12    3.79    0.00   
## family_size                  0.06     0.03    1.94    0.05   
## no_children2                 0.17     0.15    1.14    0.25   
## no_children3+                0.39     0.18    2.17    0.03   
## diahrea2Yes                 -0.03     0.16   -0.19    0.85   
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  region   (Intercept) 0.25    
##  Residual             1.00    
## ---
## number of obs: 4116, groups: region, 11
## AIC = 2771.3, DIC = 3230.5
## deviance = 2977.9
exp(fixef(fit.mix.w)[-1])
##              Age> 30 months                   sexfemale 
##                   0.5223870                   1.0976926 
##              age_fb20+years                    resrural 
##                   1.0647018                   0.3442388 
##               meduc1Primary       meduc2Secodary/Higher 
##                   0.9415679                   1.3856082 
##       watersourceUnimproved                  time_water 
##                   1.1336268                   0.9997998 
##         b_order3rd or later           b_interval> 4 yrs 
##                   1.5550297                   0.6981660 
##           b_interval2-4 yrs         wealth_index2middle 
##                   0.9335114                   0.9214214 
##           wealth_index3rich   toilet_facilityUnimproved 
##                   0.5459582                   1.4096182 
## pl_delivery1Health facility           bmi_recOverweight 
##                   0.7478579                   0.3852672 
##          bmi_recUnderweight                 family_size 
##                   1.5874810                   1.0597323 
##                no_children2               no_children3+ 
##                   1.1856966                   1.4814105 
##                 diahrea2Yes 
##                   0.9699378
exp(confint(fit.mix.w, method="Wald"))
##                                  2.5 %    97.5 %
## .sig01                              NA        NA
## (Intercept)                 0.06109223 0.3675539
## Age> 30 months              0.42112487 0.6479983
## sexfemale                   0.89697313 1.3433280
## age_fb20+years              0.85205993 1.3304110
## resrural                    0.20674733 0.5731652
## meduc1Primary               0.71951496 1.2321498
## meduc2Secodary/Higher       0.69848775 2.7486667
## watersourceUnimproved       0.91484469 1.4047299
## time_water                  0.99926869 1.0003312
## b_order3rd or later         1.10597760 2.1864073
## b_interval> 4 yrs           0.47959947 1.0163394
## b_interval2-4 yrs           0.71878444 1.2123850
## wealth_index2middle         0.71719718 1.1837992
## wealth_index3rich           0.40669384 0.7329110
## toilet_facilityUnimproved   0.81284626 2.4445255
## pl_delivery1Health facility 0.55804355 1.0022362
## bmi_recOverweight           0.19039903 0.7795776
## bmi_recUnderweight          1.24965274 2.0166370
## family_size                 0.99953621 1.1235536
## no_children2                0.88514621 1.5882986
## no_children3+               1.03904056 2.1121189
## diahrea2Yes                 0.70510420 1.3342417
fit.w<-glm(wasted~Age+sex+age_fb+res+meduc+watersource+time_water+b_order+b_interval+wealth_index+toilet_facility+pl_delivery+bmi_rec+family_size+no_children+diahrea+region, train.w, weights=train.w$wt,family=binomial) 
 
summary(fit.w)
## 
## Call:
## glm(formula = wasted ~ Age + sex + age_fb + res + meduc + watersource + 
##     time_water + b_order + b_interval + wealth_index + toilet_facility + 
##     pl_delivery + bmi_rec + family_size + no_children + diahrea + 
##     region, family = binomial, data = train.w, weights = train.w$wt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7037  -0.5345  -0.3086  -0.1230   5.0066  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -2.0638494  0.7498783  -2.752 0.005919 ** 
## Age> 30 months              -0.6067746  0.1042004  -5.823 5.77e-09 ***
## sexfemale                    0.0431081  0.0978459   0.441 0.659523    
## age_fb20+years               0.0701305  0.1075497   0.652 0.514352    
## resrural                    -0.9580582  0.2344433  -4.087 4.38e-05 ***
## meduc1Primary               -0.0460254  0.1318208  -0.349 0.726976    
## meduc2Secodary/Higher        0.3037539  0.3264259   0.931 0.352089    
## watersourceUnimproved        0.1199635  0.1045641   1.147 0.251269    
## time_water                  -0.0002752  0.0002476  -1.112 0.266342    
## b_order3rd or later          0.3811509  0.1617866   2.356 0.018479 *  
## b_interval> 4 yrs           -0.3196613  0.1830781  -1.746 0.080804 .  
## b_interval2-4 yrs           -0.0403607  0.1259107  -0.321 0.748551    
## wealth_index2middle         -0.0387915  0.1250856  -0.310 0.756470    
## wealth_index3rich           -0.5540636  0.1452054  -3.816 0.000136 ***
## toilet_facilityUnimproved    0.2571811  0.2393162   1.075 0.282532    
## pl_delivery1Health facility -0.2541685  0.1425645  -1.783 0.074614 .  
## bmi_recOverweight           -0.8028960  0.3007523  -2.670 0.007594 ** 
## bmi_recUnderweight           0.4376308  0.1152202   3.798 0.000146 ***
## family_size                  0.0509635  0.0283920   1.795 0.072654 .  
## no_children2                 0.1753368  0.1428609   1.227 0.219700    
## no_children3+                0.3486659  0.1724537   2.022 0.043198 *  
## diahrea2Yes                  0.0189803  0.1545471   0.123 0.902256    
## regionAfar                   0.5892031  0.7895075   0.746 0.455491    
## regionAmhara                 0.2905587  0.6911064   0.420 0.674175    
## regionBenishangul            0.2132111  0.8204631   0.260 0.794967    
## regionDire Dawa              0.0152304  1.1598419   0.013 0.989523    
## regionGambela                0.1853093  1.1974548   0.155 0.877016    
## regionHarari                 0.5700444  1.2101069   0.471 0.637591    
## regionOromia                 0.2002930  0.6904335   0.290 0.771742    
## regionSNNPR                 -0.3118742  0.6947639  -0.449 0.653509    
## regionSomali                 0.8315152  0.6963894   1.194 0.232463    
## regionTigray                 0.2818949  0.7037805   0.401 0.688756    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3168.0  on 4115  degrees of freedom
## Residual deviance: 2973.1  on 4084  degrees of freedom
## AIC: 2849.4
## 
## Number of Fisher Scoring iterations: 5
exp(cbind(coef(fit.w), confint(fit.w)))
##                                            2.5 %     97.5 %
## (Intercept)                 0.1269643 0.02355019  0.4873491
## Age> 30 months              0.5451062 0.44380928  0.6678502
## sexfemale                   1.0440508 0.86192891  1.2651049
## age_fb20+years              1.0726482 0.86713583  1.3221742
## resrural                    0.3836371 0.24396980  0.6125594
## meduc1Primary               0.9550177 0.73436695  1.2318303
## meduc2Secodary/Higher       1.3549355 0.69471575  2.5122666
## watersourceUnimproved       1.1274557 0.91878629  1.3845514
## time_water                  0.9997248 0.99922665  1.0001984
## b_order3rd or later         1.4639685 1.07242401  2.0237463
## b_interval> 4 yrs           0.7263950 0.50650846  1.0387609
## b_interval2-4 yrs           0.9604430 0.75240787  1.2330221
## wealth_index2middle         0.9619512 0.75065285  1.2262007
## wealth_index3rich           0.5746101 0.43019297  0.7605241
## toilet_facilityUnimproved   1.2932793 0.82164227  2.1043788
## pl_delivery1Health facility 0.7755611 0.58339977  1.0207169
## bmi_recOverweight           0.4480296 0.23752080  0.7788119
## bmi_recUnderweight          1.5490329 1.23289146  1.9373669
## family_size                 1.0522845 0.99480573  1.1119712
## no_children2                1.1916475 0.90295959  1.5814797
## no_children3+               1.4171756 1.01215552  1.9907945
## diahrea2Yes                 1.0191615 0.74657052  1.3696391
## regionAfar                  1.8025514 0.41781167 10.2090817
## regionAmhara                1.3371744 0.39832463  6.6176777
## regionBenishangul           1.2376459 0.26201933  7.2948347
## regionDire Dawa             1.0153470 0.06668781  9.0823698
## regionGambela               1.2035906 0.07617710 11.7132129
## regionHarari                1.7683455 0.10439310 17.3553864
## regionOromia                1.2217607 0.36436279  6.0399047
## regionSNNPR                 0.7320736 0.21603796  3.6407777
## regionSomali                2.2967962 0.67531335 11.4493154
## regionTigray                1.3256394 0.38225432  6.6730234

Wasting - Confusion Matrix - Hierarchical Model (GLMER)

library(ROCR)
test_arr_predict <- predict(fit.mix.w, test.w, type="response")
#create a data frame of out-sample(test set) reference value and predicted value

out_sample_arr <- data.frame(labels = test.w$wasted, predictions = test_arr_predict)

#get the performance of the in-sample prediction
pred.out_sample_arr  <- prediction(out_sample_arr$predictions, out_sample_arr$labels)
perf.out_sample_arr = performance(pred.out_sample_arr, measure = "tpr", x.measure="fpr")

#This function generates the optimal cutoff value that maximizes the sensitivity and specificity simutaneously
opt.cut = function(perf, pred){
  cut.ind = mapply(FUN=function(x, y, p){
    d = (x - 0)^2 + (y-1)^2
    ind = which(d == min(d))
    c(sensitivity = y[[ind]], specificity = 1-x[[ind]], 
      cutoff = p[[ind]])
  }, perf@x.values, perf@y.values, pred@cutoffs)
}
print(opt.cut(perf.out_sample_arr, pred.out_sample_arr))
##                  [,1]
## sensitivity 0.6776119
## specificity 0.5859635
## cutoff      0.1132143
#classify the predicted values below the cutoff values to 0, above to 1
out_sample_arr$predClass  = ifelse(out_sample_arr$predictions >0.1179763,1,0)

# Confusion matrix 
confusionMatrix(table(out_sample_arr$labels ,out_sample_arr$predClass))
## Confusion Matrix and Statistics
## 
##    
##        0    1
##   0 1464  944
##   1  121  214
##                                         
##                Accuracy : 0.6117        
##                  95% CI : (0.5932, 0.63)
##     No Information Rate : 0.5778        
##     P-Value [Acc > NIR] : 0.0001659     
##                                         
##                   Kappa : 0.1199        
##  Mcnemar's Test P-Value : < 2.2e-16     
##                                         
##             Sensitivity : 0.9237        
##             Specificity : 0.1848        
##          Pos Pred Value : 0.6080        
##          Neg Pred Value : 0.6388        
##              Prevalence : 0.5778        
##          Detection Rate : 0.5337        
##    Detection Prevalence : 0.8779        
##       Balanced Accuracy : 0.5542        
##                                         
##        'Positive' Class : 0             
## 

Wasting - Confusion Matrix - Logistic Regression Model (GLM)

library(ROCR)
test_arr_predict <- predict(fit.w, test.w, type="response")
#create a data frame of out-sample(test set) reference value and predicted value

out_sample_arr <- data.frame(labels = test.w$wasted, predictions = test_arr_predict)

#get the performance of the in-sample prediction
pred.out_sample_arr  <- prediction(out_sample_arr$predictions, out_sample_arr$labels)
perf.out_sample_arr = performance(pred.out_sample_arr, measure = "tpr", x.measure="fpr")

#This function generates the optimal cutoff value that maximizes the sensitivity and specificity simutaneously
opt.cut = function(perf, pred){
  cut.ind = mapply(FUN=function(x, y, p){
    d = (x - 0)^2 + (y-1)^2
    ind = which(d == min(d))
    c(sensitivity = y[[ind]], specificity = 1-x[[ind]], 
      cutoff = p[[ind]])
  }, perf@x.values, perf@y.values, pred@cutoffs)
}
print(opt.cut(perf.out_sample_arr, pred.out_sample_arr))
##                  [,1]
## sensitivity 0.6656716
## specificity 0.6058970
## cutoff      0.1231141
#classify the predicted values below the cutoff values to 0, above to 1
out_sample_arr$predClass  = ifelse(out_sample_arr$predictions >0.1179763,1,0)

# Confusion matrix 
confusionMatrix(table(out_sample_arr$labels ,out_sample_arr$predClass))
## Confusion Matrix and Statistics
## 
##    
##        0    1
##   0 1406 1002
##   1  108  227
##                                           
##                Accuracy : 0.5953          
##                  95% CI : (0.5767, 0.6138)
##     No Information Rate : 0.552           
##     P-Value [Acc > NIR] : 2.488e-06       
##                                           
##                   Kappa : 0.1217          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9287          
##             Specificity : 0.1847          
##          Pos Pred Value : 0.5839          
##          Neg Pred Value : 0.6776          
##              Prevalence : 0.5520          
##          Detection Rate : 0.5126          
##    Detection Prevalence : 0.8779          
##       Balanced Accuracy : 0.5567          
##                                           
##        'Positive' Class : 0               
## 

Random Forest Model - Wasting

library(party)
library(randomForest)
controlDTree <- trainControl(method="cv", 5)
train.w= na.omit(train.w)
train.w$wasting <- factor(train.w$wasting)
rf.w <-randomForest(wasting~v717+m15+v501+hv201+hc1+hv205+hc61+hv025+v212+hc63+hc64+v013+hc27+hc63+hc64+hv014+hv024+hv204+hv270+h11+hv009+v445, data=train.w) 
                     

print(rf.w) 
## 
## Call:
##  randomForest(formula = wasting ~ v717 + m15 + v501 + hv201 +      hc1 + hv205 + hc61 + hv025 + v212 + hc63 + hc64 + v013 +      hc27 + hc63 + hc64 + hv014 + hv024 + hv204 + hv270 + h11 +      hv009 + v445, data = train.w) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 12.71%
## Confusion matrix:
##       no yes class.error
## no  3583  12 0.003337969
## yes  511  10 0.980806142
print(importance(rf.w,type = 2)) 
##       MeanDecreaseGini
## v717         28.796891
## m15          25.068397
## v501          9.316115
## hv201        72.753768
## hc1          92.058257
## hv205        22.506606
## hc61         15.904783
## hv025         7.036378
## v212         63.430490
## hc63         87.667776
## hc64         52.371618
## v013         46.006533
## hc27         16.226933
## hv014        28.003404
## hv024        69.962590
## hv204        61.078038
## hv270        32.935905
## h11          11.358217
## hv009        50.360687
## v445        108.266386
pred = predict(rf.w, newdata=test.w)
accuracy <- table(pred, test.w[,"wasting"])
sum(diag(accuracy))/sum(accuracy)
## [1] 0.8771418
confusionMatrix(data=pred, as.factor(test.w$wasting))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2402  331
##        yes    6    4
##                                           
##                Accuracy : 0.8771          
##                  95% CI : (0.8643, 0.8892)
##     No Information Rate : 0.8779          
##     P-Value [Acc > NIR] : 0.5608          
##                                           
##                   Kappa : 0.0162          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.99751         
##             Specificity : 0.01194         
##          Pos Pred Value : 0.87889         
##          Neg Pred Value : 0.40000         
##              Prevalence : 0.87787         
##          Detection Rate : 0.87568         
##    Detection Prevalence : 0.99635         
##       Balanced Accuracy : 0.50472         
##                                           
##        'Positive' Class : no              
## 

ROC Curves – Wasting

test_prob = predict(fit.mix.w, newdata = test.w, type = "response")
test_roc = roc(test.w$wasted ~ test_prob, plot = TRUE, print.auc = TRUE,percent=TRUE)

test_prob2 = predict(fit.w, newdata = test.w, type = "response")
test_roc2 = roc(test.w$wasted ~ test_prob2, plot = TRUE, print.auc = TRUE,percent=TRUE)

test5 = predict(rf.w, newdata=test.w, type = "prob")
test5= roc(test.w$wasting, test5[,c(2)],plot = TRUE, col='darkgoldenrod4', print.auc = TRUE,percent=TRUE)

plot(test_roc,    print.auc = TRUE,  col='blue',percent=TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test5 , add = TRUE, col='darkgoldenrod4', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='red', print.auc = TRUE, print.auc.y = 20, print.auc.x = 0)
abline(a=0, b=1, lty=2, lwd=3, col="black")
legend("topleft", legend = c("Hierarchical Model","Random Forest Model","Logistic Regression Model") , pch = 15, bty = 'n', col = c("blue","darkgoldenrod4", "red"))

Underweight

Determinants of Wasting - Hierarchical and Logistic Regression Model

library(lme4)
library(arm)
library(lmerTest)

fit.mix.u<-glmer(underweight~Age+sex+age_fb+res+meduc+watersource+time_water+b_order+b_interval+wealth_index+toilet_facility+pl_delivery+bmi_rec+family_size+no_children+diahrea+(1|region), train.u, weights=train.u$wt,family=binomial) 

display(fit.mix.u, detail=T)
## glmer(formula = underweight ~ Age + sex + age_fb + res + meduc + 
##     watersource + time_water + b_order + b_interval + wealth_index + 
##     toilet_facility + pl_delivery + bmi_rec + family_size + no_children + 
##     diahrea + (1 | region), data = train.u, family = binomial, 
##     weights = train.u$wt)
##                             coef.est coef.se z value Pr(>|z|)
## (Intercept)                 -1.22     0.30   -4.09    0.00   
## Age> 30 months               0.52     0.07    7.10    0.00   
## sexfemale                   -0.02     0.07   -0.25    0.80   
## age_fb20+years               0.17     0.07    2.25    0.02   
## resrural                     0.14     0.18    0.75    0.45   
## meduc1Primary               -0.35     0.09   -3.90    0.00   
## meduc2Secodary/Higher       -0.49     0.25   -1.96    0.05   
## watersourceUnimproved       -0.16     0.07   -2.19    0.03   
## time_water                   0.00     0.00    1.66    0.10   
## b_order3rd or later          0.06     0.10    0.64    0.52   
## b_interval> 4 yrs           -0.39     0.12   -3.22    0.00   
## b_interval2-4 yrs           -0.21     0.09   -2.29    0.02   
## wealth_index2middle         -0.40     0.09   -4.57    0.00   
## wealth_index3rich           -0.60     0.09   -6.37    0.00   
## toilet_facilityUnimproved    0.14     0.16    0.85    0.39   
## pl_delivery1Health facility -0.07     0.10   -0.69    0.49   
## bmi_recOverweight           -0.60     0.20   -3.04    0.00   
## bmi_recUnderweight           0.37     0.08    4.35    0.00   
## family_size                 -0.02     0.02   -0.86    0.39   
## no_children2                 0.23     0.09    2.50    0.01   
## no_children3+               -0.10     0.12   -0.82    0.41   
## diahrea2Yes                  0.76     0.10    7.60    0.00   
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  region   (Intercept) 0.12    
##  Residual             1.00    
## ---
## number of obs: 4802, groups: region, 11
## AIC = 5335.5, DIC = 6056.5
## deviance = 5673.0
exp(fixef(fit.mix.u)[-1])
##              Age> 30 months                   sexfemale 
##                   1.6781744                   0.9829806 
##              age_fb20+years                    resrural 
##                   1.1816876                   1.1475434 
##               meduc1Primary       meduc2Secodary/Higher 
##                   0.7030765                   0.6143373 
##       watersourceUnimproved                  time_water 
##                   0.8517313                   1.0002911 
##         b_order3rd or later           b_interval> 4 yrs 
##                   1.0652979                   0.6789157 
##           b_interval2-4 yrs         wealth_index2middle 
##                   0.8129021                   0.6717911 
##           wealth_index3rich   toilet_facilityUnimproved 
##                   0.5480417                   1.1500954 
## pl_delivery1Health facility           bmi_recOverweight 
##                   0.9342114                   0.5482468 
##          bmi_recUnderweight                 family_size 
##                   1.4417899                   0.9818509 
##                no_children2               no_children3+ 
##                   1.2533401                   0.9053286 
##                 diahrea2Yes 
##                   2.1398450
exp(confint(fit.mix.u, method="Wald"))
##                                 2.5 %    97.5 %
## .sig01                             NA        NA
## (Intercept)                 0.1641458 0.5290363
## Age> 30 months              1.4547664 1.9358913
## sexfemale                   0.8600561 1.1234742
## age_fb20+years              1.0215752 1.3668945
## resrural                    0.8010511 1.6439100
## meduc1Primary               0.5889721 0.8392869
## meduc2Secodary/Higher       0.3777872 0.9990026
## watersourceUnimproved       0.7378834 0.9831449
## time_water                  0.9999475 1.0006349
## b_order3rd or later         0.8780485 1.2924794
## b_interval> 4 yrs           0.5365251 0.8590959
## b_interval2-4 yrs           0.6806690 0.9708241
## wealth_index2middle         0.5664063 0.7967835
## wealth_index3rich           0.4554335 0.6594810
## toilet_facilityUnimproved   0.8344374 1.5851631
## pl_delivery1Health facility 0.7706928 1.1324240
## bmi_recOverweight           0.3721098 0.8077577
## bmi_recUnderweight          1.2224876 1.7004328
## family_size                 0.9415622 1.0238636
## no_children2                1.0502368 1.4957213
## no_children3+               0.7140887 1.1477842
## diahrea2Yes                 1.7585241 2.6038521
fit.u<-glm(underweight~Age+sex+age_fb+res+meduc+watersource+time_water+b_order+b_interval+wealth_index+toilet_facility+pl_delivery+bmi_rec+family_size+no_children+diahrea+region, train.u, weights=train.u$wt,family=binomial) 
 
summary(fit.u)
## 
## Call:
## glm(formula = underweight ~ Age + sex + age_fb + res + meduc + 
##     watersource + time_water + b_order + b_interval + wealth_index + 
##     toilet_facility + pl_delivery + bmi_rec + family_size + no_children + 
##     diahrea + region, family = binomial, data = train.u, weights = train.u$wt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9149  -0.8031  -0.3153   0.2558   4.6619  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -1.8347376  0.5387383  -3.406 0.000660 ***
## Age> 30 months               0.5149306  0.0701933   7.336 2.20e-13 ***
## sexfemale                   -0.0298697  0.0657615  -0.454 0.649676    
## age_fb20+years               0.1678109  0.0715927   2.344 0.019080 *  
## resrural                     0.0209434  0.1681171   0.125 0.900859    
## meduc1Primary               -0.3255210  0.0880248  -3.698 0.000217 ***
## meduc2Secodary/Higher       -0.4818543  0.2360587  -2.041 0.041226 *  
## watersourceUnimproved       -0.1562623  0.0709329  -2.203 0.027598 *  
## time_water                   0.0002221  0.0001644   1.351 0.176718    
## b_order3rd or later          0.0662753  0.0951990   0.696 0.486318    
## b_interval> 4 yrs           -0.4141767  0.1156535  -3.581 0.000342 ***
## b_interval2-4 yrs           -0.2157393  0.0863296  -2.499 0.012454 *  
## wealth_index2middle         -0.4005207  0.0860246  -4.656 3.23e-06 ***
## wealth_index3rich           -0.6008441  0.0928079  -6.474 9.54e-11 ***
## toilet_facilityUnimproved    0.1058933  0.1543293   0.686 0.492617    
## pl_delivery1Health facility -0.0633000  0.0956106  -0.662 0.507933    
## bmi_recOverweight           -0.5145038  0.1797880  -2.862 0.004213 ** 
## bmi_recUnderweight           0.3776844  0.0808199   4.673 2.97e-06 ***
## family_size                 -0.0124492  0.0203985  -0.610 0.541663    
## no_children2                 0.2063326  0.0873877   2.361 0.018220 *  
## no_children3+               -0.1093222  0.1163449  -0.940 0.347403    
## diahrea2Yes                  0.7414399  0.0972428   7.625 2.45e-14 ***
## regionAfar                   0.9151185  0.5857512   1.562 0.118218    
## regionAmhara                 1.0124412  0.5051151   2.004 0.045030 *  
## regionBenishangul            1.1097398  0.5746714   1.931 0.053472 .  
## regionDire Dawa              0.7634050  0.7408881   1.030 0.302826    
## regionGambela                0.0236018  1.0159028   0.023 0.981465    
## regionHarari                 0.6471579  0.9409270   0.688 0.491587    
## regionOromia                 0.5932016  0.5045315   1.176 0.239696    
## regionSNNPR                  0.6942139  0.5047661   1.375 0.169033    
## regionSomali                 0.7291826  0.5167284   1.411 0.158200    
## regionTigray                 0.6473674  0.5128288   1.262 0.206824    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6006.0  on 4801  degrees of freedom
## Residual deviance: 5666.5  on 4770  degrees of freedom
## AIC: 5527.5
## 
## Number of Fisher Scoring iterations: 5
exp(cbind(coef(fit.u), confint(fit.u)))
##                                            2.5 %     97.5 %
## (Intercept)                 0.1596554 0.04968521  0.4258009
## Age> 30 months              1.6735224 1.45889823  1.9210682
## sexfemale                   0.9705720 0.85314975  1.1040648
## age_fb20+years              1.1827129 1.02746694  1.3604129
## resrural                    1.0211642 0.73753571  1.4265700
## meduc1Primary               0.7221510 0.60683047  0.8569938
## meduc2Secodary/Higher       0.6176371 0.38277206  0.9682594
## watersourceUnimproved       0.8553348 0.74421480  0.9828188
## time_water                  1.0002222 0.99989703  1.0005423
## b_order3rd or later         1.0685209 0.88737113  1.2888986
## b_interval> 4 yrs           0.6608842 0.52668744  0.8288889
## b_interval2-4 yrs           0.8059454 0.68089560  0.9551879
## wealth_index2middle         0.6699711 0.56542979  0.7922616
## wealth_index3rich           0.5483486 0.45661743  0.6570442
## toilet_facilityUnimproved   1.1117033 0.82553608  1.5128362
## pl_delivery1Health facility 0.9386619 0.77719956  1.1307479
## bmi_recOverweight           0.5977971 0.41537538  0.8417957
## bmi_recUnderweight          1.4589025 1.24433133  1.7082867
## family_size                 0.9876280 0.94870421  1.0277013
## no_children2                1.2291620 1.03612295  1.4595345
## no_children3+               0.8964415 0.71338655  1.1257540
## diahrea2Yes                 2.0989556 1.73325717  2.5379516
## regionAfar                  2.4970711 0.83681418  8.6265314
## regionAmhara                2.7523119 1.11237195  8.3828888
## regionBenishangul           3.0335690 1.04362636 10.2997661
## regionDire Dawa             2.1455694 0.48851145  9.4411479
## regionGambela               1.0238825 0.09657653  6.5720848
## regionHarari                1.9101044 0.24613815 11.3760961
## regionOromia                1.8097734 0.73233305  5.5068510
## regionSNNPR                 2.0021346 0.80978895  6.0945451
## regionSomali                2.0733852 0.81538766  6.4281890
## regionTigray                1.9105045 0.75805933  5.8872863

Confusion Matrix - Underweight - Hierarchical Model (GLMER)

library(ROCR)
test_arr_predict <- predict(fit.mix.u, test.u, type="response")
#create a data frame of out-sample(test set) reference value and predicted value

out_sample_arr <- data.frame(labels = test.u$underweight, predictions = test_arr_predict)

#get the performance of the in-sample prediction
pred.out_sample_arr  <- prediction(out_sample_arr$predictions, out_sample_arr$labels)
perf.out_sample_arr = performance(pred.out_sample_arr, measure = "tpr", x.measure="fpr")

#This function generates the optimal cutoff value that maximizes the sensitivity and specificity simutaneously
opt.cut = function(perf, pred){
  cut.ind = mapply(FUN=function(x, y, p){
    d = (x - 0)^2 + (y-1)^2
    ind = which(d == min(d))
    c(sensitivity = y[[ind]], specificity = 1-x[[ind]], 
      cutoff = p[[ind]])
  }, perf@x.values, perf@y.values, pred@cutoffs)
}
print(opt.cut(perf.out_sample_arr, pred.out_sample_arr))
##                  [,1]
## sensitivity 0.6727273
## specificity 0.5428003
## cutoff      0.2512379
#classify the predicted values below the cutoff values to 0, above to 1
out_sample_arr$predClass  = ifelse(out_sample_arr$predictions >0.2492607,1,0)

# Confusion matrix 
confusionMatrix(table(out_sample_arr$labels ,out_sample_arr$predClass))
## Confusion Matrix and Statistics
## 
##    
##       0   1
##   0 812 695
##   1 178 372
##                                           
##                Accuracy : 0.5756          
##                  95% CI : (0.5539, 0.5971)
##     No Information Rate : 0.5187          
##     P-Value [Acc > NIR] : 1.279e-07       
##                                           
##                   Kappa : 0.1657          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8202          
##             Specificity : 0.3486          
##          Pos Pred Value : 0.5388          
##          Neg Pred Value : 0.6764          
##              Prevalence : 0.4813          
##          Detection Rate : 0.3947          
##    Detection Prevalence : 0.7326          
##       Balanced Accuracy : 0.5844          
##                                           
##        'Positive' Class : 0               
## 

Confusion Matrix - Underweight - Logistic Regression Model (GLM)

library(ROCR)
test_arr_predict <- predict(fit.u, test.u, type="response")
#create a data frame of out-sample(test set) reference value and predicted value

out_sample_arr <- data.frame(labels = test.u$underweight, predictions = test_arr_predict)

#get the performance of the in-sample prediction
pred.out_sample_arr  <- prediction(out_sample_arr$predictions, out_sample_arr$labels)
perf.out_sample_arr = performance(pred.out_sample_arr, measure = "tpr", x.measure="fpr")

#This function generates the optimal cutoff value that maximizes the sensitivity and specificity simutaneously
opt.cut = function(perf, pred){
  cut.ind = mapply(FUN=function(x, y, p){
    d = (x - 0)^2 + (y-1)^2
    ind = which(d == min(d))
    c(sensitivity = y[[ind]], specificity = 1-x[[ind]], 
      cutoff = p[[ind]])
  }, perf@x.values, perf@y.values, pred@cutoffs)
}
print(opt.cut(perf.out_sample_arr, pred.out_sample_arr))
##                  [,1]
## sensitivity 0.6563636
## specificity 0.5739881
## cutoff      0.2616910
#classify the predicted values below the cutoff values to 0, above to 1
out_sample_arr$predClass  = ifelse(out_sample_arr$predictions >0.2492607,1,0)

# Confusion matrix 
confusionMatrix(table(out_sample_arr$labels ,out_sample_arr$predClass))
## Confusion Matrix and Statistics
## 
##    
##       0   1
##   0 829 678
##   1 177 373
##                                           
##                Accuracy : 0.5843          
##                  95% CI : (0.5627, 0.6058)
##     No Information Rate : 0.5109          
##     P-Value [Acc > NIR] : 1.398e-11       
##                                           
##                   Kappa : 0.1771          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8241          
##             Specificity : 0.3549          
##          Pos Pred Value : 0.5501          
##          Neg Pred Value : 0.6782          
##              Prevalence : 0.4891          
##          Detection Rate : 0.4030          
##    Detection Prevalence : 0.7326          
##       Balanced Accuracy : 0.5895          
##                                           
##        'Positive' Class : 0               
## 

Random Forest Model -Underweight

library(party)
library(randomForest)
controlDTree <- trainControl(method="cv", 5)
train.u= na.omit(train.u)
train.u$underweight <- factor(train.u$underweight)
rf.u <-randomForest(underweight~v717+m15+v501+hv201+hc1+hv205+hc61+hv025+v212+hc63+hc64+v013+hc27+hc63+hc64+hv014+hv024+hv204+hv270+h11+hv009+v445, data=train.u) 
                     
print(rf.u) 
## 
## Call:
##  randomForest(formula = underweight ~ v717 + m15 + v501 + hv201 +      hc1 + hv205 + hc61 + hv025 + v212 + hc63 + hc64 + v013 +      hc27 + hc63 + hc64 + hv014 + hv024 + hv204 + hv270 + h11 +      hv009 + v445, data = train.u) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 25.97%
## Confusion matrix:
##      0   1 class.error
## 0 3352 194  0.05470953
## 1 1053 203  0.83837580
print(importance(rf.u,type = 2)) 
##       MeanDecreaseGini
## v717          70.65841
## m15           54.32829
## v501          18.13290
## hv201        153.64004
## hc1          202.98580
## hv205         49.22173
## hc61          31.36421
## hv025         10.37982
## v212         117.52020
## hc63         173.83141
## hc64          99.94728
## v013          97.40069
## hc27          33.57844
## hv014         51.13770
## hv024        151.56507
## hv204        111.30138
## hv270         83.61458
## h11           24.61102
## hv009         93.00967
## v445         206.79133
pred = predict(rf.u, newdata=test.u)
accuracy <- table(pred, test.u[,"underweight"])
sum(diag(accuracy))/sum(accuracy)
## [1] 0.7311619
confusionMatrix(data=pred, as.factor(test.u$underweight))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1424  470
##          1   83   80
##                                           
##                Accuracy : 0.7312          
##                  95% CI : (0.7114, 0.7502)
##     No Information Rate : 0.7326          
##     P-Value [Acc > NIR] : 0.5707          
##                                           
##                   Kappa : 0.1164          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9449          
##             Specificity : 0.1455          
##          Pos Pred Value : 0.7518          
##          Neg Pred Value : 0.4908          
##              Prevalence : 0.7326          
##          Detection Rate : 0.6923          
##    Detection Prevalence : 0.9208          
##       Balanced Accuracy : 0.5452          
##                                           
##        'Positive' Class : 0               
## 

ROC curves - Underweight

test_prob = predict(fit.mix.u, newdata = test.u, type = "response")
test_roc = roc(test.u$underweight ~ test_prob, plot = TRUE, print.auc = TRUE,percent=TRUE)

test_prob2 = predict(fit.u, newdata = test.u, type = "response")
test_roc2 = roc(test.u$underweight ~ test_prob2, plot = TRUE, print.auc = TRUE,percent=TRUE)

test5 = predict(rf.u, newdata=test.u, type = "prob")
test5= roc(test.u$underweight, test5[,c(2)],plot = TRUE, col='darkgoldenrod4', print.auc = TRUE,percent=TRUE)

plot(test_roc,    print.auc = TRUE,  col='blue',percent=TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test5 , add = TRUE, col='darkgoldenrod4', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='red', print.auc = TRUE, print.auc.y = 20, print.auc.x = 0)
abline(a=0, b=1, lty=2, lwd=3, col="black")
legend("topleft", legend = c("Hierarchical Model","Random Forest Model","Logistic Regression Model") , pch = 15, bty = 'n', col = c("blue","darkgoldenrod4", "red"))