# Load EDHS 2011
library(gridExtra)
library(maps)
library(ggplot2)
library(viridis)
library(rvest)
library(foreign)
library(haven)
library(survival)
library(car)
library(survey)
library(muhaz)
library(eha)
library(survminer)
library(car)
library(rgdal)
library(ggplot2)
library(sf)
library(dplyr)
library(lme4)
library(lmerTest)
require(MASS)
library(pROC)
pdhs_2016<-read.dta("~/Google Drive/DHS Resources/ET_2016_DHS_08152017_2025_103714/etpr70dt/ETPR70FL.DTA", convert.factors = TRUE)
#wdhs_2016<-read.dta( "~//Google Drive/DHS Resources/ET_2016_DHS_08152017_2025_103714/etir70dt/ETIR70FL.DTA", convert.factors = TRUE)
kdhs_2016<-read.dta( "~//Google Drive/DHS Resources/ET_2016_DHS_08152017_2025_103714/etkr70dt/ETKR70FL.DTA", convert.factors = TRUE)
#cdhs_2016<-read.dta( "~//Google Drive/DHS Resources/ET_2016_DHS_08152017_2025_103714/etcr70dt/ETCR70FL.DTA", convert.factors = TRUE)
attach(kdhs_2016)
kdhs_2016$hv001<-kdhs_2016$v001
kdhs_2016$hv002<-kdhs_2016$v002
kdhs_2016$hvidx<-kdhs_2016$b16
attach(pdhs_2016)
pdhs_2016 <- pdhs_2016[order(hv001, hv002,hvidx),]
df2 = left_join(pdhs_2016, kdhs_2016, by.x=c("hv001", "hv001", "hvidx"), by.y=c("hv001", "hv001", "hvidx"), no.dups = TRUE)
pedhs16<-df2
ethsp4<-readOGR(dsn="~//Google Drive/fikre/ethiopia_dhs/sdr_subnational_data_2017-03-28/shps",layer = "sdr_subnational_data_dhs_2016")
## OGR data source with driver: ESRI Shapefile
## Source: "/Users/fikrewoldbitew/Google Drive/fikre/ethiopia_dhs/sdr_subnational_data_2017-03-28/shps", layer: "sdr_subnational_data_dhs_2016"
## with 11 features
## It has 55 fields
Used data from the Ethiopan Demographic and Health Surveys (EDHS 2000 - 2016).
pedhs16<-pedhs16[pedhs16$hv103=='yes'&pedhs16$hc70<=9990,]
pedhs16<-pedhs16[!is.na(pedhs16$hc70),]
pedhs16$stunt<-ifelse(pedhs16$hc70/100<=-2&pedhs16$hc70/100!=-2,1,0)
pedhs16$stunting<-ifelse(pedhs16$hc70/100<=-2&pedhs16$hc70/100!=-2,'yes','no')
pedhs16$stunt.r<-as.factor(pedhs16$stunt)
pedhs16$wasted<-ifelse(pedhs16$hc72/100<=-2&pedhs16$hc72/100!=-2,1,0)
pedhs16$wasting<-ifelse(pedhs16$hc72/100<=-2&pedhs16$hc72/100!=-2,'yes','no')
pedhs16$underweight<-ifelse(pedhs16$hc71/100<=-2&pedhs16$hc71/100!=-2,1,0)
pedhs16$combined<-ifelse(pedhs16$wasted==1|pedhs16$stunt==1|pedhs16$underweight==1, 1, 0)
pedhs16$tot_ch<-1
pedhs16$age<-pedhs16$hc1
pedhs16$res<-pedhs16$hv025
pedhs16$sex<-pedhs16$hc27
pedhs16$time_water<-pedhs16$hv204
#pedhs16$any_anemia<-Recode(pedhs16$hc57, recodes="'severe' ='Yes';
#'moderate'='Yes'; 'mild'='Yes';'not anemic' = 'No'; else=NA", as.factor = F)
#pedhs16$toilet<-pedhs16$hv024
pedhs16$watersource<-Recode(pedhs16$hv201, recodes="'piped water' ='Improved'; 'piped into dwelling'='Improved'; 'piped to yard/plot'='Improved'; 'piped to neighbor'='Improved'; 'public tap/standpipe'='Improved';
'tube well water'='Unimproved'; 'tube well or borehole'= 'Improved';
'dug well (open/protected)'='Unimproved';
'surface from spring'='Unimproved';
'protected well' ='Improved';
'unprotected well' ='Unimproved';
'surface from spring' ='Unimproved';
'protected spring' ='Improved';
'unprotected spring'='Unimproved';
'river/dam/lake/ponds/stream/canal/irrigation channel'='Unimproved';
'rainwater'='Improved';
'tanker truck'='Unimproved';
'cart with small tank'='Unimproved';
'bottled water'='Improved';
'other' = 'Unimproved' ; else=NA", as.factor = F)
pedhs16$toilet_facility<-Recode(pedhs16$hv205,
recodes='"flush toilet" = "Improved";
"flush to piped sewer system" = "Improved";
"flush to septic tank" = "Improved";
"flush to pit latrine" = "Improved";
"flush to somewhere else" = "Improved";
"pit toilet latrine" = "Improved";
"ventilated improved pit latrine (vip)" = "Improved";
"pit latrine with slab" = "Improved";
"pit latrine without slab/open pit" = "Unimproved";
"no facility" = "Unimproved";
"no facility/bush/field"= "Unimproved";
"composting toilet" = "Unimproved";
"bucket toilet" = "Unimproved";
"hanging toilet/latrine"= "Unimproved";
"other" = "Unimproved"; else="Improved"', as.factor = F)
#Education
pedhs16$meduc<-Recode(pedhs16$hc61, recodes="'no education' ='0Noeducation'; 'primary'='1Primary'; 'secondary'='2Secodary/Higher'; 'technical/ vocational'='2Secodary/Higher'; 'higher'='2Secodary/Higher';else=NA", as.factor=T)
#pedhs16$pno_educ<-Recode(pedhs16$v106, recodes="'no education'=0;else=1", as.factor=T)
#Residence
pedhs16$res<-pedhs16$hv025
pedhs16$prec_b_interval<-pedhs16$hc63
pedhs16$prec_b_order<-pedhs16$hc64
pedhs16$mother_age<-Recode(pedhs16$v013,recodes="'15-19'='15-24'; '20-24'='15-24'; '25-29'='25-34';
'30-34'='25-34'; '35-39'='35-49'; '40-44'='35-49';'45-49'='35-49'; else=NA", as.factor=T)
pedhs16$Age<-Recode(pedhs16$hc1,recodes="0:29='< 30 months'; 30:60='> 30 months';else=NA", as.factor=T)
pedhs16$b_order<-Recode(pedhs16$hc64, recodes="1='1st-2nd'; 2='1st-2nd'; 3:14='3rd or later';else=NA", as.factor=T)
pedhs16$b_interval<-Recode(pedhs16$hc63, recodes="8:23='< 2 years'; 24:48='2-4 yrs'; 48:215='> 4 yrs';else=NA", as.factor=T)
pedhs16$sex<-pedhs16$hc27
#write.csv(pedhs16, '~//Google Drive/pedhs10.csv')
#time_to_water_source
pedhs16$time_water<-pedhs16$hv204
#wealth Index
pedhs16$mar_status<-Recode(pedhs16$v501, recodes="'married'='1in union';
'living with partner'='1in union';
'widowed'='1Not in union';
'divorced'='1Not in union';
'no longer living together/separated'='1Not in union';
'never in union '='1Not in union'; else=NA", as.factor=T)
#Wealth Index
pedhs16$wealth_index<-Recode(pedhs16$hv270, recodes="'poorest'='1Poor'; 'poorer'='1Poor';'middle'='2middle'; 'richer'='3rich'; 'richest'='3rich'; else=NA", as.factor=T)
#anemia-level
pedhs16$anemia<-Recode(pedhs16$hc57, recodes="'severe'='Yes'; 'moderate'='Yes'; 'mild'='Yes'; 'not anemic'='No'; else=NA", as.factor=F)
#anemia-level
pedhs16$diahrea<-Recode(pedhs16$h11, recodes='"no"="1No"; "yes, last two weeks"="2Yes";
"don\'t know"= "1No"; else=NA', as.factor=F)
#anemia-level
pedhs16$anemia<-Recode(pedhs16$hc57, recodes="'severe'='Yes'; 'moderate'='Yes'; 'mild'='Yes'; 'not anemic'='No'; else=NA", as.factor=F)
#Children Ever Born
pedhs16$parity<-Recode(pedhs16$v201, recodes="0='0'; 1:2='1-2'; 3:4='3-4'; 5:18='5+'; else=NA", as.factor=T)
pedhs16$fam_size<-Recode(pedhs16$hv009, recodes="0:3='1_Small(<4)'; 4:8='2_Medium (4-8) members) '; 9:19='3_>8 members';else=NA", as.factor=T)
pedhs16$family_size<-pedhs16$hv009
pedhs16$no_children<-Recode(pedhs16$hv014, recodes="1='1'; 2='2'; 3:10='3+'; else=NA", as.factor=T)
pedhs16$ceb<-pedhs16$v201
#Age at first Birth
pedhs16$age_fb<-Recode(pedhs16$v212, recodes="10:19='<20 years'; 20:49='20+years'; else=NA", as.factor=T)
#BMI
pedhs16$bmi<-ifelse(pedhs16$v445==9999 | pedhs16$v445==9998 , NA, pedhs16$v445/100)
pedhs16$bmi_rec<-Recode(pedhs16$bmi, recodes="0:18.49='Underweight'; 18.5:24.99='Normal';
25:49='Overweight'; else=NA", as.factor=T)
#Occupation
pedhs16$occup<-Recode(pedhs16$v717, recodes="'not working'='1Unemployed';
'professional/technical/managerial'='2Non-manual/Professional';
'clerical' = '2Non-manual/Professional';
'sales' = '2Non-manual/Professional';
'agricultural - employee'='3Agricultural/Manual(skilled/unskilled)';
'agricultural - self employed'='3Agricultural/Manual(skilled/unskilled)';
'skilled manual'='3Agricultural/Manual(skilled/unskilled)';
'unskilled manual'='3Agricultural/Manual(skilled/unskilled)';
'others'='3Agricultural/Manual(skilled/unskilled)';
else=NA", as.factor=T)
pedhs16$breastfed_1sthr<-Recode(pedhs16$m34, recodes="0='1Yes'; 100='1Yes'; 101:223='2No'; else=NA", as.factor=T)
#Partner"s/Husband's education level
pedhs16$pl_delivery<-Recode(pedhs16$m15, recodes='"home"="0Home"; "other home"="0Home";
"respondent\'s home"="0Home"; else="1Health facility"', as.factor=F)
pedhs16$region<-Recode(pedhs16$hv024, recodes="'tigray' ='Tigray'; 'afar'='Afar'; 'amhara'='Amhara'; 'oromia'='Oromia'; 'somali'='Somali';
'benishangul'='Benishangul';'dire dawa'='Dire Dawa';'addis adaba'='Addis Adaba';'snnpr'='SNNPR';'gambela'='Gambela';
'harari'='Harari';else=NA", as.factor=F)
write.csv(pedhs16, '~//Google Drive/pedhs10.csv')
pedhs16<-merge(pedhs16, ethsp4@data[,-37], by.x="region", by.y="REGNAME" ,all.x=T)
## identify the working variables and the all the edhs datasets
pedhs16$wt<-pedhs16$hv005/1000000
options(survey.lonely.psu = "adjust")
pedhs16$yrstratum<-paste(pedhs16$SVYYEAR, pedhs16$hv022, sep="-")
pedhs16$yrpsu<-paste(pedhs16$SVYYEAR, pedhs16$hv021, sep="-")
#table(pedhs16$yrstratum)
des<-svydesign(ids=~yrpsu, strata=~yrstratum, weights=~wt, data = pedhs16[is.na(pedhs16$wt)==F,], nest=T)
reg.est.s<-svyby(~stunt,~region+SVYYEAR,FUN = svymean, na.rm=T, design =des)
reg.est.stunt<-reshape(reg.est.s, idvar = "region", timevar = "SVYYEAR", direction = "wide")
reg.est.stunt
## region stunt.2016 se.2016
## Addis Adaba.2016 Addis Adaba 0.1459285 0.01925328
## Afar.2016 Afar 0.4110693 0.02464632
## Amhara.2016 Amhara 0.4632034 0.01941072
## Benishangul.2016 Benishangul 0.4274297 0.02203513
## Dire Dawa.2016 Dire Dawa 0.4019738 0.02895576
## Gambela.2016 Gambela 0.2352716 0.01826578
## Harari.2016 Harari 0.3195869 0.02520432
## Oromia.2016 Oromia 0.3647153 0.01700376
## SNNPR.2016 SNNPR 0.3863328 0.01982741
## Somali.2016 Somali 0.2743531 0.01738827
## Tigray.2016 Tigray 0.3927101 0.01806748
reg.est.w<-svyby(~wasted,~region+SVYYEAR,FUN = svymean, na.rm=T, design =des)
reg.est.wasted<-reshape(reg.est.w, idvar = "region", timevar = "SVYYEAR", direction = "wide")
reg.est.wasted
## region wasted.2016 se.2016
## Addis Adaba.2016 Addis Adaba 0.03472510 0.008311278
## Afar.2016 Afar 0.17760728 0.019688532
## Amhara.2016 Amhara 0.09718662 0.009674930
## Benishangul.2016 Benishangul 0.11151593 0.017453261
## Dire Dawa.2016 Dire Dawa 0.09888908 0.017358085
## Gambela.2016 Gambela 0.14006701 0.019506216
## Harari.2016 Harari 0.10744200 0.017425028
## Oromia.2016 Oromia 0.10434400 0.008757773
## SNNPR.2016 SNNPR 0.05966669 0.007108052
## Somali.2016 Somali 0.22522663 0.025358474
## Tigray.2016 Tigray 0.11013005 0.010672763
reg.est.u<-svyby(~underweight,~region+SVYYEAR,FUN = svymean, na.rm=T, design =des)
reg.est.uwgt<-reshape(reg.est.u, idvar = "region", timevar = "SVYYEAR", direction = "wide")
reg.est.uwgt
## region underweight.2016 se.2016
## Addis Adaba.2016 Addis Adaba 0.05034569 0.01539083
## Afar.2016 Afar 0.35547650 0.02909223
## Amhara.2016 Amhara 0.28210266 0.01696283
## Benishangul.2016 Benishangul 0.34069793 0.03049209
## Dire Dawa.2016 Dire Dawa 0.25680589 0.02431406
## Gambela.2016 Gambela 0.19200296 0.01981707
## Harari.2016 Harari 0.20501442 0.02110583
## Oromia.2016 Oromia 0.22238856 0.01307474
## SNNPR.2016 SNNPR 0.20644331 0.01691886
## Somali.2016 Somali 0.28968000 0.02088827
## Tigray.2016 Tigray 0.23039544 0.01463603
reg.est.c<-svyby(~combined,~region+SVYYEAR,FUN = svymean, na.rm=T, design =des)
reg.est.comb<-reshape(reg.est.c, idvar = "region", timevar = "SVYYEAR", direction = "wide")
reg.est.comb
## region combined.2016 se.2016
## Addis Adaba.2016 Addis Adaba 0.1833118 0.02340984
## Afar.2016 Afar 0.5434065 0.02718943
## Amhara.2016 Amhara 0.5402633 0.01781279
## Benishangul.2016 Benishangul 0.5147896 0.02794263
## Dire Dawa.2016 Dire Dawa 0.4839289 0.03138024
## Gambela.2016 Gambela 0.3563432 0.02476112
## Harari.2016 Harari 0.4136740 0.02949159
## Oromia.2016 Oromia 0.4546517 0.01840233
## SNNPR.2016 SNNPR 0.4367614 0.02022106
## Somali.2016 Somali 0.4662297 0.02492501
## Tigray.2016 Tigray 0.4813284 0.01854586
ethsp.w<-ethsp4
ethsp.u<-ethsp4
ethsp.c<-ethsp4
m.geo.stunt<-merge(ethsp4@data, reg.est.stunt, by.x="REGNAME", by.y="region" , sort=F)
m.geo.wasted<-merge(ethsp4@data, reg.est.wasted, by.x="REGNAME", by.y="region" , sort=F)
m.geo.uwgt<-merge(ethsp4@data, reg.est.uwgt, by.x="REGNAME", by.y="region" , sort=F)
m.geo.combined<-merge(ethsp4@data, reg.est.comb, by.x="REGNAME", by.y="region" , sort=F)
ethsp4@data<-m.geo.stunt
ethsp.w@data<-m.geo.wasted
ethsp.u@data<-m.geo.uwgt
ethsp.c@data<-m.geo.combined
library(sp)
brk.s<-quantile(m.geo.stunt$stunt.2016, probs = seq(0,1,length.out = 6), na.rm = T)
brk.w<-quantile(m.geo.wasted$wasted.2016, probs = seq(0,1,length.out = 6), na.rm = T)
brk.u<-quantile(m.geo.uwgt$underweight.2016, probs = seq(0,1,length.out = 6), na.rm = T)
brk.c<-quantile(m.geo.combined$combined.2016, probs = seq(0,1,length.out = 6), na.rm = T)
eth.geo.stunt<-st_as_sf(ethsp4)
eth.geo.wasted<-st_as_sf(ethsp.w)
eth.geo.uwgt<-st_as_sf(ethsp.u)
eth.geo.comb<-st_as_sf(ethsp.c)
eth.geo.stunt$stunt_q_16<-cut(eth.geo.stunt$stunt.2016, brk.s, include.lowest = T)
eth.geo.wasted$wasted_q_16<-cut(eth.geo.wasted$wasted.2016, brk.w, include.lowest = T)
eth.geo.uwgt$uwgt_q_16<-cut(eth.geo.uwgt$underweight.2016, brk.u, include.lowest = T)
eth.geo.comb$comb_q_16<-cut(eth.geo.comb$combined.2016, brk.c, include.lowest = T)
centroids<-coordinates(ethsp4)
centroids<-coordinates(ethsp.w)
centroids<-coordinates(ethsp.u)
centroids<-coordinates(ethsp.c)
eth.geo.stunt<-cbind(eth.geo.stunt, centroids)
eth.geo.wasted<-cbind(eth.geo.wasted, centroids)
eth.geo.uwgt<-cbind(eth.geo.uwgt, centroids)
eth.geo.comb<-cbind(eth.geo.comb, centroids)
p1<-eth.geo.stunt%>%
ggplot()+geom_sf(aes(fill=stunt_q_16))+ scale_colour_brewer(palette = "YlOrRd" )+scale_fill_brewer(palette = "YlOrRd", na.value="grey")+guides(fill=guide_legend(title="U5-Stunting"))+geom_text(aes(label = DHSREGEN, x = X1, y = X2), cex=5)+ylab(label = "")+xlab(label = "")+ggtitle(label = "Spatial Distribution of Stunting by Regions of Ethiopia, EDHS 2016", subtitle = "")+theme(legend.text = element_text(size =10))+theme(plot.title = element_text(color="Blue", size=10, face="bold"), plot.subtitle = element_text(color="blue", size=10, face="bold"))+theme(legend.title = element_text(color="blue", size=10, face="bold"), strip.text.x = element_text(size = 10))+theme(plot.title = element_text(size=10))
p1
p2<-eth.geo.wasted%>%
ggplot()+geom_sf(aes(fill=wasted_q_16))+ scale_colour_brewer(palette = "YlOrRd" )+scale_fill_brewer(palette = "YlOrRd", na.value="grey")+guides(fill=guide_legend(title="U5-Wasting"))+geom_text(aes(label = DHSREGEN, x = X1, y = X2), cex=5)+ylab(label = "")+xlab(label = "")+ggtitle(label = "Spatial Distribution of Wasting by Regions of Ethiopia, EDHS 2016", subtitle = "")+theme(legend.text = element_text(size =10))+theme(plot.title = element_text(color="Blue", size=10, face="bold"), plot.subtitle = element_text(color="blue", size=10, face="bold"))+theme(legend.title = element_text(color="blue", size=10, face="bold"), strip.text.x = element_text(size = 10))+theme(plot.title = element_text(size=10))
p2
p3<-eth.geo.uwgt%>%
ggplot()+geom_sf(aes(fill=uwgt_q_16))+ scale_colour_brewer(palette = "YlOrRd" )+scale_fill_brewer(palette = "YlOrRd", na.value="grey")+guides(fill=guide_legend(title="U5-Underweight"))+geom_text(aes(label = DHSREGEN, x = X1, y = X2), cex=5)+ylab(label = "")+xlab(label = "")+ggtitle(label = "Spatial Distribution of Underweight by Regions of Ethiopia, EDHS 2016", subtitle = "")+theme(legend.text = element_text(size =10))+theme(plot.title = element_text(color="Blue", size=10, face="bold"), plot.subtitle = element_text(color="blue", size=10, face="bold"))+theme(legend.title = element_text(color="blue", size=10, face="bold"), strip.text.x = element_text(size = 10))+theme(plot.title = element_text(size=10))
p3
p4<-eth.geo.comb%>%
ggplot()+geom_sf(aes(fill=comb_q_16))+ scale_colour_brewer(palette = "YlOrRd" )+scale_fill_brewer(palette = "YlOrRd", na.value="grey")+guides(fill=guide_legend(title="Combined Risk"))+geom_text(aes(label = DHSREGEN, x = X1, y = X2), cex=5)+ylab(label = "")+xlab(label = "")+ggtitle(label = "Spatial Distribution of Combined Malnutrition Risk by Regions of Ethiopia, EDHS 2016", subtitle = "")+theme(legend.text = element_text(size =10))+theme(plot.title = element_text(color="Blue", size=10, face="bold"), plot.subtitle = element_text(color="blue", size=10, face="bold"))+theme(legend.title = element_text(color="blue", size=10, face="bold"), strip.text.x = element_text(size = 10))+theme(plot.title = element_text(size=10))
p4
#ggsave(plot=p1,filename = "~/Google Drive/PhD_Courses/Spring 2019/PublicHealth/FinalProject/Under_5_Mortality.png", width = 18, height = 12,units = "in", dpi=100)
library(leaflet)
library(sf)
library(sp)
library(raster)
library(rgeos)
popch = paste0("<strong>", eth.geo.stunt$DHSREGEN, "</strong>", "</br>", eth.geo.stunt$CNNUTSCHA2)
bins <- c(30,40, 50,60, 70,80, 90,100, 110, 130)
bin <- c(3, 5, 10, 15, 20, 25,30,40, 50,60,70, 100, Inf)
pall <- colorBin("YlOrRd", domain = eth.geo.stunt$CNNUTSCHA2, bins = bins)
mypal <- colorQuantile(palette = "YlOrRd", domain = eth.geo.stunt$CNNUTSCHA2, n = 6, reverse = FALSE)
eth.geo.stunt$pch_mort<-eth.geo.stunt$CNNUTSCHA2*100
U5S <- leaflet(eth.geo.stunt) %>%
addProviderTiles("OpenStreetMap.Mapnik") %>%
#setView(lat = 9, lng = 41, zoom = 6.2) %>%
setView(lat = 9.292, lng = 40.71 , zoom = 6.2) %>%
addProviderTiles("CartoDB.Positron") %>%
addPolygons(data = eth.geo.stunt, fillColor = ~mypal(eth.geo.stunt$CNNUTSCHA2),
fillOpacity = 1,
weight = 1,
color = "white",
popup = popch)%>%
addLabelOnlyMarkers(data = eth.geo.stunt, lng = ~X1, lat = ~X2, label = ~DHSREGEN,
labelOptions = labelOptions(noHide = TRUE, direction = 'center', textOnly = TRUE)) %>%
addLegend(pal = mypal, values = ~CNNUTSCHA2, title = "U5 Stunting",opacity = 0.5, position = "bottomright")
U5S
library(leaflet)
library(sf)
library(sp)
library(raster)
library(rgeos)
popch = paste0("<strong>", eth.geo.wasted$DHSREGEN, "</strong>", "</br>", eth.geo.wasted$CNNUTSCWH2)
bins <- c(30,40, 50,60, 70,80, 90,100, 110, 130)
bin <- c(3, 5, 10, 15, 20, 25,30,40, 50,60,70, 100, Inf)
pall <- colorBin("YlOrRd", domain = eth.geo.wasted$CNNUTSCWH2, bins = bins)
mypal <- colorQuantile(palette = "YlOrRd", domain = eth.geo.wasted$CNNUTSCWH2, n = 6, reverse = FALSE)
eth.geo.wasted$pch_mort<-eth.geo.wasted$CNNUTSCWH2*100
U5W<- leaflet(eth.geo.wasted) %>%
addProviderTiles("OpenStreetMap.Mapnik") %>%
#setView(lat = 9, lng = 41, zoom = 6.2) %>%
setView(lat = 9.292, lng = 40.71 , zoom = 6.2) %>%
addProviderTiles("CartoDB.Positron") %>%
addPolygons(data = eth.geo.wasted, fillColor = ~mypal(eth.geo.wasted$CNNUTSCWH2),
fillOpacity = 1,
weight = 1,
color = "white",
popup = popch)%>%
addLabelOnlyMarkers(data = eth.geo.wasted, lng = ~X1, lat = ~X2, label = ~DHSREGEN,
labelOptions = labelOptions(noHide = TRUE, direction = 'center', textOnly = TRUE)) %>%
addLegend(pal = mypal, values = ~CNNUTSCWH2, title = "U5 Wasted",opacity = 0.5, position = "bottomright")
U5W
library(leaflet)
library(sf)
library(sp)
library(raster)
library(rgeos)
popch = paste0("<strong>", eth.geo.uwgt$DHSREGEN, "</strong>", "</br>", eth.geo.stunt$CNNUTSCWA2)
bins <- c(30,40, 50,60, 70,80, 90,100, 110, 130)
bin <- c(3, 5, 10, 15, 20, 25,30,40, 50,60,70, 100, Inf)
pall <- colorBin("YlOrRd", domain = eth.geo.stunt$CNNUTSCWA2, bins = bins)
mypal <- colorQuantile(palette = "YlOrRd", domain = eth.geo.uwgt$CNNUTSCWA2, n = 6, reverse = FALSE)
eth.geo.uwgt$pch_mort<-eth.geo.uwgt$CNNUTSCWA2*100
U5U<- leaflet(eth.geo.uwgt) %>%
addProviderTiles("OpenStreetMap.Mapnik") %>%
#setView(lat = 9, lng = 41, zoom = 6.2) %>%
setView(lat = 9.292, lng = 40.71 , zoom = 6.2) %>%
addProviderTiles("CartoDB.Positron") %>%
addPolygons(data = eth.geo.uwgt, fillColor = ~mypal(eth.geo.uwgt$CNNUTSCWA2),
fillOpacity = 1,
weight = 1,
color = "white",
popup = popch)%>%
addLabelOnlyMarkers(data = eth.geo.uwgt, lng = ~X1, lat = ~X2, label = ~DHSREGEN,
labelOptions = labelOptions(noHide = TRUE, direction = 'center', textOnly = TRUE)) %>%
addLegend(pal = mypal, values = ~CNNUTSCWA2, title = "U5 Underweight",opacity = 0.5, position = "bottomright")
U5U
library(leaflet)
library(sf)
library(sp)
library(raster)
library(rgeos)
bins <- c(30,40, 50,60, 70,80, 90,100, 110, 130)
bin <- c(3, 5, 10, 15, 20, 25,30,40, 50,60,70, 100, Inf)
pall <- colorBin("YlOrRd", domain = eth.geo.comb$combined.2016, bins = bins)
eth.geo.comb$combined<-eth.geo.comb$combined.2016*100
popch = paste0("<strong>", eth.geo.comb$DHSREGEN, "</strong>", "</br>", eth.geo.comb$combined)
mypal <- colorQuantile(palette = "YlOrRd", domain = eth.geo.comb$combined, n = 6, reverse = FALSE)
U5C<- leaflet(eth.geo.comb) %>%
addProviderTiles("OpenStreetMap.Mapnik") %>%
#setView(lat = 9, lng = 41, zoom = 6.2) %>%
setView(lat = 9.292, lng = 40.71 , zoom = 6.2) %>%
addProviderTiles("CartoDB.Positron") %>%
addPolygons(data = eth.geo.comb, fillColor = ~mypal(eth.geo.comb$combined),
fillOpacity = 1,
weight = 1,
color = "white",
popup = popch)%>%
addLabelOnlyMarkers(data = eth.geo.comb, lng = ~X1, lat = ~X2, label = ~DHSREGEN,
labelOptions = labelOptions(noHide = TRUE, direction = 'center', textOnly = TRUE)) %>%
addLegend(pal = mypal, values = ~combined, title = "U5 Combined Malnutrition",opacity = 0.5, position = "bottomright")
U5C
#Calculate percentage by background characterstics for all years
prop.table(cat<-svytable(~Age+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## Age 2016
## < 30 months 50.35385
## > 30 months 49.64615
prop.table(cat<-svytable(~age_fb+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## age_fb 2016
## <20 years 64.63254
## 20+years 35.36746
prop.table(cat<-svytable(~ stunt+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## stunt 2016
## 0 61.6386
## 1 38.3614
prop.table(cat<-svytable(~ wasted+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## wasted 2016
## 0 90.186707
## 1 9.813293
prop.table(cat<-svytable(~ underweight+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## underweight 2016
## 0 76.67906
## 1 23.32094
prop.table(cat<-svytable(~ combined+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## combined 2016
## 0 53.40146
## 1 46.59854
prop.table(cat<-svytable(~sex+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## sex 2016
## male 51.12845
## female 48.87155
prop.table(cat<-svytable(~meduc+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## meduc 2016
## 0Noeducation 65.878324
## 1Primary 27.099239
## 2Secodary/Higher 7.022437
prop.table(cat<-svytable(~res+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## res 2016
## urban 10.90268
## rural 89.09732
prop.table(cat<-svytable(~watersource+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## watersource 2016
## Improved 56.16005
## Unimproved 43.83995
prop.table(cat<-svytable(~toilet_facility+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## toilet_facility 2016
## Improved 9.140233
## Unimproved 90.859767
prop.table(cat<-svytable(~b_order+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## b_order 2016
## 1st-2nd 34.6108
## 3rd or later 65.3892
prop.table(cat<-svytable(~b_interval+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## b_interval 2016
## < 2 years 19.09225
## > 4 yrs 25.08676
## 2-4 yrs 55.82099
prop.table(cat<-svytable(~wealth_index+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## wealth_index 2016
## 1Poor 46.31847
## 2middle 20.82901
## 3rich 32.85252
prop.table(cat<-svytable(~pl_delivery+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## pl_delivery 2016
## 0Home 67.22107
## 1Health facility 32.77893
prop.table(cat<-svytable(~bmi_rec+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## bmi_rec 2016
## Normal 74.249211
## Overweight 6.140153
## Underweight 19.610636
prop.table(cat<-svytable(~family_size+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## family_size 2016
## 2 0.87570691
## 3 8.68116119
## 4 16.24540971
## 5 17.23706530
## 6 17.80270857
## 7 14.73567613
## 8 11.41752444
## 9 7.13908670
## 10 2.88538122
## 11 1.32671733
## 12 0.86168118
## 13 0.31801751
## 14 0.17535449
## 15 0.18026642
## 16 0.08720861
## 17 0.01581034
## 19 0.01522393
prop.table(cat<-svytable(~no_children+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## no_children 2016
## 1 37.76292
## 2 44.88307
## 3+ 17.35401
prop.table(cat<-svytable(~diahrea+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## diahrea 2016
## 1No 87.98408
## 2Yes 12.01592
prop.table(cat<-svytable(~region+SVYYEAR,design=des), margin = 2)*100
## SVYYEAR
## region 2016
## Addis Adaba 2.0787686
## Afar 0.9402692
## Amhara 20.1131706
## Benishangul 1.0223200
## Dire Dawa 0.3866153
## Gambela 0.2214537
## Harari 0.1919811
## Oromia 43.2798556
## SNNPR 21.0838728
## Somali 4.0201735
## Tigray 6.6615196
pedhs16_c<-subset(pedhs16,select=c("age","Age","res","sex", "stunt","stunt.r", "b_interval", "stunting","wasted","underweight", "b_order", "combined", "meduc","wt", "wealth_index","ceb", "family_size", "mother_age", "occup","age_fb", "watersource","time_water", "wasting", "toilet_facility","region", "pl_delivery", "diahrea", "bmi_rec","fam_size", "no_children", "age_fb", "v717", "m15","v501", "hv201", "hc1", "hv205", "hc61", "hv025", "v212", "hc63", "hc64", "v013", "hc27", "hc63", "hc64", "hv014", "hv024", "hv204", "hv270", "h11", "v201", "hv009", "v445"))
pedhs16_c<-na.omit(pedhs16_c)
library(caret)
library(randomForest)
set.seed(22222)
intrain.s<-createDataPartition(y=pedhs16_c$stunt, p=.70, list=F)
train.s <- pedhs16_c[intrain.s,]
test.s<- pedhs16_c[-intrain.s,]
intrain.w<-createDataPartition(y=pedhs16_c$wasted, p=.60, list=F)
train.w <- pedhs16_c[intrain.w,]
test.w<- pedhs16_c[-intrain.w,]
intrain.u<-createDataPartition(y=pedhs16_c$underweight, p=.70, list=F)
train.u <- pedhs16_c[intrain.u,]
test.u<- pedhs16_c[-intrain.u,]
intrain.c<-createDataPartition(y=pedhs16_c$combined, p=.70, list=F)
train.c <- pedhs16_c[intrain.c,]
test.c<- pedhs16_c[-intrain.c,]
library(lme4)
library(arm)
library(lmerTest)
fit.mix.s<-glmer(stunt~Age+sex+age_fb+res+meduc+watersource+time_water+b_order+b_interval+wealth_index+toilet_facility+pl_delivery+bmi_rec+family_size+no_children+diahrea+(1|region), train.s, weights=train.s$wt,family=binomial)
display(fit.mix.s, detail=T)
## glmer(formula = stunt ~ Age + sex + age_fb + res + meduc + watersource +
## time_water + b_order + b_interval + wealth_index + toilet_facility +
## pl_delivery + bmi_rec + family_size + no_children + diahrea +
## (1 | region), data = train.s, family = binomial, weights = train.s$wt)
## coef.est coef.se z value Pr(>|z|)
## (Intercept) 0.11 0.27 0.41 0.68
## Age> 30 months 0.63 0.06 9.87 0.00
## sexfemale -0.16 0.06 -2.69 0.01
## age_fb20+years -0.06 0.07 -0.92 0.36
## resrural -0.19 0.16 -1.25 0.21
## meduc1Primary -0.21 0.08 -2.68 0.01
## meduc2Secodary/Higher -0.60 0.21 -2.90 0.00
## watersourceUnimproved -0.07 0.07 -1.15 0.25
## time_water 0.00 0.00 -2.21 0.03
## b_order3rd or later 0.02 0.09 0.21 0.84
## b_interval> 4 yrs -0.43 0.11 -4.12 0.00
## b_interval2-4 yrs -0.38 0.08 -4.67 0.00
## wealth_index2middle -0.29 0.08 -3.63 0.00
## wealth_index3rich -0.42 0.08 -5.14 0.00
## toilet_facilityUnimproved 0.14 0.15 0.98 0.33
## pl_delivery1Health facility -0.16 0.09 -1.89 0.06
## bmi_recOverweight -0.33 0.16 -2.09 0.04
## bmi_recUnderweight 0.25 0.08 3.21 0.00
## family_size -0.03 0.02 -1.64 0.10
## no_children2 0.04 0.08 0.46 0.65
## no_children3+ -0.21 0.11 -1.89 0.06
## diahrea2Yes 0.14 0.10 1.49 0.14
##
## Error terms:
## Groups Name Std.Dev.
## region (Intercept) 0.26
## Residual 1.00
## ---
## number of obs: 4802, groups: region, 11
## AIC = 6304.9, DIC = 7079.4
## deviance = 6669.2
exp(fixef(fit.mix.s)[-1])
## Age> 30 months sexfemale
## 1.8797997 0.8491836
## age_fb20+years resrural
## 0.9383437 0.8231073
## meduc1Primary meduc2Secodary/Higher
## 0.8120705 0.5475864
## watersourceUnimproved time_water
## 0.9278117 0.9996689
## b_order3rd or later b_interval> 4 yrs
## 1.0184773 0.6472918
## b_interval2-4 yrs wealth_index2middle
## 0.6817012 0.7516247
## wealth_index3rich toilet_facilityUnimproved
## 0.6581978 1.1522166
## pl_delivery1Health facility bmi_recOverweight
## 0.8491396 0.7202198
## bmi_recUnderweight family_size
## 1.2801233 0.9702389
## no_children2 no_children3+
## 1.0373795 0.8145929
## diahrea2Yes
## 1.1533205
exp(confint(fit.mix.s, method="Wald"))
## 2.5 % 97.5 %
## .sig01 NA NA
## (Intercept) 0.6562476 1.9069293
## Age> 30 months 1.6583195 2.1308602
## sexfemale 0.7537858 0.9566548
## age_fb20+years 0.8198654 1.0739433
## resrural 0.6058534 1.1182665
## meduc1Primary 0.6973680 0.9456393
## meduc2Secodary/Higher 0.3644332 0.8227869
## watersourceUnimproved 0.8161335 1.0547717
## time_water 0.9993754 0.9999625
## b_order3rd or later 0.8555829 1.2123853
## b_interval> 4 yrs 0.5263724 0.7959890
## b_interval2-4 yrs 0.5804882 0.8005615
## wealth_index2middle 0.6441328 0.8770546
## wealth_index3rich 0.5611353 0.7720497
## toilet_facilityUnimproved 0.8667017 1.5317878
## pl_delivery1Health facility 0.7167588 1.0059702
## bmi_recOverweight 0.5294697 0.9796906
## bmi_recUnderweight 1.1008494 1.4885922
## family_size 0.9357517 1.0059971
## no_children2 0.8867672 1.2135724
## no_children3+ 0.6587067 1.0073704
## diahrea2Yes 0.9559023 1.3915107
fit.s<-glm(stunt~Age+sex+age_fb+res+meduc+watersource+time_water+b_order+b_interval+wealth_index+toilet_facility+pl_delivery+bmi_rec+family_size+no_children+diahrea+region, train.s, weights=train.s$wt,family=binomial)
summary(fit.s)
##
## Call:
## glm(formula = stunt ~ Age + sex + age_fb + res + meduc + watersource +
## time_water + b_order + b_interval + wealth_index + toilet_facility +
## pl_delivery + bmi_rec + family_size + no_children + diahrea +
## region, family = binomial, data = train.s, weights = train.s$wt)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5041 -0.8562 -0.2713 0.5454 3.6701
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.2840555 0.4281200 -0.663 0.50701
## Age> 30 months 0.6272541 0.0618239 10.146 < 2e-16 ***
## sexfemale -0.1640445 0.0588894 -2.786 0.00534 **
## age_fb20+years -0.0500792 0.0663729 -0.755 0.45054
## resrural -0.2372241 0.1472015 -1.612 0.10706
## meduc1Primary -0.2066572 0.0759290 -2.722 0.00649 **
## meduc2Secodary/Higher -0.6324261 0.1974690 -3.203 0.00136 **
## watersourceUnimproved -0.0575911 0.0636065 -0.905 0.36524
## time_water -0.0003030 0.0001422 -2.130 0.03316 *
## b_order3rd or later 0.0212012 0.0858925 0.247 0.80504
## b_interval> 4 yrs -0.4580140 0.1019896 -4.491 7.10e-06 ***
## b_interval2-4 yrs -0.3759730 0.0786132 -4.783 1.73e-06 ***
## wealth_index2middle -0.2918836 0.0776376 -3.760 0.00017 ***
## wealth_index3rich -0.4170731 0.0799339 -5.218 1.81e-07 ***
## toilet_facilityUnimproved 0.0709825 0.1343625 0.528 0.59730
## pl_delivery1Health facility -0.1528801 0.0840597 -1.819 0.06896 .
## bmi_recOverweight -0.2933491 0.1463661 -2.004 0.04505 *
## bmi_recUnderweight 0.2497476 0.0740959 3.371 0.00075 ***
## family_size -0.0279581 0.0178042 -1.570 0.11634
## no_children2 0.0263130 0.0777530 0.338 0.73505
## no_children3+ -0.2310168 0.1043433 -2.214 0.02683 *
## diahrea2Yes 0.1356023 0.0931520 1.456 0.14547
## regionAfar 0.3470527 0.4899757 0.708 0.47876
## regionAmhara 0.9330390 0.3951521 2.361 0.01822 *
## regionBenishangul 0.6432586 0.4756408 1.352 0.17625
## regionDire Dawa 0.8808982 0.6318780 1.394 0.16329
## regionGambela -0.3332688 0.8698620 -0.383 0.70162
## regionHarari 0.4032397 0.8211255 0.491 0.62337
## regionOromia 0.4961953 0.3944231 1.258 0.20838
## regionSNNPR 0.4744647 0.3951094 1.201 0.22981
## regionSomali -0.1782729 0.4141975 -0.430 0.66690
## regionTigray 0.6912256 0.4039281 1.711 0.08703 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7061.4 on 4801 degrees of freedom
## Residual deviance: 6664.8 on 4770 degrees of freedom
## AIC: 6483
##
## Number of Fisher Scoring iterations: 4
exp(cbind(coef(fit.s), confint(fit.s)))
## 2.5 % 97.5 %
## (Intercept) 0.7527249 0.3095606 1.6821724
## Age> 30 months 1.8724620 1.6590902 2.1141288
## sexfemale 0.8487042 0.7561411 0.9525033
## age_fb20+years 0.9511541 0.8349433 1.0830992
## resrural 0.7888145 0.5912901 1.0534148
## meduc1Primary 0.8132984 0.7004936 0.9433995
## meduc2Secodary/Higher 0.5313012 0.3577036 0.7768312
## watersourceUnimproved 0.9440358 0.8333227 1.0693252
## time_water 0.9996971 0.9994165 0.9999743
## b_order3rd or later 1.0214275 0.8633776 1.2090849
## b_interval> 4 yrs 0.6325386 0.5178054 0.7723743
## b_interval2-4 yrs 0.6866209 0.5885506 0.8010186
## wealth_index2middle 0.7468555 0.6411743 0.8693023
## wealth_index3rich 0.6589728 0.5631654 0.7704432
## toilet_facilityUnimproved 1.0735624 0.8265019 1.4001711
## pl_delivery1Health facility 0.8582326 0.7274207 1.0114179
## bmi_recOverweight 0.7457617 0.5573056 0.9898774
## bmi_recUnderweight 1.2837014 1.1099707 1.4841539
## family_size 0.9724291 0.9389669 1.0068575
## no_children2 1.0266622 0.8815801 1.1957857
## no_children3+ 0.7937261 0.6467368 0.9736370
## diahrea2Yes 1.1452264 0.9534223 1.3738346
## regionAfar 1.4148914 0.5527415 3.8261917
## regionAmhara 2.5422233 1.2210671 5.8419810
## regionBenishangul 1.9026708 0.7681820 5.0235480
## regionDire Dawa 2.4130661 0.6981855 8.5079493
## regionGambela 0.7165775 0.1019823 3.5482558
## regionHarari 1.4966657 0.2626331 7.1699873
## regionOromia 1.6424603 0.7900992 3.7695488
## regionSNNPR 1.6071537 0.7719843 3.6928372
## regionSomali 0.8367140 0.3853086 1.9848902
## regionTigray 1.9961605 0.9405167 4.6552403
library(ROCR)
test_arr_predict <- predict(fit.mix.s, test.s, type="response")
#create a data frame of out-sample(test set) reference value and predicted value
out_sample_arr <- data.frame(labels = test.s$stunt, predictions = test_arr_predict)
#get the performance of the in-sample prediction
pred.out_sample_arr <- prediction(out_sample_arr$predictions, out_sample_arr$labels)
perf.out_sample_arr = performance(pred.out_sample_arr, measure = "tpr", x.measure="fpr")
#This function generates the optimal cutoff value that maximizes the sensitivity and specificity simutaneously
opt.cut = function(perf, pred){
cut.ind = mapply(FUN=function(x, y, p){
d = (x - 0)^2 + (y-1)^2
ind = which(d == min(d))
c(sensitivity = y[[ind]], specificity = 1-x[[ind]],
cutoff = p[[ind]])
}, perf@x.values, perf@y.values, pred@cutoffs)
}
print(opt.cut(perf.out_sample_arr, pred.out_sample_arr))
## [,1]
## sensitivity 0.7078788
## specificity 0.5154221
## cutoff 0.3430460
#classify the predicted values below the cutoff values to 0, above to 1
out_sample_arr$predClass = ifelse(out_sample_arr$predictions >0.6365748,1,0)
# Confusion matrix
confusionMatrix(table(out_sample_arr$labels ,out_sample_arr$predClass))
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 1224 8
## 1 808 17
##
## Accuracy : 0.6033
## 95% CI : (0.5818, 0.6245)
## No Information Rate : 0.9878
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0168
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.60236
## Specificity : 0.68000
## Pos Pred Value : 0.99351
## Neg Pred Value : 0.02061
## Prevalence : 0.98785
## Detection Rate : 0.59504
## Detection Prevalence : 0.59893
## Balanced Accuracy : 0.64118
##
## 'Positive' Class : 0
##
library(ROCR)
test_arr_predict <- predict(fit.s, test.s, type="response")
#create a data frame of out-sample(test set) reference value and predicted value
out_sample_arr <- data.frame(labels = test.s$stunt, predictions = test_arr_predict)
#get the performance of the in-sample prediction
pred.out_sample_arr <- prediction(out_sample_arr$predictions, out_sample_arr$labels)
perf.out_sample_arr = performance(pred.out_sample_arr, measure = "tpr", x.measure="fpr")
#This function generates the optimal cutoff value that maximizes the sensitivity and specificity simutaneously
opt.cut = function(perf, pred){
cut.ind = mapply(FUN=function(x, y, p){
d = (x - 0)^2 + (y-1)^2
ind = which(d == min(d))
c(sensitivity = y[[ind]], specificity = 1-x[[ind]],
cutoff = p[[ind]])
}, perf@x.values, perf@y.values, pred@cutoffs)
}
print(opt.cut(perf.out_sample_arr, pred.out_sample_arr))
## [,1]
## sensitivity 0.6703030
## specificity 0.5478896
## cutoff 0.3419350
#classify the predicted values below the cutoff values to 0, above to 1
out_sample_arr$predClass = ifelse(out_sample_arr$predictions >0.6365748,1,0)
# Confusion matrix
confusionMatrix(table(out_sample_arr$labels ,out_sample_arr$predClass))
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 1218 14
## 1 800 25
##
## Accuracy : 0.6043
## 95% CI : (0.5828, 0.6255)
## No Information Rate : 0.981
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0225
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6036
## Specificity : 0.6410
## Pos Pred Value : 0.9886
## Neg Pred Value : 0.0303
## Prevalence : 0.9810
## Detection Rate : 0.5921
## Detection Prevalence : 0.5989
## Balanced Accuracy : 0.6223
##
## 'Positive' Class : 0
##
library(party)
library(randomForest)
controlDTree <- trainControl(method="cv", 5)
train.s= na.omit(train.s)
train.s$stunting <- factor(train.s$stunting)
rf2 <-randomForest(stunting~v717+m15+v501+hv201+hc1+hv205+hc61+hv025+v212+hc63+hc64+v013+hc27+hc63+hc64+hv014+hv024+hv204+hv270+h11+v201+hv009+v445, data=train.s)
print(rf2)
##
## Call:
## randomForest(formula = stunting ~ v717 + m15 + v501 + hv201 + hc1 + hv205 + hc61 + hv025 + v212 + hc63 + hc64 + v013 + hc27 + hc63 + hc64 + hv014 + hv024 + hv204 + hv270 + h11 + v201 + hv009 + v445, data = train.s)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 33.59%
## Confusion matrix:
## no yes class.error
## no 2486 584 0.1902280
## yes 1029 703 0.5941109
print(importance(rf2,type = 2))
## MeanDecreaseGini
## v717 74.07013
## m15 61.76096
## v501 19.58124
## hv201 168.94126
## hc1 320.66027
## hv205 58.80136
## hc61 39.55445
## hv025 11.32081
## v212 136.34160
## hc63 197.66523
## hc64 92.18071
## v013 98.41926
## hc27 34.54762
## hv014 58.50836
## hv024 177.45771
## hv204 127.27707
## hv270 85.92757
## h11 20.94379
## v201 91.05105
## hv009 102.17918
## v445 219.44327
pred = predict(rf2, newdata=test.s)
accuracy <- table(pred, test.s[,"stunting"])
sum(diag(accuracy))/sum(accuracy)
## [1] 0.6349052
confusionMatrix(data=pred, as.factor(test.s$stunting))
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1002 521
## yes 230 304
##
## Accuracy : 0.6349
## 95% CI : (0.6137, 0.6557)
## No Information Rate : 0.5989
## P-Value [Acc > NIR] : 0.0004455
##
## Kappa : 0.193
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8133
## Specificity : 0.3685
## Pos Pred Value : 0.6579
## Neg Pred Value : 0.5693
## Prevalence : 0.5989
## Detection Rate : 0.4871
## Detection Prevalence : 0.7404
## Balanced Accuracy : 0.5909
##
## 'Positive' Class : no
##
test_prob = predict(fit.mix.s, newdata = test.s, type = "response")
test_roc = roc(test.s$stunt ~ test_prob, plot = TRUE, print.auc = TRUE,percent=TRUE)
test_prob2 = predict(fit.s, newdata = test.s, type = "response")
test_roc2 = roc(test.s$stunt ~ test_prob2, plot = TRUE, print.auc = TRUE,percent=TRUE)
test_prob5 = predict(rf2, newdata=test.s, type = "prob")
test_roc5= roc(test.s$stunting, test_prob5[,c(2)],plot = TRUE, col='darkgoldenrod4', print.auc = TRUE,percent=TRUE)
plot(test_roc, print.auc = TRUE, col='blue',percent=TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test_roc5 , add = TRUE, col='darkgoldenrod4', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='red', print.auc = TRUE, print.auc.y = 20, print.auc.x = 0)
abline(a=0, b=1, lty=2, lwd=3, col="black")
legend("topleft", legend = c("Hierarchical Model","Random Forest Model","Logistic Regression Model") , pch = 15, bty = 'n', col = c("blue","darkgoldenrod4", "red"))
library(lme4)
library(arm)
library(lmerTest)
fit.mix.w<-glmer(wasted~Age+sex+age_fb+res+meduc+watersource+time_water+b_order+b_interval+wealth_index+toilet_facility+pl_delivery+bmi_rec+family_size+no_children+diahrea+(1|region), train.w, weights=train.w$wt,family=binomial)
display(fit.mix.w, detail=T)
## glmer(formula = wasted ~ Age + sex + age_fb + res + meduc + watersource +
## time_water + b_order + b_interval + wealth_index + toilet_facility +
## pl_delivery + bmi_rec + family_size + no_children + diahrea +
## (1 | region), data = train.w, family = binomial, weights = train.w$wt)
## coef.est coef.se z value Pr(>|z|)
## (Intercept) -1.90 0.46 -4.15 0.00
## Age> 30 months -0.65 0.11 -5.91 0.00
## sexfemale 0.09 0.10 0.90 0.37
## age_fb20+years 0.06 0.11 0.55 0.58
## resrural -1.07 0.26 -4.10 0.00
## meduc1Primary -0.06 0.14 -0.44 0.66
## meduc2Secodary/Higher 0.33 0.35 0.93 0.35
## watersourceUnimproved 0.13 0.11 1.15 0.25
## time_water 0.00 0.00 -0.74 0.46
## b_order3rd or later 0.44 0.17 2.54 0.01
## b_interval> 4 yrs -0.36 0.19 -1.88 0.06
## b_interval2-4 yrs -0.07 0.13 -0.52 0.61
## wealth_index2middle -0.08 0.13 -0.64 0.52
## wealth_index3rich -0.61 0.15 -4.03 0.00
## toilet_facilityUnimproved 0.34 0.28 1.22 0.22
## pl_delivery1Health facility -0.29 0.15 -1.95 0.05
## bmi_recOverweight -0.95 0.36 -2.65 0.01
## bmi_recUnderweight 0.46 0.12 3.79 0.00
## family_size 0.06 0.03 1.94 0.05
## no_children2 0.17 0.15 1.14 0.25
## no_children3+ 0.39 0.18 2.17 0.03
## diahrea2Yes -0.03 0.16 -0.19 0.85
##
## Error terms:
## Groups Name Std.Dev.
## region (Intercept) 0.25
## Residual 1.00
## ---
## number of obs: 4116, groups: region, 11
## AIC = 2771.3, DIC = 3230.5
## deviance = 2977.9
exp(fixef(fit.mix.w)[-1])
## Age> 30 months sexfemale
## 0.5223870 1.0976926
## age_fb20+years resrural
## 1.0647018 0.3442388
## meduc1Primary meduc2Secodary/Higher
## 0.9415679 1.3856082
## watersourceUnimproved time_water
## 1.1336268 0.9997998
## b_order3rd or later b_interval> 4 yrs
## 1.5550297 0.6981660
## b_interval2-4 yrs wealth_index2middle
## 0.9335114 0.9214214
## wealth_index3rich toilet_facilityUnimproved
## 0.5459582 1.4096182
## pl_delivery1Health facility bmi_recOverweight
## 0.7478579 0.3852672
## bmi_recUnderweight family_size
## 1.5874810 1.0597323
## no_children2 no_children3+
## 1.1856966 1.4814105
## diahrea2Yes
## 0.9699378
exp(confint(fit.mix.w, method="Wald"))
## 2.5 % 97.5 %
## .sig01 NA NA
## (Intercept) 0.06109223 0.3675539
## Age> 30 months 0.42112487 0.6479983
## sexfemale 0.89697313 1.3433280
## age_fb20+years 0.85205993 1.3304110
## resrural 0.20674733 0.5731652
## meduc1Primary 0.71951496 1.2321498
## meduc2Secodary/Higher 0.69848775 2.7486667
## watersourceUnimproved 0.91484469 1.4047299
## time_water 0.99926869 1.0003312
## b_order3rd or later 1.10597760 2.1864073
## b_interval> 4 yrs 0.47959947 1.0163394
## b_interval2-4 yrs 0.71878444 1.2123850
## wealth_index2middle 0.71719718 1.1837992
## wealth_index3rich 0.40669384 0.7329110
## toilet_facilityUnimproved 0.81284626 2.4445255
## pl_delivery1Health facility 0.55804355 1.0022362
## bmi_recOverweight 0.19039903 0.7795776
## bmi_recUnderweight 1.24965274 2.0166370
## family_size 0.99953621 1.1235536
## no_children2 0.88514621 1.5882986
## no_children3+ 1.03904056 2.1121189
## diahrea2Yes 0.70510420 1.3342417
fit.w<-glm(wasted~Age+sex+age_fb+res+meduc+watersource+time_water+b_order+b_interval+wealth_index+toilet_facility+pl_delivery+bmi_rec+family_size+no_children+diahrea+region, train.w, weights=train.w$wt,family=binomial)
summary(fit.w)
##
## Call:
## glm(formula = wasted ~ Age + sex + age_fb + res + meduc + watersource +
## time_water + b_order + b_interval + wealth_index + toilet_facility +
## pl_delivery + bmi_rec + family_size + no_children + diahrea +
## region, family = binomial, data = train.w, weights = train.w$wt)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7037 -0.5345 -0.3086 -0.1230 5.0066
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.0638494 0.7498783 -2.752 0.005919 **
## Age> 30 months -0.6067746 0.1042004 -5.823 5.77e-09 ***
## sexfemale 0.0431081 0.0978459 0.441 0.659523
## age_fb20+years 0.0701305 0.1075497 0.652 0.514352
## resrural -0.9580582 0.2344433 -4.087 4.38e-05 ***
## meduc1Primary -0.0460254 0.1318208 -0.349 0.726976
## meduc2Secodary/Higher 0.3037539 0.3264259 0.931 0.352089
## watersourceUnimproved 0.1199635 0.1045641 1.147 0.251269
## time_water -0.0002752 0.0002476 -1.112 0.266342
## b_order3rd or later 0.3811509 0.1617866 2.356 0.018479 *
## b_interval> 4 yrs -0.3196613 0.1830781 -1.746 0.080804 .
## b_interval2-4 yrs -0.0403607 0.1259107 -0.321 0.748551
## wealth_index2middle -0.0387915 0.1250856 -0.310 0.756470
## wealth_index3rich -0.5540636 0.1452054 -3.816 0.000136 ***
## toilet_facilityUnimproved 0.2571811 0.2393162 1.075 0.282532
## pl_delivery1Health facility -0.2541685 0.1425645 -1.783 0.074614 .
## bmi_recOverweight -0.8028960 0.3007523 -2.670 0.007594 **
## bmi_recUnderweight 0.4376308 0.1152202 3.798 0.000146 ***
## family_size 0.0509635 0.0283920 1.795 0.072654 .
## no_children2 0.1753368 0.1428609 1.227 0.219700
## no_children3+ 0.3486659 0.1724537 2.022 0.043198 *
## diahrea2Yes 0.0189803 0.1545471 0.123 0.902256
## regionAfar 0.5892031 0.7895075 0.746 0.455491
## regionAmhara 0.2905587 0.6911064 0.420 0.674175
## regionBenishangul 0.2132111 0.8204631 0.260 0.794967
## regionDire Dawa 0.0152304 1.1598419 0.013 0.989523
## regionGambela 0.1853093 1.1974548 0.155 0.877016
## regionHarari 0.5700444 1.2101069 0.471 0.637591
## regionOromia 0.2002930 0.6904335 0.290 0.771742
## regionSNNPR -0.3118742 0.6947639 -0.449 0.653509
## regionSomali 0.8315152 0.6963894 1.194 0.232463
## regionTigray 0.2818949 0.7037805 0.401 0.688756
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3168.0 on 4115 degrees of freedom
## Residual deviance: 2973.1 on 4084 degrees of freedom
## AIC: 2849.4
##
## Number of Fisher Scoring iterations: 5
exp(cbind(coef(fit.w), confint(fit.w)))
## 2.5 % 97.5 %
## (Intercept) 0.1269643 0.02355019 0.4873491
## Age> 30 months 0.5451062 0.44380928 0.6678502
## sexfemale 1.0440508 0.86192891 1.2651049
## age_fb20+years 1.0726482 0.86713583 1.3221742
## resrural 0.3836371 0.24396980 0.6125594
## meduc1Primary 0.9550177 0.73436695 1.2318303
## meduc2Secodary/Higher 1.3549355 0.69471575 2.5122666
## watersourceUnimproved 1.1274557 0.91878629 1.3845514
## time_water 0.9997248 0.99922665 1.0001984
## b_order3rd or later 1.4639685 1.07242401 2.0237463
## b_interval> 4 yrs 0.7263950 0.50650846 1.0387609
## b_interval2-4 yrs 0.9604430 0.75240787 1.2330221
## wealth_index2middle 0.9619512 0.75065285 1.2262007
## wealth_index3rich 0.5746101 0.43019297 0.7605241
## toilet_facilityUnimproved 1.2932793 0.82164227 2.1043788
## pl_delivery1Health facility 0.7755611 0.58339977 1.0207169
## bmi_recOverweight 0.4480296 0.23752080 0.7788119
## bmi_recUnderweight 1.5490329 1.23289146 1.9373669
## family_size 1.0522845 0.99480573 1.1119712
## no_children2 1.1916475 0.90295959 1.5814797
## no_children3+ 1.4171756 1.01215552 1.9907945
## diahrea2Yes 1.0191615 0.74657052 1.3696391
## regionAfar 1.8025514 0.41781167 10.2090817
## regionAmhara 1.3371744 0.39832463 6.6176777
## regionBenishangul 1.2376459 0.26201933 7.2948347
## regionDire Dawa 1.0153470 0.06668781 9.0823698
## regionGambela 1.2035906 0.07617710 11.7132129
## regionHarari 1.7683455 0.10439310 17.3553864
## regionOromia 1.2217607 0.36436279 6.0399047
## regionSNNPR 0.7320736 0.21603796 3.6407777
## regionSomali 2.2967962 0.67531335 11.4493154
## regionTigray 1.3256394 0.38225432 6.6730234
library(ROCR)
test_arr_predict <- predict(fit.mix.w, test.w, type="response")
#create a data frame of out-sample(test set) reference value and predicted value
out_sample_arr <- data.frame(labels = test.w$wasted, predictions = test_arr_predict)
#get the performance of the in-sample prediction
pred.out_sample_arr <- prediction(out_sample_arr$predictions, out_sample_arr$labels)
perf.out_sample_arr = performance(pred.out_sample_arr, measure = "tpr", x.measure="fpr")
#This function generates the optimal cutoff value that maximizes the sensitivity and specificity simutaneously
opt.cut = function(perf, pred){
cut.ind = mapply(FUN=function(x, y, p){
d = (x - 0)^2 + (y-1)^2
ind = which(d == min(d))
c(sensitivity = y[[ind]], specificity = 1-x[[ind]],
cutoff = p[[ind]])
}, perf@x.values, perf@y.values, pred@cutoffs)
}
print(opt.cut(perf.out_sample_arr, pred.out_sample_arr))
## [,1]
## sensitivity 0.6776119
## specificity 0.5859635
## cutoff 0.1132143
#classify the predicted values below the cutoff values to 0, above to 1
out_sample_arr$predClass = ifelse(out_sample_arr$predictions >0.1179763,1,0)
# Confusion matrix
confusionMatrix(table(out_sample_arr$labels ,out_sample_arr$predClass))
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 1464 944
## 1 121 214
##
## Accuracy : 0.6117
## 95% CI : (0.5932, 0.63)
## No Information Rate : 0.5778
## P-Value [Acc > NIR] : 0.0001659
##
## Kappa : 0.1199
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9237
## Specificity : 0.1848
## Pos Pred Value : 0.6080
## Neg Pred Value : 0.6388
## Prevalence : 0.5778
## Detection Rate : 0.5337
## Detection Prevalence : 0.8779
## Balanced Accuracy : 0.5542
##
## 'Positive' Class : 0
##
library(ROCR)
test_arr_predict <- predict(fit.w, test.w, type="response")
#create a data frame of out-sample(test set) reference value and predicted value
out_sample_arr <- data.frame(labels = test.w$wasted, predictions = test_arr_predict)
#get the performance of the in-sample prediction
pred.out_sample_arr <- prediction(out_sample_arr$predictions, out_sample_arr$labels)
perf.out_sample_arr = performance(pred.out_sample_arr, measure = "tpr", x.measure="fpr")
#This function generates the optimal cutoff value that maximizes the sensitivity and specificity simutaneously
opt.cut = function(perf, pred){
cut.ind = mapply(FUN=function(x, y, p){
d = (x - 0)^2 + (y-1)^2
ind = which(d == min(d))
c(sensitivity = y[[ind]], specificity = 1-x[[ind]],
cutoff = p[[ind]])
}, perf@x.values, perf@y.values, pred@cutoffs)
}
print(opt.cut(perf.out_sample_arr, pred.out_sample_arr))
## [,1]
## sensitivity 0.6656716
## specificity 0.6058970
## cutoff 0.1231141
#classify the predicted values below the cutoff values to 0, above to 1
out_sample_arr$predClass = ifelse(out_sample_arr$predictions >0.1179763,1,0)
# Confusion matrix
confusionMatrix(table(out_sample_arr$labels ,out_sample_arr$predClass))
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 1406 1002
## 1 108 227
##
## Accuracy : 0.5953
## 95% CI : (0.5767, 0.6138)
## No Information Rate : 0.552
## P-Value [Acc > NIR] : 2.488e-06
##
## Kappa : 0.1217
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9287
## Specificity : 0.1847
## Pos Pred Value : 0.5839
## Neg Pred Value : 0.6776
## Prevalence : 0.5520
## Detection Rate : 0.5126
## Detection Prevalence : 0.8779
## Balanced Accuracy : 0.5567
##
## 'Positive' Class : 0
##
library(party)
library(randomForest)
controlDTree <- trainControl(method="cv", 5)
train.w= na.omit(train.w)
train.w$wasting <- factor(train.w$wasting)
rf.w <-randomForest(wasting~v717+m15+v501+hv201+hc1+hv205+hc61+hv025+v212+hc63+hc64+v013+hc27+hc63+hc64+hv014+hv024+hv204+hv270+h11+hv009+v445, data=train.w)
print(rf.w)
##
## Call:
## randomForest(formula = wasting ~ v717 + m15 + v501 + hv201 + hc1 + hv205 + hc61 + hv025 + v212 + hc63 + hc64 + v013 + hc27 + hc63 + hc64 + hv014 + hv024 + hv204 + hv270 + h11 + hv009 + v445, data = train.w)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 12.71%
## Confusion matrix:
## no yes class.error
## no 3583 12 0.003337969
## yes 511 10 0.980806142
print(importance(rf.w,type = 2))
## MeanDecreaseGini
## v717 28.796891
## m15 25.068397
## v501 9.316115
## hv201 72.753768
## hc1 92.058257
## hv205 22.506606
## hc61 15.904783
## hv025 7.036378
## v212 63.430490
## hc63 87.667776
## hc64 52.371618
## v013 46.006533
## hc27 16.226933
## hv014 28.003404
## hv024 69.962590
## hv204 61.078038
## hv270 32.935905
## h11 11.358217
## hv009 50.360687
## v445 108.266386
pred = predict(rf.w, newdata=test.w)
accuracy <- table(pred, test.w[,"wasting"])
sum(diag(accuracy))/sum(accuracy)
## [1] 0.8771418
confusionMatrix(data=pred, as.factor(test.w$wasting))
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 2402 331
## yes 6 4
##
## Accuracy : 0.8771
## 95% CI : (0.8643, 0.8892)
## No Information Rate : 0.8779
## P-Value [Acc > NIR] : 0.5608
##
## Kappa : 0.0162
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99751
## Specificity : 0.01194
## Pos Pred Value : 0.87889
## Neg Pred Value : 0.40000
## Prevalence : 0.87787
## Detection Rate : 0.87568
## Detection Prevalence : 0.99635
## Balanced Accuracy : 0.50472
##
## 'Positive' Class : no
##
test_prob = predict(fit.mix.w, newdata = test.w, type = "response")
test_roc = roc(test.w$wasted ~ test_prob, plot = TRUE, print.auc = TRUE,percent=TRUE)
test_prob2 = predict(fit.w, newdata = test.w, type = "response")
test_roc2 = roc(test.w$wasted ~ test_prob2, plot = TRUE, print.auc = TRUE,percent=TRUE)
test5 = predict(rf.w, newdata=test.w, type = "prob")
test5= roc(test.w$wasting, test5[,c(2)],plot = TRUE, col='darkgoldenrod4', print.auc = TRUE,percent=TRUE)
plot(test_roc, print.auc = TRUE, col='blue',percent=TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test5 , add = TRUE, col='darkgoldenrod4', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='red', print.auc = TRUE, print.auc.y = 20, print.auc.x = 0)
abline(a=0, b=1, lty=2, lwd=3, col="black")
legend("topleft", legend = c("Hierarchical Model","Random Forest Model","Logistic Regression Model") , pch = 15, bty = 'n', col = c("blue","darkgoldenrod4", "red"))
library(lme4)
library(arm)
library(lmerTest)
fit.mix.u<-glmer(underweight~Age+sex+age_fb+res+meduc+watersource+time_water+b_order+b_interval+wealth_index+toilet_facility+pl_delivery+bmi_rec+family_size+no_children+diahrea+(1|region), train.u, weights=train.u$wt,family=binomial)
display(fit.mix.u, detail=T)
## glmer(formula = underweight ~ Age + sex + age_fb + res + meduc +
## watersource + time_water + b_order + b_interval + wealth_index +
## toilet_facility + pl_delivery + bmi_rec + family_size + no_children +
## diahrea + (1 | region), data = train.u, family = binomial,
## weights = train.u$wt)
## coef.est coef.se z value Pr(>|z|)
## (Intercept) -1.22 0.30 -4.09 0.00
## Age> 30 months 0.52 0.07 7.10 0.00
## sexfemale -0.02 0.07 -0.25 0.80
## age_fb20+years 0.17 0.07 2.25 0.02
## resrural 0.14 0.18 0.75 0.45
## meduc1Primary -0.35 0.09 -3.90 0.00
## meduc2Secodary/Higher -0.49 0.25 -1.96 0.05
## watersourceUnimproved -0.16 0.07 -2.19 0.03
## time_water 0.00 0.00 1.66 0.10
## b_order3rd or later 0.06 0.10 0.64 0.52
## b_interval> 4 yrs -0.39 0.12 -3.22 0.00
## b_interval2-4 yrs -0.21 0.09 -2.29 0.02
## wealth_index2middle -0.40 0.09 -4.57 0.00
## wealth_index3rich -0.60 0.09 -6.37 0.00
## toilet_facilityUnimproved 0.14 0.16 0.85 0.39
## pl_delivery1Health facility -0.07 0.10 -0.69 0.49
## bmi_recOverweight -0.60 0.20 -3.04 0.00
## bmi_recUnderweight 0.37 0.08 4.35 0.00
## family_size -0.02 0.02 -0.86 0.39
## no_children2 0.23 0.09 2.50 0.01
## no_children3+ -0.10 0.12 -0.82 0.41
## diahrea2Yes 0.76 0.10 7.60 0.00
##
## Error terms:
## Groups Name Std.Dev.
## region (Intercept) 0.12
## Residual 1.00
## ---
## number of obs: 4802, groups: region, 11
## AIC = 5335.5, DIC = 6056.5
## deviance = 5673.0
exp(fixef(fit.mix.u)[-1])
## Age> 30 months sexfemale
## 1.6781744 0.9829806
## age_fb20+years resrural
## 1.1816876 1.1475434
## meduc1Primary meduc2Secodary/Higher
## 0.7030765 0.6143373
## watersourceUnimproved time_water
## 0.8517313 1.0002911
## b_order3rd or later b_interval> 4 yrs
## 1.0652979 0.6789157
## b_interval2-4 yrs wealth_index2middle
## 0.8129021 0.6717911
## wealth_index3rich toilet_facilityUnimproved
## 0.5480417 1.1500954
## pl_delivery1Health facility bmi_recOverweight
## 0.9342114 0.5482468
## bmi_recUnderweight family_size
## 1.4417899 0.9818509
## no_children2 no_children3+
## 1.2533401 0.9053286
## diahrea2Yes
## 2.1398450
exp(confint(fit.mix.u, method="Wald"))
## 2.5 % 97.5 %
## .sig01 NA NA
## (Intercept) 0.1641458 0.5290363
## Age> 30 months 1.4547664 1.9358913
## sexfemale 0.8600561 1.1234742
## age_fb20+years 1.0215752 1.3668945
## resrural 0.8010511 1.6439100
## meduc1Primary 0.5889721 0.8392869
## meduc2Secodary/Higher 0.3777872 0.9990026
## watersourceUnimproved 0.7378834 0.9831449
## time_water 0.9999475 1.0006349
## b_order3rd or later 0.8780485 1.2924794
## b_interval> 4 yrs 0.5365251 0.8590959
## b_interval2-4 yrs 0.6806690 0.9708241
## wealth_index2middle 0.5664063 0.7967835
## wealth_index3rich 0.4554335 0.6594810
## toilet_facilityUnimproved 0.8344374 1.5851631
## pl_delivery1Health facility 0.7706928 1.1324240
## bmi_recOverweight 0.3721098 0.8077577
## bmi_recUnderweight 1.2224876 1.7004328
## family_size 0.9415622 1.0238636
## no_children2 1.0502368 1.4957213
## no_children3+ 0.7140887 1.1477842
## diahrea2Yes 1.7585241 2.6038521
fit.u<-glm(underweight~Age+sex+age_fb+res+meduc+watersource+time_water+b_order+b_interval+wealth_index+toilet_facility+pl_delivery+bmi_rec+family_size+no_children+diahrea+region, train.u, weights=train.u$wt,family=binomial)
summary(fit.u)
##
## Call:
## glm(formula = underweight ~ Age + sex + age_fb + res + meduc +
## watersource + time_water + b_order + b_interval + wealth_index +
## toilet_facility + pl_delivery + bmi_rec + family_size + no_children +
## diahrea + region, family = binomial, data = train.u, weights = train.u$wt)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9149 -0.8031 -0.3153 0.2558 4.6619
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.8347376 0.5387383 -3.406 0.000660 ***
## Age> 30 months 0.5149306 0.0701933 7.336 2.20e-13 ***
## sexfemale -0.0298697 0.0657615 -0.454 0.649676
## age_fb20+years 0.1678109 0.0715927 2.344 0.019080 *
## resrural 0.0209434 0.1681171 0.125 0.900859
## meduc1Primary -0.3255210 0.0880248 -3.698 0.000217 ***
## meduc2Secodary/Higher -0.4818543 0.2360587 -2.041 0.041226 *
## watersourceUnimproved -0.1562623 0.0709329 -2.203 0.027598 *
## time_water 0.0002221 0.0001644 1.351 0.176718
## b_order3rd or later 0.0662753 0.0951990 0.696 0.486318
## b_interval> 4 yrs -0.4141767 0.1156535 -3.581 0.000342 ***
## b_interval2-4 yrs -0.2157393 0.0863296 -2.499 0.012454 *
## wealth_index2middle -0.4005207 0.0860246 -4.656 3.23e-06 ***
## wealth_index3rich -0.6008441 0.0928079 -6.474 9.54e-11 ***
## toilet_facilityUnimproved 0.1058933 0.1543293 0.686 0.492617
## pl_delivery1Health facility -0.0633000 0.0956106 -0.662 0.507933
## bmi_recOverweight -0.5145038 0.1797880 -2.862 0.004213 **
## bmi_recUnderweight 0.3776844 0.0808199 4.673 2.97e-06 ***
## family_size -0.0124492 0.0203985 -0.610 0.541663
## no_children2 0.2063326 0.0873877 2.361 0.018220 *
## no_children3+ -0.1093222 0.1163449 -0.940 0.347403
## diahrea2Yes 0.7414399 0.0972428 7.625 2.45e-14 ***
## regionAfar 0.9151185 0.5857512 1.562 0.118218
## regionAmhara 1.0124412 0.5051151 2.004 0.045030 *
## regionBenishangul 1.1097398 0.5746714 1.931 0.053472 .
## regionDire Dawa 0.7634050 0.7408881 1.030 0.302826
## regionGambela 0.0236018 1.0159028 0.023 0.981465
## regionHarari 0.6471579 0.9409270 0.688 0.491587
## regionOromia 0.5932016 0.5045315 1.176 0.239696
## regionSNNPR 0.6942139 0.5047661 1.375 0.169033
## regionSomali 0.7291826 0.5167284 1.411 0.158200
## regionTigray 0.6473674 0.5128288 1.262 0.206824
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6006.0 on 4801 degrees of freedom
## Residual deviance: 5666.5 on 4770 degrees of freedom
## AIC: 5527.5
##
## Number of Fisher Scoring iterations: 5
exp(cbind(coef(fit.u), confint(fit.u)))
## 2.5 % 97.5 %
## (Intercept) 0.1596554 0.04968521 0.4258009
## Age> 30 months 1.6735224 1.45889823 1.9210682
## sexfemale 0.9705720 0.85314975 1.1040648
## age_fb20+years 1.1827129 1.02746694 1.3604129
## resrural 1.0211642 0.73753571 1.4265700
## meduc1Primary 0.7221510 0.60683047 0.8569938
## meduc2Secodary/Higher 0.6176371 0.38277206 0.9682594
## watersourceUnimproved 0.8553348 0.74421480 0.9828188
## time_water 1.0002222 0.99989703 1.0005423
## b_order3rd or later 1.0685209 0.88737113 1.2888986
## b_interval> 4 yrs 0.6608842 0.52668744 0.8288889
## b_interval2-4 yrs 0.8059454 0.68089560 0.9551879
## wealth_index2middle 0.6699711 0.56542979 0.7922616
## wealth_index3rich 0.5483486 0.45661743 0.6570442
## toilet_facilityUnimproved 1.1117033 0.82553608 1.5128362
## pl_delivery1Health facility 0.9386619 0.77719956 1.1307479
## bmi_recOverweight 0.5977971 0.41537538 0.8417957
## bmi_recUnderweight 1.4589025 1.24433133 1.7082867
## family_size 0.9876280 0.94870421 1.0277013
## no_children2 1.2291620 1.03612295 1.4595345
## no_children3+ 0.8964415 0.71338655 1.1257540
## diahrea2Yes 2.0989556 1.73325717 2.5379516
## regionAfar 2.4970711 0.83681418 8.6265314
## regionAmhara 2.7523119 1.11237195 8.3828888
## regionBenishangul 3.0335690 1.04362636 10.2997661
## regionDire Dawa 2.1455694 0.48851145 9.4411479
## regionGambela 1.0238825 0.09657653 6.5720848
## regionHarari 1.9101044 0.24613815 11.3760961
## regionOromia 1.8097734 0.73233305 5.5068510
## regionSNNPR 2.0021346 0.80978895 6.0945451
## regionSomali 2.0733852 0.81538766 6.4281890
## regionTigray 1.9105045 0.75805933 5.8872863
library(ROCR)
test_arr_predict <- predict(fit.mix.u, test.u, type="response")
#create a data frame of out-sample(test set) reference value and predicted value
out_sample_arr <- data.frame(labels = test.u$underweight, predictions = test_arr_predict)
#get the performance of the in-sample prediction
pred.out_sample_arr <- prediction(out_sample_arr$predictions, out_sample_arr$labels)
perf.out_sample_arr = performance(pred.out_sample_arr, measure = "tpr", x.measure="fpr")
#This function generates the optimal cutoff value that maximizes the sensitivity and specificity simutaneously
opt.cut = function(perf, pred){
cut.ind = mapply(FUN=function(x, y, p){
d = (x - 0)^2 + (y-1)^2
ind = which(d == min(d))
c(sensitivity = y[[ind]], specificity = 1-x[[ind]],
cutoff = p[[ind]])
}, perf@x.values, perf@y.values, pred@cutoffs)
}
print(opt.cut(perf.out_sample_arr, pred.out_sample_arr))
## [,1]
## sensitivity 0.6727273
## specificity 0.5428003
## cutoff 0.2512379
#classify the predicted values below the cutoff values to 0, above to 1
out_sample_arr$predClass = ifelse(out_sample_arr$predictions >0.2492607,1,0)
# Confusion matrix
confusionMatrix(table(out_sample_arr$labels ,out_sample_arr$predClass))
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 812 695
## 1 178 372
##
## Accuracy : 0.5756
## 95% CI : (0.5539, 0.5971)
## No Information Rate : 0.5187
## P-Value [Acc > NIR] : 1.279e-07
##
## Kappa : 0.1657
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8202
## Specificity : 0.3486
## Pos Pred Value : 0.5388
## Neg Pred Value : 0.6764
## Prevalence : 0.4813
## Detection Rate : 0.3947
## Detection Prevalence : 0.7326
## Balanced Accuracy : 0.5844
##
## 'Positive' Class : 0
##
library(ROCR)
test_arr_predict <- predict(fit.u, test.u, type="response")
#create a data frame of out-sample(test set) reference value and predicted value
out_sample_arr <- data.frame(labels = test.u$underweight, predictions = test_arr_predict)
#get the performance of the in-sample prediction
pred.out_sample_arr <- prediction(out_sample_arr$predictions, out_sample_arr$labels)
perf.out_sample_arr = performance(pred.out_sample_arr, measure = "tpr", x.measure="fpr")
#This function generates the optimal cutoff value that maximizes the sensitivity and specificity simutaneously
opt.cut = function(perf, pred){
cut.ind = mapply(FUN=function(x, y, p){
d = (x - 0)^2 + (y-1)^2
ind = which(d == min(d))
c(sensitivity = y[[ind]], specificity = 1-x[[ind]],
cutoff = p[[ind]])
}, perf@x.values, perf@y.values, pred@cutoffs)
}
print(opt.cut(perf.out_sample_arr, pred.out_sample_arr))
## [,1]
## sensitivity 0.6563636
## specificity 0.5739881
## cutoff 0.2616910
#classify the predicted values below the cutoff values to 0, above to 1
out_sample_arr$predClass = ifelse(out_sample_arr$predictions >0.2492607,1,0)
# Confusion matrix
confusionMatrix(table(out_sample_arr$labels ,out_sample_arr$predClass))
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 829 678
## 1 177 373
##
## Accuracy : 0.5843
## 95% CI : (0.5627, 0.6058)
## No Information Rate : 0.5109
## P-Value [Acc > NIR] : 1.398e-11
##
## Kappa : 0.1771
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8241
## Specificity : 0.3549
## Pos Pred Value : 0.5501
## Neg Pred Value : 0.6782
## Prevalence : 0.4891
## Detection Rate : 0.4030
## Detection Prevalence : 0.7326
## Balanced Accuracy : 0.5895
##
## 'Positive' Class : 0
##
library(party)
library(randomForest)
controlDTree <- trainControl(method="cv", 5)
train.u= na.omit(train.u)
train.u$underweight <- factor(train.u$underweight)
rf.u <-randomForest(underweight~v717+m15+v501+hv201+hc1+hv205+hc61+hv025+v212+hc63+hc64+v013+hc27+hc63+hc64+hv014+hv024+hv204+hv270+h11+hv009+v445, data=train.u)
print(rf.u)
##
## Call:
## randomForest(formula = underweight ~ v717 + m15 + v501 + hv201 + hc1 + hv205 + hc61 + hv025 + v212 + hc63 + hc64 + v013 + hc27 + hc63 + hc64 + hv014 + hv024 + hv204 + hv270 + h11 + hv009 + v445, data = train.u)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 25.97%
## Confusion matrix:
## 0 1 class.error
## 0 3352 194 0.05470953
## 1 1053 203 0.83837580
print(importance(rf.u,type = 2))
## MeanDecreaseGini
## v717 70.65841
## m15 54.32829
## v501 18.13290
## hv201 153.64004
## hc1 202.98580
## hv205 49.22173
## hc61 31.36421
## hv025 10.37982
## v212 117.52020
## hc63 173.83141
## hc64 99.94728
## v013 97.40069
## hc27 33.57844
## hv014 51.13770
## hv024 151.56507
## hv204 111.30138
## hv270 83.61458
## h11 24.61102
## hv009 93.00967
## v445 206.79133
pred = predict(rf.u, newdata=test.u)
accuracy <- table(pred, test.u[,"underweight"])
sum(diag(accuracy))/sum(accuracy)
## [1] 0.7311619
confusionMatrix(data=pred, as.factor(test.u$underweight))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1424 470
## 1 83 80
##
## Accuracy : 0.7312
## 95% CI : (0.7114, 0.7502)
## No Information Rate : 0.7326
## P-Value [Acc > NIR] : 0.5707
##
## Kappa : 0.1164
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9449
## Specificity : 0.1455
## Pos Pred Value : 0.7518
## Neg Pred Value : 0.4908
## Prevalence : 0.7326
## Detection Rate : 0.6923
## Detection Prevalence : 0.9208
## Balanced Accuracy : 0.5452
##
## 'Positive' Class : 0
##
test_prob = predict(fit.mix.u, newdata = test.u, type = "response")
test_roc = roc(test.u$underweight ~ test_prob, plot = TRUE, print.auc = TRUE,percent=TRUE)
test_prob2 = predict(fit.u, newdata = test.u, type = "response")
test_roc2 = roc(test.u$underweight ~ test_prob2, plot = TRUE, print.auc = TRUE,percent=TRUE)
test5 = predict(rf.u, newdata=test.u, type = "prob")
test5= roc(test.u$underweight, test5[,c(2)],plot = TRUE, col='darkgoldenrod4', print.auc = TRUE,percent=TRUE)
plot(test_roc, print.auc = TRUE, col='blue',percent=TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test5 , add = TRUE, col='darkgoldenrod4', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='red', print.auc = TRUE, print.auc.y = 20, print.auc.x = 0)
abline(a=0, b=1, lty=2, lwd=3, col="black")
legend("topleft", legend = c("Hierarchical Model","Random Forest Model","Logistic Regression Model") , pch = 15, bty = 'n', col = c("blue","darkgoldenrod4", "red"))