For this homework, I used the 2016 Ethiopian Demographic and Health Survey(EDHS). The spatial data for Ethipia is downloaded from the Spatial Data Repository, DHS website (https://spatialdata.dhsprogram.com/home/).
The purpose of these homework to examine the association between (avg height for age standard deviation) which is a continous measure and other explanatory variables (proportion of poor mothers, proportion of non educated mothers, average children evern born and mean age of mothers), using various spatial Autoregression models to determine the best model.
library(sf)
library(sp)
library(RQGIS)
library(mapview)
library(raster)
library(rgdal)
library(ggplot2)
require(ggplot2)
library(dplyr)
library(gridExtra)
library(maps)
library(tibble)
library(viridis)
library(rvest)
library(ggsn)
library(ggmap)
library(survey)
library(car)
library(nortest)
library(lmtest)
library(classInt)
library(sandwich)
library(spdep)
library(foreign)
library(maptools)
library(RColorBrewer)
eth_pts<-readOGR("~/Google Drive/DHS Resources/ GPS data from DHS/ ET_2016_DHS_ETGE71FL",layer = "ETGE71FL")
## OGR data source with driver: ESRI Shapefile
## Source: "/Users/fikrewoldbitew/Google Drive/DHS Resources/ GPS data from DHS/ ET_2016_DHS_ETGE71FL", layer: "ETGE71FL"
## with 643 features
## It has 20 fields
dhs_2016<- read.dta( "/Users/fikrewoldbitew/Google Drive/DHS Resources/ET_2016_DHS_08152017_2025_103714/etcr70dt/ETCR70FL.DTA", convert.factors = TRUE)
#dhs_2016<-read.dta("~/Google Drive/DHS Resources/ET_2016_DHS_08152017_2025_103714/etir70dt/ETIR70FL.dta", convert.factors = F)
edhs16<-subset(dhs_2016,select=c("v445","v001", "v440","v002", "v003", "v005", "v012", "v013", "v021","v022", "v024", "v025","v102", "v106", "v113", "v115","v116", "v119", "v120", "v121", "v122", "v123", "v124", "v125", "v127", "v129", "v153", "v190" , "v201","v213", "v217", "v302", "v440", "v445", "v501", "v511", "v525", "v623", "v701","v705", "v717"))
edhs16$SVYYEAR<-2016
library(rgdal)
mdat16<- edhs16[edhs16$v440!=9999&edhs16$v440!=9998,]
mdat16$stunt<-ifelse(mdat16$v440==9999 | mdat16$v440==9998 , NA, mdat16$v440/100)
mdat16$stunt_bin <- ifelse(mdat16$v106=='0',1,0)
#Women who are currently pregnant or pospartum/amenorrheic-Not Included
#Education
#mdat16$educ<-Recode(mdat16$v106, recodes="0='0Noeducation'; 1='1Primary'; 2='2Secodary'; 3='3Higher'; else=NA", as.factor=T)
mdat16$pno_educ<-Recode(mdat16$v106, recodes="'no education'=0;else=1", as.factor=T)
#Residence
mdat16$rural<-Recode(mdat16$v025, recodes="'rural'=1; else=0", as.factor = T)
#mdat16$rur<-Recode(mdat16$v025, recodes='1=1; else=0', as.factor = F)
#mdat16$count<-1
#Age
mdat16$age<-Recode(mdat16$v013, recodes="1='15-19'; 2:3='20-29'; 4:5='30-39'; 6:7='40-49'; else='NA'", as.factor=T)
#Children Ever Born
mdat16$parity<-Recode(mdat16$v201, recodes="0='0'; 1:2='1-2'; 3:4='3-4'; 5:18='5+'; else=NA", as.factor=T)
#wealth Index
mdat16$wealthindex<-Recode(mdat16$v190, recodes="1:2='1Poor'; 3='2Middle'; 4:5='3Rich'; else=NA", as.factor=T)
mdat16$ppoor<-Recode(mdat16$v190, recodes="'poorest'=1;'poorer'=1;else=0", as.factor=T)
mdat16$pwt <-mdat16$v005/1000000
mdat16$weight<-mdat16$v005/1000000
options(survey.lonely.psu = "adjust")
mdat16$yrstratum<-paste(mdat16$SVYYEAR, mdat16$v022, sep="-")
mdat16$yrpsu<-paste(mdat16$SVYYEAR, mdat16$v021, sep="-")
table(mdat16$yrstratum)
##
## 2016-1 2016-10 2016-11 2016-12 2016-13 2016-14 2016-15 2016-16 2016-17
## 100 317 252 99 380 55 486 75 366
## 2016-18 2016-19 2016-2 2016-20 2016-21 2016-22 2016-23 2016-24 2016-25
## 399 89 484 233 111 200 398 198 151
## 2016-3 2016-4 2016-5 2016-6 2016-7 2016-8 2016-9 NA-NA
## 62 285 93 420 363 64 298 156
des<-svydesign(ids=~yrpsu, strata=~yrstratum, weights=~weight, data = mdat16[is.na(mdat16$weight)==F,], nest=T)
no_educ<-svyby(~pno_educ, by=~v021, design = des, FUN = svymean, na.rm=T)
poor<-svyby(~ppoor, by=~v021, design = des, FUN = svymean, na.rm=T)
library(dplyr)
mdat_1<-merge(mdat16, no_educ, by.x="v021", by.y="v021", sort=F)
mdat_new2<-merge(mdat_1, poor, by.x="v021", by.y="v021", sort=F)
datagroup<-mdat_new2%>%group_by(v021,pno_educ1, ppoor1) %>%
summarize(mean_stunt = mean(stunt, na.rm = TRUE),
wt = mean(pwt, na.rm = TRUE),
mean_age= mean(v012, na.rm=TRUE),
avg_ceb= mean(v201, na.rm=TRUE))
datagr_new<-datagroup[!is.na(datagroup$v021),]
eth_pts<-eth_pts[eth_pts$DHSCLUST!=149ð_pts$DHSCLUST!=324ð_pts$DHSCLUST!=500,]
write.csv(eth_pts@data, file = "~/Downloads/eth_new7.csv")
#write.csv(datagr_new, file = "~/Downloads/datagr_2.csv")
m.geo<-merge(eth_pts@data, datagr_new, by.x="DHSCLUST", by.y="v021", sort=F)
m.geo<- na.omit(m.geo)
eth_pts@data<-m.geo
library(ggmap)
#project the data into a meter based system
#mapview(eth_pts["DHSCLUST"])
eths_border=sf::st_read("~/Google Drive/fikre/ETH_adm_DIVA_GIS/ETH_adm1.shp")
## Reading layer `ETH_adm1' from data source `/Users/fikrewoldbitew/Google Drive/fikre/ETH_adm_DIVA_GIS/ETH_adm1.shp' using driver `ESRI Shapefile'
## Simple feature collection with 11 features and 9 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 32.99994 ymin: 3.322099 xmax: 47.98618 ymax: 14.89996
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
eths_border<-st_transform(eths_border, crs=20136)
#Fit
fit.ols<-lm(mean_stunt~mean_age+avg_ceb+ppoor1+pno_educ1, data = eth_pts)
fit.ols.wt<-lm(mean_stunt~mean_age+avg_ceb+ppoor1+pno_educ1, data = eth_pts, weights = wt)
summary(fit.ols)
##
## Call:
## lm(formula = mean_stunt ~ mean_age + avg_ceb + ppoor1 + pno_educ1,
## data = eth_pts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.03161 -0.42020 -0.06428 0.34888 2.50088
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.096678 0.256816 -4.270 2.25e-05 ***
## mean_age -0.019366 0.008809 -2.199 0.028266 *
## avg_ceb 0.104259 0.027561 3.783 0.000170 ***
## ppoor1 0.311025 0.095573 3.254 0.001197 **
## pno_educ1 0.436899 0.113805 3.839 0.000136 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6255 on 635 degrees of freedom
## Multiple R-squared: 0.06594, Adjusted R-squared: 0.06006
## F-statistic: 11.21 on 4 and 635 DF, p-value: 8.608e-09
summary(fit.ols.wt)
##
## Call:
## lm(formula = mean_stunt ~ mean_age + avg_ceb + ppoor1 + pno_educ1,
## data = eth_pts, weights = wt)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -1.55711 -0.19224 0.01128 0.22829 2.03638
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.897318 0.210111 -4.271 2.25e-05 ***
## mean_age -0.017157 0.007176 -2.391 0.017102 *
## avg_ceb 0.076210 0.020043 3.802 0.000157 ***
## ppoor1 -0.124867 0.073718 -1.694 0.090784 .
## pno_educ1 0.200008 0.083614 2.392 0.017045 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4288 on 635 degrees of freedom
## Multiple R-squared: 0.02655, Adjusted R-squared: 0.02042
## F-statistic: 4.33 on 4 and 635 DF, p-value: 0.001835
options(tigris_class = "sf")
#eths_pts<-as(eths_pts, "Spatial")
#nbs<-poly2nb(eths_pts, queen = T)
#wt<-nb2listw(nbs, style = "W")
et.nb6<-knearneigh(coordinates(eth_pts), k=6)
et.nb6<-knn2nb(et.nb6)
et.wt6<-nb2listw(et.nb6, style="W")
moran.test(eth_pts$mean_stunt, listw=et.wt6)
##
## Moran I test under randomisation
##
## data: eth_pts$mean_stunt
## weights: et.wt6
##
## Moran I statistic standard deviate = 23.69, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.4946457718 -0.0015649452 0.0004387478
et.nb5<-knearneigh(coordinates(eth_pts), k=5)
et.nb5<-knn2nb(et.nb5)
et.wt5<-nb2listw(et.nb5, style="W")
moran.test(eth_pts$mean_stunt, listw=et.wt5)
##
## Moran I test under randomisation
##
## data: eth_pts$mean_stunt
## weights: et.wt5
##
## Moran I statistic standard deviate = 21.178, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.483606739 -0.001564945 0.000524850
et.nb4<-knearneigh(coordinates(eth_pts), k=4)
et.nb4<-knn2nb(et.nb4)
et.wt4<-nb2listw(et.nb4, style="W")
moran.test(eth_pts$mean_stunt, listw=et.wt4)
##
## Moran I test under randomisation
##
## data: eth_pts$mean_stunt
## weights: et.wt4
##
## Moran I statistic standard deviate = 18.874, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.481400822 -0.001564945 0.000654826
et.nb3<-knearneigh(coordinates(eth_pts), k=3)
et.nb3<-knn2nb(et.nb3)
et.wt3<-nb2listw(et.nb3,style="W")
moran.test(eth_pts$mean_stunt, listw=et.wt3)
##
## Moran I test under randomisation
##
## data: eth_pts$mean_stunt
## weights: et.wt3
##
## Moran I statistic standard deviate = 15.986, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.4690059481 -0.0015649452 0.0008664981
et.nb2<-knearneigh(coordinates(eth_pts), k=2)
et.nb2<-knn2nb(et.nb2)
et.wt2<-nb2listw(et.nb2,style="W")
moran.test(eth_pts$mean_stunt, listw=et.wt2)
##
## Moran I test under randomisation
##
## data: eth_pts$mean_stunt
## weights: et.wt2
##
## Moran I statistic standard deviate = 12.794, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.459402538 -0.001564945 0.001298163
The above results depicts that a positive autocorrelation in the residuals of mean height for age standard deviations is observed in every neighborhood specifications employed.
resi<-c(lm.morantest(fit.ols, listw=et.wt2)$estimate[1],
lm.morantest(fit.ols, listw=et.wt3)$estimate[1],
lm.morantest(fit.ols, listw=et.wt4)$estimate[1],
lm.morantest(fit.ols, listw=et.wt5)$estimate[1],
lm.morantest(fit.ols, listw=et.wt6)$estimate[1])
plot(resi, type="l")
eth_pts$avg_stunt<-as.numeric(scale(eth_pts$mean_stunt, center=T, scale=T))
library(sf)
qplot(eth_pts$avg_stunt,geom="histogram" )
According to WHO recommendation z-score less than -2 is most common criterion i.e 2 standard deviations (SD) below the median in reference population. , Height/Age z-score < -2 SD are considered as “stunted”
eths_pts<-st_as_sf(eth_pts)
remove<-which(eths_pts$LATNUM==0|is.na(eths_pts$LATNUM))
eths_pts<-eths_pts[-remove,]
eths_pts%>%
mutate(stuntquant=cut(mean_stunt, breaks = quantile(eths_pts$mean_stunt, p=seq(0,1,length.out = 8)), include.lowest = T))%>%
ggplot(aes(color=stuntquant, fill=stuntquant))+geom_sf()+
scale_fill_viridis_d()+
scale_color_viridis_d()+
ggtitle(label = "Spatial Distribution of Children's Avg.Height for Age SD,
Ethiopia 2016")+geom_sf(data=eths_border, fill=NA, color="black")
fit<-lm(mean_stunt~mean_age+avg_ceb+ppoor1+pno_educ1, data = eth_pts)
summary(fit)
##
## Call:
## lm(formula = mean_stunt ~ mean_age + avg_ceb + ppoor1 + pno_educ1,
## data = eth_pts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.03161 -0.42020 -0.06428 0.34888 2.50088
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.096678 0.256816 -4.270 2.25e-05 ***
## mean_age -0.019366 0.008809 -2.199 0.028266 *
## avg_ceb 0.104259 0.027561 3.783 0.000170 ***
## ppoor1 0.311025 0.095573 3.254 0.001197 **
## pno_educ1 0.436899 0.113805 3.839 0.000136 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6255 on 635 degrees of freedom
## Multiple R-squared: 0.06594, Adjusted R-squared: 0.06006
## F-statistic: 11.21 on 4 and 635 DF, p-value: 8.608e-09
eth_pts$olsresid<-rstudent(fit)
eths_pts<-st_as_sf(eth_pts)
remove<-which(eths_pts$LATNUM==0|is.na(eths_pts$LATNUM))
eths_pts<-eths_pts[-remove,]
eths_pts%>%
mutate(rquant=cut(olsresid, breaks = quantile(eths_pts$olsresid, p=seq(0,1,length.out = 8)), include.lowest = T))%>%
ggplot(aes(color=rquant, fill=rquant))+geom_sf()+
scale_fill_viridis_d()+
scale_color_viridis_d()+geom_sf(data=eths_border, fill=NA, color="black")+ggtitle("OLS Model Residuals- Avg.Height for Age SD")
moran.test(eth_pts$olsresid, listw=et.wt6)
##
## Moran I test under randomisation
##
## data: eth_pts$olsresid
## weights: et.wt6
##
## Moran I statistic standard deviate = 20.287, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.4235942590 -0.0015649452 0.0004392009
moran.mc(eth_pts$olsresid, listw=et.wt6, nsim=999)
##
## Monte-Carlo simulation of Moran I
##
## data: eth_pts$olsresid
## weights: et.wt6
## number of simulations + 1: 1000
##
## statistic = 0.42359, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
locali<-localmoran(eth_pts$olsresid, listw=et.wt3, p.adjust.method="fdr")
eth_pts$locali<-locali[,1]
eth_pts$localp<-locali[,5]
#Plots of the results
remove<-which(eth_pts$LATNUM==0|is.na(eths_pts$LATNUM))
eths_pts<-eth_pts[-remove,]
spplot(eths_pts, "locali", main="Local Moran's I for Residuals",
at=quantile(eth_pts$locali),
col.regions=brewer.pal(n=4, "RdBu"))
# Spatial Autoregressive Error Model
fit.err<-errorsarlm(mean_stunt~mean_age+avg_ceb+ppoor1+pno_educ1, data = eth_pts, listw=et.wt6, method = "MC")
#summary(fit.err,Nagelkerke=T)
#Spatial Autoregressive Lag Model
fit.lag<-lagsarlm(mean_stunt~mean_age+avg_ceb+ppoor1+pno_educ1, data = eth_pts, listw=et.wt6, type="lag", method = "MC")
#summary(fit.lag, Nagelkerke=T)
#Spatial Autoregression Model
fit.spauto.w<-spautolm(mean_stunt~mean_age+avg_ceb+ppoor1+pno_educ1,data = eth_pts, listw=et.wt6, family="SAR", weights=wt, method = "MC")
#summary(fit.spauto.w, Nagelkerke=T)
#Spatial Autoregressive Lag Model
fit.lag.d<-lagsarlm(mean_stunt~mean_age+avg_ceb+ppoor1+pno_educ1,data = eth_pts, listw=et.wt6, type="mixed", method = "MC")
#Spatial Autoregressive Error Model
fit.err.d<-errorsarlm(mean_stunt~mean_age+avg_ceb+ppoor1+pno_educ1,data = eth_pts, listw=et.wt6, etype = "emixed", method = "MC")
#summary(fit.err.d,Nagelkerke=T)
#As a pretty good indicator of which model is best, look at the AIC of each
AIC(fit.ols)
## [1] 1222.729
AIC(fit.ols.wt)
## [1] 1369.384
AIC(fit.lag)
## [1] 979.4364
AIC(fit.err)
## [1] 980.648
AIC(fit.spauto.w)
## [1] 1266.157
AIC(fit.lag.d)
## [1] 970.194
AIC(fit.err.d)
## [1] 974.6474
The above results clearly depicts that all spacial auregression models have lower AIC values when compared to the OLS models. However, the Spatial Autoregressive Lag Model was found to have a minimum AIC compared to other models. So, I chose Spatial Autoregressive Lag Model based on minimum AIC criteria.
summary(fit.lag.d, Nagelkerke=T)
##
## Call:lagsarlm(formula = mean_stunt ~ mean_age + avg_ceb + ppoor1 +
## pno_educ1, data = eth_pts, listw = et.wt6, type = "mixed",
## method = "MC")
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.637656 -0.324833 -0.018796 0.304459 1.974924
##
## Type: mixed
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.22818347 0.43392611 -0.5259 0.598987
## mean_age 0.00036656 0.00745271 0.0492 0.960772
## avg_ceb 0.03861938 0.02386925 1.6180 0.105672
## ppoor1 -0.03296004 0.08323525 -0.3960 0.692115
## pno_educ1 0.26674306 0.10012847 2.6640 0.007722
## lag.mean_age -0.01996222 0.01457689 -1.3694 0.170861
## lag.avg_ceb 0.01446676 0.04333226 0.3339 0.738488
## lag.ppoor1 0.33509331 0.14174222 2.3641 0.018074
## lag.pno_educ1 0.06423593 0.16848744 0.3813 0.703017
##
## Rho: 0.62615, LR test value: 204.76, p-value: < 2.22e-16
## Asymptotic standard error: 0.038042
## z-value: 16.46, p-value: < 2.22e-16
## Wald statistic: 270.92, p-value: < 2.22e-16
##
## Log likelihood: -474.097 for mixed model
## ML residual variance (sigma squared): 0.24104, (sigma: 0.49096)
## Nagelkerke pseudo-R-squared: 0.38024
## Number of observations: 640
## Number of parameters estimated: 11
## AIC: 970.19, (AIC for lm: 1173)
## LM test for residual autocorrelation
## test value: 17.443, p-value: 2.9603e-05
The results above Spatial Autoregressive Lag Model therefore clearly depicts that: - The percentage of poor and percentage of non-educated mothers are positvely significantly associated with stunting (avg height for age standard deviations). The increament of proportion of poor and non educated mothers are significant contributing factors for the expansion of stunting in Ethiopia.