For this homework, I used the 2016 Ethiopian Demographic and Health Survey(EDHS). The spatial data for Ethipia is downloaded from the Spatial Data Repository, DHS website (https://spatialdata.dhsprogram.com/home/).

The purpose of these homework to examine the association between (avg height for age standard deviation) which is a continous measure and other explanatory variables (proportion of poor mothers, proportion of non educated mothers, average children evern born and mean age of mothers), using various spatial Autoregression models to determine the best model.

For this specific assignment, I used the

eth_pts<-readOGR("~/Google Drive/DHS Resources/ GPS data from DHS/ ET_2016_DHS_ETGE71FL",layer = "ETGE71FL")

## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/fikrewoldbitew/Google Drive/DHS Resources/ GPS data from DHS/ ET_2016_DHS_ETGE71FL", layer: "ETGE71FL"
## with 643 features
## It has 20 fields

dhs_2016<- read.dta( "/Users/fikrewoldbitew/Google Drive/DHS Resources/ET_2016_DHS_08152017_2025_103714/etcr70dt/ETCR70FL.DTA", convert.factors = TRUE)


#dhs_2016<-read.dta("~/Google Drive/DHS Resources/ET_2016_DHS_08152017_2025_103714/etir70dt/ETIR70FL.dta", convert.factors = F)

edhs16<-subset(dhs_2016,select=c("v445","v001", "v440","v002", "v003", "v005", "v012", "v013", "v021","v022", "v024", "v025","v102", "v106", "v113", "v115","v116", "v119", "v120", "v121", "v122", "v123", "v124", "v125", "v127", "v129", "v153", "v190" , "v201","v213", "v217", "v302", "v440", "v445", "v501", "v511", "v525", "v623", "v701","v705", "v717"))
 
edhs16$SVYYEAR<-2016

library(rgdal)
mdat16<- edhs16[edhs16$v440!=9999&edhs16$v440!=9998,]

mdat16$stunt<-ifelse(mdat16$v440==9999 | mdat16$v440==9998 , NA, mdat16$v440/100)

mdat16$stunt_bin <- ifelse(mdat16$v106=='0',1,0)

#Women who are currently pregnant or pospartum/amenorrheic-Not Included  

#Education 
#mdat16$educ<-Recode(mdat16$v106, recodes="0='0Noeducation'; 1='1Primary'; 2='2Secodary'; 3='3Higher'; else=NA", as.factor=T)



mdat16$pno_educ<-Recode(mdat16$v106, recodes="'no education'=0;else=1", as.factor=T)


#Residence
mdat16$rural<-Recode(mdat16$v025, recodes="'rural'=1; else=0",  as.factor = T)


#mdat16$rur<-Recode(mdat16$v025, recodes='1=1;  else=0',  as.factor = F)
#mdat16$count<-1

#Age
mdat16$age<-Recode(mdat16$v013, recodes="1='15-19'; 2:3='20-29'; 4:5='30-39'; 6:7='40-49'; else='NA'", as.factor=T)

#Children Ever Born 
mdat16$parity<-Recode(mdat16$v201, recodes="0='0'; 1:2='1-2'; 3:4='3-4'; 5:18='5+'; else=NA", as.factor=T)

#wealth Index
mdat16$wealthindex<-Recode(mdat16$v190, recodes="1:2='1Poor'; 3='2Middle'; 4:5='3Rich'; else=NA", as.factor=T)

mdat16$ppoor<-Recode(mdat16$v190, recodes="'poorest'=1;'poorer'=1;else=0", as.factor=T)

mdat16$pwt <-mdat16$v005/1000000

mdat16$weight<-mdat16$v005/1000000
options(survey.lonely.psu = "adjust")
mdat16$yrstratum<-paste(mdat16$SVYYEAR, mdat16$v022, sep="-")

mdat16$yrpsu<-paste(mdat16$SVYYEAR, mdat16$v021, sep="-")

table(mdat16$yrstratum)

## 
##  2016-1 2016-10 2016-11 2016-12 2016-13 2016-14 2016-15 2016-16 2016-17 
##     100     317     252      99     380      55     486      75     366 
## 2016-18 2016-19  2016-2 2016-20 2016-21 2016-22 2016-23 2016-24 2016-25 
##     399      89     484     233     111     200     398     198     151 
##  2016-3  2016-4  2016-5  2016-6  2016-7  2016-8  2016-9   NA-NA 
##      62     285      93     420     363      64     298     156

des<-svydesign(ids=~yrpsu, strata=~yrstratum, weights=~weight, data = mdat16[is.na(mdat16$weight)==F,], nest=T)

no_educ<-svyby(~pno_educ, by=~v021, design = des, FUN = svymean, na.rm=T)

poor<-svyby(~ppoor, by=~v021, design = des, FUN = svymean, na.rm=T)

Depedent Variable: Height for Age standard deviations (mean_stunt) Independent variables: mean_stunt= mean_stunt, wt = wt,mean_age, Avg childen ever born=avg_ceb, %noeduc= pno_educ, %poor = ppoor

library(dplyr)
mdat_1<-merge(mdat16, no_educ, by.x="v021", by.y="v021", sort=F)

mdat_new2<-merge(mdat_1, poor, by.x="v021", by.y="v021", sort=F)

datagroup<-mdat_new2%>%group_by(v021,pno_educ1, ppoor1) %>% 
summarize(mean_stunt = mean(stunt, na.rm = TRUE), 
          wt = mean(pwt, na.rm = TRUE),
          mean_age= mean(v012, na.rm=TRUE), 
          avg_ceb= mean(v201, na.rm=TRUE))

datagr_new<-datagroup[!is.na(datagroup$v021),]

eth_pts<-eth_pts[eth_pts$DHSCLUST!=149&eth_pts$DHSCLUST!=324&eth_pts$DHSCLUST!=500,]

write.csv(eth_pts@data, file = "~/Downloads/eth_new7.csv")

#write.csv(datagr_new, file = "~/Downloads/datagr_2.csv")
m.geo<-merge(eth_pts@data, datagr_new, by.x="DHSCLUST", by.y="v021",  sort=F)


m.geo<- na.omit(m.geo)   

eth_pts@data<-m.geo

library(ggmap)
#project the data into a meter based system
#mapview(eth_pts["DHSCLUST"])

eths_border=sf::st_read("~/Google Drive/fikre/ETH_adm_DIVA_GIS/ETH_adm1.shp")

## Reading layer `ETH_adm1' from data source `/Users/fikrewoldbitew/Google Drive/fikre/ETH_adm_DIVA_GIS/ETH_adm1.shp' using driver `ESRI Shapefile'
## Simple feature collection with 11 features and 9 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 32.99994 ymin: 3.322099 xmax: 47.98618 ymax: 14.89996
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs

eths_border<-st_transform(eths_border, crs=20136)

Ordinary OLS model for Mean Height for Age Standard deviations and a serious of weights using knearneigh weights(k=2 through k=6)

#Fit  
fit.ols<-lm(mean_stunt~mean_age+avg_ceb+ppoor1+pno_educ1, data = eth_pts)
fit.ols.wt<-lm(mean_stunt~mean_age+avg_ceb+ppoor1+pno_educ1, data = eth_pts, weights = wt)
summary(fit.ols)

## 
## Call:
## lm(formula = mean_stunt ~ mean_age + avg_ceb + ppoor1 + pno_educ1, 
##     data = eth_pts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.03161 -0.42020 -0.06428  0.34888  2.50088 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.096678   0.256816  -4.270 2.25e-05 ***
## mean_age    -0.019366   0.008809  -2.199 0.028266 *  
## avg_ceb      0.104259   0.027561   3.783 0.000170 ***
## ppoor1       0.311025   0.095573   3.254 0.001197 ** 
## pno_educ1    0.436899   0.113805   3.839 0.000136 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6255 on 635 degrees of freedom
## Multiple R-squared:  0.06594,    Adjusted R-squared:  0.06006 
## F-statistic: 11.21 on 4 and 635 DF,  p-value: 8.608e-09

summary(fit.ols.wt)

## 
## Call:
## lm(formula = mean_stunt ~ mean_age + avg_ceb + ppoor1 + pno_educ1, 
##     data = eth_pts, weights = wt)
## 
## Weighted Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.55711 -0.19224  0.01128  0.22829  2.03638 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.897318   0.210111  -4.271 2.25e-05 ***
## mean_age    -0.017157   0.007176  -2.391 0.017102 *  
## avg_ceb      0.076210   0.020043   3.802 0.000157 ***
## ppoor1      -0.124867   0.073718  -1.694 0.090784 .  
## pno_educ1    0.200008   0.083614   2.392 0.017045 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4288 on 635 degrees of freedom
## Multiple R-squared:  0.02655,    Adjusted R-squared:  0.02042 
## F-statistic:  4.33 on 4 and 635 DF,  p-value: 0.001835

options(tigris_class = "sf")

#eths_pts<-as(eths_pts, "Spatial")

#nbs<-poly2nb(eths_pts, queen = T)
#wt<-nb2listw(nbs, style = "W")

et.nb6<-knearneigh(coordinates(eth_pts), k=6)
et.nb6<-knn2nb(et.nb6)
et.wt6<-nb2listw(et.nb6, style="W")

moran.test(eth_pts$mean_stunt, listw=et.wt6)

## 
##  Moran I test under randomisation
## 
## data:  eth_pts$mean_stunt  
## weights: et.wt6    
## 
## Moran I statistic standard deviate = 23.69, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.4946457718     -0.0015649452      0.0004387478

et.nb5<-knearneigh(coordinates(eth_pts), k=5)
et.nb5<-knn2nb(et.nb5)
et.wt5<-nb2listw(et.nb5, style="W")
moran.test(eth_pts$mean_stunt, listw=et.wt5)

## 
##  Moran I test under randomisation
## 
## data:  eth_pts$mean_stunt  
## weights: et.wt5    
## 
## Moran I statistic standard deviate = 21.178, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.483606739      -0.001564945       0.000524850

et.nb4<-knearneigh(coordinates(eth_pts), k=4)
et.nb4<-knn2nb(et.nb4)
et.wt4<-nb2listw(et.nb4, style="W")
moran.test(eth_pts$mean_stunt, listw=et.wt4)

## 
##  Moran I test under randomisation
## 
## data:  eth_pts$mean_stunt  
## weights: et.wt4    
## 
## Moran I statistic standard deviate = 18.874, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.481400822      -0.001564945       0.000654826

et.nb3<-knearneigh(coordinates(eth_pts), k=3)
et.nb3<-knn2nb(et.nb3)
et.wt3<-nb2listw(et.nb3,style="W")
moran.test(eth_pts$mean_stunt, listw=et.wt3)

## 
##  Moran I test under randomisation
## 
## data:  eth_pts$mean_stunt  
## weights: et.wt3    
## 
## Moran I statistic standard deviate = 15.986, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.4690059481     -0.0015649452      0.0008664981

et.nb2<-knearneigh(coordinates(eth_pts), k=2)
et.nb2<-knn2nb(et.nb2)
et.wt2<-nb2listw(et.nb2,style="W")
moran.test(eth_pts$mean_stunt, listw=et.wt2)

## 
##  Moran I test under randomisation
## 
## data:  eth_pts$mean_stunt  
## weights: et.wt2    
## 
## Moran I statistic standard deviate = 12.794, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.459402538      -0.001564945       0.001298163

The above results depicts that a positive autocorrelation in the residuals of mean height for age standard deviations is observed in every neighborhood specifications employed.

resi<-c(lm.morantest(fit.ols, listw=et.wt2)$estimate[1],
        lm.morantest(fit.ols, listw=et.wt3)$estimate[1],
        lm.morantest(fit.ols, listw=et.wt4)$estimate[1],
        lm.morantest(fit.ols, listw=et.wt5)$estimate[1],
        lm.morantest(fit.ols, listw=et.wt6)$estimate[1])
plot(resi, type="l")

After testing for autocorrelation in the residual using each neighborhood specification, the k=6 and k=5 has the largest redidual Moran’s I value, so we use k=6.

eth_pts$avg_stunt<-as.numeric(scale(eth_pts$mean_stunt, center=T, scale=T))

library(sf)

qplot(eth_pts$avg_stunt,geom="histogram" )

According to WHO recommendation z-score less than -2 is most common criterion i.e 2 standard deviations (SD) below the median in reference population. , Height/Age z-score < -2 SD are considered as “stunted”

Map for the standardized residuals from the model fit and a map of the local Moran statistic for the residuals

eths_pts<-st_as_sf(eth_pts)
remove<-which(eths_pts$LATNUM==0|is.na(eths_pts$LATNUM))
eths_pts<-eths_pts[-remove,]

eths_pts%>%
  mutate(stuntquant=cut(mean_stunt, breaks = quantile(eths_pts$mean_stunt, p=seq(0,1,length.out = 8)), include.lowest = T))%>%
  ggplot(aes(color=stuntquant, fill=stuntquant))+geom_sf()+
  scale_fill_viridis_d()+
  scale_color_viridis_d()+
  ggtitle(label = "Spatial Distribution of Children's Avg.Height for Age SD, 
          Ethiopia 2016")+geom_sf(data=eths_border, fill=NA, color="black")

fit<-lm(mean_stunt~mean_age+avg_ceb+ppoor1+pno_educ1, data = eth_pts)
summary(fit)

## 
## Call:
## lm(formula = mean_stunt ~ mean_age + avg_ceb + ppoor1 + pno_educ1, 
##     data = eth_pts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.03161 -0.42020 -0.06428  0.34888  2.50088 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.096678   0.256816  -4.270 2.25e-05 ***
## mean_age    -0.019366   0.008809  -2.199 0.028266 *  
## avg_ceb      0.104259   0.027561   3.783 0.000170 ***
## ppoor1       0.311025   0.095573   3.254 0.001197 ** 
## pno_educ1    0.436899   0.113805   3.839 0.000136 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6255 on 635 degrees of freedom
## Multiple R-squared:  0.06594,    Adjusted R-squared:  0.06006 
## F-statistic: 11.21 on 4 and 635 DF,  p-value: 8.608e-09

eth_pts$olsresid<-rstudent(fit)
eths_pts<-st_as_sf(eth_pts)
remove<-which(eths_pts$LATNUM==0|is.na(eths_pts$LATNUM))
eths_pts<-eths_pts[-remove,]

eths_pts%>%
  mutate(rquant=cut(olsresid, breaks = quantile(eths_pts$olsresid, p=seq(0,1,length.out = 8)), include.lowest = T))%>%
  ggplot(aes(color=rquant, fill=rquant))+geom_sf()+
  scale_fill_viridis_d()+
  scale_color_viridis_d()+geom_sf(data=eths_border, fill=NA, color="black")+ggtitle("OLS Model Residuals- Avg.Height for Age SD")

moran.test(eth_pts$olsresid, listw=et.wt6)

## 
##  Moran I test under randomisation
## 
## data:  eth_pts$olsresid  
## weights: et.wt6    
## 
## Moran I statistic standard deviate = 20.287, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.4235942590     -0.0015649452      0.0004392009

moran.mc(eth_pts$olsresid, listw=et.wt6, nsim=999)

## 
##  Monte-Carlo simulation of Moran I
## 
## data:  eth_pts$olsresid 
## weights: et.wt6  
## number of simulations + 1: 1000 
## 
## statistic = 0.42359, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater

locali<-localmoran(eth_pts$olsresid,  listw=et.wt3, p.adjust.method="fdr")
eth_pts$locali<-locali[,1]
eth_pts$localp<-locali[,5]


#Plots of the results
remove<-which(eth_pts$LATNUM==0|is.na(eths_pts$LATNUM))
eths_pts<-eth_pts[-remove,]

spplot(eths_pts, "locali", main="Local Moran's I for Residuals", 
       at=quantile(eth_pts$locali), 
       col.regions=brewer.pal(n=4, "RdBu"))

In the below section alternative Special Autoregressive Models (SAM) have been examined using k=6 neighborhood specification.

# Spatial Autoregressive Error Model

fit.err<-errorsarlm(mean_stunt~mean_age+avg_ceb+ppoor1+pno_educ1, data = eth_pts, listw=et.wt6, method = "MC")

#summary(fit.err,Nagelkerke=T)

#Spatial Autoregressive Lag Model
fit.lag<-lagsarlm(mean_stunt~mean_age+avg_ceb+ppoor1+pno_educ1, data = eth_pts, listw=et.wt6, type="lag", method = "MC")
#summary(fit.lag, Nagelkerke=T)


#Spatial Autoregression Model 

fit.spauto.w<-spautolm(mean_stunt~mean_age+avg_ceb+ppoor1+pno_educ1,data = eth_pts, listw=et.wt6, family="SAR", weights=wt, method = "MC")
#summary(fit.spauto.w, Nagelkerke=T)


#Spatial Autoregressive Lag Model 

fit.lag.d<-lagsarlm(mean_stunt~mean_age+avg_ceb+ppoor1+pno_educ1,data = eth_pts, listw=et.wt6, type="mixed", method = "MC")

#Spatial Autoregressive Error Model

fit.err.d<-errorsarlm(mean_stunt~mean_age+avg_ceb+ppoor1+pno_educ1,data = eth_pts, listw=et.wt6, etype = "emixed", method = "MC")

#summary(fit.err.d,Nagelkerke=T)

#As a pretty good indicator of which model is best, look at the AIC of each
AIC(fit.ols)

## [1] 1222.729

AIC(fit.ols.wt)

## [1] 1369.384

AIC(fit.lag)

## [1] 979.4364

AIC(fit.err)

## [1] 980.648

AIC(fit.spauto.w)

## [1] 1266.157

AIC(fit.lag.d)

## [1] 970.194

AIC(fit.err.d)

## [1] 974.6474

The above results clearly depicts that all spacial auregression models have lower AIC values when compared to the OLS models. However, the Spatial Autoregressive Lag Model was found to have a minimum AIC compared to other models. So, I chose Spatial Autoregressive Lag Model based on minimum AIC criteria.

summary(fit.lag.d, Nagelkerke=T)

## 
## Call:lagsarlm(formula = mean_stunt ~ mean_age + avg_ceb + ppoor1 + 
##     pno_educ1, data = eth_pts, listw = et.wt6, type = "mixed", 
##     method = "MC")
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -1.637656 -0.324833 -0.018796  0.304459  1.974924 
## 
## Type: mixed 
## Coefficients: (asymptotic standard errors) 
##                  Estimate  Std. Error z value Pr(>|z|)
## (Intercept)   -0.22818347  0.43392611 -0.5259 0.598987
## mean_age       0.00036656  0.00745271  0.0492 0.960772
## avg_ceb        0.03861938  0.02386925  1.6180 0.105672
## ppoor1        -0.03296004  0.08323525 -0.3960 0.692115
## pno_educ1      0.26674306  0.10012847  2.6640 0.007722
## lag.mean_age  -0.01996222  0.01457689 -1.3694 0.170861
## lag.avg_ceb    0.01446676  0.04333226  0.3339 0.738488
## lag.ppoor1     0.33509331  0.14174222  2.3641 0.018074
## lag.pno_educ1  0.06423593  0.16848744  0.3813 0.703017
## 
## Rho: 0.62615, LR test value: 204.76, p-value: < 2.22e-16
## Asymptotic standard error: 0.038042
##     z-value: 16.46, p-value: < 2.22e-16
## Wald statistic: 270.92, p-value: < 2.22e-16
## 
## Log likelihood: -474.097 for mixed model
## ML residual variance (sigma squared): 0.24104, (sigma: 0.49096)
## Nagelkerke pseudo-R-squared: 0.38024 
## Number of observations: 640 
## Number of parameters estimated: 11 
## AIC: 970.19, (AIC for lm: 1173)
## LM test for residual autocorrelation
## test value: 17.443, p-value: 2.9603e-05

The results above Spatial Autoregressive Lag Model therefore clearly depicts that: - The percentage of poor and percentage of non-educated mothers are positvely significantly associated with stunting (avg height for age standard deviations). The increament of proportion of poor and non educated mothers are significant contributing factors for the expansion of stunting in Ethiopia.

HW2: DEM 7263 Spring 2017 - Spatially Autoregressive Models

Fikre Bitew

February 9, 2019

For this specific assignment, I used the