HW3: DEM 7263 Spring 2019-Generalized Linear Models for Spatial Count Data

Data for this analysis is obtained from the 2016 Ethiopian Demographic and Health Surveys (2016 -EDHS). Further analysis in this homework is conducted by considering the the number of children who are wasted in each adminstrative units. The offest variable (exposure) are those under-5 children who slept in their house in 12 hours before the survey and who had weight and height measres. Further analysis on wasting is found under https://userforum.dhsprogram.com/index.php?t=msg&goto=11335&S=Google

library(maptools)
library(sf)

library(reticulate)
library(sp)
library(spdep)
library(RQGIS) #you need Qgis 2.18 installed, not 3!
library(mapview)
library(rgdal)
library(ggplot2)
library(dplyr)
library(gridExtra)
library(maps)
library(tibble)
library(knitr)
library(viridis)
library(rvest)
library(ggsn)
library(ggmap)
library(RColorBrewer)
library(haven)
library(survey)
library(dplyr)
library(foreign)
library(lme4)
library(car)
library(dplyr)
library(nortest)
library(lmtest)
library(classInt)
library(sandwich)

eth_16_points<-st_read("~/Google Drive/DHS Resources/ GPS data from DHS/ ET_2016_DHS_ETGE71FL",layer = "ETGE71FL")

## Reading layer `ETGE71FL' from data source `/Users/fikrewoldbitew/Google Drive/DHS Resources/ GPS data from DHS/ ET_2016_DHS_ETGE71FL' using driver `ESRI Shapefile'
## Simple feature collection with 643 features and 20 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 5.684342e-14 ymin: 5.684342e-14 xmax: 47.00792 ymax: 14.50213
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs

eths_16_points<-st_transform(eth_16_points, crs=20136 )
 
eths_border<-st_read("~/Google Drive/fikre/ETH_adm_DIVA_GIS", layer= "ETH_adm0")

## Reading layer `ETH_adm0' from data source `/Users/fikrewoldbitew/Google Drive/fikre/ETH_adm_DIVA_GIS' using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 70 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: 32.99994 ymin: 3.322099 xmax: 47.98618 ymax: 14.89996
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs

eths_border<-st_transform(eths_border, crs=20136 )

eths_16_points<-st_crop(eths_16_points,eths_border)

 
 
#eths_von_16<-st_voronoi(st_union(eths_16_points))
#eth_vonmap_16<-st_intersection(st_cast(eths_von_16), st_union(eths_border))
 
 
#eth_vonmap_16<-st_intersection(eths_border,eths_von_16)
 
#eth_vonmap_16<-st_collection_extract(eth_vonmap_16,type="POLYGON")
 

#pv<-st_sf(eths_16_points, eth_vonmap_16)
library(SDraw)
library(rgeos)
out<-voronoi.polygons(as(eths_16_points, "Spatial"), bounding.polygon = as(eths_border, "Spatial"))
out@data<-as.data.frame(eths_16_points)
clip<- raster::intersect(out, eths_border)

eth_vonmap_16<-st_as_sf(clip)
#plot(eth_vonmap_16["DHSCLUST"])

If we have the 2005 DHS data, we could generate estimates of some quality, say proportion of women underweight in Ethiopia

dhs_2016<- read.dta( "~/Google Drive/DHS Resources/ET_2016_DHS_08152017_2025_103714/etpr70dt/ETPR70FL.DTA", convert.factors = TRUE)

#dhs_2016<-read.dta("~/Google Drive/DHS Resources/ET_2016_DHS_08152017_2025_103714/etir70dt/ETIR70FL.dta", convert.factors = F)
pedhs16<-subset(dhs_2016,select=c("hc1","hc1","hc2", "hc70", "hv024", "hc61","hc63", "hc64", "hc68", "hc70", "hc71", "hc72","hc73", "hv024","hv025", "hv021", "hv005", "hv022", "hv103","hv106","hv270"))
pedhs16$SVYYEAR<-2016
eth16_pts<-st_read("~/Google Drive/DHS Resources/ GPS data from DHS/ ET_2016_DHS_ETGE71FL",layer = "ETGE71FL")

## Reading layer `ETGE71FL' from data source `/Users/fikrewoldbitew/Google Drive/DHS Resources/ GPS data from DHS/ ET_2016_DHS_ETGE71FL' using driver `ESRI Shapefile'
## Simple feature collection with 643 features and 20 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 5.684342e-14 ymin: 5.684342e-14 xmax: 47.00792 ymax: 14.50213
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs

pdat16<-merge(pedhs16, eth16_pts, by.x="hv021", by.y="DHSCLUST" ,all.x=T)
#tail(names(edhs11$pdat1), n=20)

pdat16<- pdat16[pdat16$hv103=='yes'&pdat16$hc71<=9990,]
pdat16<-pdat16[!is.na(pdat16$hc71),]

pdat16$wasted<-ifelse(pdat16$hc71/100<=-2,1,0)
pdat16$tot_ch<-1 
#Residence
pdat16$res<-pdat16$hv025
pdat16$age<-pdat16$ha1
pdat16$region<-pdat16$hv024
pdat16$pno_educ<-Recode(pdat16$hc61, recodes="'no education'=1;else=0", as.factor=T)
pdat16$ppoor<-Recode(pdat16$hv270, recodes="'poorest'=1; 'poorer'=1;else=0", as.factor=T)

pdat16$pwt<-pdat16$hv005/1000000
options(survey.lonely.psu = "adjust")
pdat16$yrstratum<-paste(pdat16$SVYYEAR, pdat16$hv022, sep="-")

pdat16$yrpsu<-paste(pdat16$SVYYEAR, pdat16$hv021, sep="-")

#table(pdat16$yrstratum)

des<-svydesign(ids=~yrpsu, strata=~yrstratum, weights=~pwt, data = pdat16[is.na(pdat16$pwt)==F,], nest=T)

new_data<-svyby(~ppoor+pno_educ, by=~hv021, design = des, FUN = svymean, na.rm=T)

pdat<-merge(pdat16,new_data, by.x="hv021", by.y="hv021", sort=F)

 datagroup<-pdat%>%group_by(hv021,res,region) %>% 
   summarize(tot_wasted= sum(wasted, na.rm = TRUE), 
            tot_ch= sum(tot_ch),
            pno_educ=mean(pno_educ1, na.rm=TRUE), 
            ppoor=mean(ppoor1, na.rm=TRUE),
            wt = mean(pwt, na.rm = TRUE))

datagroup<-na.omit(datagroup)

datagroup$liberate<-datagroup$tot_wasted/datagroup$tot_ch

#datagroup$wasted_ests <- fitted(fit.ols)

place_ests_16<-aggregate(liberate~hv021, data=datagroup, FUN=mean)

Here, we do a basic map of the outcome variable, and see the highest rates of wasted under-five children in Ethiopia, 2016.

library(maptools)
eth_border=sf::st_read("~/Google Drive/fikre/ETH_adm_DIVA_GIS/ETH_adm1.shp")

## Reading layer `ETH_adm1' from data source `/Users/fikrewoldbitew/Google Drive/fikre/ETH_adm_DIVA_GIS/ETH_adm1.shp' using driver `ESRI Shapefile'
## Simple feature collection with 11 features and 9 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 32.99994 ymin: 3.322099 xmax: 47.98618 ymax: 14.89996
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs

##Map some estimates:

dhs_est_16<-left_join(eth_vonmap_16, place_ests_16, by=c("DHSCLUST"= "hv021"))


#plot(dhs_est_16["wasted_ests"])
brks<-quantile(c(dhs_est_16$liberate), probs = seq(0,1,length.out = 5), na.rm = T)


dhs_est_16$wasted_quart_2016<-cut(dhs_est_16$liberate, brks, include.lowest = T)

eth.geo.16<-st_as_sf(dhs_est_16)


p1<-eth.geo.16%>%ggplot()+geom_sf(aes(fill=wasted_quart_2016))+geom_sf(data=eth_border,fill=NA, color="black",  size = 1)+
  scale_colour_brewer(palette = "YlOrRd",direction = -1 )+scale_fill_brewer(palette = "YlOrRd", na.value="grey", direction = -1)+guides(fill=guide_legend(title="Quartile"))+ylab(label = "")+xlab(label = "")+ggtitle(label = "Proportion of Children that were wasted, Ethiopia 2016", subtitle = "EDHS,2016")+theme(legend.text = element_text(size = 10))+theme(plot.title = element_text(color="Red", size=10, face="bold"), plot.subtitle = element_text(color="blue", size=10, face="bold"))+theme(legend.title = element_text(color="blue", size=10, face="bold"))


p1

#Applications of count data models Now we use the two model, the Poisson to estimate regression models for our ‘wasting’ outcome.

Our offset (denominator) include children under the age of 5 years (HC1 < 60 months) who slept in the household the previous night (HV103 = 1) and who have anthropometric Z-scores calculated in each adminstrative units.

In this model, we use the rural/urban, region, residence(rural/urban), region, % poor, %no education as predictor variable predicting the wasted individuals in the country.

Rural-urban disparities are highest in the sub-Saharan Africa, where children living in rural areas are more likely to be undernourished than children living in urban areas.

The model indicates that admin units that are rural have higher wasteding rates.

library(spdep) 

eths_pts<-readOGR("~/Google Drive/DHS Resources/ GPS data from DHS/ ET_2016_DHS_ETGE71FL",layer = "ETGE71FL")

## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/fikrewoldbitew/Google Drive/DHS Resources/ GPS data from DHS/ ET_2016_DHS_ETGE71FL", layer: "ETGE71FL"
## with 643 features
## It has 20 fields

#eths_pts$DHSCLUST!=642

eth_pts<-eths_pts[eths_pts$DHSCLUST!=324&eths_pts$DHSCLUST!=614,]



#write.csv(eths_pts, "/Users/fikrewoldbitew/Google Drive/Data/eths_pts.csv")
# dta<-eth_pts@data

m.geo<-merge(eth_pts@data, datagroup, by.x="DHSCLUST", by.y="hv021",  sort=F)

m.geo<- na.omit(m.geo)  

eth_pts@data<-m.geo


#write.csv(eth_pts@data, "/Users/fikrewoldbitew/Google Drive/Data/eth_pts.csv")


#mapview(eth_pts["DHSCLUST"])

datagroup<-filter(eth_pts@data, is.na(tot_wasted)==F)

fit_pois<- glm(tot_wasted ~ offset(log(tot_ch)) + res+region+ppoor+pno_educ,
               family=poisson, 
               data=eth_pts@data)
summary(fit_pois)

## 
## Call:
## glm(formula = tot_wasted ~ offset(log(tot_ch)) + res + region + 
##     ppoor + pno_educ, family = poisson, data = eth_pts@data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2010  -0.9744  -0.2178   0.6213   3.0067  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -1.91596    0.09536 -20.093  < 2e-16 ***
## resrural           0.07557    0.09030   0.837  0.40264    
## regionafar         0.21743    0.08898   2.444  0.01454 *  
## regionamhara       0.13903    0.09074   1.532  0.12548    
## regionoromia      -0.07592    0.08645  -0.878  0.37986    
## regionsomali      -0.04164    0.08787  -0.474  0.63556    
## regionbenishangul  0.27826    0.08903   3.126  0.00177 ** 
## regionsnnpr       -0.10133    0.09071  -1.117  0.26394    
## regiongambela     -0.18743    0.11268  -1.663  0.09624 .  
## regionharari      -0.07558    0.11975  -0.631  0.52794    
## regionaddis adaba -1.28014    0.25322  -5.055 4.30e-07 ***
## regiondire dawa    0.06191    0.11340   0.546  0.58513    
## ppoor              0.49296    0.09119   5.406 6.45e-08 ***
## pno_educ           0.28822    0.11186   2.577  0.00998 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1129.14  on 640  degrees of freedom
## Residual deviance:  794.71  on 627  degrees of freedom
## AIC: 2456.6
## 
## Number of Fisher Scoring iterations: 5

If we exponentiate the coefficients from the model, we can get the risk ratios:

exp(coef(fit_pois))

##       (Intercept)          resrural        regionafar      regionamhara 
##         0.1472005         1.0785013         1.2428788         1.1491640 
##      regionoromia      regionsomali regionbenishangul       regionsnnpr 
##         0.9268907         0.9592123         1.3208325         0.9036328 
##     regiongambela      regionharari regionaddis adaba   regiondire dawa 
##         0.8290894         0.9272013         0.2779991         1.0638628 
##             ppoor          pno_educ 
##         1.6371543         1.3340546

The above model clearly indicates that, residing region and proportion of poor and proportion no education are significanly associated with the wasting under the age of 5.

children residing in Afar are 21% more likely to be wasted than those children residing in Tigray region.Simlarly children residing in Benshangsul-gimuz and gambella are more likely to be wasted than those in Tigray and those residing in Addis Ababa are less likely to be wasted than their correspoding counterparts residing in Tigray.

_ The other interesting finding found from the above model is that, administrative units who have higher prortion of non-educated and poor mothers are more likely to be have wasted than adminstrative units who have lower proportion of wasted children.

datagroup$est_pois<- fitted(fit_pois)
datagroup$est_rate<- datagroup$est_pois/datagroup$tot_ch
aggregate(est_rate ~ res, data=datagroup, FUN = mean)

##     res  est_rate
## 1 urban 0.1318970
## 2 rural 0.2743334

##Issues with the Poisson model

scale<-sqrt(fit_pois$deviance/fit_pois$df.residual)
scale

## [1] 1.125822

The above poisson model clearly indicate that the variance is 1.25 times as large as the mean, which indicate that there is a little bit of over-dispersion in the poisson model.

While testing goodness of fit statistic

1-pchisq(fit_pois$deviance, df = fit_pois$df.residual)

## [1] 5.726142e-06

The statistic has a p-value of 0, indicating the model does not fit the data well at all.

##quasi-despersion model.

fit_qpois<- glm(tot_wasted ~ offset(log(tot_ch)) + res+region+ppoor+pno_educ, 
               family=quasipoisson, 
               data=eth_pts@data)
summary(fit_qpois)

## 
## Call:
## glm(formula = tot_wasted ~ offset(log(tot_ch)) + res + region + 
##     ppoor + pno_educ, family = quasipoisson, data = eth_pts@data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2010  -0.9744  -0.2178   0.6213   3.0067  
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.91596    0.10297 -18.606  < 2e-16 ***
## resrural           0.07557    0.09751   0.775  0.43863    
## regionafar         0.21743    0.09609   2.263  0.02399 *  
## regionamhara       0.13903    0.09799   1.419  0.15644    
## regionoromia      -0.07592    0.09336  -0.813  0.41642    
## regionsomali      -0.04164    0.09489  -0.439  0.66092    
## regionbenishangul  0.27826    0.09614   2.894  0.00393 ** 
## regionsnnpr       -0.10133    0.09795  -1.034  0.30131    
## regiongambela     -0.18743    0.12168  -1.540  0.12399    
## regionharari      -0.07558    0.12932  -0.584  0.55911    
## regionaddis adaba -1.28014    0.27345  -4.681 3.49e-06 ***
## regiondire dawa    0.06191    0.12246   0.506  0.61337    
## ppoor              0.49296    0.09847   5.006 7.23e-07 ***
## pno_educ           0.28822    0.12079   2.386  0.01733 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 1.166155)
## 
##     Null deviance: 1129.14  on 640  degrees of freedom
## Residual deviance:  794.71  on 627  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5

scale<-sqrt(fit_qpois$deviance/fit_qpois$df.residual)
scale

## [1] 1.125822

The quasipoisson model also showed similar result, indicating that the variance is 1.25 times as large as the mean. i.e. there is also over-dispersion in the quasipoisson model.

1-pchisq(fit_qpois$deviance, df = fit_qpois$df.residual)

## [1] 5.726142e-06

Similar result has been observed using quasipoisson regression model, indicating the quasi-poisson model also does not fit the data well.

Interpretation Here:

Negative Binomial distribution

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

fit_nb<- glm.nb(tot_wasted ~ offset(log(tot_ch))+res+region+ppoor+pno_educ,data=eth_pts@data)
summary(fit_nb)

## 
## Call:
## glm.nb(formula = tot_wasted ~ offset(log(tot_ch)) + res + region + 
##     ppoor + pno_educ, data = eth_pts@data, init.theta = 21.09133283, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0342  -0.9291  -0.1983   0.5533   2.7258  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -1.94103    0.10340 -18.772  < 2e-16 ***
## resrural           0.07896    0.09798   0.806  0.42033    
## regionafar         0.22581    0.10065   2.244  0.02486 *  
## regionamhara       0.13752    0.10085   1.364  0.17269    
## regionoromia      -0.07831    0.09657  -0.811  0.41743    
## regionsomali      -0.04982    0.09839  -0.506  0.61261    
## regionbenishangul  0.26481    0.10072   2.629  0.00856 ** 
## regionsnnpr       -0.11220    0.10011  -1.121  0.26242    
## regiongambela     -0.17353    0.12233  -1.419  0.15604    
## regionharari      -0.07936    0.13146  -0.604  0.54608    
## regionaddis adaba -1.25824    0.25819  -4.873 1.10e-06 ***
## regiondire dawa    0.02554    0.12825   0.199  0.84216    
## ppoor              0.49575    0.10111   4.903 9.43e-07 ***
## pno_educ           0.32361    0.12352   2.620  0.00879 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(21.0913) family taken to be 1)
## 
##     Null deviance: 975.23  on 640  degrees of freedom
## Residual deviance: 681.06  on 627  degrees of freedom
## AIC: 2446.3
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  21.09 
##           Std. Err.:  7.10 
## 
##  2 x log-likelihood:  -2416.285

which showed higher risk in the partially under-served areas.

##Relative risk model

Here, I used the epidemiological concept of relative risk using the concept of the expected number of cases. To calculate the expected number of wasted childrens, I calculated the national wasting rate and multiply it by the number of under-five children in each administrative units:

lbrate<-(sum(datagroup$tot_wasted, na.rm=T)/sum(datagroup$tot_ch, na.rm=T)) 
lbrate

## [1] 0.2540126

Which estimates that on average, 25% of children should be wasted. We can apply this rate to each adminstrative units under-5 children:

datagroup$E <- lbrate* datagroup$tot_ch

head(datagroup[, c("DHSCLUST", "tot_ch", "tot_wasted", "E")])

##   DHSCLUST tot_ch tot_wasted        E
## 1        1     22          7 5.588278
## 2        2     19          8 4.826240
## 3        3     19          7 4.826240
## 4        4     18         16 4.572227
## 5        5      6          1 1.524076
## 6        6     24         11 6.096303

This shows us the observed number of wasted under-5 children, and the expected number.

datagroup%>%
  ggplot(aes(tot_wasted))+geom_histogram()+ggtitle("Distribution of wasted children in Ethiopia", "y")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

datagroup%>%
  ggplot(aes(tot_wasted/E))+geom_histogram()+ggtitle("Distribution of Standardized Wasting in Ethiopia", "Y/E")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##Binomial count model

Here I also tried to fit the model using Binomial count model. However, inorder to apply the binomial count model, the offsetset variable should be based on E.

fit_pois_E<-glm(tot_wasted~ offset(log(E))+res+region+ppoor+pno_educ, 
               family=poisson, 
               data=datagroup)
summary(fit_pois_E)

## 
## Call:
## glm(formula = tot_wasted ~ offset(log(E)) + res + region + ppoor + 
##     pno_educ, family = poisson, data = datagroup)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2010  -0.9744  -0.2178   0.6213   3.0067  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -0.54559    0.09536  -5.722 1.05e-08 ***
## resrural           0.07557    0.09030   0.837  0.40264    
## regionafar         0.21743    0.08898   2.444  0.01454 *  
## regionamhara       0.13903    0.09074   1.532  0.12548    
## regionoromia      -0.07592    0.08645  -0.878  0.37986    
## regionsomali      -0.04164    0.08787  -0.474  0.63556    
## regionbenishangul  0.27826    0.08903   3.126  0.00177 ** 
## regionsnnpr       -0.10133    0.09071  -1.117  0.26394    
## regiongambela     -0.18743    0.11268  -1.663  0.09624 .  
## regionharari      -0.07558    0.11975  -0.631  0.52794    
## regionaddis adaba -1.28014    0.25322  -5.055 4.30e-07 ***
## regiondire dawa    0.06191    0.11340   0.546  0.58513    
## ppoor              0.49296    0.09119   5.406 6.45e-08 ***
## pno_educ           0.28822    0.11186   2.577  0.00998 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1129.14  on 640  degrees of freedom
## Residual deviance:  794.71  on 627  degrees of freedom
## AIC: 2456.6
## 
## Number of Fisher Scoring iterations: 5

In fact, these results are identical to those from the Poisson model with the under-5 children as the offset term.

##Filtering GLMs for spatial autocorrelation

###Autocorrelation in GLMs

library(spdep) 

nbs<-knearneigh(coordinates(eth_pts), k=4)

## Warning in knearneigh(coordinates(eth_pts), k = 4): knearneigh: identical
## points found

et.nb4<-knn2nb(nbs)
et.wt4<-nb2listw(et.nb4, style="W")

#moran.test(eth_pts$, listw=et.wt6)

#nbs<-knearneigh(coordinates(as(eth_pts, "Spatial")), k = 4, longlat = T)

lm.morantest(fit_pois, listw = et.wt4)

## 
##  Global Moran I for regression residuals
## 
## data:  
## model: glm(formula = tot_wasted ~ offset(log(tot_ch)) + res +
## region + ppoor + pno_educ, family = poisson, data = eth_pts@data)
## weights: et.wt4
## 
## Moran I statistic standard deviate = 4.9866, p-value = 3.072e-07
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.1202870738    -0.0094885542     0.0006772895

lm.morantest(fit_nb, listw = et.wt4)

## 
##  Global Moran I for regression residuals
## 
## data:  
## model: glm.nb(formula = tot_wasted ~ offset(log(tot_ch)) + res +
## region + ppoor + pno_educ, data = eth_pts@data, init.theta =
## 21.09133283, link = log)
## weights: et.wt4
## 
## Moran I statistic standard deviate = 5.2156, p-value = 9.16e-08
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.1247402709    -0.0105980067     0.0006733296

###Spatial nuisance and GLMs

me.fit<-ME(tot_wasted ~ offset(tot_ch)+res+pno_educ+ppoor, family=negative.binomial(40.75),
           data=eth_pts@data, listw = et.wt4, verbose=T
           ,alpha=.2
           )

## eV[,54], I: 2.541698e-09 ZI: NA, pr(ZI): 0.14
## eV[,297], I: 6.192899e-11 ZI: NA, pr(ZI): 0.39

me.fit

##   Eigenvector ZI pr(ZI)
## 0          NA NA   0.05
## 1          54 NA   0.14
## 2         297 NA   0.39

fits<-data.frame(fitted(me.fit))
eth_pts@data$me54<-fits$vec54
eth_pts@data$me297<-fits$vec297
place_ests_me54<-aggregate(me54~DHSCLUST, data=eth_pts@data, FUN=mean)
place_ests_me297<-aggregate(me297~DHSCLUST, data=eth_pts@data, FUN=mean)

dhs_est_me54<-left_join(eth_vonmap_16, place_ests_me54, by=c("DHSCLUST"= "DHSCLUST"))
dhs_est_me297<-left_join(eth_vonmap_16, place_ests_me297, by=c("DHSCLUST"= "DHSCLUST"))
 

#brk_me54<-quantile(c(dhs_est_me54$me54), probs = seq(0,1,length.out = 5), na.rm = T)


#dhs_est_mlocal$loc_quart<-cut(dhs_est_mlocal$locali, brk_lmor, include.lowest = T)

eth.geo.me54<- st_as_sf(dhs_est_me54)
eth.geo.me297<- st_as_sf(dhs_est_me297)



p1<-eth.geo.me54%>%ggplot()+geom_sf(aes(fill=me54, color=me54))+
  geom_sf(data=eth_border,fill=NA,color="black", size = 1)+
  #scale_colour_brewer(palette = "RdBu" )+scale_fill_brewer(palette = "RdBu", na.value="grey")+
    scale_color_viridis_c()+
  scale_fill_viridis_c(na.value = "grey50")+
  ggtitle(label = "Spatial Distribution Moran Eigenvector 54", subtitle = "EDHS 2016")+
  theme(plot.title = element_text(color="Red", size=10, face="bold"), plot.subtitle = element_text(color="blue", size=10, face="bold"))+theme(legend.title = element_text(color="blue", size=10, face="bold"))
 

p1

 p2<-eth.geo.me297%>%ggplot()+geom_sf(aes(fill=me297, color=me297))+
   geom_sf(data=eth_border,fill=NA, color="black",  size = 1)+
  #scale_colour_brewer(palette = "RdBu" )+scale_fill_brewer(palette = "RdBu", na.value="grey")+
    scale_color_viridis_c()+
  scale_fill_viridis_c(na.value = "grey50")+
  ggtitle(label = "Spatial Distribution Moran Eigenvector 297", subtitle = "EDHS 2016")+
  theme(plot.title = element_text(color="Red", size=10, face="bold"), plot.subtitle = element_text(color="blue", size=10, face="bold"))+theme(legend.title = element_text(color="blue", size=10, face="bold"))

p2

Then I plug the scores of each adminstrative units on these Moran eigenvectors into the model to “clean” it:

clean.nb<-glm.nb(tot_wasted~ offset(log(tot_ch)) + res+region+ppoor+pno_educ+fitted(me.fit), eth_pts@data)
summary(clean.nb)

## 
## Call:
## glm.nb(formula = tot_wasted ~ offset(log(tot_ch)) + res + region + 
##     ppoor + pno_educ + fitted(me.fit), data = eth_pts@data, init.theta = 22.22761043, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0375  -0.9289  -0.2157   0.5714   2.6973  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -1.92139    0.10315 -18.627  < 2e-16 ***
## resrural              0.05682    0.09799   0.580  0.56197    
## regionafar            0.23375    0.10017   2.334  0.01961 *  
## regionamhara          0.12999    0.10053   1.293  0.19597    
## regionoromia         -0.05891    0.09636  -0.611  0.54099    
## regionsomali         -0.04971    0.09795  -0.508  0.61180    
## regionbenishangul     0.27561    0.10027   2.749  0.00598 ** 
## regionsnnpr          -0.10440    0.09971  -1.047  0.29508    
## regiongambela        -0.17036    0.12194  -1.397  0.16239    
## regionharari         -0.07214    0.13091  -0.551  0.58159    
## regionaddis adaba    -1.29337    0.25862  -5.001  5.7e-07 ***
## regiondire dawa      -0.03818    0.13093  -0.292  0.77060    
## ppoor                 0.49618    0.10073   4.926  8.4e-07 ***
## pno_educ              0.31782    0.12328   2.578  0.00994 ** 
## fitted(me.fit)vec54  -1.51093    0.70973  -2.129  0.03326 *  
## fitted(me.fit)vec297  0.82396    0.65769   1.253  0.21027    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(22.2276) family taken to be 1)
## 
##     Null deviance: 981.58  on 640  degrees of freedom
## Residual deviance: 679.83  on 625  degrees of freedom
## AIC: 2444.5
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  22.23 
##           Std. Err.:  7.82 
## 
##  2 x log-likelihood:  -2410.451

lm.morantest(clean.nb, listw=et.wt4)

## 
##  Global Moran I for regression residuals
## 
## data:  
## model: glm.nb(formula = tot_wasted ~ offset(log(tot_ch)) + res +
## region + ppoor + pno_educ + fitted(me.fit), data = eth_pts@data,
## init.theta = 22.22761043, link = log)
## weights: et.wt4
## 
## Moran I statistic standard deviate = 5.0182, p-value = 2.607e-07
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.1191239933    -0.0112227452     0.0006746807

AIC(clean.nb)

## [1] 2444.451

AIC(fit_nb)

## [1] 2446.285

As we can see we couldn’t even decrease the autocorrelation in the model with a larger AIC units. So,these approach didn’t help to reduce the autocorrelation in my model.

HW3: DEM 7263 Spring 2019-Generalized Linear Models for Spatial Count Data

Fikre Bitew

Feb 23, 2019