SSA Poverty - Spatial Regression

library(rgdal)
library(foreign)
library(data.table)
library(spdep)
library(tmap)

# Load districts and attributes
g2 <- readOGR("../out/r16.05/shapefiles", "svyMaps_2016.06.22_sara")
dt2 <- read.dta("../temp/2016.05/SSApoverty_Dist_forGWR.12.dta")

# Keep STATA labels for re-use
dt2.lbl <- data.table(varCode=names(dt2), varLabel=attr(dt2, "var.labels"))
setkey(dt2.lbl, varCode)
dt2.lbl[is.na(varLabel), varLabel := varCode]
dt2 <- data.table(dt2)

# Make unique shape IDs explicit
g2.dt <- data.table(g2@data)
g2.dt[, rn := row.names(g2)]
g2$rn <- row.names(g2)

# Recode Ethiopia woredas
dt2[svyCode=="eth2010", svyL2Cd := svyL1Cd * 10000 + svyL2Cd]

# Merge shapes and attributes
setkey(g2.dt, ISO3, svyCode, svyL1Cd, svyL2Cd)
setkey(dt2, ISO3, svyCode, svyL1Cd, svyL2Cd)

# Look for possible duplicates
g2.dt[duplicated(dt2), .(ISO3, svyCode, svyL1Cd, svyL2Cd)]

dt2[duplicated(dt2), .(ISO3, svyCode, svyL1Cd, svyL2Cd)]

# Drop duplicated vars from Sara's file before merging
dt2[, `:=`(
  rn=NULL, svyL1Nm=NULL, svyL2Nm=NULL, prttyNm=NULL, areakm=NULL, X=NULL, Y=NULL)]

# Any unmatched obs?
dt2[!g2.dt][, .N, by=svyCode]

## Empty data.table (0 rows) of 2 cols: svyCode,N

# Seems okay, so merge
g2.dt <- dt2[g2.dt]

# Re-attach Sara's attributes to shapes
g2 <- SpatialPolygonsDataFrame(g2, data.frame(g2.dt), match.ID="rn")

# Visually check a few vars
tm_shape(g2) + tm_polygons("ndvi_ave", title=dt2.lbl["ndvi_ave", varLabel],
  borders.lwd=0.1, borders.col="white", style="pretty", n=9, palette="RdYlGn", auto=F)

tm_shape(g2) + tm_fill("pcexp_ppp_m", title=dt2.lbl["pcexp_ppp_m", varLabel],
  borders.lwd=0.1, borders.col="white", style="jenks", n=9, auot=F)

tm_shape(g2) + tm_fill("spei_lt", title=dt2.lbl["spei_lt", varLabel],
  borders.lwd=0.1, borders.col="white", style="jenks", n=9, palette="RdYlGn", auto=F)

Model Specifications

Y <- "pcexp_ppp_m" 
#Y <- "foodexp_ppp_m" 
#Y <- "foodshare" 
#Y <- "wealth_index_all"

X <- c("spei_lt", "L1_speihishock", "L1_speiloshock")
# X <- c("spei_lt", "L1_speidif", "L1_speidif2")
# X <- c("pre_lt", "temp_lt", "L1_prehishock", "L1_preloshock", "L1_temphishock")


# Drop shapes with missing outcome values (not included in model)
g2.nb <- g2[!is.na(g2$pcexp_ppp_m),]


# Impute missing X values with regional median
g2.nb.dt <- data.table(g2.nb@data)
g2.nb.dt[, spei_lt_imp := median(spei_lt, na.rm=T), by=.(svyCode, svyL1Cd)]
g2.nb.dt[is.na(spei_lt), spei_lt := spei_lt_imp]
g2.nb.dt[is.na(spei_lt), .N, by=svyCode]

# Impute still missing X values with national median
g2.nb.dt[, spei_lt_imp := median(spei_lt, na.rm=T), by=svyCode]
g2.nb.dt[is.na(spei_lt), spei_lt := spei_lt_imp]

# Re-attach imputed attributes to shapes
g2.nb <- SpatialPolygonsDataFrame(g2.nb, data.frame(g2.nb.dt), match.ID=F)

Spatial Weights

Need to choose between QUEEN contiguity or we can also choose the k-nearest points/shapes as neighbors using knn2nb() instead of poly2nb(). Can also assign neighbors based on a specified distance using dnearneigh().

There’s also the issue of chossing a method for the spatial weights (row-standardized, binary). Typically Row standardization is used to create proportional weights in cases where features have an unequal number of neighbors. Use Binary when you want comparable spatial parameters across different data sets with different connectivity structures.

# Generate spatial neighbour list for SSA
nb2 <- poly2nb(g2.nb, row.names=paste(g2.nb$ISO3, g2.nb$rn, sep="."))
summary(nb2)

## Neighbour list object:
## Number of regions: 1650 
## Number of nonzero links: 7598 
## Percentage nonzero weights: 0.2790817 
## Average number of links: 4.604848 
## 5 regions with no links:
## SEN.477 AGO.1052 ETH.1343 ETH.1746 ETH.1947
## Link number distribution:
## 
##   0   1   2   3   4   5   6   7   8   9  10  11  14  15 
##   5  48 124 239 379 386 263 132  51  12   7   2   1   1 
## 48 least connected regions:
## GHA.114 NGA.187 TZA.212 TZA.216 ZMB.597 ZMB.609 KEN.901 AGO.1033 AGO.1051 AGO.1053 AGO.1080 AGO.1087 AGO.1099 AGO.1101 AGO.1104 AGO.1106 AGO.1109 BFA.1115 BFA.1119 COD.1161 ETH.1299 ETH.1345 ETH.1438 ETH.1441 ETH.1507 ETH.1583 ETH.1626 ETH.1674 ETH.1685 ETH.1692 ETH.1696 ETH.1716 ETH.1721 ETH.1759 ETH.1766 ETH.1815 ETH.1882 ETH.1934 ETH.1936 ETH.1941 ETH.1945 ETH.1953 ETH.1955 ETH.1957 ETH.1958 UGA.1968 UGA.2006 UGA.2061 with 1 link
## 1 most connected region:
## TZA.214 with 15 links

# Verify the 5 discontiguous districts
bad <- c("477", "1052", "1343", "1746", "1947")
g2.nb.dt[rn %in% bad, .(rn, ISO3, svyL1Nm, svyL2Nm, pcexp_ppp_m)]

##      rn ISO3     svyL1Nm        svyL2Nm pcexp_ppp_m
## 1:  477  SEN       Dakar         Pikine   135.95871
## 2: 1052  AGO Lunda Norte       Caungula    74.23630
## 3: 1343  ETH    gambella Gambela zuriya    71.91619
## 4: 1746  ETH     oromiya           Meyo    48.48015
## 5: 1947  ETH     oromiya         Guliso    54.08664

for(i in c("AGO", "ETH", "SEN")) {
  bb <- bbox(g2.nb[g2.nb$ISO3==i & g2.nb$rn %in% bad,])
  bb <- bb + c(-.5, -.5, .5, .5)
  print(
    tm_shape(g2, bbox=bb) + tm_borders() +
      tm_shape(g2.nb) + tm_fill("pcexp_ppp_m") + 
      tm_shape(g2.nb[g2.nb$ISO3==i & g2.nb$rn %in% bad,]) +
      tm_borders(col="red") + tm_text("svyL2Nm", col="red") +
      tm_layout(legend.outside=T)
  )}

They’re not islands, but surrounding districts have no data. Need to check a little more what’s going on, fix in QGIS if needed. Also refer to Bivand:

I did look at this 15 years ago with Boris Portnov, in the context of ESDA:

@incollection{bivand+portnov:04,
author = {R. S. Bivand and B. A. Portnov}, editor = {L. Anselin and R. J. G. M. Florax and S. J. Rey}, title = {Exploring spatial data analysis techniques using {}: the case of observations with no neighbours}, booktitle = {Advances in Spatial Econometrics: Methodology, Tools, Applications},
year = {2004},
publisher = {Springer},
address = {Berlin},
pages = {121–142} }

There are oddities in the Moran scatterplot, and also in mapping the graph-based neighbour representation into matrix form, say with the spatial lag of a no-neighbour observation’s value being zero (for zero.policy=TRUE). That paper was the basis for the zero.policy= framework. There are other consequences that you’ve found with respect to the number of subgraphs, which may or may not break formal assumptions of analysis methods. In addition, we don’t know how far the broken assumptions actually matter. This would probably be a good candidate for proper study including simulation.

Plot Contiguities

# Plot contiguities in a few countries
for (i in c("GHA", "ETH", "AGO", "SEN")) {
  
  tmp <- g2.nb[g2.nb$ISO3==i,]
  coords.tmp <- coordinates(tmp)
  nb2.tmp <- poly2nb(tmp)
  nb2.tmp
  
  plot(g2[g2$ISO3==i,], col="red", lwd=0.1)
  plot(tmp, col="grey90", lwd=0.1, add=T)
  plot(nb2.tmp, coords.tmp, col="blue", add=T)
  title(main=paste("Contiguity -", i), font.main=1)
}

# Save distance matrix for SSA (W=row standardized) to STATA for re-use
w2 <- nb2mat(nb2, style="W", zero.policy=T)
w2 <- as.data.frame(w2)
attr(w2, "var.labels") <- paste(g2$svyCode, g2$svyL1Cd, g2$svyL2Cd, sep=".")
write.dta(w2, "../out/r16.05/poverty_continguity.dta", version=12)

# Check population weights
summary(g2.nb$pop)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##       79    65250   120500   291700   238700 10250000

# Spatial weights for SSA (check doco for how to include pop weights)
# Note that If zero.policy is set to TRUE, weights vectors of zero length are inserted
# for regions without neighbour in the neighbours list. 
w <- nb2listw(nb2, zero.policy=T)

Moran’s I Statistic

moran.plot(g2.nb@data[, Y], w, zero.policy=T,
  xlim=c(0, 200), ylim=c(0, 200),
  xlab=dt2.lbl[Y, varLabel], 
  ylab=paste("Spatially Lagged", Y))

#plot(
#  variogram(as.formula(paste(Y, "~1")), 
#    locations=coordinates(g2.nb), data=g2.nb, cloud=F), 
#  type="b", pch=16, main=paste("Variogram of", dt2.lbl[Y, varLabel]))

moran.mc(g2.nb@data[, Y], w, zero.policy=T, nsim=999)

## 
##  Monte-Carlo simulation of Moran I
## 
## data:  g2.nb@data[, Y] 
## weights: w  
## number of simulations + 1: 1000 
## 
## statistic = 0.61505, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater

moran.plot(g2.nb@data[, X[1]], w, zero.policy=T,
  xlab=dt2.lbl[X[1], varLabel], 
  ylab=paste("Spatially Lagged", X[1]))

moran.mc(g2.nb@data[, X[1]], w, zero.policy=T, nsim=999)

## 
##  Monte-Carlo simulation of Moran I
## 
## data:  g2.nb@data[, X[1]] 
## weights: w  
## number of simulations + 1: 1000 
## 
## statistic = 0.89247, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater

Spatial Regressions (OLS, LAG, SAC)

# Compare models
fm <- as.formula(paste(Y, "~", paste(X, collapse="+")))
fm

## pcexp_ppp_m ~ spei_lt + L1_speihishock + L1_speiloshock

# Model 1: simple OLS
m <- lm(fm, data=g2.nb.dt, weights=1/pop)
summary(m)

## 
## Call:
## lm(formula = fm, data = g2.nb.dt, weights = 1/pop)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5946 -0.4230 -0.2702 -0.1223 10.9019 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     166.721      2.258  73.841  < 2e-16 ***
## spei_lt        -497.745     25.323 -19.656  < 2e-16 ***
## L1_speihishock -135.113     37.282  -3.624 0.000299 ***
## L1_speiloshock  -90.942     44.292  -2.053 0.040207 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6781 on 1646 degrees of freedom
## Multiple R-squared:  0.195,  Adjusted R-squared:  0.1936 
## F-statistic: 132.9 on 3 and 1646 DF,  p-value: < 2.2e-16

# Examine spatial autocorrelation among the residuals
lm.morantest(m, listw=w, zero.policy=T)

## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = fm, data = g2.nb.dt, weights = 1/pop)
## weights: w
## 
## Moran I statistic standard deviate = 20.147, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.3460501431    -0.0008697837     0.0002965048

# Model 2: LAG model
mlagsar <- lagsarlm(fm, w, zero.policy=T, data=g2.nb.dt)
summary(mlagsar)

## 
## Call:lagsarlm(formula = fm, data = g2.nb.dt, listw = w, zero.policy = T)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -151.6546  -15.7585   -6.3551    5.7782  582.5194 
## 
## Type: lag 
## Regions with no neighbours included:
##  SEN.477 AGO.1052 ETH.1343 ETH.1746 ETH.1947 
## Coefficients: (asymptotic standard errors) 
##                Estimate Std. Error z value Pr(>|z|)
## (Intercept)     23.8100     1.7151 13.8825   <2e-16
## spei_lt          3.3433     9.5494  0.3501   0.7263
## L1_speihishock  -5.6525     6.4926 -0.8706   0.3840
## L1_speiloshock  -5.8244     6.8429 -0.8512   0.3947
## 
## Rho: 0.70984, LR test value: 995.26, p-value: < 2.22e-16
## Asymptotic standard error: 0.018093
##     z-value: 39.232, p-value: < 2.22e-16
## Wald statistic: 1539.2, p-value: < 2.22e-16
## 
## Log likelihood: -8315.603 for lag model
## ML residual variance (sigma squared): 1195.9, (sigma: 34.582)
## Number of observations: 1650 
## Number of parameters estimated: 6 
## AIC: 16643, (AIC for lm: 17636)
## LM test for residual autocorrelation
## test value: 100.13, p-value: < 2.22e-16

# Model 3: SAC model
msacsar <- sacsarlm(fm, w, zero.policy=T, data=g2.nb.dt)
summary(msacsar)

## 
## Call:sacsarlm(formula = fm, data = g2.nb.dt, listw = w, zero.policy = T)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -127.6475  -11.2404   -3.8969    6.3728  498.9994 
## 
## Type: sac 
## Coefficients: (asymptotic standard errors) 
##                Estimate Std. Error z value Pr(>|z|)
## (Intercept)    122.1725     6.4924 18.8179   <2e-16
## spei_lt         19.8338    25.9401  0.7646   0.4445
## L1_speihishock  -5.4229     7.9648 -0.6809   0.4960
## L1_speiloshock  -8.8760     7.8254 -1.1343   0.2567
## 
## Rho: -0.59865
## Asymptotic standard error: 0.043201
##     z-value: -13.857, p-value: < 2.22e-16
## Lambda: 0.88784
## Asymptotic standard error: 0.012031
##     z-value: 73.794, p-value: < 2.22e-16
## 
## LR test value: 1115.8, p-value: < 2.22e-16
## 
## Log likelihood: -8255.331 for sac model
## ML residual variance (sigma squared): 888.38, (sigma: 29.806)
## Number of observations: 1650 
## Number of parameters estimated: 7 
## AIC: 16525, (AIC for lm: 17636)

# Also show impact effects of spatial models
# To understand the direct (local), indirect(spill-over), and total effect of a unit 
# change in each of the predictor variables
W <- as(w, "CsparseMatrix")
trMatc <- trW(W, type="mult")

impacts(mlagsar, tr=trMatc, R=2000)

## Impact measures (lag, trace):
##                   Direct   Indirect     Total
## spei_lt         4.021767   7.500132  11.52190
## L1_speihishock -6.799543 -12.680365 -19.47991
## L1_speiloshock -7.006400 -13.066130 -20.07253

impacts(msacsar, tr=trMatc, R=2000)

## Impact measures (sac, trace):
##                   Direct  Indirect     Total
## spei_lt        21.423800 -9.017233 12.406567
## L1_speihishock -5.857673  2.465482 -3.392190
## L1_speiloshock -9.587609  4.035405 -5.552204

summary(impacts(mlagsar, tr=trMatc, R=2000), zstats=T, short=T)

## Impact measures (lag, trace):
##                   Direct   Indirect     Total
## spei_lt         4.021767   7.500132  11.52190
## L1_speihishock -6.799543 -12.680365 -19.47991
## L1_speiloshock -7.006400 -13.066130 -20.07253
## ========================================================
## Simulation results (asymptotic variance matrix):
## ========================================================
## Simulated z-values:
##                    Direct   Indirect      Total
## spei_lt         0.3566697  0.3553799  0.3560692
## L1_speihishock -0.8520510 -0.8466720 -0.8495536
## L1_speiloshock -0.8482487 -0.8419022 -0.8450586
## 
## Simulated p-values:
##                Direct  Indirect Total  
## spei_lt        0.72134 0.72231  0.72179
## L1_speihishock 0.39419 0.39718  0.39557
## L1_speiloshock 0.39630 0.39984  0.39808

summary(impacts(msacsar, tr=trMatc, R=2000), zstats=T, short=T)

## Impact measures (sac, trace):
##                   Direct  Indirect     Total
## spei_lt        21.423800 -9.017233 12.406567
## L1_speihishock -5.857673  2.465482 -3.392190
## L1_speiloshock -9.587609  4.035405 -5.552204
## ========================================================
## Simulation results (asymptotic variance matrix):
## ========================================================
## Simulated z-values:
##                    Direct   Indirect      Total
## spei_lt         0.7512922 -0.7499719  0.7502616
## L1_speihishock -0.6760830  0.6746123 -0.6754650
## L1_speiloshock -1.1259427  1.1209913 -1.1250902
## 
## Simulated p-values:
##                Direct  Indirect Total  
## spei_lt        0.45248 0.45327  0.45310
## L1_speihishock 0.49899 0.49992  0.49938
## L1_speiloshock 0.26019 0.26229  0.26055

The output from impacts() in the LAG model says that a 1 point increase in long-term SPEI leads to an increase in expenditure of PPP $11/month. A 1 sd increase in drought leads to a reduction in expenditure of PPP $20/month.

# Label 20 districts at random
rnd <- sample(1:nrow(g2.nb.dt), 20)

# Plot OLS
plot(m$model$pcexp_ppp_m, m$fitted.values,
  main=m$call, xlab=fm, cex=.5, pch=16,
  xlim=c(0, 300), ylim=c(0, 300))

text(m$model$pcexp_ppp_m[rnd], m$fitted.values[rnd], 
  labels=g2.nb.dt[rnd, svyL2Nm],
  cex=.6, pos=4)

# And residuals x fitted values
plot(m, which=3)

# Plot LAG
plot(mlagsar$y, mlagsar$fitted.values,
  main=mlagsar$call, xlab=fm, cex=.5, pch=16,
  xlim=c(0, 300), ylim=c(0, 300))

text(mlagsar$y[rnd], mlagsar$fitted.values[rnd], 
  labels=g2.nb.dt[rnd, svyL2Nm],
  cex=.6, pos=4)

# Plot SAC
plot(msacsar$y, msacsar$fitted.values,
  main=msacsar$call, xlab=fm, cex=.5, pch=16,
  xlim=c(0, 300), ylim=c(0, 300))

text(msacsar$y[rnd], msacsar$fitted.values[rnd], 
  labels=g2.nb.dt[rnd, svyL2Nm],
  cex=.6, pos=4)

Geographically Weighted Regression

See Bivand at https://cran.r-project.org/web/packages/spgwr/vignettes/GWR.pdf and Anselin http://www.csiss.org/gispopsci/workshops/2011/PSU/readings/W15_Anselin2007.pdf. Also Brunsdon http://rpubs.com/chrisbrunsdon/101305.

Note that sampling weights are not implemented. Choosing a method to estimate optimal bandwidth is unclear (check doco), also not clear how to choose a kernel function (default Gaussian).

# Load package
library(spgwr)

# Try GWR/LM on same model as above (pass shapes, will return shapes with coeff)
bwG <- gwr.sel(fm, data=g2.nb, gweight=gwr.Gauss, verbose=FALSE)
gwrG <- gwr(fm, data=g2.nb, bandwidth=bwG, gweight=gwr.Gauss, hatmatrix=TRUE)
gwrG

## Call:
## gwr(formula = fm, data = g2.nb, bandwidth = bwG, gweight = gwr.Gauss, 
##     hatmatrix = TRUE)
## Kernel function: gwr.Gauss 
## Fixed bandwidth: 171.1847 
## Summary of GWR coefficient estimates at data points:
##                     Min.   1st Qu.    Median   3rd Qu.      Max.   Global
## X.Intercept.      28.450    52.690    72.370    78.360   429.000  78.8598
## spei_lt        -2120.000   -14.840    62.750   153.200   760.500  15.4804
## L1_speihishock  -237.000   -11.390    30.190    46.070   192.200 -12.0761
## L1_speiloshock  -813.000   -14.870    -5.089    18.520    58.010  -5.2121
## Number of data points: 1650 
## Effective number of parameters (residual: 2traceS - traceS'S): 189.859 
## Effective degrees of freedom (residual: 2traceS - traceS'S): 1460.141 
## Sigma (residual: 2traceS - traceS'S): 33.86525 
## Effective number of parameters (model: traceS): 143.1904 
## Effective degrees of freedom (model: traceS): 1506.81 
## Sigma (model: traceS): 33.33669 
## Sigma (ML): 31.85736 
## AICc (GWR p. 61, eq 2.33; p. 96, eq. 4.21): 16420.89 
## AIC (GWR p. 96, eq. 4.22): 16247.87 
## Residual sum of squares: 1674570 
## Quasi-global R2: 0.6031909

# Map coefficients
data(World)

for (i in X) {print(
    tm_shape(gwrG$SDF, is.master=T) + 
      tm_fill(i, palette="RdYlGn", style="jenks", n=9,
        title=stringr::str_wrap(dt2.lbl[Y, varLabel], 30)) +
      tm_shape(World) + tm_borders(lwd=0.1) +
      tm_layout(
        title=dt2.lbl[i, varLabel],
        title.snap.to.legend=T, legend.outside=T)
  )}

# Could also try a GWR/GLM on same model as above
bwG <- ggwr.sel(fm, data=g2.nb, gweight=gwr.Gauss, verbose=FALSE)
gwrG <- ggwr(fm, data=g2.nb, bandwidth=bwG, gweight=gwr.Gauss, hatmatrix=TRUE,
  family="poisson")