library(rgdal)
library(foreign)
library(data.table)
library(spdep)
library(tmap)
# Load districts and attributes
g2 <- readOGR("../out/r16.05/shapefiles", "svyMaps_2016.06.22_sara")
dt2 <- read.dta("../temp/2016.05/SSApoverty_Dist_forGWR.12.dta")
# Keep STATA labels for re-use
dt2.lbl <- data.table(varCode=names(dt2), varLabel=attr(dt2, "var.labels"))
setkey(dt2.lbl, varCode)
dt2.lbl[is.na(varLabel), varLabel := varCode]
dt2 <- data.table(dt2)
# Make unique shape IDs explicit
g2.dt <- data.table(g2@data)
g2.dt[, rn := row.names(g2)]
g2$rn <- row.names(g2)
# Recode Ethiopia woredas
dt2[svyCode=="eth2010", svyL2Cd := svyL1Cd * 10000 + svyL2Cd]
# Merge shapes and attributes
setkey(g2.dt, ISO3, svyCode, svyL1Cd, svyL2Cd)
setkey(dt2, ISO3, svyCode, svyL1Cd, svyL2Cd)
# Look for possible duplicates
g2.dt[duplicated(dt2), .(ISO3, svyCode, svyL1Cd, svyL2Cd)]
dt2[duplicated(dt2), .(ISO3, svyCode, svyL1Cd, svyL2Cd)]
# Drop duplicated vars from Sara's file before merging
dt2[, `:=`(
rn=NULL, svyL1Nm=NULL, svyL2Nm=NULL, prttyNm=NULL, areakm=NULL, X=NULL, Y=NULL)]
# Any unmatched obs?
dt2[!g2.dt][, .N, by=svyCode]
## Empty data.table (0 rows) of 2 cols: svyCode,N
# Seems okay, so merge
g2.dt <- dt2[g2.dt]
# Re-attach Sara's attributes to shapes
g2 <- SpatialPolygonsDataFrame(g2, data.frame(g2.dt), match.ID="rn")
# Visually check a few vars
tm_shape(g2) + tm_polygons("ndvi_ave", title=dt2.lbl["ndvi_ave", varLabel],
borders.lwd=0.1, borders.col="white", style="pretty", n=9, palette="RdYlGn", auto=F)
tm_shape(g2) + tm_fill("pcexp_ppp_m", title=dt2.lbl["pcexp_ppp_m", varLabel],
borders.lwd=0.1, borders.col="white", style="jenks", n=9, auot=F)
tm_shape(g2) + tm_fill("spei_lt", title=dt2.lbl["spei_lt", varLabel],
borders.lwd=0.1, borders.col="white", style="jenks", n=9, palette="RdYlGn", auto=F)
Y <- "pcexp_ppp_m"
#Y <- "foodexp_ppp_m"
#Y <- "foodshare"
#Y <- "wealth_index_all"
X <- c("spei_lt", "L1_speihishock", "L1_speiloshock")
# X <- c("spei_lt", "L1_speidif", "L1_speidif2")
# X <- c("pre_lt", "temp_lt", "L1_prehishock", "L1_preloshock", "L1_temphishock")
# Drop shapes with missing outcome values (not included in model)
g2.nb <- g2[!is.na(g2$pcexp_ppp_m),]
# Impute missing X values with regional median
g2.nb.dt <- data.table(g2.nb@data)
g2.nb.dt[, spei_lt_imp := median(spei_lt, na.rm=T), by=.(svyCode, svyL1Cd)]
g2.nb.dt[is.na(spei_lt), spei_lt := spei_lt_imp]
g2.nb.dt[is.na(spei_lt), .N, by=svyCode]
# Impute still missing X values with national median
g2.nb.dt[, spei_lt_imp := median(spei_lt, na.rm=T), by=svyCode]
g2.nb.dt[is.na(spei_lt), spei_lt := spei_lt_imp]
# Re-attach imputed attributes to shapes
g2.nb <- SpatialPolygonsDataFrame(g2.nb, data.frame(g2.nb.dt), match.ID=F)
Need to choose between QUEEN contiguity or we can also choose the k-nearest points/shapes as neighbors using knn2nb() instead of poly2nb(). Can also assign neighbors based on a specified distance using dnearneigh().
There’s also the issue of chossing a method for the spatial weights (row-standardized, binary). Typically Row standardization is used to create proportional weights in cases where features have an unequal number of neighbors. Use Binary when you want comparable spatial parameters across different data sets with different connectivity structures.
# Generate spatial neighbour list for SSA
nb2 <- poly2nb(g2.nb, row.names=paste(g2.nb$ISO3, g2.nb$rn, sep="."))
summary(nb2)
## Neighbour list object:
## Number of regions: 1650
## Number of nonzero links: 7598
## Percentage nonzero weights: 0.2790817
## Average number of links: 4.604848
## 5 regions with no links:
## SEN.477 AGO.1052 ETH.1343 ETH.1746 ETH.1947
## Link number distribution:
##
## 0 1 2 3 4 5 6 7 8 9 10 11 14 15
## 5 48 124 239 379 386 263 132 51 12 7 2 1 1
## 48 least connected regions:
## GHA.114 NGA.187 TZA.212 TZA.216 ZMB.597 ZMB.609 KEN.901 AGO.1033 AGO.1051 AGO.1053 AGO.1080 AGO.1087 AGO.1099 AGO.1101 AGO.1104 AGO.1106 AGO.1109 BFA.1115 BFA.1119 COD.1161 ETH.1299 ETH.1345 ETH.1438 ETH.1441 ETH.1507 ETH.1583 ETH.1626 ETH.1674 ETH.1685 ETH.1692 ETH.1696 ETH.1716 ETH.1721 ETH.1759 ETH.1766 ETH.1815 ETH.1882 ETH.1934 ETH.1936 ETH.1941 ETH.1945 ETH.1953 ETH.1955 ETH.1957 ETH.1958 UGA.1968 UGA.2006 UGA.2061 with 1 link
## 1 most connected region:
## TZA.214 with 15 links
# Verify the 5 discontiguous districts
bad <- c("477", "1052", "1343", "1746", "1947")
g2.nb.dt[rn %in% bad, .(rn, ISO3, svyL1Nm, svyL2Nm, pcexp_ppp_m)]
## rn ISO3 svyL1Nm svyL2Nm pcexp_ppp_m
## 1: 477 SEN Dakar Pikine 135.95871
## 2: 1052 AGO Lunda Norte Caungula 74.23630
## 3: 1343 ETH gambella Gambela zuriya 71.91619
## 4: 1746 ETH oromiya Meyo 48.48015
## 5: 1947 ETH oromiya Guliso 54.08664
for(i in c("AGO", "ETH", "SEN")) {
bb <- bbox(g2.nb[g2.nb$ISO3==i & g2.nb$rn %in% bad,])
bb <- bb + c(-.5, -.5, .5, .5)
print(
tm_shape(g2, bbox=bb) + tm_borders() +
tm_shape(g2.nb) + tm_fill("pcexp_ppp_m") +
tm_shape(g2.nb[g2.nb$ISO3==i & g2.nb$rn %in% bad,]) +
tm_borders(col="red") + tm_text("svyL2Nm", col="red") +
tm_layout(legend.outside=T)
)}
They’re not islands, but surrounding districts have no data. Need to check a little more what’s going on, fix in QGIS if needed. Also refer to Bivand:
I did look at this 15 years ago with Boris Portnov, in the context of ESDA:
@incollection{bivand+portnov:04,
author = {R. S. Bivand and B. A. Portnov}, editor = {L. Anselin and R. J. G. M. Florax and S. J. Rey}, title = {Exploring spatial data analysis techniques using {}: the case of observations with no neighbours}, booktitle = {Advances in Spatial Econometrics: Methodology, Tools, Applications},
year = {2004},
publisher = {Springer},
address = {Berlin},
pages = {121–142} }There are oddities in the Moran scatterplot, and also in mapping the graph-based neighbour representation into matrix form, say with the spatial lag of a no-neighbour observation’s value being zero (for zero.policy=TRUE). That paper was the basis for the zero.policy= framework. There are other consequences that you’ve found with respect to the number of subgraphs, which may or may not break formal assumptions of analysis methods. In addition, we don’t know how far the broken assumptions actually matter. This would probably be a good candidate for proper study including simulation.
# Plot contiguities in a few countries
for (i in c("GHA", "ETH", "AGO", "SEN")) {
tmp <- g2.nb[g2.nb$ISO3==i,]
coords.tmp <- coordinates(tmp)
nb2.tmp <- poly2nb(tmp)
nb2.tmp
plot(g2[g2$ISO3==i,], col="red", lwd=0.1)
plot(tmp, col="grey90", lwd=0.1, add=T)
plot(nb2.tmp, coords.tmp, col="blue", add=T)
title(main=paste("Contiguity -", i), font.main=1)
}
# Save distance matrix for SSA (W=row standardized) to STATA for re-use
w2 <- nb2mat(nb2, style="W", zero.policy=T)
w2 <- as.data.frame(w2)
attr(w2, "var.labels") <- paste(g2$svyCode, g2$svyL1Cd, g2$svyL2Cd, sep=".")
write.dta(w2, "../out/r16.05/poverty_continguity.dta", version=12)
# Check population weights
summary(g2.nb$pop)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 79 65250 120500 291700 238700 10250000
# Spatial weights for SSA (check doco for how to include pop weights)
# Note that If zero.policy is set to TRUE, weights vectors of zero length are inserted
# for regions without neighbour in the neighbours list.
w <- nb2listw(nb2, zero.policy=T)
moran.plot(g2.nb@data[, Y], w, zero.policy=T,
xlim=c(0, 200), ylim=c(0, 200),
xlab=dt2.lbl[Y, varLabel],
ylab=paste("Spatially Lagged", Y))
#plot(
# variogram(as.formula(paste(Y, "~1")),
# locations=coordinates(g2.nb), data=g2.nb, cloud=F),
# type="b", pch=16, main=paste("Variogram of", dt2.lbl[Y, varLabel]))
moran.mc(g2.nb@data[, Y], w, zero.policy=T, nsim=999)
##
## Monte-Carlo simulation of Moran I
##
## data: g2.nb@data[, Y]
## weights: w
## number of simulations + 1: 1000
##
## statistic = 0.61505, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
moran.plot(g2.nb@data[, X[1]], w, zero.policy=T,
xlab=dt2.lbl[X[1], varLabel],
ylab=paste("Spatially Lagged", X[1]))
moran.mc(g2.nb@data[, X[1]], w, zero.policy=T, nsim=999)
##
## Monte-Carlo simulation of Moran I
##
## data: g2.nb@data[, X[1]]
## weights: w
## number of simulations + 1: 1000
##
## statistic = 0.89247, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
# Compare models
fm <- as.formula(paste(Y, "~", paste(X, collapse="+")))
fm
## pcexp_ppp_m ~ spei_lt + L1_speihishock + L1_speiloshock
# Model 1: simple OLS
m <- lm(fm, data=g2.nb.dt, weights=1/pop)
summary(m)
##
## Call:
## lm(formula = fm, data = g2.nb.dt, weights = 1/pop)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -4.5946 -0.4230 -0.2702 -0.1223 10.9019
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 166.721 2.258 73.841 < 2e-16 ***
## spei_lt -497.745 25.323 -19.656 < 2e-16 ***
## L1_speihishock -135.113 37.282 -3.624 0.000299 ***
## L1_speiloshock -90.942 44.292 -2.053 0.040207 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6781 on 1646 degrees of freedom
## Multiple R-squared: 0.195, Adjusted R-squared: 0.1936
## F-statistic: 132.9 on 3 and 1646 DF, p-value: < 2.2e-16
# Examine spatial autocorrelation among the residuals
lm.morantest(m, listw=w, zero.policy=T)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = fm, data = g2.nb.dt, weights = 1/pop)
## weights: w
##
## Moran I statistic standard deviate = 20.147, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.3460501431 -0.0008697837 0.0002965048
# Model 2: LAG model
mlagsar <- lagsarlm(fm, w, zero.policy=T, data=g2.nb.dt)
summary(mlagsar)
##
## Call:lagsarlm(formula = fm, data = g2.nb.dt, listw = w, zero.policy = T)
##
## Residuals:
## Min 1Q Median 3Q Max
## -151.6546 -15.7585 -6.3551 5.7782 582.5194
##
## Type: lag
## Regions with no neighbours included:
## SEN.477 AGO.1052 ETH.1343 ETH.1746 ETH.1947
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 23.8100 1.7151 13.8825 <2e-16
## spei_lt 3.3433 9.5494 0.3501 0.7263
## L1_speihishock -5.6525 6.4926 -0.8706 0.3840
## L1_speiloshock -5.8244 6.8429 -0.8512 0.3947
##
## Rho: 0.70984, LR test value: 995.26, p-value: < 2.22e-16
## Asymptotic standard error: 0.018093
## z-value: 39.232, p-value: < 2.22e-16
## Wald statistic: 1539.2, p-value: < 2.22e-16
##
## Log likelihood: -8315.603 for lag model
## ML residual variance (sigma squared): 1195.9, (sigma: 34.582)
## Number of observations: 1650
## Number of parameters estimated: 6
## AIC: 16643, (AIC for lm: 17636)
## LM test for residual autocorrelation
## test value: 100.13, p-value: < 2.22e-16
# Model 3: SAC model
msacsar <- sacsarlm(fm, w, zero.policy=T, data=g2.nb.dt)
summary(msacsar)
##
## Call:sacsarlm(formula = fm, data = g2.nb.dt, listw = w, zero.policy = T)
##
## Residuals:
## Min 1Q Median 3Q Max
## -127.6475 -11.2404 -3.8969 6.3728 498.9994
##
## Type: sac
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 122.1725 6.4924 18.8179 <2e-16
## spei_lt 19.8338 25.9401 0.7646 0.4445
## L1_speihishock -5.4229 7.9648 -0.6809 0.4960
## L1_speiloshock -8.8760 7.8254 -1.1343 0.2567
##
## Rho: -0.59865
## Asymptotic standard error: 0.043201
## z-value: -13.857, p-value: < 2.22e-16
## Lambda: 0.88784
## Asymptotic standard error: 0.012031
## z-value: 73.794, p-value: < 2.22e-16
##
## LR test value: 1115.8, p-value: < 2.22e-16
##
## Log likelihood: -8255.331 for sac model
## ML residual variance (sigma squared): 888.38, (sigma: 29.806)
## Number of observations: 1650
## Number of parameters estimated: 7
## AIC: 16525, (AIC for lm: 17636)
# Also show impact effects of spatial models
# To understand the direct (local), indirect(spill-over), and total effect of a unit
# change in each of the predictor variables
W <- as(w, "CsparseMatrix")
trMatc <- trW(W, type="mult")
impacts(mlagsar, tr=trMatc, R=2000)
## Impact measures (lag, trace):
## Direct Indirect Total
## spei_lt 4.021767 7.500132 11.52190
## L1_speihishock -6.799543 -12.680365 -19.47991
## L1_speiloshock -7.006400 -13.066130 -20.07253
impacts(msacsar, tr=trMatc, R=2000)
## Impact measures (sac, trace):
## Direct Indirect Total
## spei_lt 21.423800 -9.017233 12.406567
## L1_speihishock -5.857673 2.465482 -3.392190
## L1_speiloshock -9.587609 4.035405 -5.552204
summary(impacts(mlagsar, tr=trMatc, R=2000), zstats=T, short=T)
## Impact measures (lag, trace):
## Direct Indirect Total
## spei_lt 4.021767 7.500132 11.52190
## L1_speihishock -6.799543 -12.680365 -19.47991
## L1_speiloshock -7.006400 -13.066130 -20.07253
## ========================================================
## Simulation results (asymptotic variance matrix):
## ========================================================
## Simulated z-values:
## Direct Indirect Total
## spei_lt 0.3566697 0.3553799 0.3560692
## L1_speihishock -0.8520510 -0.8466720 -0.8495536
## L1_speiloshock -0.8482487 -0.8419022 -0.8450586
##
## Simulated p-values:
## Direct Indirect Total
## spei_lt 0.72134 0.72231 0.72179
## L1_speihishock 0.39419 0.39718 0.39557
## L1_speiloshock 0.39630 0.39984 0.39808
summary(impacts(msacsar, tr=trMatc, R=2000), zstats=T, short=T)
## Impact measures (sac, trace):
## Direct Indirect Total
## spei_lt 21.423800 -9.017233 12.406567
## L1_speihishock -5.857673 2.465482 -3.392190
## L1_speiloshock -9.587609 4.035405 -5.552204
## ========================================================
## Simulation results (asymptotic variance matrix):
## ========================================================
## Simulated z-values:
## Direct Indirect Total
## spei_lt 0.7512922 -0.7499719 0.7502616
## L1_speihishock -0.6760830 0.6746123 -0.6754650
## L1_speiloshock -1.1259427 1.1209913 -1.1250902
##
## Simulated p-values:
## Direct Indirect Total
## spei_lt 0.45248 0.45327 0.45310
## L1_speihishock 0.49899 0.49992 0.49938
## L1_speiloshock 0.26019 0.26229 0.26055
The output from impacts() in the LAG model says that a 1 point increase in long-term SPEI leads to an increase in expenditure of PPP $11/month. A 1 sd increase in drought leads to a reduction in expenditure of PPP $20/month.
# Label 20 districts at random
rnd <- sample(1:nrow(g2.nb.dt), 20)
# Plot OLS
plot(m$model$pcexp_ppp_m, m$fitted.values,
main=m$call, xlab=fm, cex=.5, pch=16,
xlim=c(0, 300), ylim=c(0, 300))
text(m$model$pcexp_ppp_m[rnd], m$fitted.values[rnd],
labels=g2.nb.dt[rnd, svyL2Nm],
cex=.6, pos=4)
# And residuals x fitted values
plot(m, which=3)
# Plot LAG
plot(mlagsar$y, mlagsar$fitted.values,
main=mlagsar$call, xlab=fm, cex=.5, pch=16,
xlim=c(0, 300), ylim=c(0, 300))
text(mlagsar$y[rnd], mlagsar$fitted.values[rnd],
labels=g2.nb.dt[rnd, svyL2Nm],
cex=.6, pos=4)
# Plot SAC
plot(msacsar$y, msacsar$fitted.values,
main=msacsar$call, xlab=fm, cex=.5, pch=16,
xlim=c(0, 300), ylim=c(0, 300))
text(msacsar$y[rnd], msacsar$fitted.values[rnd],
labels=g2.nb.dt[rnd, svyL2Nm],
cex=.6, pos=4)
See Bivand at https://cran.r-project.org/web/packages/spgwr/vignettes/GWR.pdf and Anselin http://www.csiss.org/gispopsci/workshops/2011/PSU/readings/W15_Anselin2007.pdf. Also Brunsdon http://rpubs.com/chrisbrunsdon/101305.
Note that sampling weights are not implemented. Choosing a method to estimate optimal bandwidth is unclear (check doco), also not clear how to choose a kernel function (default Gaussian).
# Load package
library(spgwr)
# Try GWR/LM on same model as above (pass shapes, will return shapes with coeff)
bwG <- gwr.sel(fm, data=g2.nb, gweight=gwr.Gauss, verbose=FALSE)
gwrG <- gwr(fm, data=g2.nb, bandwidth=bwG, gweight=gwr.Gauss, hatmatrix=TRUE)
gwrG
## Call:
## gwr(formula = fm, data = g2.nb, bandwidth = bwG, gweight = gwr.Gauss,
## hatmatrix = TRUE)
## Kernel function: gwr.Gauss
## Fixed bandwidth: 171.1847
## Summary of GWR coefficient estimates at data points:
## Min. 1st Qu. Median 3rd Qu. Max. Global
## X.Intercept. 28.450 52.690 72.370 78.360 429.000 78.8598
## spei_lt -2120.000 -14.840 62.750 153.200 760.500 15.4804
## L1_speihishock -237.000 -11.390 30.190 46.070 192.200 -12.0761
## L1_speiloshock -813.000 -14.870 -5.089 18.520 58.010 -5.2121
## Number of data points: 1650
## Effective number of parameters (residual: 2traceS - traceS'S): 189.859
## Effective degrees of freedom (residual: 2traceS - traceS'S): 1460.141
## Sigma (residual: 2traceS - traceS'S): 33.86525
## Effective number of parameters (model: traceS): 143.1904
## Effective degrees of freedom (model: traceS): 1506.81
## Sigma (model: traceS): 33.33669
## Sigma (ML): 31.85736
## AICc (GWR p. 61, eq 2.33; p. 96, eq. 4.21): 16420.89
## AIC (GWR p. 96, eq. 4.22): 16247.87
## Residual sum of squares: 1674570
## Quasi-global R2: 0.6031909
# Map coefficients
data(World)
for (i in X) {print(
tm_shape(gwrG$SDF, is.master=T) +
tm_fill(i, palette="RdYlGn", style="jenks", n=9,
title=stringr::str_wrap(dt2.lbl[Y, varLabel], 30)) +
tm_shape(World) + tm_borders(lwd=0.1) +
tm_layout(
title=dt2.lbl[i, varLabel],
title.snap.to.legend=T, legend.outside=T)
)}
# Could also try a GWR/GLM on same model as above
bwG <- ggwr.sel(fm, data=g2.nb, gweight=gwr.Gauss, verbose=FALSE)
gwrG <- ggwr(fm, data=g2.nb, bandwidth=bwG, gweight=gwr.Gauss, hatmatrix=TRUE,
family="poisson")