This is a combination of the exercise in the lecture slides: Spatial Analysis and Regression I.pdf and the rpub http://rpubs.com/Seth/LagEquibEff
# install.packages('spdep') install.packages('gstat')
# install.packages('classInt')
library(RColorBrewer)
library(spdep)
## Loading required package: sp
## Loading required package: boot
## Loading required package: Matrix
## Loading required package: lattice
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
## Loading required package: MASS
## Loading required package: nlme
## Loading required package: maptools
## Loading required package: foreign
## Loading required package: grid
## Checking rgeos availability: FALSE Note: when rgeos is not available,
## polygon geometry computations in maptools depend on gpclib, which has a
## restricted licence. It is disabled by default; to enable gpclib, type
## gpclibPermit()
## Loading required package: deldir
## deldir 0.0-21
## Loading required package: coda
## Loading required package: splines
library(gstat)
library(classInt)
## Loading required package: class
## Loading required package: e1071
load("/Users/dgetman/WorkRelated/GradSchool/Current_Courses/GEOG_5023/Data/soco.rda")
# Generate the matrix of binary neighbors (Queen’s)
soco_nbq <- poly2nb(soco)
# Use the Neighbors to generate the row standardized weights matrix
soco_nbq_w <- nb2listw(soco_nbq)
## THESE TWO ABOVE STEPS ARE SEPRATE FOR A REASON, WHY? Because you need
## the binary neighbors to get the total number of neighbors in order to
## calculate the standardized rows
## GENERALLY YOU HAVE TO PREFORM BOTH BEFORE BEFORE ANY SPATIAL ANALYSIS.
# This calculates a simulation of 999 randomly generated 'maps' within
# which our data are ranked.
MyMoran <- moran.mc(soco$PPOV, listw = soco_nbq_w, nsim = 999)
MyMoran
##
## Monte-Carlo simulation of Moran's I
##
## data: soco$PPOV
## weights: soco_nbq_w
## number of simulations + 1: 1000
##
## statistic = 0.5893, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
hist(MyMoran$res, breaks = 50)
# GUESS WHICH OBSERVATION IS OUTLIER The outlier is showing the value of
# the moran's I from our data. It is obviously a rare event.
# This plots the percent poverty against the lag of percent poverty to
# help spot relationships between neighbors. Looking at Washington DC is
# telling because the high poverty and low lag show it is surrounded by
# less poverty
mp <- moran.plot(soco$PPOV, soco_nbq_w, labels = as.character(soco$CNTY_ST),
xlab = "Percent Poverty", ylab = "Lag of Percent Poverty")
# Calculate the local Moran for each point in the dataset using the
# weights matrix
locm <- localmoran(soco$PPOV, soco_nbq_w)
summary(locm)
## Ii E.Ii Var.Ii Z.Ii
## Min. :-1.778 Min. :-0.000722 Min. :0.0901 Min. :-3.983
## 1st Qu.: 0.012 1st Qu.:-0.000722 1st Qu.:0.1420 1st Qu.: 0.029
## Median : 0.219 Median :-0.000722 Median :0.1658 Median : 0.517
## Mean : 0.589 Mean :-0.000722 Mean :0.1861 Mean : 1.408
## 3rd Qu.: 0.746 3rd Qu.:-0.000722 3rd Qu.:0.1990 3rd Qu.: 1.761
## Max. : 9.335 Max. :-0.000722 Max. :0.9981 Max. :18.977
## Pr(z > 0)
## Min. :0.0000
## 1st Qu.:0.0391
## Median :0.3025
## Mean :0.2900
## 3rd Qu.:0.4886
## Max. :1.0000
## Manually make a moran plot Need to standardize variables
soco$sPPOV <- scale(soco$PPOV) #save to a new column
head(soco@data[, c("sPPOV", "PPOV")])
## sPPOV PPOV
## 0 -0.9591 0.1372
## 1 -0.9929 0.1341
## 2 1.6199 0.3725
## 3 0.6180 0.2811
## 4 -0.9803 0.1353
## 5 2.4691 0.4500
# create a lagged variable
soco$lag_sPPOV <- lag.listw(soco_nbq_w, soco$sPPOV)
summary(soco$sPPOV)
## V1
## Min. :-2.151
## 1st Qu.:-0.700
## Median :-0.123
## Mean : 0.000
## 3rd Qu.: 0.591
## Max. : 4.062
summary(soco$lag_sPPOV)
## V1
## Min. :-1.9925
## 1st Qu.:-0.5298
## Median :-0.0762
## Mean : 0.0085
## 3rd Qu.: 0.4514
## Max. : 2.6478
plot(x = soco$sPPOV, y = soco$lag_sPPOV, main = "Moran Scatterplot PPOV")
abline(h = 0, v = 0)
abline(lm(soco$lag_sPPOV ~ soco$sPPOV), lty = 3, lwd = 4, col = "red")
## Check Out the outliers click on one or two and then hit escape (or
## click finish)
identify(soco$sPPOV, soco$lag_sPPOV, soco$CNTY_ST, cex = 0.8)
## integer(0)
## Identify the Moran plot quadrant for each observation This is some
## serious slicing and illustrate the power of the bracket...
## soco$quad_sig <- NA
soco@data[(soco$sPPOV >= 0 & soco$lag_sPPOV >= 0) & (locm[, 5] <= 0.05), "quad_sig"] <- 1
soco@data[(soco$sPPOV <= 0 & soco$lag_sPPOV <= 0) & (locm[, 5] <= 0.05), "quad_sig"] <- 2
soco@data[(soco$sPPOV >= 0 & soco$lag_sPPOV <= 0) & (locm[, 5] <= 0.05), "quad_sig"] <- 3
soco@data[(soco$sPPOV >= 0 & soco$lag_sPPOV <= 0) & (locm[, 5] <= 0.05), "quad_sig"] <- 4
# This might be wrong somehow
soco@data[(soco$sPPOV <= 0 & soco$lag_sPPOV >= 0) & (locm[, 5] <= 0.05), "quad_sig"] <- 5 #WE ASSIGN A 5 TO ALL NON-SIGNIFICANT OBSERVATIONS
##### MAP THE RESULTS (courtesy of Paul Voss) ############# # Set the
##### breaks for the thematic map classes
breaks <- seq(1, 5, 1)
# Set the corresponding labels for the thematic map classes
labels <- c("high-High", "low-Low", "High-Low", "Low-High", "Not Signif.")
# see ?findInterval - This is necessary for making a map
np <- findInterval(soco$quad_sig, breaks)
# Assign colors to each map class
colors <- c("red", "blue", "lightpink", "skyblue2", "white")
plot(soco, col = colors[np]) #colors[np] manually sets the color for each county
mtext("Local Moran's I", cex = 1.5, side = 3, line = 1)
legend("topleft", legend = labels, fill = colors, bty = "n")
cor10 <- sp.correlogram(soco_nbq, soco$PPOV, order = 10, method = "I")
cor10
## Spatial correlogram for soco$PPOV
## method: Moran's I
## estimate expectation variance standard deviate Pr(I) two sided
## 1 (1387) 5.89e-01 -7.22e-04 2.61e-04 36.54 < 2e-16
## 2 (1387) 4.58e-01 -7.22e-04 1.30e-04 40.20 < 2e-16
## 3 (1387) 3.34e-01 -7.22e-04 8.85e-05 35.60 < 2e-16
## 4 (1387) 2.40e-01 -7.22e-04 7.03e-05 28.69 < 2e-16
## 5 (1387) 1.48e-01 -7.22e-04 5.90e-05 19.39 < 2e-16
## 6 (1387) 8.35e-02 -7.22e-04 5.03e-05 11.88 < 2e-16
## 7 (1387) 3.09e-02 -7.22e-04 4.33e-05 4.81 1.5e-06
## 8 (1387) -1.46e-02 -7.22e-04 3.83e-05 -2.24 0.025
## 9 (1387) -5.41e-02 -7.22e-04 3.47e-05 -9.06 < 2e-16
## 10 (1387) -7.38e-02 -7.22e-04 3.23e-05 -12.85 < 2e-16
##
## 1 (1387) ***
## 2 (1387) ***
## 3 (1387) ***
## 4 (1387) ***
## 5 (1387) ***
## 6 (1387) ***
## 7 (1387) ***
## 8 (1387) *
## 9 (1387) ***
## 10 (1387) ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(cor10)
plot(variogram(soco$PPOV ~ 1, locations = coordinates(soco), data = soco, cloud = F),
type = "b")
sp.correlogram(soco_nbq, soco$PPOV, order = 10, method = "corr")
## Spatial correlogram for soco$PPOV
## method: Spatial autocorrelation
## 1 2 3 4 5 6 7 8
## 0.75938 0.69002 0.58062 0.47335 0.33112 0.20307 0.08188 -0.04172
## 9 10
## -0.16401 -0.22568
soco_OLS <- lm(PPOV ~ PFHH + PUNEM + PBLK + P65UP, data = soco)
summary(soco_OLS)
##
## Call:
## lm(formula = PPOV ~ PFHH + PUNEM + PBLK + P65UP, data = soco)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.26570 -0.03711 -0.00785 0.03294 0.28733
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.05544 0.00891 -6.22 6.6e-10 ***
## PFHH 0.45748 0.04941 9.26 < 2e-16 ***
## PUNEM 2.18109 0.07334 29.74 < 2e-16 ***
## PBLK -0.05919 0.01878 -3.15 0.0017 **
## P65UP 0.38023 0.04276 8.89 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0576 on 1382 degrees of freedom
## Multiple R-squared: 0.603, Adjusted R-squared: 0.601
## F-statistic: 524 on 4 and 1382 DF, p-value: <2e-16
plot(soco_OLS) #Notice non-normal residuals, right skew
soco$resid <- resid(soco_OLS) #save the residuals
moran.mc(soco$resid, soco_nbq_w, nsim = 999)
##
## Monte-Carlo simulation of Moran's I
##
## data: soco$resid
## weights: soco_nbq_w
## number of simulations + 1: 1000
##
## statistic = 0.3609, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
soco_LAG <- lagsarlm(PPOV ~ PFHH + PUNEM + PBLK + P65UP, data = soco, soco_nbq_w)
summary(soco_LAG)
##
## Call:lagsarlm(formula = PPOV ~ PFHH + PUNEM + PBLK + P65UP, data = soco,
## listw = soco_nbq_w)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2457653 -0.0284360 -0.0028762 0.0262169 0.2374894
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.100260 0.007375 -13.5946 < 2.2e-16
## PFHH 0.429404 0.040246 10.6695 < 2.2e-16
## PUNEM 1.354637 0.065959 20.5374 < 2.2e-16
## PBLK -0.069046 0.015335 -4.5025 6.716e-06
## P65UP 0.291192 0.035210 8.2701 2.220e-16
##
## Rho: 0.5172, LR test value: 491.5, p-value: < 2.22e-16
## Asymptotic standard error: 0.02118
## z-value: 24.42, p-value: < 2.22e-16
## Wald statistic: 596.3, p-value: < 2.22e-16
##
## Log likelihood: 2239 for lag model
## ML residual variance (sigma squared): 0.002193, (sigma: 0.04682)
## Number of observations: 1387
## Number of parameters estimated: 7
## AIC: -4464, (AIC for lm: -3974)
## LM test for residual autocorrelation
## test value: 4.85, p-value: 0.027651
#### This is where we switch from the lecture to the rpub
# Our goal will be to model the percent of children living in poverty in
# the southeastern USA. This variable shows significant spatial
# auto-correlation, but the nugget effect is also substantial.
plot(variogram(soco$PPOV ~ 1, locations = coordinates(soco), data = soco, cloud = F),
type = "b", pch = 16, main = "Variogram of PPOV")
# The Moran's I also indicates significant spatial autocorrelation in
# poverty: By doing this, we can see where our samplemwould be in a group
# of 1000 samples
moran.mc(soco$PPOV, soco_nbq_w, nsim = 999)
##
## Monte-Carlo simulation of Moran's I
##
## data: soco$PPOV
## weights: soco_nbq_w
## number of simulations + 1: 1000
##
## statistic = 0.5893, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
# This variogram and Moran's I indicate that there is spatial
# autocorrelation in childhood poverty (PPOV) but this spatial pattern
# could be cause by the spatial patterning of the covariates of poverty. A
# full analysis would involve fitting an OLS model (weighted by
# population) and comparing that model to spatial models. Here, we'll
# assume that spatial lag model is appropriate (I'll fit an unweighted
# model for simplicity).
soco_LAG <- lagsarlm(PPOV ~ PFHH + PUNEM + PBLK + P65UP, data = soco, soco_nbq_w)
summary(soco_LAG)
##
## Call:lagsarlm(formula = PPOV ~ PFHH + PUNEM + PBLK + P65UP, data = soco,
## listw = soco_nbq_w)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2457653 -0.0284360 -0.0028762 0.0262169 0.2374894
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.100260 0.007375 -13.5946 < 2.2e-16
## PFHH 0.429404 0.040246 10.6695 < 2.2e-16
## PUNEM 1.354637 0.065959 20.5374 < 2.2e-16
## PBLK -0.069046 0.015335 -4.5025 6.716e-06
## P65UP 0.291192 0.035210 8.2701 2.220e-16
##
## Rho: 0.5172, LR test value: 491.5, p-value: < 2.22e-16
## Asymptotic standard error: 0.02118
## z-value: 24.42, p-value: < 2.22e-16
## Wald statistic: 596.3, p-value: < 2.22e-16
##
## Log likelihood: 2239 for lag model
## ML residual variance (sigma squared): 0.002193, (sigma: 0.04682)
## Number of observations: 1387
## Number of parameters estimated: 7
## AIC: -4464, (AIC for lm: -3974)
## LM test for residual autocorrelation
## test value: 4.85, p-value: 0.027651
# The results of the model indicate that each of the predictors and the
# spatial lag are significant. We can evaluate the significance of the
# spatial lag a variety of ways. The output includes a z-test (“z-value”
# in the output) based on the standard error of Moran's I
# 0.51719/.02118=24.41879. It also includes a Likelihood Ratio Test (“LR
# test”, discussed in class) which is a test of the model with and without
# the spatial lag. The reported “LR test” suggests that the addition of
# the lag is an improvement, it is formally equivalent to:
lm1 <- lm(PPOV ~ PFHH + PUNEM + PBLK + P65UP, data = soco)
anova(soco_LAG, lm1)
## Model df AIC logLik Test L.Ratio p-value
## soco_LAG 1 7 -4464 2239 1
## lm1 2 6 -3974 1993 2 491 0
# Tests suggest that the lag is a useful addition but it makes reading the
# rest of the model output difficult. In a spatial lag model a change in
# one place cascades through the entire map. This can be easily seen by
# changing the values in one location and examining the changes elsewhere.
# We'll do this by altering one independent (predictor) variable, in one
# county and then using the fitted spatial model to predict the values
# (PPOV) elsewhere on the map. For example, what would happen to Poverty
# in the Southeast if the unemployment rate rose from 6% to 75% in
# Jefferson County, Alabama.
soco.new <- soco #copy the data frame (so we don't mess up the original)
# Change the unemployment rate
soco.new@data[soco.new@data$CNTY_ST == "Jefferson County AL", "PUNEM"] <- 0.75
# The original predicted values
orig.pred <- as.data.frame(predict(soco_LAG))
# The predicted values with the new unemployment rate in Alabama
new.pred <- as.data.frame(predict(soco_LAG, newdata = soco.new, listw = soco_nbq_w))
# the difference between the predicted values
jCoAL_effect <- new.pred$fit - orig.pred$fit
el <- data.frame(name = soco$CNTY_ST, diff_in_pred_PPOV = jCoAL_effect)
soco.new$jee <- el$diff_in_pred_PPOV
# sort counties by absolute value of the change in predicted PPOV
el <- el[rev(order(abs(el$diff_in_pred_PPOV))), ]
el[1:10, ] #show the top 10 counties
## name diff_in_pred_PPOV
## 37 Jefferson County AL 0.99079
## 4 Bibb County AL 0.11637
## 58 St. Clair County AL 0.11165
## 5 Blount County AL 0.10881
## 64 Walker County AL 0.10738
## 59 Shelby County AL 0.10516
## 63 Tuscaloosa County AL 0.09641
## 1050 El Paso County TX -0.08629
## 1197 Sutton County TX -0.08108
## 437 Lee County KY -0.07285
# We see that the biggest change was in the count we messed with (no
# surprise), this is the direct effect of our change. However, when we
# look at the 10 counties that experiences the biggest change el[1:10,] we
# also see that counties as far away as Kentucky experienced substantial
# change in their predicted poverty rate. It's also interesting to note
# that these changes in predicted PPOV are both positive and negative.
# Mapping these changes is also instructive. We can do this several ways:
breaks <- c(min(soco.new$jee), -0.03, 0.03, max(soco.new$jee))
labels <- c("Negative effect (greater than .03)", "No Effect (effect -.03 to .03)",
"Positive effect (greater than .03)")
np <- findInterval(soco.new$jee, breaks)
colors <- c("red", "yellow", "blue")
# Draw Map
plot(soco.new, col = colors[np])
mtext("Effects of a change in Jefferson County, AL (set PUNEM = .75)\n on predicted values in a spatial lag model",
side = 3, line = 1)
legend("topleft", legend = labels, fill = colors, bty = "n")
# We could also map the magnitude of the changes caused by messing with
# Jefferson County, AL.
pal5 <- brewer.pal(6, "Spectral")
cats5 <- classIntervals(soco.new$jee, n = 5, style = "jenks")
colors5 <- findColours(cats5, pal5)
plot(soco.new, col = colors5)
legend("topleft", legend = round(cats5$brks, 2), fill = pal5, bty = "n")
mtext("Effects of a change in Jefferson County, AL (set PUNEM = .75)\n on predicted values in a spatial lag model",
side = 3, line = 1)
# While these maps and the effects for individual counties may be useful
# they only tell us about a change in one place. The impacts() function
# gives us something like OLS regression coefficients for a spatial lag
# model. The logic of the impacts() function is similar to the code above,
# it tells you the direct (local), indirect (spill-over), and total effect
# of a unit change in each of the predictor variables. The changes
# reported by impacts are the global average impact:
impacts(soco_LAG, listw = soco_nbq_w) #takes a minute or two
## Impact measures (lag, exact):
## Direct Indirect Total
## PFHH 0.45695 0.43243 0.8894
## PUNEM 1.44154 1.36419 2.8057
## PBLK -0.07348 -0.06953 -0.1430
## P65UP 0.30987 0.29325 0.6031
# Our model deals with rates, so a unit change is big! The output from
# impacts tells us that a 100% increase in unemployment leads to a 280%
# (on average) increase in the childhood poverty rate.