This is a combination of the exercise in the lecture slides: Spatial Analysis and Regression I.pdf and the rpub http://rpubs.com/Seth/LagEquibEff

# install.packages('spdep') install.packages('gstat')
# install.packages('classInt')

library(RColorBrewer)
library(spdep)
## Loading required package: sp
## Loading required package: boot
## Loading required package: Matrix
## Loading required package: lattice
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
## melanoma
## Loading required package: MASS
## Loading required package: nlme
## Loading required package: maptools
## Loading required package: foreign
## Loading required package: grid
## Checking rgeos availability: FALSE Note: when rgeos is not available,
## polygon geometry computations in maptools depend on gpclib, which has a
## restricted licence. It is disabled by default; to enable gpclib, type
## gpclibPermit()
## Loading required package: deldir
## deldir 0.0-21
## Loading required package: coda
## Loading required package: splines
library(gstat)
library(classInt)
## Loading required package: class
## Loading required package: e1071

load("/Users/dgetman/WorkRelated/GradSchool/Current_Courses/GEOG_5023/Data/soco.rda")
# Generate the matrix of binary neighbors (Queen’s)
soco_nbq <- poly2nb(soco)

# Use the Neighbors to generate the row standardized weights matrix
soco_nbq_w <- nb2listw(soco_nbq)

## THESE TWO ABOVE STEPS ARE SEPRATE FOR A REASON, WHY?  Because you need
## the binary neighbors to get the total number of neighbors in order to
## calculate the standardized rows

## GENERALLY YOU HAVE TO PREFORM BOTH BEFORE BEFORE ANY SPATIAL ANALYSIS.

# This calculates a simulation of 999 randomly generated 'maps' within
# which our data are ranked.
MyMoran <- moran.mc(soco$PPOV, listw = soco_nbq_w, nsim = 999)
MyMoran
## 
##  Monte-Carlo simulation of Moran's I
## 
## data:  soco$PPOV 
## weights: soco_nbq_w  
## number of simulations + 1: 1000 
## 
## statistic = 0.5893, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
hist(MyMoran$res, breaks = 50)

plot of chunk unnamed-chunk-1


# GUESS WHICH OBSERVATION IS OUTLIER The outlier is showing the value of
# the moran's I from our data. It is obviously a rare event.

# This plots the percent poverty against the lag of percent poverty to
# help spot relationships between neighbors.  Looking at Washington DC is
# telling because the high poverty and low lag show it is surrounded by
# less poverty
mp <- moran.plot(soco$PPOV, soco_nbq_w, labels = as.character(soco$CNTY_ST), 
    xlab = "Percent Poverty", ylab = "Lag of Percent Poverty")

plot of chunk unnamed-chunk-1



# Calculate the local Moran for each point in the dataset using the
# weights matrix
locm <- localmoran(soco$PPOV, soco_nbq_w)
summary(locm)
##        Ii              E.Ii               Var.Ii            Z.Ii       
##  Min.   :-1.778   Min.   :-0.000722   Min.   :0.0901   Min.   :-3.983  
##  1st Qu.: 0.012   1st Qu.:-0.000722   1st Qu.:0.1420   1st Qu.: 0.029  
##  Median : 0.219   Median :-0.000722   Median :0.1658   Median : 0.517  
##  Mean   : 0.589   Mean   :-0.000722   Mean   :0.1861   Mean   : 1.408  
##  3rd Qu.: 0.746   3rd Qu.:-0.000722   3rd Qu.:0.1990   3rd Qu.: 1.761  
##  Max.   : 9.335   Max.   :-0.000722   Max.   :0.9981   Max.   :18.977  
##    Pr(z > 0)     
##  Min.   :0.0000  
##  1st Qu.:0.0391  
##  Median :0.3025  
##  Mean   :0.2900  
##  3rd Qu.:0.4886  
##  Max.   :1.0000

## Manually make a moran plot Need to standardize variables
soco$sPPOV <- scale(soco$PPOV)  #save to a new column
head(soco@data[, c("sPPOV", "PPOV")])
##     sPPOV   PPOV
## 0 -0.9591 0.1372
## 1 -0.9929 0.1341
## 2  1.6199 0.3725
## 3  0.6180 0.2811
## 4 -0.9803 0.1353
## 5  2.4691 0.4500

# create a lagged variable
soco$lag_sPPOV <- lag.listw(soco_nbq_w, soco$sPPOV)

summary(soco$sPPOV)
##        V1        
##  Min.   :-2.151  
##  1st Qu.:-0.700  
##  Median :-0.123  
##  Mean   : 0.000  
##  3rd Qu.: 0.591  
##  Max.   : 4.062
summary(soco$lag_sPPOV)
##        V1         
##  Min.   :-1.9925  
##  1st Qu.:-0.5298  
##  Median :-0.0762  
##  Mean   : 0.0085  
##  3rd Qu.: 0.4514  
##  Max.   : 2.6478
plot(x = soco$sPPOV, y = soco$lag_sPPOV, main = "Moran Scatterplot PPOV")
abline(h = 0, v = 0)
abline(lm(soco$lag_sPPOV ~ soco$sPPOV), lty = 3, lwd = 4, col = "red")

plot of chunk unnamed-chunk-1



## Check Out the outliers click on one or two and then hit escape (or
## click finish)
identify(soco$sPPOV, soco$lag_sPPOV, soco$CNTY_ST, cex = 0.8)
## integer(0)

## Identify the Moran plot quadrant for each observation This is some
## serious slicing and illustrate the power of the bracket...
## soco$quad_sig <- NA
soco@data[(soco$sPPOV >= 0 & soco$lag_sPPOV >= 0) & (locm[, 5] <= 0.05), "quad_sig"] <- 1
soco@data[(soco$sPPOV <= 0 & soco$lag_sPPOV <= 0) & (locm[, 5] <= 0.05), "quad_sig"] <- 2
soco@data[(soco$sPPOV >= 0 & soco$lag_sPPOV <= 0) & (locm[, 5] <= 0.05), "quad_sig"] <- 3
soco@data[(soco$sPPOV >= 0 & soco$lag_sPPOV <= 0) & (locm[, 5] <= 0.05), "quad_sig"] <- 4
# This might be wrong somehow
soco@data[(soco$sPPOV <= 0 & soco$lag_sPPOV >= 0) & (locm[, 5] <= 0.05), "quad_sig"] <- 5  #WE ASSIGN A 5 TO ALL NON-SIGNIFICANT OBSERVATIONS

##### MAP THE RESULTS (courtesy of Paul Voss) ############# # Set the
##### breaks for the thematic map classes
breaks <- seq(1, 5, 1)

# Set the corresponding labels for the thematic map classes
labels <- c("high-High", "low-Low", "High-Low", "Low-High", "Not Signif.")

# see ?findInterval - This is necessary for making a map
np <- findInterval(soco$quad_sig, breaks)


# Assign colors to each map class
colors <- c("red", "blue", "lightpink", "skyblue2", "white")
plot(soco, col = colors[np])  #colors[np] manually sets the color for each county
mtext("Local Moran's I", cex = 1.5, side = 3, line = 1)
legend("topleft", legend = labels, fill = colors, bty = "n")

plot of chunk unnamed-chunk-1




cor10 <- sp.correlogram(soco_nbq, soco$PPOV, order = 10, method = "I")
cor10
## Spatial correlogram for soco$PPOV 
## method: Moran's I
##            estimate expectation  variance standard deviate Pr(I) two sided
## 1 (1387)   5.89e-01   -7.22e-04  2.61e-04            36.54         < 2e-16
## 2 (1387)   4.58e-01   -7.22e-04  1.30e-04            40.20         < 2e-16
## 3 (1387)   3.34e-01   -7.22e-04  8.85e-05            35.60         < 2e-16
## 4 (1387)   2.40e-01   -7.22e-04  7.03e-05            28.69         < 2e-16
## 5 (1387)   1.48e-01   -7.22e-04  5.90e-05            19.39         < 2e-16
## 6 (1387)   8.35e-02   -7.22e-04  5.03e-05            11.88         < 2e-16
## 7 (1387)   3.09e-02   -7.22e-04  4.33e-05             4.81         1.5e-06
## 8 (1387)  -1.46e-02   -7.22e-04  3.83e-05            -2.24           0.025
## 9 (1387)  -5.41e-02   -7.22e-04  3.47e-05            -9.06         < 2e-16
## 10 (1387) -7.38e-02   -7.22e-04  3.23e-05           -12.85         < 2e-16
##              
## 1 (1387)  ***
## 2 (1387)  ***
## 3 (1387)  ***
## 4 (1387)  ***
## 5 (1387)  ***
## 6 (1387)  ***
## 7 (1387)  ***
## 8 (1387)  *  
## 9 (1387)  ***
## 10 (1387) ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(cor10)

plot of chunk unnamed-chunk-1


plot(variogram(soco$PPOV ~ 1, locations = coordinates(soco), data = soco, cloud = F), 
    type = "b")

plot of chunk unnamed-chunk-1


sp.correlogram(soco_nbq, soco$PPOV, order = 10, method = "corr")
## Spatial correlogram for soco$PPOV 
## method: Spatial autocorrelation
##        1        2        3        4        5        6        7        8 
##  0.75938  0.69002  0.58062  0.47335  0.33112  0.20307  0.08188 -0.04172 
##        9       10 
## -0.16401 -0.22568


soco_OLS <- lm(PPOV ~ PFHH + PUNEM + PBLK + P65UP, data = soco)
summary(soco_OLS)
## 
## Call:
## lm(formula = PPOV ~ PFHH + PUNEM + PBLK + P65UP, data = soco)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.26570 -0.03711 -0.00785  0.03294  0.28733 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.05544    0.00891   -6.22  6.6e-10 ***
## PFHH         0.45748    0.04941    9.26  < 2e-16 ***
## PUNEM        2.18109    0.07334   29.74  < 2e-16 ***
## PBLK        -0.05919    0.01878   -3.15   0.0017 ** 
## P65UP        0.38023    0.04276    8.89  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0576 on 1382 degrees of freedom
## Multiple R-squared:  0.603,  Adjusted R-squared:  0.601 
## F-statistic:  524 on 4 and 1382 DF,  p-value: <2e-16


plot(soco_OLS)  #Notice non-normal residuals, right skew

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1

soco$resid <- resid(soco_OLS)  #save the residuals
moran.mc(soco$resid, soco_nbq_w, nsim = 999)
## 
##  Monte-Carlo simulation of Moran's I
## 
## data:  soco$resid 
## weights: soco_nbq_w  
## number of simulations + 1: 1000 
## 
## statistic = 0.3609, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater


soco_LAG <- lagsarlm(PPOV ~ PFHH + PUNEM + PBLK + P65UP, data = soco, soco_nbq_w)
summary(soco_LAG)
## 
## Call:lagsarlm(formula = PPOV ~ PFHH + PUNEM + PBLK + P65UP, data = soco, 
##     listw = soco_nbq_w)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.2457653 -0.0284360 -0.0028762  0.0262169  0.2374894 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##              Estimate Std. Error  z value  Pr(>|z|)
## (Intercept) -0.100260   0.007375 -13.5946 < 2.2e-16
## PFHH         0.429404   0.040246  10.6695 < 2.2e-16
## PUNEM        1.354637   0.065959  20.5374 < 2.2e-16
## PBLK        -0.069046   0.015335  -4.5025 6.716e-06
## P65UP        0.291192   0.035210   8.2701 2.220e-16
## 
## Rho: 0.5172, LR test value: 491.5, p-value: < 2.22e-16
## Asymptotic standard error: 0.02118
##     z-value: 24.42, p-value: < 2.22e-16
## Wald statistic: 596.3, p-value: < 2.22e-16
## 
## Log likelihood: 2239 for lag model
## ML residual variance (sigma squared): 0.002193, (sigma: 0.04682)
## Number of observations: 1387 
## Number of parameters estimated: 7 
## AIC: -4464, (AIC for lm: -3974)
## LM test for residual autocorrelation
## test value: 4.85, p-value: 0.027651


#### This is where we switch from the lecture to the rpub

# Our goal will be to model the percent of children living in poverty in
# the southeastern USA. This variable shows significant spatial
# auto-correlation, but the nugget effect is also substantial.
plot(variogram(soco$PPOV ~ 1, locations = coordinates(soco), data = soco, cloud = F), 
    type = "b", pch = 16, main = "Variogram of PPOV")

plot of chunk unnamed-chunk-1



# The Moran's I also indicates significant spatial autocorrelation in
# poverty: By doing this, we can see where our samplemwould be in a group
# of 1000 samples
moran.mc(soco$PPOV, soco_nbq_w, nsim = 999)
## 
##  Monte-Carlo simulation of Moran's I
## 
## data:  soco$PPOV 
## weights: soco_nbq_w  
## number of simulations + 1: 1000 
## 
## statistic = 0.5893, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater

# This variogram and Moran's I indicate that there is spatial
# autocorrelation in childhood poverty (PPOV) but this spatial pattern
# could be cause by the spatial patterning of the covariates of poverty. A
# full analysis would involve fitting an OLS model (weighted by
# population) and comparing that model to spatial models. Here, we'll
# assume that spatial lag model is appropriate (I'll fit an unweighted
# model for simplicity).
soco_LAG <- lagsarlm(PPOV ~ PFHH + PUNEM + PBLK + P65UP, data = soco, soco_nbq_w)
summary(soco_LAG)
## 
## Call:lagsarlm(formula = PPOV ~ PFHH + PUNEM + PBLK + P65UP, data = soco, 
##     listw = soco_nbq_w)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.2457653 -0.0284360 -0.0028762  0.0262169  0.2374894 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##              Estimate Std. Error  z value  Pr(>|z|)
## (Intercept) -0.100260   0.007375 -13.5946 < 2.2e-16
## PFHH         0.429404   0.040246  10.6695 < 2.2e-16
## PUNEM        1.354637   0.065959  20.5374 < 2.2e-16
## PBLK        -0.069046   0.015335  -4.5025 6.716e-06
## P65UP        0.291192   0.035210   8.2701 2.220e-16
## 
## Rho: 0.5172, LR test value: 491.5, p-value: < 2.22e-16
## Asymptotic standard error: 0.02118
##     z-value: 24.42, p-value: < 2.22e-16
## Wald statistic: 596.3, p-value: < 2.22e-16
## 
## Log likelihood: 2239 for lag model
## ML residual variance (sigma squared): 0.002193, (sigma: 0.04682)
## Number of observations: 1387 
## Number of parameters estimated: 7 
## AIC: -4464, (AIC for lm: -3974)
## LM test for residual autocorrelation
## test value: 4.85, p-value: 0.027651

# The results of the model indicate that each of the predictors and the
# spatial lag are significant. We can evaluate the significance of the
# spatial lag a variety of ways. The output includes a z-test (“z-value”
# in the output) based on the standard error of Moran's I
# 0.51719/.02118=24.41879. It also includes a Likelihood Ratio Test (“LR
# test”, discussed in class) which is a test of the model with and without
# the spatial lag. The reported “LR test” suggests that the addition of
# the lag is an improvement, it is formally equivalent to:
lm1 <- lm(PPOV ~ PFHH + PUNEM + PBLK + P65UP, data = soco)
anova(soco_LAG, lm1)
##          Model df   AIC logLik Test L.Ratio p-value
## soco_LAG     1  7 -4464   2239    1                
## lm1          2  6 -3974   1993    2     491       0

# Tests suggest that the lag is a useful addition but it makes reading the
# rest of the model output difficult. In a spatial lag model a change in
# one place cascades through the entire map. This can be easily seen by
# changing the values in one location and examining the changes elsewhere.
# We'll do this by altering one independent (predictor) variable, in one
# county and then using the fitted spatial model to predict the values
# (PPOV) elsewhere on the map. For example, what would happen to Poverty
# in the Southeast if the unemployment rate rose from 6% to 75% in
# Jefferson County, Alabama.
soco.new <- soco  #copy the data frame (so we don't mess up the original)

# Change the unemployment rate
soco.new@data[soco.new@data$CNTY_ST == "Jefferson County  AL", "PUNEM"] <- 0.75

# The original predicted values
orig.pred <- as.data.frame(predict(soco_LAG))

# The predicted values with the new unemployment rate in Alabama
new.pred <- as.data.frame(predict(soco_LAG, newdata = soco.new, listw = soco_nbq_w))

# the difference between the predicted values
jCoAL_effect <- new.pred$fit - orig.pred$fit
el <- data.frame(name = soco$CNTY_ST, diff_in_pred_PPOV = jCoAL_effect)
soco.new$jee <- el$diff_in_pred_PPOV

# sort counties by absolute value of the change in predicted PPOV
el <- el[rev(order(abs(el$diff_in_pred_PPOV))), ]
el[1:10, ]  #show the top 10 counties
##                       name diff_in_pred_PPOV
## 37    Jefferson County  AL           0.99079
## 4          Bibb County  AL           0.11637
## 58    St. Clair County  AL           0.11165
## 5        Blount County  AL           0.10881
## 64       Walker County  AL           0.10738
## 59       Shelby County  AL           0.10516
## 63   Tuscaloosa County  AL           0.09641
## 1050    El Paso County  TX          -0.08629
## 1197     Sutton County  TX          -0.08108
## 437         Lee County  KY          -0.07285

# We see that the biggest change was in the count we messed with (no
# surprise), this is the direct effect of our change. However, when we
# look at the 10 counties that experiences the biggest change el[1:10,] we
# also see that counties as far away as Kentucky experienced substantial
# change in their predicted poverty rate. It's also interesting to note
# that these changes in predicted PPOV are both positive and negative.
# Mapping these changes is also instructive. We can do this several ways:
breaks <- c(min(soco.new$jee), -0.03, 0.03, max(soco.new$jee))
labels <- c("Negative effect (greater than .03)", "No Effect (effect -.03 to .03)", 
    "Positive effect (greater than .03)")


np <- findInterval(soco.new$jee, breaks)
colors <- c("red", "yellow", "blue")

# Draw Map
plot(soco.new, col = colors[np])
mtext("Effects of a change in Jefferson County, AL (set PUNEM = .75)\n on predicted values in a spatial lag model", 
    side = 3, line = 1)
legend("topleft", legend = labels, fill = colors, bty = "n")

plot of chunk unnamed-chunk-1



# We could also map the magnitude of the changes caused by messing with
# Jefferson County, AL.
pal5 <- brewer.pal(6, "Spectral")
cats5 <- classIntervals(soco.new$jee, n = 5, style = "jenks")
colors5 <- findColours(cats5, pal5)
plot(soco.new, col = colors5)
legend("topleft", legend = round(cats5$brks, 2), fill = pal5, bty = "n")
mtext("Effects of a change in Jefferson County, AL (set PUNEM = .75)\n on predicted values in a spatial lag model", 
    side = 3, line = 1)

plot of chunk unnamed-chunk-1



# While these maps and the effects for individual counties may be useful
# they only tell us about a change in one place. The impacts() function
# gives us something like OLS regression coefficients for a spatial lag
# model. The logic of the impacts() function is similar to the code above,
# it tells you the direct (local), indirect (spill-over), and total effect
# of a unit change in each of the predictor variables. The changes
# reported by impacts are the global average impact:
impacts(soco_LAG, listw = soco_nbq_w)  #takes a minute or two
## Impact measures (lag, exact):
##         Direct Indirect   Total
## PFHH   0.45695  0.43243  0.8894
## PUNEM  1.44154  1.36419  2.8057
## PBLK  -0.07348 -0.06953 -0.1430
## P65UP  0.30987  0.29325  0.6031

# Our model deals with rates, so a unit change is big! The output from
# impacts tells us that a 100% increase in unemployment leads to a 280%
# (on average) increase in the childhood poverty rate.