GEOG5023 – Exercise 8B&C – Alex Crawford

Exercise 8B

Load Data

# Load libraries

library(spdep)
## Loading required package: sp
## Loading required package: boot
## Loading required package: Matrix
## Loading required package: lattice
## Attaching package: 'lattice'
## The following object(s) are masked from 'package:boot':
## 
## melanoma
## Loading required package: MASS
## Loading required package: nlme
## Loading required package: maptools
## Loading required package: foreign
## Loading required package: grid
## Checking rgeos availability: FALSE Note: when rgeos is not available,
## polygon geometry computations in maptools depend on gpclib, which has a
## restricted licence. It is disabled by default; to enable gpclib, type
## gpclibPermit()
## Loading required package: deldir
## deldir 0.0-21
## Loading required package: coda
## Loading required package: splines
library(classInt)
## Loading required package: class
## Loading required package: e1071
library(gstat)

# Load soco dataset with load() function Automatically creates an object
# called soco
load("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/soco.rda")

Moran's I - Global

# Find neighbors for each location and save them to an object.
soco_nbq <- poly2nb(soco)  # Queen’s neighborhood method
# Create the weight matrix for the neighbors
soco_nbq_w <- nb2listw(soco_nbq)

# Moran's I Test
moran.test(soco$PERPOV, listw = soco_nbq_w, zero.policy = T)  # Value of 0.435
## 
##  Moran's I test under randomisation
## 
## data:  soco$PERPOV  
## weights: soco_nbq_w  
##  
## Moran I statistic standard deviate = 26.96, p-value < 2.2e-16
## alternative hypothesis: greater 
## sample estimates:
## Moran I statistic       Expectation          Variance 
##         0.4346902        -0.0007215         0.0002609
# Null Hypothesis: There is no spatial autocorrelation for this variable.
# Alt Hypothesis: There is spatial autocorrelation for this variable.

Testing Significance of Moran's I Statistic

# Simulations of Moran's I statistic using moran.mc listw= is for the
# weight matrix object nsim= is for the # of simulations to include.
# Always make it 99, 999, 9999, etc. because we're adding in the real
# world Moran's I to the list as well.  We want a round number in the end
# to make the p-value intuitive.
socoMoranSim <- moran.mc(soco$PERPOV, listw = soco_nbq_w, nsim = 999)
socoMoranSim
## 
##  Monte-Carlo simulation of Moran's I
## 
## data:  soco$PERPOV 
## weights: soco_nbq_w  
## number of simulations + 1: 1000 
##  
## statistic = 0.4347, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
names(socoMoranSim)  # socoMoranSim$res is the Moran's I result
## [1] "statistic"   "parameter"   "p.value"     "alternative" "method"     
## [6] "data.name"   "res"

# Plot the Moran's I Simulation Results
hist(socoMoranSim$res, breaks = 50, main = "Histogram of Moran's I Simulations", 
    xlab = "Moran's I Stat", ylab = "Frequency")

plot of chunk unnamed-chunk-3

# So we can confidently say that this Moran's I statistic is the real
# deal. There is spatial autocorrelation!

Moran's I Plot Quadrants

# Use moran.plot(x variable, lag x variable, labels=as.character(label))
# This plot will automatically create labels, so you might as well assign
# something intuitive.
socoMoranPlot <- moran.plot(soco$PPOV, soco_nbq_w, labels = as.character(soco$CNTY_ST), 
    xlab = "Percent Poverty", ylab = "Lag of Percent Poverty")

plot of chunk unnamed-chunk-4

Local Moran's I

# Calculate the local Moran's I statistic using localmoran(DATA, WEIGHTS)
# where WEIGHTS is the list of weighted neighbors.
soco_locm <- localmoran(soco$PPOV, soco_nbq_w)
summary(soco_locm)
##        Ii              E.Ii               Var.Ii            Z.Ii       
##  Min.   :-1.778   Min.   :-0.000722   Min.   :0.0901   Min.   :-3.983  
##  1st Qu.: 0.012   1st Qu.:-0.000722   1st Qu.:0.1420   1st Qu.: 0.029  
##  Median : 0.219   Median :-0.000722   Median :0.1658   Median : 0.517  
##  Mean   : 0.589   Mean   :-0.000722   Mean   :0.1861   Mean   : 1.408  
##  3rd Qu.: 0.746   3rd Qu.:-0.000722   3rd Qu.:0.1990   3rd Qu.: 1.761  
##  Max.   : 9.335   Max.   :-0.000722   Max.   :0.9981   Max.   :18.977  
##    Pr(z > 0)     
##  Min.   :0.0000  
##  1st Qu.:0.0391  
##  Median :0.3025  
##  Mean   :0.2900  
##  3rd Qu.:0.4886  
##  Max.   :1.0000

## Manually make a moran plot Standardize variables using the scale()
## function; save to a new column
soco$sPPOV <- scale(soco$PPOV)
# Create a lagged variable using the lag.listw(WEIGHTS, SCALED_DATA)
# function.  Note: 'lag' variable means the values of the neighbors of the
# variable under consideration.
soco$lag_sPPOV <- lag.listw(soco_nbq_w, soco$sPPOV)
summary(soco$sPPOV)
##        V1        
##  Min.   :-2.151  
##  1st Qu.:-0.700  
##  Median :-0.123  
##  Mean   : 0.000  
##  3rd Qu.: 0.591  
##  Max.   : 4.062
summary(soco$lag_sPPOV)
##        V1         
##  Min.   :-1.9925  
##  1st Qu.:-0.5298  
##  Median :-0.0762  
##  Mean   : 0.0085  
##  3rd Qu.: 0.4514  
##  Max.   : 2.6478
# Plot the lag v. the scaled variable.
plot(x = soco$sPPOV, y = soco$lag_sPPOV, main = "Moran Scatterplot PPOV", xlab = "Scaled % Poverty", 
    ylab = "Scaled Lag of % Poverty")
abline(h = 0, v = 0)  # Divides the plot into quadrants
abline(lm(soco$lag_sPPOV ~ soco$sPPOV), lty = 3, lwd = 4, col = "red")  # linear model used to find line of best fit.

# Investigate outliers by clicking on one or two and then hitting escape
# (or click finish)
identify(soco$sPPOV, soco$lag_sPPOV, soco$CNTY_ST, cex = 0.8)

plot of chunk unnamed-chunk-5

## integer(0)
## Quadrant I: high poverty counties surrounded by high poverty counties.
## Quadrant II: low poverty counties surrounded by high poverty counties.
## Quadrant III: low poverty counties surrounded by low poverty counties.
## Quadrant IV: high poverty counties surrounded by low poverty counties.

# Identify the Moran plot quadrant for each observation
soco$quad_sig <- NA  # Start by creating a new column in the original data frame.
# Assign the appropriate quadrant based on whether the scaled and lag
# values are positive or negative. If not significant, assign a 5.
soco@data[(soco$sPPOV >= 0 & soco$lag_sPPOV >= 0) & (soco_locm[, 5] <= 0.05), 
    "quad_sig"] <- 1
soco@data[(soco$sPPOV <= 0 & soco$lag_sPPOV <= 0) & (soco_locm[, 5] <= 0.05), 
    "quad_sig"] <- 2
soco@data[(soco$sPPOV >= 0 & soco$lag_sPPOV <= 0) & (soco_locm[, 5] <= 0.05), 
    "quad_sig"] <- 3
soco@data[(soco$sPPOV <= 0 & soco$lag_sPPOV >= 0) & (soco_locm[, 5] <= 0.05), 
    "quad_sig"] <- 4
soco@data[(soco_locm[, 5] > 0.05), "quad_sig"] <- 5

Plotting Results

# Presenting the results (from Paul Voss) Set the breaks for the thematic
# map classes
breaks <- seq(1, 5, 1)
# Set the corresponding labels for the thematic map classes
labels <- c("High-High", "Low-Low", "High-Low", "Low-High", "Not Signif.")
# see ?findInterval - This is necessary for making a map
np <- findInterval(soco$quad_sig, breaks)
# Assign colors to each map class
colors <- c("red", "blue", "lightpink", "skyblue2", "white")
plot(soco, col = colors[np])  #colors[np] manually sets the color for each county
mtext("Local Moran's I\n% Poverty in Southern Counties", cex = 1.5, side = 3, 
    line = 1)
legend("topleft", legend = labels, fill = colors, bty = "n")

plot of chunk unnamed-chunk-6

# Correlelogram
cor10 <- sp.correlogram(soco_nbq, soco$PPOV, order = 10, method = "I")
cor10  # Notice how the values for Moran's I go from significantly positive to significantly negative, crossing 0 around a lag of 8.
## Spatial correlogram for soco$PPOV 
## method: Moran's I
##            estimate expectation  variance standard deviate Pr(I) two sided
## 1 (1387)   5.89e-01   -7.22e-04  2.61e-04            36.54         < 2e-16
## 2 (1387)   4.58e-01   -7.22e-04  1.30e-04            40.20         < 2e-16
## 3 (1387)   3.34e-01   -7.22e-04  8.85e-05            35.60         < 2e-16
## 4 (1387)   2.40e-01   -7.22e-04  7.03e-05            28.69         < 2e-16
## 5 (1387)   1.48e-01   -7.22e-04  5.90e-05            19.39         < 2e-16
## 6 (1387)   8.35e-02   -7.22e-04  5.03e-05            11.88         < 2e-16
## 7 (1387)   3.09e-02   -7.22e-04  4.33e-05             4.81         1.5e-06
## 8 (1387)  -1.46e-02   -7.22e-04  3.83e-05            -2.24           0.025
## 9 (1387)  -5.41e-02   -7.22e-04  3.47e-05            -9.06         < 2e-16
## 10 (1387) -7.38e-02   -7.22e-04  3.23e-05           -12.85         < 2e-16
##              
## 1 (1387)  ***
## 2 (1387)  ***
## 3 (1387)  ***
## 4 (1387)  ***
## 5 (1387)  ***
## 6 (1387)  ***
## 7 (1387)  ***
## 8 (1387)  *  
## 9 (1387)  ***
## 10 (1387) ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(cor10, main = "Correlelogram of % Poverty")

plot of chunk unnamed-chunk-7


# Variogram shows how the variance changes with lag distance from location
# of focus.  variogram(soco$PPOV~1, locations=coordinates(soco),
# data=soco, cloud=F,type='b”)

Exercise 8C

Create a map of PPOV

# Create a Palette
library(classInt)
library(RColorBrewer)
pal5 <- brewer.pal(5, "Reds")  # Makes a 7-color spectral palette
display.brewer.pal(5, "Reds")  # This displays the colors

plot of chunk unnamed-chunk-8

# Classifying Data using classIntervals() and findColours()
cats5 <- classIntervals(soco$PPOV, n = 5, style = "quantile")
cats5  # Returns the bins for each classification.
## style: quantile
## [0.02846,0.1488)  [0.1488,0.1926)  [0.1926,0.2365)  [0.2365,0.2936) 
##              278              277              277              277 
##  [0.2936,0.5953] 
##              278
# Link the color pallette to the categories using
# findColours(CATEGORIES,PALETTE)
FiveColors <- findColours(cats5, pal5)
# Draw a map using specificed data and colors
plot(soco, col = FiveColors)  # The col= argument provides the coloring scheme.
mtext("Percent Poverty in Southern Counties", cex = 1.5, side = 3, line = 1)
legend("topleft", legend = c("2.8% - 14.9%", "14.9% - 19.3%", "19.3% - 23.6%", 
    "23.6% - 29.4%", "29.4% - 59.5%"), fill = pal5, bty = "n", cex = 0.8)

plot of chunk unnamed-chunk-9

OLS Model

names(soco)
##  [1] "CNTY_ST"   "STUSAB"    "FIPS"      "YCOORD"    "XCOORD"   
##  [6] "SQYCORD"   "SQXCORD"   "XYCOORD"   "PPOV"      "PHSP"     
## [11] "PFHH"      "PWKCO"     "PHSLS"     "PUNEM"     "PUDEM"    
## [16] "PEXTR"     "PPSRV"     "PMSRV"     "PNDMFG"    "PNHSPW"   
## [21] "PMNRTY"    "LO_POV"    "PFRN"      "PNAT"      "PBLK"     
## [26] "P65UP"     "PDSABL"    "METRO"     "PERPOV"    "OTMIG"    
## [31] "SQOTMIG"   "BINMIG"    "INCARC"    "BINCARC"   "SQRTPPOV" 
## [36] "SQRTUNEM"  "SQRTPFHH"  "LOGHSPLS"  "PHSPLUS"   "sPPOV"    
## [41] "lag_sPPOV" "quad_sig"
library(HH)
## Loading required package: multcomp
## Loading required package: mvtnorm
## Loading required package: survival
## Attaching package: 'survival'
## The following object(s) are masked from 'package:boot':
## 
## aml
## Loading required package: leaps
## Loading required package: latticeExtra
## Attaching package: 'HH'
## The following object(s) are masked from 'package:coda':
## 
## acfplot
## The following object(s) are masked from 'package:boot':
## 
## logit
library(leaps)
subsetsmodel <- regsubsets(PPOV ~ PMNRTY + PFHH + PWKCO + PHSLS + PUNEM + PUDEM + 
    PEXTR + PPSRV + PMSRV + PNDMFG + PNHSPW + PFRN + PNAT + P65UP + PHSP + PBLK + 
    PDSABL + METRO + OTMIG, data = soco, nvmax = 10)
## Warning: 1 linear dependencies found
## Reordering variables and trying again:
subsetsum <- summary(subsetsmodel)
plot(subsetsmodel, main = "Regsubsets Results by BIC")

plot of chunk unnamed-chunk-10

plot(1:11, y = subsetsum$adjr2, type = "o", xlab = "Number of Parameters", ylab = "Adj. r^2", 
    main = "Adj. r^2 by Model Size")

plot of chunk unnamed-chunk-10

plot(1:11, y = subsetsum$bic, type = "o", xlab = "Number of Parameters", ylab = "BIC", 
    main = "BIC by Model Size")

plot of chunk unnamed-chunk-10


# There seems to be an asymptote in improved explanatory power and BIC at
# 7 variables.
soco.lm7 <- lm(PPOV ~ PFHH + PHSLS + PUNEM + PUDEM + PEXTR + PBLK + METRO, data = soco)

Map the Residuals

breaks.lm <- c(min(soco.lm7$resid), 0, max(soco.lm7$resid))
labels.lm <- c("negative residuals", "positive residuals")
np <- findInterval(soco.lm7$resid, breaks.lm)
colors.lm <- c("red", "blue")
dev.off()
## null device 
##           1
plot(soco, col = colors[np])  #colors[np] manually sets the color for each county
mtext("OLS Residuals", cex = 1.5, side = 3, line = 1)
legend("topleft", legend = labels.lm, fill = colors.lm, bty = "n")

Calculate Moran's I For the Residuals

soco.lm7resid <- residuals(soco.lm7)

# Find distance-based nearest neighbors:
coords <- coordinates(soco)
soco_kn1 <- knn2nb(knearneigh(coords, k = 1))
dist <- unlist(nbdists(soco_kn1, coords))
max_k1 <- max(dist)  # Store the max dist as an object.
soco_kd <- dnearneigh(coords, d1 = 0, d2 = 1 * max_k1)

# Assign IDW Weights:
soco_idw <- lapply(dist, function(x) 1/(x/1000))
soco_kdw <- nb2listw(soco_kn1, glist = soco_idw, style = "B")

# Moran's I
moran.test(soco.lm7$resid, listw = soco_kdw, zero.policy = T)  # 0.296
## 
##  Moran's I test under randomisation
## 
## data:  soco.lm7$resid  
## weights: soco_kdw  
##  
## Moran I statistic standard deviate = 7.655, p-value = 9.638e-15
## alternative hypothesis: greater 
## sample estimates:
## Moran I statistic       Expectation          Variance 
##         0.2959646        -0.0007215         0.0015020
# Null Hypothesis: There is no spatial autocorrelation for this variable.
# Alt Hypothesis: There is spatial autocorrelation for this variable.

Model with a Spatial Lag Term

# The lagsarlm model only differs from the OLS code in two ways.  1) The
# fucntion in lagsarlm() instead of lm() 2) Include the argument listw =
# NEIGHBOR.WEIGHTS Optional: Include the argument: zero.policy=T
soco.lm7lag <- lagsarlm(PPOV ~ PFHH + PHSLS + PUNEM + PUDEM + PEXTR + PBLK + 
    METRO, data = soco, listw = soco_kdw, zero.policy = T)

# Run Moran's I for the lag model to see how much spatial autocorrelation
# remains: Note: best to use the same weighted neighbors as before.
moran.test(soco.lm7lag$resid, listw = soco_kdw, zero.policy = T)  # 0.196
## 
##  Moran's I test under randomisation
## 
## data:  soco.lm7lag$resid  
## weights: soco_kdw  
##  
## Moran I statistic standard deviate = 5.08, p-value = 1.888e-07
## alternative hypothesis: greater 
## sample estimates:
## Moran I statistic       Expectation          Variance 
##         0.1961457        -0.0007215         0.0015019
# Null Hypothesis: There is no spatial autocorrelation for this variable.
# Alt Hypothesis: There is spatial autocorrelation for this variable.

# The spatial autocorrelation has been reduced from 0.296 to 0.196, which
# means its in that gray area between spatial dependence and noise.  This
# indicates that the weights of the neighbors I selected have accounted
# for some of the spatial dependence.

# Comparison of results:
summary(soco.lm7)
## 
## Call:
## lm(formula = PPOV ~ PFHH + PHSLS + PUNEM + PUDEM + PEXTR + PBLK + 
##     METRO, data = soco)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.18763 -0.02516 -0.00118  0.02414  0.20958 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.23668    0.01195  -19.81  < 2e-16 ***
## PFHH         0.72210    0.03690   19.57  < 2e-16 ***
## PHSLS        0.26748    0.01326   20.17  < 2e-16 ***
## PUNEM        1.27693    0.06146   20.78  < 2e-16 ***
## PUDEM        0.23699    0.03097    7.65  3.7e-14 ***
## PEXTR        0.41578    0.02203   18.87  < 2e-16 ***
## PBLK        -0.10990    0.01354   -8.12  1.0e-15 ***
## METRO       -0.00307    0.00286   -1.07     0.28    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.0421 on 1379 degrees of freedom
## Multiple R-squared: 0.788,   Adjusted R-squared: 0.787 
## F-statistic:  733 on 7 and 1379 DF,  p-value: <2e-16
summary(soco.lm7lag)
## 
## Call:lagsarlm(formula = PPOV ~ PFHH + PHSLS + PUNEM + PUDEM + PEXTR + 
##     PBLK + METRO, data = soco, listw = soco_kdw, zero.policy = T)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.2038179 -0.0239862 -0.0008581  0.0228565  0.1894518 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##               Estimate Std. Error  z value  Pr(>|z|)
## (Intercept) -0.2376510  0.0116638 -20.3752 < 2.2e-16
## PFHH         0.7215938  0.0360197  20.0333 < 2.2e-16
## PHSLS        0.2528998  0.0131156  19.2824 < 2.2e-16
## PUNEM        1.2110850  0.0602474  20.1019 < 2.2e-16
## PUDEM        0.2304071  0.0302502   7.6167 2.598e-14
## PEXTR        0.4206466  0.0215299  19.5378 < 2.2e-16
## PBLK        -0.1163250  0.0132391  -8.7865 < 2.2e-16
## METRO       -0.0017139  0.0027949  -0.6132    0.5397
## 
## Rho: 2.138, LR test value: 51.77, p-value: 6.2239e-13
## Asymptotic standard error: 0.2329
##     z-value: 9.181, p-value: < 2.22e-16
## Wald statistic: 84.29, p-value: < 2.22e-16
## 
## Log likelihood: 2456 for lag model
## ML residual variance (sigma squared): 0.001688, (sigma: 0.04109)
## Number of observations: 1387 
## Number of parameters estimated: 10 
## AIC: -4891, (AIC for lm: -4842)
## LM test for residual autocorrelation
## test value: 36.41, p-value: 1.6031e-09
anova(soco.lm7lag, soco.lm7)
##             Model df   AIC logLik Test L.Ratio  p-value
## soco.lm7lag     1 10 -4891   2456    1                 
## soco.lm7        2  9 -4842   2430    2    51.8 6.22e-13

soco.lm7coefs <- coef(soco.lm7)
soco.lm7lagcoefs <- coef(soco.lm7lag)
soco.lm7lagcoefs <- soco.lm7lagcoefs[2:9]  # Remove rho in order to compare coefs.
soco.lm7lagcoefs
## (Intercept)        PFHH       PHSLS       PUNEM       PUDEM       PEXTR 
##   -0.237651    0.721594    0.252900    1.211085    0.230407    0.420647 
##        PBLK       METRO 
##   -0.116325   -0.001714
abs(soco.lm7coefs) - abs(soco.lm7lagcoefs)
## (Intercept)        PFHH       PHSLS       PUNEM       PUDEM       PEXTR 
##  -0.0009700   0.0005034   0.0145792   0.0658421   0.0065873  -0.0048638 
##        PBLK       METRO 
##  -0.0064294   0.0013553
# Observing the above differences in the magnitude of the coefficients, in
# most cases, the coefficients decreased in magnitude.  However, most
# differences were also less than a magntiude of 0.01.  PUNEM decreased by
# 0.658, indicating that it is the most spatially dependent variable.  All
# variables that were significant in the OLS are still significant in the
# spatial lag model.  The Moran's I tests show that some spatial
# autocorrelation was present, but that much of it was accounted for by
# the weights list.