Spatial Regression I

## Set working directory

setwd("~/Desktop/University of Utah PhD /Course Work/Fall 2022 Semester/GEOG6000_Data Analysis/lab09")

library(RColorBrewer) #color palettes
library(sf) #simple features for spatial data
library(spdep)
library(spatialreg)
library(tmap) #mapping package
library(kableExtra) #table modification
library(dbscan) # density based clustering

Reading the Data

##Boston Data Read - shapefile

boston = st_read("../datafiles/boston.tr/boston.shp", 
                 quiet = TRUE)

##Boston Plot with sf
plot(st_geometry(boston))

##Boston Plot with tmap
tm_shape(boston) +
  tm_borders()

##NY Data Read

NY8 = st_read("../datafiles/NY_data/NY8_utm18.shp", 
              quiet = TRUE)

plot(st_geometry(NY8))

tm_shape(NY8) + 
  tm_borders() +
  tm_fill("POP8",
          title = "Population") + 
  tm_layout(legend.outside = TRUE,
            legend.title.size = 1,
            legend.text.size = 0.6,
            legend.position = c("right","top"),
            legend.bg.alpha = 1) 

head(NY8)
## Simple feature collection with 6 features and 17 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 421423.4 ymin: 4661351 xmax: 427091.6 ymax: 4664822
## Projected CRS: WGS 84 / UTM zone 18N
##          AREANAME     AREAKEY        X        Y POP8 TRACTCAS  PROPCAS
## 1 Binghamton city 36007000100 4.069397 -67.3533 3540     3.08 0.000870
## 2 Binghamton city 36007000200 4.639371 -66.8619 3560     4.08 0.001146
## 3 Binghamton city 36007000300 5.709063 -66.9775 3739     1.09 0.000292
## 4 Binghamton city 36007000400 7.613831 -65.9958 2784     1.07 0.000384
## 5 Binghamton city 36007000500 7.315968 -67.3183 2571     3.06 0.001190
## 6 Binghamton city 36007000600 8.558753 -66.9344 2729     1.06 0.000388
##   PCTOWNHOME PCTAGE65P        Z  AVGIDIST PEXPOSURE   Cases       Xm       Ym
## 1  0.3277311 0.1466102  0.14197 0.2373852  3.167099 3.08284 4069.397 -67353.3
## 2  0.4268293 0.2351124  0.35555 0.2087413  3.038511 4.08331 4639.371 -66861.9
## 3  0.3377396 0.1380048 -0.58165 0.1708548  2.838229 1.08750 5709.063 -66977.5
## 4  0.4616048 0.1188937 -0.29634 0.1406045  2.643366 1.06515 7613.831 -65995.8
## 5  0.1924370 0.1415791  0.45689 0.1577753  2.758587 3.06017 7315.968 -67318.3
## 6  0.3651786 0.1410773 -0.28123 0.1726033  2.848411 1.06386 8558.753 -66934.4
##     Xshift  Yshift                       geometry
## 1 423391.0 4661502 POLYGON ((421840.4 4662874,...
## 2 423961.0 4661993 POLYGON ((421826.4 4663108,...
## 3 425030.6 4661878 POLYGON ((423159 4664759, 4...
## 4 426935.4 4662859 POLYGON ((425261.8 4663493,...
## 5 426637.5 4661537 POLYGON ((425033.3 4662995,...
## 6 427880.3 4661921 POLYGON ((426178.8 4664216,...
##Simple histogram

hist(NY8$TRACTCAS,
     xlab = "Cases per Tract")

##Build a thematic map by specifying the column (attribute) that you wish to plot

## Create a color palette
my.pal = brewer.pal(n = 9, "YlOrRd")

## Create the thematic map

plot(NY8["Cases"],
     main = "New Yok Leukemia Cases",
     col = my.pal)

Building the Spatial Weight Matrix

##Extract the polygons for Syracuse
syracuse = NY8[NY8$AREANAME == "Syracuse city", ]

## Store the geometry of the polygons
syracuse.geom = st_geometry(syracuse)

## Store the centroids of the polygons
syracuse.coords = st_centroid(syracuse.geom)

## Plot the geometry. The reset=FALSE syntax allows you to keep the plot and add more layers. The add=TRUE syntax adds it to the current plot
plot(syracuse.geom, reset = FALSE)
plot(syracuse.coords, pch = 16, col = 2, add = TRUE)

Neighborhood Functions

## This method connects polygons that touch boundaries or corners

sy1_nb = poly2nb(syracuse)

## Visualize the neighborhood structure

plot(syracuse.geom, 
     main = "Syracuse Neighborhood Structure (Queen)",
     reset = FALSE)
plot(sy1_nb, syracuse.coords, 
     add = TRUE, 
     col = 2, 
     lwd = 1.5)

## This method connects polygons that touch boundaries only

sy2_nb = poly2nb(syracuse, queen = FALSE)

## Visualize the neighborhood structure

plot(syracuse.geom, 
     main = "Syracuse Neighborhood Structure (Rook)",
     reset = FALSE)
plot(sy2_nb, syracuse.coords, 
     add = TRUE, 
     col = 2, 
     lwd = 1.5)

par(mfrow = c(1,2))

plot(syracuse.geom, 
     main = "Queen",
     reset = FALSE)
plot(sy1_nb, syracuse.coords, 
     add = TRUE, 
     col = 2, 
     lwd = 1.5)


plot(syracuse.geom, 
     main = "Rook",
     reset = FALSE)
plot(sy2_nb, syracuse.coords, 
     add = TRUE, 
     col = 2, 
     lwd = 1.5)

Centroid Methods

Delaunay Triangulation

## Triangulation is built between sets of three points. It is joined as long as no other points are found in a circle fit to the three points:

sy3_nb = tri2nb(syracuse.coords)


## Visualize

plot(syracuse.geom, 
     main = "Delaunay Triangulation",
     reset = FALSE)
plot(sy3_nb, syracuse.coords, 
     add = TRUE, 
     col = 2, 
     lwd = 1.5)

Sphere of Influence Method

## Restricts the links formed by Delauney triangulation to a certain length.

sy4_nb = graph2nb(soi.graph(sy3_nb, syracuse.coords))

plot(syracuse.geom, 
     main = "Sphere of Influence",
     reset = FALSE)
plot(sy4_nb, syracuse.coords, 
     add = TRUE, 
     col = 2, 
     lwd = 1.5)

k nearest neighbors

## Here each region is linked to its k nearest neighbors irrespective of the distance between them

sy5_nb = knn2nb(knearneigh(syracuse.coords, k = 1))
sy6_nb = knn2nb(knearneigh(syracuse.coords, k = 2))

par(mfrow = c(1,2))

plot(syracuse.geom, 
     main = "k = 1",
     reset = FALSE)
plot(sy5_nb, syracuse.coords, 
     add = TRUE, 
     col = 2, 
     lwd = 1.5)

plot(syracuse.geom, 
     main = "k = 2",
     reset = FALSE)
plot(sy6_nb, syracuse.coords, 
     add = TRUE, 
     col = 2, 
     lwd = 1.5)

Distance Functions

## Distance functions link together two regions if their centroids are within a certain distance of each other. This requires a minimum distance (d1) and a maximum distance (d2).

## Here we will set the maximum to 75% of the largest distance obtained when using 2 nearest neighbors (sy6_nb). We obtain all the distances between nearest neighbor pairs in the first line of code (this extracts distances as a list with nbdists() and converts them into a vector with unlist()). Then we find the maximum of these distances. Finally we set d2 to this value ∗0.75.

dists = nbdists(sy6_nb, syracuse.coords)

## Create a vector for the distance values
dists = unlist(dists)

## Find the maximum distance
max_1nn = max(dists)

## Create the neighborhood structure using the distance method
sy7_nb = dnearneigh(syracuse.coords, d1 = 0, d2 = 0.75*max_1nn)

plot(syracuse.geom, 
     main = "Maximum Distance (0.75)",
     reset = FALSE)
plot(sy7_nb, syracuse.coords, 
     add = TRUE, 
     col = 2, 
     lwd = 1.5)

Editing Neighborhood Structures

## While these function provide a quick way to establish a neighborhood structure, they will likely include some connections that are unrealistic or exclude some real connections. It is possible to edit the neighborhood structure directly - this is simply a list of n vectors, where each vector contains the neighbors linked to a single polygon. 

## See the contents of the neighborhood structure vector

str(sy1_nb)
## List of 63
##  $ : int [1:5] 2 5 11 21 22
##  $ : int [1:5] 1 3 4 5 6
##  $ : int [1:2] 2 4
##  $ : int [1:7] 2 3 6 7 8 9 10
##  $ : int [1:7] 1 2 6 11 12 13 14
##  $ : int [1:5] 2 4 5 7 14
##  $ : int [1:4] 4 6 8 14
##  $ : int [1:5] 4 7 9 14 15
##  $ : int [1:7] 4 8 10 15 16 17 18
##  $ : int [1:5] 4 9 18 19 20
##  $ : int [1:5] 1 5 12 22 23
##  $ : int [1:7] 5 11 13 22 23 24 30
##  $ : int [1:5] 5 12 14 15 24
##  $ : int [1:7] 5 6 7 8 13 15 24
##  $ : int [1:6] 8 9 13 14 16 24
##  $ : int [1:6] 9 15 17 24 25 32
##  $ : int [1:6] 9 16 18 25 33 34
##  $ : int [1:6] 9 10 17 19 34 35
##  $ : int [1:6] 10 18 20 34 35 36
##  $ : int [1:4] 10 19 35 36
##  $ : int [1:4] 1 22 26 28
##  $ : int [1:7] 1 11 12 21 23 28 29
##  $ : int [1:6] 11 12 22 28 29 30
##  $ : int [1:8] 12 13 14 15 16 30 31 32
##  $ : int [1:4] 16 17 32 33
##  $ : int [1:3] 21 27 28
##  $ : int [1:3] 26 28 37
##  $ : int [1:9] 21 22 23 26 27 29 37 38 40
##  $ : int [1:6] 22 23 28 30 38 40
##  $ : int [1:8] 12 23 24 29 31 40 41 42
##  $ : int [1:6] 24 30 32 41 42 43
##  $ : int [1:6] 16 24 25 31 33 43
##  $ : int [1:6] 17 25 32 34 43 44
##  $ : int [1:8] 17 18 19 33 35 44 45 46
##  $ : int [1:6] 18 19 20 34 36 46
##  $ : int [1:4] 19 20 35 46
##  $ : int [1:4] 27 28 38 47
##  $ : int [1:8] 28 29 37 39 40 47 48 49
##  $ : int [1:6] 38 40 41 48 49 51
##  $ : int [1:6] 28 29 30 38 39 41
##  $ : int [1:7] 30 31 39 40 42 51 52
##  $ : int [1:5] 30 31 41 43 52
##  $ : int [1:7] 31 32 33 42 44 52 54
##  $ : int [1:6] 33 34 43 45 54 55
##  $ : int [1:4] 34 44 46 55
##  $ : int [1:5] 34 35 36 45 55
##  $ : int [1:3] 37 38 48
##  $ : int [1:5] 38 39 47 49 57
##  $ : int [1:6] 38 39 48 50 51 57
##  $ : int [1:4] 49 51 57 58
##  $ : int [1:6] 39 41 49 50 52 58
##  $ : int [1:7] 41 42 43 51 53 54 58
##  $ : int [1:4] 52 54 58 59
##  $ : int [1:8] 43 44 52 53 55 59 61 62
##  $ : int [1:5] 44 45 46 54 56
##  $ : int 55
##  $ : int [1:5] 48 49 50 58 60
##  $ : int [1:8] 50 51 52 53 57 59 60 61
##  $ : int [1:5] 53 54 58 60 61
##  $ : int [1:5] 57 58 59 61 63
##  $ : int [1:6] 54 58 59 60 62 63
##  $ : int [1:3] 54 61 63
##  $ : int [1:3] 60 61 62
##  - attr(*, "class")= chr "nb"
##  - attr(*, "region.id")= chr [1:63] "110" "111" "112" "113" ...
##  - attr(*, "call")= language poly2nb(pl = syracuse)
##  - attr(*, "type")= chr "queen"
##  - attr(*, "sym")= logi TRUE
## Access the first polygon's neighbors

sy1_nb[[1]]
## [1]  2  5 11 21 22
## See appendix for editing structures using the edit.nb() function

Spatial Weights

## First method - binary (0 = no connection, 1 = connection)

sy1_lw_b = nb2listw(sy1_nb, style = 'B')


## Second method - row standardization (0 = no connection, weight = 1/total # of connections)

sy1_lw_w = nb2listw(sy1_nb, style = 'W')
## Weights could be assigned by an inverse distance method, where closer neighbors have higher weights. To do this, we need the list of distances along each join for the neighborhood structure that we will use. 

## First, we extract the distances from the neighborhood list (nbdists()), to obtain a list of distances. 

## We then generate inverse distances by dividing by 1000 (to make these a little more manageable) and taking the reciprocal. This is a little complex as we obtain a list output from nbdists(). 

## We create a new function, invd(), which calculates the inverse distance in kilometers (the coordinates are in meters). Then we apply this function to each item in the list using lapply(). This is the list version of the function apply() that we have used earlier.

## Extract distances

dists = nbdists(sy1_nb, syracuse.coords)

## Generate inverse distances. Dividing by 1000 makes the distances more manageable

inverse_distance = function(x) {1/(x/1000)}

## Apply this function (lapply) to our distance extraction

idw = lapply(dists, inverse_distance)

## Generate the weights

sy1_lw_idwB = nb2listw(sy1_nb, glist = idw, style = "B")

Checking for Autocorrelation

Visual Checks

## Create a column of log house prices

boston$logCMEDV = log(boston$CMEDV)

## View the histogram

hist(boston$logCMEDV)

cmedv.pal = brewer.pal(5, "Accent")
ellie.pal = brewer.pal(5, "PiYG")

tm_shape(boston) +
  tm_fill("logCMEDV",
          palette = cmedv.pal)

## Build the neighborhood structure using the queen's method
boston.nb = poly2nb(boston, queen = TRUE)

## Create the centroids

boston.coords = st_centroid(st_geometry(boston))

## Visualize them

plot(st_geometry(boston), 
     main = "Neighborhood Structure (Queen's)",
     reset = FALSE)
plot(boston.nb, boston.coords, 
     add = TRUE, 
     col = 2, 
     lwd = 1.5)

boston.listw = nb2listw(boston.nb)

Global Moran’s I

## Run the Global Moran's I test to look for auto correlation. Randomization assumtion is set to TRUE as we don't know about any larger spatial trends in the data. Two sided meaning postive or negative autocorrelation. 

moran.test(boston$logCMEDV,
           listw = boston.listw,
           alternative = "two.sided",
           randomisation = TRUE)
## 
##  Moran I test under randomisation
## 
## data:  boston$logCMEDV  
## weights: boston.listw    
## 
## Moran I statistic standard deviate = 27.06, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.7270399706     -0.0019801980      0.0007258269
  • Evidence here for strong spatial autocorrelation given the standard deviate, I statistic, and p-value.
## Purely for the sake of comparison, we’ll also calculate Moran’s I under the normality assumption, stating that the pattern of house prices in Boston is a subset of a larger spatial trend. You should see a slight change in the variance calculated, but not enough to affect our conclusion that the data are strongly autocorrelated.

moran.test(boston$logCMEDV,
           listw = boston.listw,
           alternative = "two.sided",
           randomisation = FALSE)
## 
##  Moran I test under normality
## 
## data:  boston$logCMEDV  
## weights: boston.listw    
## 
## Moran I statistic standard deviate = 27.038, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.7270399706     -0.0019801980      0.0007270156
## Run the test multiple times to compare our observed statistic compared to a random sampling of values across the dataset. 

## Once done, we look at the rank of the observed version of Moran’s I against those obtained from random resampling. If we are either at the high or low end of all these random realizations, it is highly likely that the observed distribution is significantly autocorrelated. As we are using a rank, we cannot use a two-sided test, but must specify if we believe the autocorrelation to be positive (‘greater’) or negative (‘less’). The number of simulations to run is given by the parameter nsim. Increasing this, will increase the precision of the p-value obtained (but take longer to run):


moran.mc(boston$logCMEDV,
         listw = boston.listw,
         alternative = "greater",
         nsim = 999)
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  boston$logCMEDV 
## weights: boston.listw  
## number of simulations + 1: 1000 
## 
## statistic = 0.72704, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
  • In this case, our observed value is the highest value after running the test 999 times. Based on this and the p-value we reject the null hypothesis that there is no autocorrelation.
## Now make the Moran scatterplot. This shows the relationship between the value of any given area and its neighbors. The slope of the fitted line is the value of Moran’s I:

moran.plot(boston$logCMEDV,
           boston.listw,
           labels = as.character(boston$ID),
           xlab = "Log Median Value",
           ylab = "Lagged Log Median Value")

Local Moran’s I

## We will now calculate the local Moran’s I to examine the spatial pattern of autocorrelation. This returns a statistic (and z-score) for each area considered.

lm1 = localmoran(boston$logCMEDV,
                 listw = boston.listw,
                 alternative = "two.sided")


## View the header of the local moran's value by polygon
head(lm1)
##             Ii          E.Ii      Var.Ii        Z.Ii Pr(z != E(Ii))
## 1 -0.224510839 -2.873020e-04 0.017914316 -1.67525561    0.093884090
## 2 -0.001798936 -2.175813e-05 0.002735961 -0.03397629    0.972896060
## 3  0.137484323 -9.177414e-05 0.005723568  1.81848429    0.068990146
## 4  0.033966647 -8.277718e-05 0.010408122  0.33375177    0.738566878
## 5  0.491120158 -4.043873e-04 0.022365621  3.28665963    0.001013833
## 6  0.002533256 -2.287544e-05 0.001909980  0.05848826    0.953359715
## Extract the z scores in the fourth column

boston$zscore = lm1[ , 4]

## Extract the z scores in the fifth column

boston$pval = lm1[ , 5]

## Visualize them

## Note: style  
## method to process the color scale when col is a numeric variable. Discrete gradient options are "cat", "fixed", "sd", "equal", "pretty", "quantile", "kmeans", "hclust", "bclust", "fisher", "jenks", "dpih", "headtails", and "log10_pretty"

tm_shape(boston) +
  tm_fill("zscore",
          palette = "Blues",
          style = "jenks",
          n = 6) +
  tm_borders() +
  tm_layout(main.title = "Local Morans I (Z-Scores)",
            legend.outside = TRUE,
            legend.outside.position = "left")

## Turn the pvalue into a binary vector to separate statistically significant values (>0.05) and those that are not.

boston$pval.bin = as.factor(ifelse(boston$pval < 0.05, "Signifcant", "Not Significant"))



tm_shape(boston) +
  tm_fill("pval.bin") +
  tm_borders() +
  tm_layout(main.title = "Local Morans I (P-values)",
            legend.position = c("left", "bottom"))

## Produce weights to include the weight of a location with itself (include.self) syntax.

boston.listwGs = nb2listw(include.self(boston.nb), style = "B")

## Caluclate the statistc - localG()

boston$lG = as.numeric(localG(boston$logCMEDV, boston.listwGs))

## Plot the results

tm_shape(boston) +
  tm_fill("lG",
          palette = "-RdYlBu",
          style = "jenks",
          n = 6,
          title = "Getis-Ord Score") +
  tm_borders() +
  tm_layout(main.title = "Local Getis-Ord (Z-Scores)",
            main.title.size = 1,
            legend.outside = TRUE,
            legend.outside.position = "left")

  • Observe that we have spatial structure with clusters of low prices in the center and higher prices in the suburbs.
## Significant factor build

## Question on the if else statment. Number bound choice??

boston$lG.sig = ifelse(boston$lG < -1.96, -1,
                       ifelse(boston$lG > 1.96, 1, 0))

boston$lG.sig = factor(boston$lG.sig, labels = c("Cold", "Not Significant", "Hot"))


## Visualization

tm_shape(boston) + 
  tm_fill("lG.sig", 
          palette = "-RdYlBu", 
          style = "jenks",
          n = 6) +
  tm_borders() +
  tm_layout(main.title = "Local Getis-Ord G(*) hot/cold spots",
            main.title.size = 1,
            legend.position = c("left", "bottom"))

Spatial Regression Models

## Read the Data

col = st_read("../datafiles/columbus/columbus.shp", quiet = TRUE)

## Check CRS

st_crs(col)
## Coordinate Reference System: NA
## Set CRS

st_crs(col) = 4326

## Skewed (to the left according to the histogram...error in the lab instructions?)
## Log transformation

col$lINC = log(col$INC)

col$lHOVAL = log(col$HOVAL)


## Visualize the Crime Rate


tm_shape(col) +
  tm_fill("CRIME",
          palette = "-RdYlGn") +
  tm_borders(col = "gray")

Build the Spatial Weight Matrix

## Get the Geometry

col.geom = st_geometry(col)

## Get the centroids

col.coords = st_centroid(col.geom)

## Build the neighborhood structure (Queens Case)

col.nbq = poly2nb(col, queen = TRUE)


## Convert to a Spatial Weight Matrix

col.listw = nb2listw(col.nbq)


## Plot the Structure

plot(col.geom, 
     main = "Neighborhood Structure (Queen's)",
     reset = FALSE)
plot(col.nbq, col.coords, 
     add = TRUE, 
     col = 1, 
     lwd = 1.5)

Checking For Autocorrelation for Crime

## Run the Global Moran's I test to look for auto correlation. Randomization assumtion is set to TRUE as we don't know about any larger spatial trends in the data. Two sided meaning postive or negative autocorrelation. 

moran.test(col$CRIME,
           listw = col.listw,
           alternative = "two.sided",
           randomisation = TRUE)
## 
##  Moran I test under randomisation
## 
## data:  col$CRIME  
## weights: col.listw    
## 
## Moran I statistic standard deviate = 5.5894, p-value = 2.279e-08
## alternative hypothesis: two.sided
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.500188557      -0.020833333       0.008689289
## Monte Carlo Check

moran.mc(col$CRIME,
         listw = col.listw,
         alternative = "greater",
         nsim = 999)
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  col$CRIME 
## weights: col.listw  
## number of simulations + 1: 1000 
## 
## statistic = 0.50019, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
  • Suggests a strong level of autocorrelation
col.lm1 = localmoran(col$CRIME,
                 listw = col.listw,
                 alternative = "two.sided")


## View the header of the local moran's value by polygon
head(col.lm1)
##             Ii          E.Ii      Var.Ii        Z.Ii Pr(z != E(Ii))
## 1  0.736818491 -0.0285985420 0.666144891  0.93780765      0.3483433
## 2  0.528777013 -0.0202502140 0.310266063  0.98565912      0.3243004
## 3  0.093850742 -0.0015396867 0.017630072  0.71841891      0.4724990
## 4  0.004820967 -0.0005707572 0.006541757  0.06666232      0.9468505
## 5  0.303606125 -0.0184931910 0.094617920  1.04713603      0.2950368
## 6 -0.181570517 -0.0062384562 0.148657003 -0.45474575      0.6492922
## Extract the z scores in the fourth column

col$zscore = col.lm1[ , 4]

## Extract the z scores in the fifth column

col$pval = col.lm1[ , 5]

## Visualize them

## Note: style  
## method to process the color scale when col is a numeric variable. Discrete gradient options are "cat", "fixed", "sd", "equal", "pretty", "quantile", "kmeans", "hclust", "bclust", "fisher", "jenks", "dpih", "headtails", and "log10_pretty"

tm_shape(col) +
  tm_fill("zscore",
          palette = "Blues",
          style = "jenks",
          n = 6) +
  tm_borders() +
  tm_layout(main.title = " Columbus - Local Morans I (Z-Scores)",
            legend.outside = TRUE,
            legend.outside.position = "left")

## Turn the pvalue into a binary vector to separate statistically significant values (>0.05) and those that are not.

col$pval.bin = as.factor(ifelse(col$pval < 0.05, "Signifcant", "Not Significant"))


tm_shape(col) +
  tm_fill("pval.bin",
          title = "p-value Significance") +
  tm_borders() +
  tm_layout(main.title = "Columbus - Local Morans I (P-values)",
            legend.position = c("left", "top"))

  • The local test also provides evidence for significance

Spatial Regression

## Basic linear model - Crime as a function of the log transformed income and home value

col.fit1 = lm(CRIME ~ lINC + lHOVAL, data = col)

summary(col.fit1)
## 
## Call:
## lm(formula = CRIME ~ lINC + lHOVAL, data = col)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.361  -5.997  -1.521   7.432  28.013 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  138.074     14.542   9.495 2.06e-12 ***
## lINC         -19.272      5.024  -3.836 0.000379 ***
## lHOVAL       -14.924      4.568  -3.267 0.002059 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.6 on 46 degrees of freedom
## Multiple R-squared:  0.5392, Adjusted R-squared:  0.5191 
## F-statistic: 26.91 on 2 and 46 DF,  p-value: 1.827e-08
  • This model performs well, but we know based on the map and Moran’s test that there is some level of autocorrelation
## Use morans I Monte Carlo simulations

moran.mc(residuals(col.fit1),
         nsim = 999,
         listw = col.listw,
         alternative = "greater")
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  residuals(col.fit1) 
## weights: col.listw  
## number of simulations + 1: 1000 
## 
## statistic = 0.21308, observed rank = 992, p-value = 0.008
## alternative hypothesis: greater
  • Statistically significant that there is spatial autocorrelation among our residuals (errors)
## The Lagrange multiplier test is used to assess whether the autocorrelation is in the values of the dependent variable or in its errors, and helps in the choice of which spatial regression model to use. 

## Build the Test
## LMerr is testing the errors
## LMlag is testing the dependent variable and a "lag" effect (spillover)

lmt = lm.LMtests(col.fit1, col.listw, test = c("LMerr", "LMlag"))

summary(lmt)
##  Lagrange multiplier diagnostics for spatial dependence
## data:  
## model: lm(formula = CRIME ~ lINC + lHOVAL, data = col)
## weights: col.listw
##  
##       statistic parameter  p.value   
## LMerr    4.7913         1 0.028603 * 
## LMlag    9.1694         1 0.002461 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  • Indicates that there is spatial autocorrelation among both the errors and dependent variable, however it is stronger in the dependent variable.
## Use the robust test to decide which of two is the more likely source for autocorrelation

lmt_robust = lm.LMtests(col.fit1, col.listw, test = c("RLMerr", "RLMlag"))

summary(lmt_robust)
##  Lagrange multiplier diagnostics for spatial dependence
## data:  
## model: lm(formula = CRIME ~ lINC + lHOVAL, data = col)
## weights: col.listw
##  
##        statistic parameter p.value  
## RLMerr  0.024805         1 0.87485  
## RLMlag  4.402809         1 0.03588 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Spatial Lag Model

## Build the model using the lagsarlm() function

col.fit2 = lagsarlm(CRIME ~ lINC + lHOVAL,
                    data = col,
                    listw = col.listw)

summary(col.fit2)
## 
## Call:lagsarlm(formula = CRIME ~ lINC + lHOVAL, data = col, listw = col.listw)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -36.1856  -5.4709  -0.5156   6.4413  22.5983 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##             Estimate Std. Error z value  Pr(>|z|)
## (Intercept)  99.5700    16.1538  6.1639 7.099e-10
## lINC        -11.9221     4.5544 -2.6177 0.0088524
## lHOVAL      -13.5984     3.9965 -3.4026 0.0006676
## 
## Rho: 0.42092, LR test value: 8.9447, p-value: 0.0027828
## Asymptotic standard error: 0.12671
##     z-value: 3.3219, p-value: 0.00089398
## Wald statistic: 11.035, p-value: 0.00089398
## 
## Log likelihood: -183.6196 for lag model
## ML residual variance (sigma squared): 100.73, (sigma: 10.036)
## Number of observations: 49 
## Number of parameters estimated: 5 
## AIC: 377.24, (AIC for lm: 384.18)
## LM test for residual autocorrelation
## test value: 0.028921, p-value: 0.86496

Note: - Estimates of the coefficients associated with the independent variables - The coefficient rho value, showing the strength and significance of the autoregressive spatial component - The LM test on residuals to look for remaining autocorrelation - The AIC and log-likelihood giving an estimate of the goodness-of-fit of the model

moran.mc(residuals(col.fit2),
         nsim = 999,
         listw = col.listw,
         alternative = "greater")
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  residuals(col.fit2) 
## weights: col.listw  
## number of simulations + 1: 1000 
## 
## statistic = 0.010517, observed rank = 656, p-value = 0.344
## alternative hypothesis: greater
  • This model does well at taking into account the spatial autocorrelation

Spatial Error Model

col.fit3 = errorsarlm(CRIME ~ lINC + lHOVAL,
                    data = col,
                    listw = col.listw)

summary(col.fit3)
## 
## Call:errorsarlm(formula = CRIME ~ lINC + lHOVAL, data = col, listw = col.listw)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -33.3722  -6.3766  -0.1172   6.9844  21.9846 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##             Estimate Std. Error z value  Pr(>|z|)
## (Intercept) 117.7761    14.7937  7.9612 1.776e-15
## lINC         -9.9844     4.9172 -2.0305  0.042305
## lHOVAL      -16.0041     4.1761 -3.8323  0.000127
## 
## Lambda: 0.53456, LR test value: 6.9048, p-value: 0.0085966
## Asymptotic standard error: 0.14043
##     z-value: 3.8066, p-value: 0.00014091
## Wald statistic: 14.49, p-value: 0.00014091
## 
## Log likelihood: -184.6396 for error model
## ML residual variance (sigma squared): 101.71, (sigma: 10.085)
## Number of observations: 49 
## Number of parameters estimated: 5 
## AIC: 379.28, (AIC for lm: 384.18)
  • In the output of the function, note the value of lambda, the autoregressive coefficient representing the strength of autocorrelation in the residuals of a linear model.

Spatial Durbin Lag Model

## Correlation between the dependent variable and the neighboring independent variables

## This also uses the lagsarlm() function, but with the parameter type set to ‘mixed’, to specify a Spatial Durbin lag model:

col.fit4 = lagsarlm(CRIME ~ lINC + lHOVAL,
                    data = col,
                    listw = col.listw,
                    type = "mixed")

summary(col.fit4)
## 
## Call:lagsarlm(formula = CRIME ~ lINC + lHOVAL, data = col, listw = col.listw, 
##     type = "mixed")
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -36.42025  -5.66457   0.10208   5.61973  21.82351 
## 
## Type: mixed 
## Coefficients: (asymptotic standard errors) 
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  95.4310    31.8319  2.9980 0.002718
## lINC         -9.6490     4.9167 -1.9625 0.049702
## lHOVAL      -15.2748     4.1030 -3.7229 0.000197
## lag.lINC    -13.6313     8.4749 -1.6084 0.107743
## lag.lHOVAL   11.8547     8.2710  1.4333 0.151776
## 
## Rho: 0.35363, LR test value: 3.4569, p-value: 0.062987
## Asymptotic standard error: 0.16701
##     z-value: 2.1174, p-value: 0.034225
## Wald statistic: 4.4834, p-value: 0.034225
## 
## Log likelihood: -182.0123 for mixed model
## ML residual variance (sigma squared): 95.663, (sigma: 9.7807)
## Number of observations: 49 
## Number of parameters estimated: 7 
## AIC: 378.02, (AIC for lm: 379.48)
## LM test for residual autocorrelation
## test value: 0.14333, p-value: 0.705
  • Not significant with the lagged versions. There is a low Rho value at the p-value is above the critical threshold.
aic.tbl = AIC(col.fit2, col.fit3, col.fit4)

rownames(aic.tbl) = c("Lag Model", "Error Model", "Durbin Model")

aic.tbl %>%
  kbl(caption = "AIC Comparison") %>%
  kable_classic_2(full_width = F, html_font = "arial")
AIC Comparison
df AIC
Lag Model 5 377.2393
Error Model 5 379.2792
Durbin Model 7 378.0246

Exercise

1.1 US Car Data Neighbor Structure

## Read the data

cars = st_read("../datafiles/usedcars/usa48_usedcars.shp", 
                 quiet = TRUE)

## Check

st_crs(cars)
## Coordinate Reference System: NA
## Check if the geometry is valid

st_is_valid(cars)
##  [1]  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [13]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [25]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
## [37]  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE
## Make the geometry valid

cars2 = st_make_valid(cars)

## Check again

st_is_valid(cars2)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE
## Set CRS

# Note Converting to WGS84 on the original data - cars - produces a polygon with overlapping lines which produces an error when building the centroids 

st_crs(cars2) = 4326

## Alternative code to deal with invalid geometry

##sf_use_s2(FALSE)
##cars <- st_read("../datafiles/usedcars/usa48_usedcars.shp", crs = 4326)
##tm_shape(cars) + tm_fill("price_1960")

## Extract the geometry
cars.geom = st_geometry(cars)

## Extract the centroids

cars.coord = st_centroid(cars.geom)

## Build the neighborhood structure

cars.nb = poly2nb(cars, queen = TRUE)

## Visualize the Structure

plot(cars.geom,
     reset = FALSE,
     border = "black")
plot(cars.nb, cars.coord,
     col = "red",
     cex = 0.5,
     add = TRUE)

Explanation:

  • I chose the queen’s method for constructing my neighbor function to account for states that are in close proximity, yet only share a point along their border, for example, the four corners region - Utah, Colorado, New Mexico, and Arizona. Utilizing the queen’s method allows us to account for Colorado and Arizona being neighbors along with New Mexico and Utah. This approach is ideal as the region shares several characteristics with each other.

1.2 Spatial Weight Matrix

## Utilizing a weighted approach to building our matrix

cars.listw = nb2listw(cars.nb, style = 'W')

## Summary of our weight matrix

summary(cars.listw)
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 48 
## Number of nonzero links: 214 
## Percentage nonzero weights: 9.288194 
## Average number of links: 4.458333 
## Link number distribution:
## 
##  1  2  3  4  5  6  7  8 
##  1  4  9 11 10  9  2  2 
## 1 least connected region:
## 17 with 1 link
## 2 most connected regions:
## 23 40 with 8 links
## 
## Weights style: W 
## Weights constants summary:
##    n   nn S0       S1       S2
## W 48 2304 48 23.94514 197.3707

1.3 Moran’s Test for Autocorrelation

## Moran's Monte Carlo

cars.mc = 
  moran.mc(cars$price_1960,
         listw = cars.listw,
         alternative = "greater",
         nsim = 999)

cars.mc
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  cars$price_1960 
## weights: cars.listw  
## number of simulations + 1: 1000 
## 
## statistic = 0.78356, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater

Interpretation:

  • Based on the high Moran’s I statistic of 0.7835615, its observed rank of 1000 , and a p-value of 0.001 we can reject the null hypothesis and accept the alternative hypothesis that there is spatial autocorrelation for used car prices in the United States.

1.4 Simple Linear Model

car.lm1 = lm(price_1960 ~ tax_charge, data = cars)

car.lm1.sum = summary(car.lm1)

car.lm1.sum
## 
## Call:
## lm(formula = price_1960 ~ tax_charge, data = cars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -116.701  -45.053   -1.461   43.400  107.807 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1435.7506    27.5796  52.058  < 2e-16 ***
## tax_charge     0.6872     0.1754   3.918 0.000294 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57.01 on 46 degrees of freedom
## Multiple R-squared:  0.2503, Adjusted R-squared:  0.234 
## F-statistic: 15.35 on 1 and 46 DF,  p-value: 0.0002939
car.lm1.mc = 
  moran.mc(residuals(car.lm1),
         nsim = 999,
         listw = cars.listw,
         alternative = "greater")

car.lm1.mc
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  residuals(car.lm1) 
## weights: cars.listw  
## number of simulations + 1: 1000 
## 
## statistic = 0.57482, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater

Coefficients:

  • The intercept is 1435.7506037 and the slope is 0.6871577.

  • The \(r^2\) is 0.2339528.

  • The p-value is 2.9387325^{-4}.

Residual Moran’s Monte Carlo:

  • The Moran’s I statistic is 0.5748178 with an observed rank of 1000 and p-value of 0.001. Given these values we reject the null hypothesis and accept the alternative hypothesis that there is spatial autocorrelation among the residuals.

1.5 Lagrange Multiplier Test

## Non-robust test

cars.lmt = lm.LMtests(car.lm1, listw = cars.listw, test = c("LMerr", "LMlag"))

cars.lmt.sum = summary(cars.lmt)

cars.lmt.sum
##  Lagrange multiplier diagnostics for spatial dependence
## data:  
## model: lm(formula = price_1960 ~ tax_charge, data = cars)
## weights: cars.listw
##  
##       statistic parameter   p.value    
## LMerr    31.793         1 1.715e-08 ***
## LMlag    40.664         1 1.808e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Robust Test
## RLM syntax for the test

cars.lmt.robust = lm.LMtests(car.lm1, listw = cars.listw, test = c("RLMerr", "RLMlag"))

cars.lmt.robust.sum = summary(cars.lmt.robust)

cars.lmt.robust.sum
##  Lagrange multiplier diagnostics for spatial dependence
## data:  
## model: lm(formula = price_1960 ~ tax_charge, data = cars)
## weights: cars.listw
##  
##        statistic parameter  p.value   
## RLMerr  0.051751         1 0.820044   
## RLMlag  8.922943         1 0.002816 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Explanation:

  • The non-robust Lagrange multiplier test results in significant values for both the error and lag models. The statistic and p-value for the error model are 31.7925591 and 1.715487^{-8} respectively. The statistic and p-value for the lag model are 40.6637513 and 1.8081037^{-10} respectively.

  • The robust test returns a significant p-value, 0.0028161, for the lag model. As a result, we will build a spatial lag model for this data set.

1.6 Spatial Model Build

## Model Build
cars.lagmod = lagsarlm(price_1960 ~ tax_charge, 
                       data = cars,
                       listw = cars.listw)

## Summary Variable
cars.lagmod.sum = summary(cars.lagmod)

## Output
cars.lagmod.sum
## 
## Call:
## lagsarlm(formula = price_1960 ~ tax_charge, data = cars, listw = cars.listw)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -77.6781 -16.9504   4.2497  19.5484  58.9811 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) 309.42546  123.03167  2.5150   0.0119
## tax_charge    0.16711    0.10212  1.6364   0.1018
## 
## Rho: 0.78302, LR test value: 42.681, p-value: 6.4426e-11
## Asymptotic standard error: 0.081636
##     z-value: 9.5916, p-value: < 2.22e-16
## Wald statistic: 91.998, p-value: < 2.22e-16
## 
## Log likelihood: -239.8252 for lag model
## ML residual variance (sigma squared): 1036.7, (sigma: 32.197)
## Number of observations: 48 
## Number of parameters estimated: 4 
## AIC: 487.65, (AIC for lm: 528.33)
## LM test for residual autocorrelation
## test value: 2.1141, p-value: 0.14594

Explanation:

  • The intercept for this model is 309.4254609 with a p-value of 0.011903.

  • The slope for this model is 0.1671123 with a p-value of 0.1017542

  • The value of Rho for this model is 0.7830191 which indicates there is a strong spatial component for our dependent variable.

  • The AIC for this model is 487.65. This indicates that the spatial lag model is better than the simple linear model which has an AIC of 528.3316544.

1.7 Spatial Lag Model - Residuals Autocorrelation Test

cars.resid.moran =
  moran.mc(residuals(cars.lagmod),
         listw = cars.listw,
         alternative = "greater",
         nsim = 999)

cars.resid.moran
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  residuals(cars.lagmod) 
## weights: cars.listw  
## number of simulations + 1: 1000 
## 
## statistic = -0.077924, observed rank = 311, p-value = 0.689
## alternative hypothesis: greater

Explanation:

  • For the model’s residuals, the Moran’s I statistic is -0.0779243, with an observed rank of 311 and p-value of 0.689. Based on these values, we accept the null hypothesis that there is no spatial autocorrelation among this model’s residuals. Our spatial lag model adequately addresses the spatial component of our data.

1.8 Coefficient Interpretation

Explanation:

  • The slope for this model is 0.1671123 with a p-value of 0.1017542 which is not statistically significant. This is likely due to the nature of used cars. They are typically bought and sold within the same city and state which minimizes the delivery aspect of used car prices. Additionally, tax rates and delivery charges vary by state which may not reflect the cost of living and, subsequently, the price of a used car. In many cases, the tax and delivery charge only represent a small percentage (approximately 5% - 15%) of the overall cost and is not a great predictor of overall price.
tm_shape(cars2) +
  tm_fill(col = "price_1960",
          palette = "GnBu",
          title = "Price ($)") +
  tm_borders(col = "gray") +
  tm_layout(main.title = "Used Car Prices in the US - 1960",
            main.title.position = "center")

tm_shape(cars2) +
  tm_fill(col = "tax_charge",
          palette = "Reds",
          title = "Charges ($)") +
  tm_borders(col = "gray") +
  tm_layout(main.title = "Tax and Delivery Charge in the US - 1960",
            main.title.position = "center")