Spatial Regression I
## Set working directory
setwd("~/Desktop/University of Utah PhD /Course Work/Fall 2022 Semester/GEOG6000_Data Analysis/lab09")
library(RColorBrewer) #color palettes
library(sf) #simple features for spatial data
library(spdep)
library(spatialreg)
library(tmap) #mapping package
library(kableExtra) #table modification
library(dbscan) # density based clustering
##Boston Data Read - shapefile
boston = st_read("../datafiles/boston.tr/boston.shp",
quiet = TRUE)
##Boston Plot with sf
plot(st_geometry(boston))
##Boston Plot with tmap
tm_shape(boston) +
tm_borders()
##NY Data Read
NY8 = st_read("../datafiles/NY_data/NY8_utm18.shp",
quiet = TRUE)
plot(st_geometry(NY8))
tm_shape(NY8) +
tm_borders() +
tm_fill("POP8",
title = "Population") +
tm_layout(legend.outside = TRUE,
legend.title.size = 1,
legend.text.size = 0.6,
legend.position = c("right","top"),
legend.bg.alpha = 1)
head(NY8)
## Simple feature collection with 6 features and 17 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: 421423.4 ymin: 4661351 xmax: 427091.6 ymax: 4664822
## Projected CRS: WGS 84 / UTM zone 18N
## AREANAME AREAKEY X Y POP8 TRACTCAS PROPCAS
## 1 Binghamton city 36007000100 4.069397 -67.3533 3540 3.08 0.000870
## 2 Binghamton city 36007000200 4.639371 -66.8619 3560 4.08 0.001146
## 3 Binghamton city 36007000300 5.709063 -66.9775 3739 1.09 0.000292
## 4 Binghamton city 36007000400 7.613831 -65.9958 2784 1.07 0.000384
## 5 Binghamton city 36007000500 7.315968 -67.3183 2571 3.06 0.001190
## 6 Binghamton city 36007000600 8.558753 -66.9344 2729 1.06 0.000388
## PCTOWNHOME PCTAGE65P Z AVGIDIST PEXPOSURE Cases Xm Ym
## 1 0.3277311 0.1466102 0.14197 0.2373852 3.167099 3.08284 4069.397 -67353.3
## 2 0.4268293 0.2351124 0.35555 0.2087413 3.038511 4.08331 4639.371 -66861.9
## 3 0.3377396 0.1380048 -0.58165 0.1708548 2.838229 1.08750 5709.063 -66977.5
## 4 0.4616048 0.1188937 -0.29634 0.1406045 2.643366 1.06515 7613.831 -65995.8
## 5 0.1924370 0.1415791 0.45689 0.1577753 2.758587 3.06017 7315.968 -67318.3
## 6 0.3651786 0.1410773 -0.28123 0.1726033 2.848411 1.06386 8558.753 -66934.4
## Xshift Yshift geometry
## 1 423391.0 4661502 POLYGON ((421840.4 4662874,...
## 2 423961.0 4661993 POLYGON ((421826.4 4663108,...
## 3 425030.6 4661878 POLYGON ((423159 4664759, 4...
## 4 426935.4 4662859 POLYGON ((425261.8 4663493,...
## 5 426637.5 4661537 POLYGON ((425033.3 4662995,...
## 6 427880.3 4661921 POLYGON ((426178.8 4664216,...
##Simple histogram
hist(NY8$TRACTCAS,
xlab = "Cases per Tract")
##Build a thematic map by specifying the column (attribute) that you wish to plot
## Create a color palette
my.pal = brewer.pal(n = 9, "YlOrRd")
## Create the thematic map
plot(NY8["Cases"],
main = "New Yok Leukemia Cases",
col = my.pal)
##Extract the polygons for Syracuse
syracuse = NY8[NY8$AREANAME == "Syracuse city", ]
## Store the geometry of the polygons
syracuse.geom = st_geometry(syracuse)
## Store the centroids of the polygons
syracuse.coords = st_centroid(syracuse.geom)
## Plot the geometry. The reset=FALSE syntax allows you to keep the plot and add more layers. The add=TRUE syntax adds it to the current plot
plot(syracuse.geom, reset = FALSE)
plot(syracuse.coords, pch = 16, col = 2, add = TRUE)
## This method connects polygons that touch boundaries or corners
sy1_nb = poly2nb(syracuse)
## Visualize the neighborhood structure
plot(syracuse.geom,
main = "Syracuse Neighborhood Structure (Queen)",
reset = FALSE)
plot(sy1_nb, syracuse.coords,
add = TRUE,
col = 2,
lwd = 1.5)
## This method connects polygons that touch boundaries only
sy2_nb = poly2nb(syracuse, queen = FALSE)
## Visualize the neighborhood structure
plot(syracuse.geom,
main = "Syracuse Neighborhood Structure (Rook)",
reset = FALSE)
plot(sy2_nb, syracuse.coords,
add = TRUE,
col = 2,
lwd = 1.5)
par(mfrow = c(1,2))
plot(syracuse.geom,
main = "Queen",
reset = FALSE)
plot(sy1_nb, syracuse.coords,
add = TRUE,
col = 2,
lwd = 1.5)
plot(syracuse.geom,
main = "Rook",
reset = FALSE)
plot(sy2_nb, syracuse.coords,
add = TRUE,
col = 2,
lwd = 1.5)
## Triangulation is built between sets of three points. It is joined as long as no other points are found in a circle fit to the three points:
sy3_nb = tri2nb(syracuse.coords)
## Visualize
plot(syracuse.geom,
main = "Delaunay Triangulation",
reset = FALSE)
plot(sy3_nb, syracuse.coords,
add = TRUE,
col = 2,
lwd = 1.5)
## Restricts the links formed by Delauney triangulation to a certain length.
sy4_nb = graph2nb(soi.graph(sy3_nb, syracuse.coords))
plot(syracuse.geom,
main = "Sphere of Influence",
reset = FALSE)
plot(sy4_nb, syracuse.coords,
add = TRUE,
col = 2,
lwd = 1.5)
## Here each region is linked to its k nearest neighbors irrespective of the distance between them
sy5_nb = knn2nb(knearneigh(syracuse.coords, k = 1))
sy6_nb = knn2nb(knearneigh(syracuse.coords, k = 2))
par(mfrow = c(1,2))
plot(syracuse.geom,
main = "k = 1",
reset = FALSE)
plot(sy5_nb, syracuse.coords,
add = TRUE,
col = 2,
lwd = 1.5)
plot(syracuse.geom,
main = "k = 2",
reset = FALSE)
plot(sy6_nb, syracuse.coords,
add = TRUE,
col = 2,
lwd = 1.5)
## Distance functions link together two regions if their centroids are within a certain distance of each other. This requires a minimum distance (d1) and a maximum distance (d2).
## Here we will set the maximum to 75% of the largest distance obtained when using 2 nearest neighbors (sy6_nb). We obtain all the distances between nearest neighbor pairs in the first line of code (this extracts distances as a list with nbdists() and converts them into a vector with unlist()). Then we find the maximum of these distances. Finally we set d2 to this value ∗0.75.
dists = nbdists(sy6_nb, syracuse.coords)
## Create a vector for the distance values
dists = unlist(dists)
## Find the maximum distance
max_1nn = max(dists)
## Create the neighborhood structure using the distance method
sy7_nb = dnearneigh(syracuse.coords, d1 = 0, d2 = 0.75*max_1nn)
plot(syracuse.geom,
main = "Maximum Distance (0.75)",
reset = FALSE)
plot(sy7_nb, syracuse.coords,
add = TRUE,
col = 2,
lwd = 1.5)
## While these function provide a quick way to establish a neighborhood structure, they will likely include some connections that are unrealistic or exclude some real connections. It is possible to edit the neighborhood structure directly - this is simply a list of n vectors, where each vector contains the neighbors linked to a single polygon.
## See the contents of the neighborhood structure vector
str(sy1_nb)
## List of 63
## $ : int [1:5] 2 5 11 21 22
## $ : int [1:5] 1 3 4 5 6
## $ : int [1:2] 2 4
## $ : int [1:7] 2 3 6 7 8 9 10
## $ : int [1:7] 1 2 6 11 12 13 14
## $ : int [1:5] 2 4 5 7 14
## $ : int [1:4] 4 6 8 14
## $ : int [1:5] 4 7 9 14 15
## $ : int [1:7] 4 8 10 15 16 17 18
## $ : int [1:5] 4 9 18 19 20
## $ : int [1:5] 1 5 12 22 23
## $ : int [1:7] 5 11 13 22 23 24 30
## $ : int [1:5] 5 12 14 15 24
## $ : int [1:7] 5 6 7 8 13 15 24
## $ : int [1:6] 8 9 13 14 16 24
## $ : int [1:6] 9 15 17 24 25 32
## $ : int [1:6] 9 16 18 25 33 34
## $ : int [1:6] 9 10 17 19 34 35
## $ : int [1:6] 10 18 20 34 35 36
## $ : int [1:4] 10 19 35 36
## $ : int [1:4] 1 22 26 28
## $ : int [1:7] 1 11 12 21 23 28 29
## $ : int [1:6] 11 12 22 28 29 30
## $ : int [1:8] 12 13 14 15 16 30 31 32
## $ : int [1:4] 16 17 32 33
## $ : int [1:3] 21 27 28
## $ : int [1:3] 26 28 37
## $ : int [1:9] 21 22 23 26 27 29 37 38 40
## $ : int [1:6] 22 23 28 30 38 40
## $ : int [1:8] 12 23 24 29 31 40 41 42
## $ : int [1:6] 24 30 32 41 42 43
## $ : int [1:6] 16 24 25 31 33 43
## $ : int [1:6] 17 25 32 34 43 44
## $ : int [1:8] 17 18 19 33 35 44 45 46
## $ : int [1:6] 18 19 20 34 36 46
## $ : int [1:4] 19 20 35 46
## $ : int [1:4] 27 28 38 47
## $ : int [1:8] 28 29 37 39 40 47 48 49
## $ : int [1:6] 38 40 41 48 49 51
## $ : int [1:6] 28 29 30 38 39 41
## $ : int [1:7] 30 31 39 40 42 51 52
## $ : int [1:5] 30 31 41 43 52
## $ : int [1:7] 31 32 33 42 44 52 54
## $ : int [1:6] 33 34 43 45 54 55
## $ : int [1:4] 34 44 46 55
## $ : int [1:5] 34 35 36 45 55
## $ : int [1:3] 37 38 48
## $ : int [1:5] 38 39 47 49 57
## $ : int [1:6] 38 39 48 50 51 57
## $ : int [1:4] 49 51 57 58
## $ : int [1:6] 39 41 49 50 52 58
## $ : int [1:7] 41 42 43 51 53 54 58
## $ : int [1:4] 52 54 58 59
## $ : int [1:8] 43 44 52 53 55 59 61 62
## $ : int [1:5] 44 45 46 54 56
## $ : int 55
## $ : int [1:5] 48 49 50 58 60
## $ : int [1:8] 50 51 52 53 57 59 60 61
## $ : int [1:5] 53 54 58 60 61
## $ : int [1:5] 57 58 59 61 63
## $ : int [1:6] 54 58 59 60 62 63
## $ : int [1:3] 54 61 63
## $ : int [1:3] 60 61 62
## - attr(*, "class")= chr "nb"
## - attr(*, "region.id")= chr [1:63] "110" "111" "112" "113" ...
## - attr(*, "call")= language poly2nb(pl = syracuse)
## - attr(*, "type")= chr "queen"
## - attr(*, "sym")= logi TRUE
## Access the first polygon's neighbors
sy1_nb[[1]]
## [1] 2 5 11 21 22
## See appendix for editing structures using the edit.nb() function
## First method - binary (0 = no connection, 1 = connection)
sy1_lw_b = nb2listw(sy1_nb, style = 'B')
## Second method - row standardization (0 = no connection, weight = 1/total # of connections)
sy1_lw_w = nb2listw(sy1_nb, style = 'W')
## Weights could be assigned by an inverse distance method, where closer neighbors have higher weights. To do this, we need the list of distances along each join for the neighborhood structure that we will use.
## First, we extract the distances from the neighborhood list (nbdists()), to obtain a list of distances.
## We then generate inverse distances by dividing by 1000 (to make these a little more manageable) and taking the reciprocal. This is a little complex as we obtain a list output from nbdists().
## We create a new function, invd(), which calculates the inverse distance in kilometers (the coordinates are in meters). Then we apply this function to each item in the list using lapply(). This is the list version of the function apply() that we have used earlier.
## Extract distances
dists = nbdists(sy1_nb, syracuse.coords)
## Generate inverse distances. Dividing by 1000 makes the distances more manageable
inverse_distance = function(x) {1/(x/1000)}
## Apply this function (lapply) to our distance extraction
idw = lapply(dists, inverse_distance)
## Generate the weights
sy1_lw_idwB = nb2listw(sy1_nb, glist = idw, style = "B")
## Create a column of log house prices
boston$logCMEDV = log(boston$CMEDV)
## View the histogram
hist(boston$logCMEDV)
cmedv.pal = brewer.pal(5, "Accent")
ellie.pal = brewer.pal(5, "PiYG")
tm_shape(boston) +
tm_fill("logCMEDV",
palette = cmedv.pal)
## Build the neighborhood structure using the queen's method
boston.nb = poly2nb(boston, queen = TRUE)
## Create the centroids
boston.coords = st_centroid(st_geometry(boston))
## Visualize them
plot(st_geometry(boston),
main = "Neighborhood Structure (Queen's)",
reset = FALSE)
plot(boston.nb, boston.coords,
add = TRUE,
col = 2,
lwd = 1.5)
boston.listw = nb2listw(boston.nb)
## Run the Global Moran's I test to look for auto correlation. Randomization assumtion is set to TRUE as we don't know about any larger spatial trends in the data. Two sided meaning postive or negative autocorrelation.
moran.test(boston$logCMEDV,
listw = boston.listw,
alternative = "two.sided",
randomisation = TRUE)
##
## Moran I test under randomisation
##
## data: boston$logCMEDV
## weights: boston.listw
##
## Moran I statistic standard deviate = 27.06, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
## Moran I statistic Expectation Variance
## 0.7270399706 -0.0019801980 0.0007258269
## Purely for the sake of comparison, we’ll also calculate Moran’s I under the normality assumption, stating that the pattern of house prices in Boston is a subset of a larger spatial trend. You should see a slight change in the variance calculated, but not enough to affect our conclusion that the data are strongly autocorrelated.
moran.test(boston$logCMEDV,
listw = boston.listw,
alternative = "two.sided",
randomisation = FALSE)
##
## Moran I test under normality
##
## data: boston$logCMEDV
## weights: boston.listw
##
## Moran I statistic standard deviate = 27.038, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
## Moran I statistic Expectation Variance
## 0.7270399706 -0.0019801980 0.0007270156
## Run the test multiple times to compare our observed statistic compared to a random sampling of values across the dataset.
## Once done, we look at the rank of the observed version of Moran’s I against those obtained from random resampling. If we are either at the high or low end of all these random realizations, it is highly likely that the observed distribution is significantly autocorrelated. As we are using a rank, we cannot use a two-sided test, but must specify if we believe the autocorrelation to be positive (‘greater’) or negative (‘less’). The number of simulations to run is given by the parameter nsim. Increasing this, will increase the precision of the p-value obtained (but take longer to run):
moran.mc(boston$logCMEDV,
listw = boston.listw,
alternative = "greater",
nsim = 999)
##
## Monte-Carlo simulation of Moran I
##
## data: boston$logCMEDV
## weights: boston.listw
## number of simulations + 1: 1000
##
## statistic = 0.72704, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
## Now make the Moran scatterplot. This shows the relationship between the value of any given area and its neighbors. The slope of the fitted line is the value of Moran’s I:
moran.plot(boston$logCMEDV,
boston.listw,
labels = as.character(boston$ID),
xlab = "Log Median Value",
ylab = "Lagged Log Median Value")
## We will now calculate the local Moran’s I to examine the spatial pattern of autocorrelation. This returns a statistic (and z-score) for each area considered.
lm1 = localmoran(boston$logCMEDV,
listw = boston.listw,
alternative = "two.sided")
## View the header of the local moran's value by polygon
head(lm1)
## Ii E.Ii Var.Ii Z.Ii Pr(z != E(Ii))
## 1 -0.224510839 -2.873020e-04 0.017914316 -1.67525561 0.093884090
## 2 -0.001798936 -2.175813e-05 0.002735961 -0.03397629 0.972896060
## 3 0.137484323 -9.177414e-05 0.005723568 1.81848429 0.068990146
## 4 0.033966647 -8.277718e-05 0.010408122 0.33375177 0.738566878
## 5 0.491120158 -4.043873e-04 0.022365621 3.28665963 0.001013833
## 6 0.002533256 -2.287544e-05 0.001909980 0.05848826 0.953359715
## Extract the z scores in the fourth column
boston$zscore = lm1[ , 4]
## Extract the z scores in the fifth column
boston$pval = lm1[ , 5]
## Visualize them
## Note: style
## method to process the color scale when col is a numeric variable. Discrete gradient options are "cat", "fixed", "sd", "equal", "pretty", "quantile", "kmeans", "hclust", "bclust", "fisher", "jenks", "dpih", "headtails", and "log10_pretty"
tm_shape(boston) +
tm_fill("zscore",
palette = "Blues",
style = "jenks",
n = 6) +
tm_borders() +
tm_layout(main.title = "Local Morans I (Z-Scores)",
legend.outside = TRUE,
legend.outside.position = "left")
## Turn the pvalue into a binary vector to separate statistically significant values (>0.05) and those that are not.
boston$pval.bin = as.factor(ifelse(boston$pval < 0.05, "Signifcant", "Not Significant"))
tm_shape(boston) +
tm_fill("pval.bin") +
tm_borders() +
tm_layout(main.title = "Local Morans I (P-values)",
legend.position = c("left", "bottom"))
## Produce weights to include the weight of a location with itself (include.self) syntax.
boston.listwGs = nb2listw(include.self(boston.nb), style = "B")
## Caluclate the statistc - localG()
boston$lG = as.numeric(localG(boston$logCMEDV, boston.listwGs))
## Plot the results
tm_shape(boston) +
tm_fill("lG",
palette = "-RdYlBu",
style = "jenks",
n = 6,
title = "Getis-Ord Score") +
tm_borders() +
tm_layout(main.title = "Local Getis-Ord (Z-Scores)",
main.title.size = 1,
legend.outside = TRUE,
legend.outside.position = "left")
## Significant factor build
## Question on the if else statment. Number bound choice??
boston$lG.sig = ifelse(boston$lG < -1.96, -1,
ifelse(boston$lG > 1.96, 1, 0))
boston$lG.sig = factor(boston$lG.sig, labels = c("Cold", "Not Significant", "Hot"))
## Visualization
tm_shape(boston) +
tm_fill("lG.sig",
palette = "-RdYlBu",
style = "jenks",
n = 6) +
tm_borders() +
tm_layout(main.title = "Local Getis-Ord G(*) hot/cold spots",
main.title.size = 1,
legend.position = c("left", "bottom"))
## Read the Data
col = st_read("../datafiles/columbus/columbus.shp", quiet = TRUE)
## Check CRS
st_crs(col)
## Coordinate Reference System: NA
## Set CRS
st_crs(col) = 4326
## Skewed (to the left according to the histogram...error in the lab instructions?)
## Log transformation
col$lINC = log(col$INC)
col$lHOVAL = log(col$HOVAL)
## Visualize the Crime Rate
tm_shape(col) +
tm_fill("CRIME",
palette = "-RdYlGn") +
tm_borders(col = "gray")
## Get the Geometry
col.geom = st_geometry(col)
## Get the centroids
col.coords = st_centroid(col.geom)
## Build the neighborhood structure (Queens Case)
col.nbq = poly2nb(col, queen = TRUE)
## Convert to a Spatial Weight Matrix
col.listw = nb2listw(col.nbq)
## Plot the Structure
plot(col.geom,
main = "Neighborhood Structure (Queen's)",
reset = FALSE)
plot(col.nbq, col.coords,
add = TRUE,
col = 1,
lwd = 1.5)
## Run the Global Moran's I test to look for auto correlation. Randomization assumtion is set to TRUE as we don't know about any larger spatial trends in the data. Two sided meaning postive or negative autocorrelation.
moran.test(col$CRIME,
listw = col.listw,
alternative = "two.sided",
randomisation = TRUE)
##
## Moran I test under randomisation
##
## data: col$CRIME
## weights: col.listw
##
## Moran I statistic standard deviate = 5.5894, p-value = 2.279e-08
## alternative hypothesis: two.sided
## sample estimates:
## Moran I statistic Expectation Variance
## 0.500188557 -0.020833333 0.008689289
## Monte Carlo Check
moran.mc(col$CRIME,
listw = col.listw,
alternative = "greater",
nsim = 999)
##
## Monte-Carlo simulation of Moran I
##
## data: col$CRIME
## weights: col.listw
## number of simulations + 1: 1000
##
## statistic = 0.50019, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
col.lm1 = localmoran(col$CRIME,
listw = col.listw,
alternative = "two.sided")
## View the header of the local moran's value by polygon
head(col.lm1)
## Ii E.Ii Var.Ii Z.Ii Pr(z != E(Ii))
## 1 0.736818491 -0.0285985420 0.666144891 0.93780765 0.3483433
## 2 0.528777013 -0.0202502140 0.310266063 0.98565912 0.3243004
## 3 0.093850742 -0.0015396867 0.017630072 0.71841891 0.4724990
## 4 0.004820967 -0.0005707572 0.006541757 0.06666232 0.9468505
## 5 0.303606125 -0.0184931910 0.094617920 1.04713603 0.2950368
## 6 -0.181570517 -0.0062384562 0.148657003 -0.45474575 0.6492922
## Extract the z scores in the fourth column
col$zscore = col.lm1[ , 4]
## Extract the z scores in the fifth column
col$pval = col.lm1[ , 5]
## Visualize them
## Note: style
## method to process the color scale when col is a numeric variable. Discrete gradient options are "cat", "fixed", "sd", "equal", "pretty", "quantile", "kmeans", "hclust", "bclust", "fisher", "jenks", "dpih", "headtails", and "log10_pretty"
tm_shape(col) +
tm_fill("zscore",
palette = "Blues",
style = "jenks",
n = 6) +
tm_borders() +
tm_layout(main.title = " Columbus - Local Morans I (Z-Scores)",
legend.outside = TRUE,
legend.outside.position = "left")
## Turn the pvalue into a binary vector to separate statistically significant values (>0.05) and those that are not.
col$pval.bin = as.factor(ifelse(col$pval < 0.05, "Signifcant", "Not Significant"))
tm_shape(col) +
tm_fill("pval.bin",
title = "p-value Significance") +
tm_borders() +
tm_layout(main.title = "Columbus - Local Morans I (P-values)",
legend.position = c("left", "top"))
## Basic linear model - Crime as a function of the log transformed income and home value
col.fit1 = lm(CRIME ~ lINC + lHOVAL, data = col)
summary(col.fit1)
##
## Call:
## lm(formula = CRIME ~ lINC + lHOVAL, data = col)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.361 -5.997 -1.521 7.432 28.013
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 138.074 14.542 9.495 2.06e-12 ***
## lINC -19.272 5.024 -3.836 0.000379 ***
## lHOVAL -14.924 4.568 -3.267 0.002059 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.6 on 46 degrees of freedom
## Multiple R-squared: 0.5392, Adjusted R-squared: 0.5191
## F-statistic: 26.91 on 2 and 46 DF, p-value: 1.827e-08
## Use morans I Monte Carlo simulations
moran.mc(residuals(col.fit1),
nsim = 999,
listw = col.listw,
alternative = "greater")
##
## Monte-Carlo simulation of Moran I
##
## data: residuals(col.fit1)
## weights: col.listw
## number of simulations + 1: 1000
##
## statistic = 0.21308, observed rank = 992, p-value = 0.008
## alternative hypothesis: greater
## The Lagrange multiplier test is used to assess whether the autocorrelation is in the values of the dependent variable or in its errors, and helps in the choice of which spatial regression model to use.
## Build the Test
## LMerr is testing the errors
## LMlag is testing the dependent variable and a "lag" effect (spillover)
lmt = lm.LMtests(col.fit1, col.listw, test = c("LMerr", "LMlag"))
summary(lmt)
## Lagrange multiplier diagnostics for spatial dependence
## data:
## model: lm(formula = CRIME ~ lINC + lHOVAL, data = col)
## weights: col.listw
##
## statistic parameter p.value
## LMerr 4.7913 1 0.028603 *
## LMlag 9.1694 1 0.002461 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Use the robust test to decide which of two is the more likely source for autocorrelation
lmt_robust = lm.LMtests(col.fit1, col.listw, test = c("RLMerr", "RLMlag"))
summary(lmt_robust)
## Lagrange multiplier diagnostics for spatial dependence
## data:
## model: lm(formula = CRIME ~ lINC + lHOVAL, data = col)
## weights: col.listw
##
## statistic parameter p.value
## RLMerr 0.024805 1 0.87485
## RLMlag 4.402809 1 0.03588 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Build the model using the lagsarlm() function
col.fit2 = lagsarlm(CRIME ~ lINC + lHOVAL,
data = col,
listw = col.listw)
summary(col.fit2)
##
## Call:lagsarlm(formula = CRIME ~ lINC + lHOVAL, data = col, listw = col.listw)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.1856 -5.4709 -0.5156 6.4413 22.5983
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 99.5700 16.1538 6.1639 7.099e-10
## lINC -11.9221 4.5544 -2.6177 0.0088524
## lHOVAL -13.5984 3.9965 -3.4026 0.0006676
##
## Rho: 0.42092, LR test value: 8.9447, p-value: 0.0027828
## Asymptotic standard error: 0.12671
## z-value: 3.3219, p-value: 0.00089398
## Wald statistic: 11.035, p-value: 0.00089398
##
## Log likelihood: -183.6196 for lag model
## ML residual variance (sigma squared): 100.73, (sigma: 10.036)
## Number of observations: 49
## Number of parameters estimated: 5
## AIC: 377.24, (AIC for lm: 384.18)
## LM test for residual autocorrelation
## test value: 0.028921, p-value: 0.86496
Note: - Estimates of the coefficients associated with the independent variables - The coefficient rho value, showing the strength and significance of the autoregressive spatial component - The LM test on residuals to look for remaining autocorrelation - The AIC and log-likelihood giving an estimate of the goodness-of-fit of the model
moran.mc(residuals(col.fit2),
nsim = 999,
listw = col.listw,
alternative = "greater")
##
## Monte-Carlo simulation of Moran I
##
## data: residuals(col.fit2)
## weights: col.listw
## number of simulations + 1: 1000
##
## statistic = 0.010517, observed rank = 656, p-value = 0.344
## alternative hypothesis: greater
col.fit3 = errorsarlm(CRIME ~ lINC + lHOVAL,
data = col,
listw = col.listw)
summary(col.fit3)
##
## Call:errorsarlm(formula = CRIME ~ lINC + lHOVAL, data = col, listw = col.listw)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.3722 -6.3766 -0.1172 6.9844 21.9846
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 117.7761 14.7937 7.9612 1.776e-15
## lINC -9.9844 4.9172 -2.0305 0.042305
## lHOVAL -16.0041 4.1761 -3.8323 0.000127
##
## Lambda: 0.53456, LR test value: 6.9048, p-value: 0.0085966
## Asymptotic standard error: 0.14043
## z-value: 3.8066, p-value: 0.00014091
## Wald statistic: 14.49, p-value: 0.00014091
##
## Log likelihood: -184.6396 for error model
## ML residual variance (sigma squared): 101.71, (sigma: 10.085)
## Number of observations: 49
## Number of parameters estimated: 5
## AIC: 379.28, (AIC for lm: 384.18)
## Correlation between the dependent variable and the neighboring independent variables
## This also uses the lagsarlm() function, but with the parameter type set to ‘mixed’, to specify a Spatial Durbin lag model:
col.fit4 = lagsarlm(CRIME ~ lINC + lHOVAL,
data = col,
listw = col.listw,
type = "mixed")
summary(col.fit4)
##
## Call:lagsarlm(formula = CRIME ~ lINC + lHOVAL, data = col, listw = col.listw,
## type = "mixed")
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.42025 -5.66457 0.10208 5.61973 21.82351
##
## Type: mixed
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 95.4310 31.8319 2.9980 0.002718
## lINC -9.6490 4.9167 -1.9625 0.049702
## lHOVAL -15.2748 4.1030 -3.7229 0.000197
## lag.lINC -13.6313 8.4749 -1.6084 0.107743
## lag.lHOVAL 11.8547 8.2710 1.4333 0.151776
##
## Rho: 0.35363, LR test value: 3.4569, p-value: 0.062987
## Asymptotic standard error: 0.16701
## z-value: 2.1174, p-value: 0.034225
## Wald statistic: 4.4834, p-value: 0.034225
##
## Log likelihood: -182.0123 for mixed model
## ML residual variance (sigma squared): 95.663, (sigma: 9.7807)
## Number of observations: 49
## Number of parameters estimated: 7
## AIC: 378.02, (AIC for lm: 379.48)
## LM test for residual autocorrelation
## test value: 0.14333, p-value: 0.705
aic.tbl = AIC(col.fit2, col.fit3, col.fit4)
rownames(aic.tbl) = c("Lag Model", "Error Model", "Durbin Model")
aic.tbl %>%
kbl(caption = "AIC Comparison") %>%
kable_classic_2(full_width = F, html_font = "arial")
df | AIC | |
---|---|---|
Lag Model | 5 | 377.2393 |
Error Model | 5 | 379.2792 |
Durbin Model | 7 | 378.0246 |
## Read the data
cars = st_read("../datafiles/usedcars/usa48_usedcars.shp",
quiet = TRUE)
## Check
st_crs(cars)
## Coordinate Reference System: NA
## Check if the geometry is valid
st_is_valid(cars)
## [1] TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [13] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [25] TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE
## Make the geometry valid
cars2 = st_make_valid(cars)
## Check again
st_is_valid(cars2)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE
## Set CRS
# Note Converting to WGS84 on the original data - cars - produces a polygon with overlapping lines which produces an error when building the centroids
st_crs(cars2) = 4326
## Alternative code to deal with invalid geometry
##sf_use_s2(FALSE)
##cars <- st_read("../datafiles/usedcars/usa48_usedcars.shp", crs = 4326)
##tm_shape(cars) + tm_fill("price_1960")
## Extract the geometry
cars.geom = st_geometry(cars)
## Extract the centroids
cars.coord = st_centroid(cars.geom)
## Build the neighborhood structure
cars.nb = poly2nb(cars, queen = TRUE)
## Visualize the Structure
plot(cars.geom,
reset = FALSE,
border = "black")
plot(cars.nb, cars.coord,
col = "red",
cex = 0.5,
add = TRUE)
Explanation:
## Utilizing a weighted approach to building our matrix
cars.listw = nb2listw(cars.nb, style = 'W')
## Summary of our weight matrix
summary(cars.listw)
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 48
## Number of nonzero links: 214
## Percentage nonzero weights: 9.288194
## Average number of links: 4.458333
## Link number distribution:
##
## 1 2 3 4 5 6 7 8
## 1 4 9 11 10 9 2 2
## 1 least connected region:
## 17 with 1 link
## 2 most connected regions:
## 23 40 with 8 links
##
## Weights style: W
## Weights constants summary:
## n nn S0 S1 S2
## W 48 2304 48 23.94514 197.3707
## Moran's Monte Carlo
cars.mc =
moran.mc(cars$price_1960,
listw = cars.listw,
alternative = "greater",
nsim = 999)
cars.mc
##
## Monte-Carlo simulation of Moran I
##
## data: cars$price_1960
## weights: cars.listw
## number of simulations + 1: 1000
##
## statistic = 0.78356, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
Interpretation:
car.lm1 = lm(price_1960 ~ tax_charge, data = cars)
car.lm1.sum = summary(car.lm1)
car.lm1.sum
##
## Call:
## lm(formula = price_1960 ~ tax_charge, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -116.701 -45.053 -1.461 43.400 107.807
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1435.7506 27.5796 52.058 < 2e-16 ***
## tax_charge 0.6872 0.1754 3.918 0.000294 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57.01 on 46 degrees of freedom
## Multiple R-squared: 0.2503, Adjusted R-squared: 0.234
## F-statistic: 15.35 on 1 and 46 DF, p-value: 0.0002939
car.lm1.mc =
moran.mc(residuals(car.lm1),
nsim = 999,
listw = cars.listw,
alternative = "greater")
car.lm1.mc
##
## Monte-Carlo simulation of Moran I
##
## data: residuals(car.lm1)
## weights: cars.listw
## number of simulations + 1: 1000
##
## statistic = 0.57482, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
Coefficients:
The intercept is 1435.7506037 and the slope is 0.6871577.
The \(r^2\) is 0.2339528.
The p-value is 2.9387325^{-4}.
Residual Moran’s Monte Carlo:
## Non-robust test
cars.lmt = lm.LMtests(car.lm1, listw = cars.listw, test = c("LMerr", "LMlag"))
cars.lmt.sum = summary(cars.lmt)
cars.lmt.sum
## Lagrange multiplier diagnostics for spatial dependence
## data:
## model: lm(formula = price_1960 ~ tax_charge, data = cars)
## weights: cars.listw
##
## statistic parameter p.value
## LMerr 31.793 1 1.715e-08 ***
## LMlag 40.664 1 1.808e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Robust Test
## RLM syntax for the test
cars.lmt.robust = lm.LMtests(car.lm1, listw = cars.listw, test = c("RLMerr", "RLMlag"))
cars.lmt.robust.sum = summary(cars.lmt.robust)
cars.lmt.robust.sum
## Lagrange multiplier diagnostics for spatial dependence
## data:
## model: lm(formula = price_1960 ~ tax_charge, data = cars)
## weights: cars.listw
##
## statistic parameter p.value
## RLMerr 0.051751 1 0.820044
## RLMlag 8.922943 1 0.002816 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Explanation:
The non-robust Lagrange multiplier test results in significant values for both the error and lag models. The statistic and p-value for the error model are 31.7925591 and 1.715487^{-8} respectively. The statistic and p-value for the lag model are 40.6637513 and 1.8081037^{-10} respectively.
The robust test returns a significant p-value, 0.0028161, for the lag model. As a result, we will build a spatial lag model for this data set.
## Model Build
cars.lagmod = lagsarlm(price_1960 ~ tax_charge,
data = cars,
listw = cars.listw)
## Summary Variable
cars.lagmod.sum = summary(cars.lagmod)
## Output
cars.lagmod.sum
##
## Call:
## lagsarlm(formula = price_1960 ~ tax_charge, data = cars, listw = cars.listw)
##
## Residuals:
## Min 1Q Median 3Q Max
## -77.6781 -16.9504 4.2497 19.5484 58.9811
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 309.42546 123.03167 2.5150 0.0119
## tax_charge 0.16711 0.10212 1.6364 0.1018
##
## Rho: 0.78302, LR test value: 42.681, p-value: 6.4426e-11
## Asymptotic standard error: 0.081636
## z-value: 9.5916, p-value: < 2.22e-16
## Wald statistic: 91.998, p-value: < 2.22e-16
##
## Log likelihood: -239.8252 for lag model
## ML residual variance (sigma squared): 1036.7, (sigma: 32.197)
## Number of observations: 48
## Number of parameters estimated: 4
## AIC: 487.65, (AIC for lm: 528.33)
## LM test for residual autocorrelation
## test value: 2.1141, p-value: 0.14594
Explanation:
The intercept for this model is 309.4254609 with a p-value of 0.011903.
The slope for this model is 0.1671123 with a p-value of 0.1017542
The value of Rho for this model is 0.7830191 which indicates there is a strong spatial component for our dependent variable.
The AIC for this model is 487.65. This indicates that the spatial lag model is better than the simple linear model which has an AIC of 528.3316544.
cars.resid.moran =
moran.mc(residuals(cars.lagmod),
listw = cars.listw,
alternative = "greater",
nsim = 999)
cars.resid.moran
##
## Monte-Carlo simulation of Moran I
##
## data: residuals(cars.lagmod)
## weights: cars.listw
## number of simulations + 1: 1000
##
## statistic = -0.077924, observed rank = 311, p-value = 0.689
## alternative hypothesis: greater
Explanation:
Explanation:
tm_shape(cars2) +
tm_fill(col = "price_1960",
palette = "GnBu",
title = "Price ($)") +
tm_borders(col = "gray") +
tm_layout(main.title = "Used Car Prices in the US - 1960",
main.title.position = "center")
tm_shape(cars2) +
tm_fill(col = "tax_charge",
palette = "Reds",
title = "Charges ($)") +
tm_borders(col = "gray") +
tm_layout(main.title = "Tax and Delivery Charge in the US - 1960",
main.title.position = "center")