library(sf)
## Linking to GEOS 3.8.1, GDAL 3.2.1, PROJ 7.2.1
library(sp)
library(spData)
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
library(spdep)
library(ggplot2)
library(spatialreg)
## Loading required package: Matrix
## 
## Attaching package: 'spatialreg'
## The following objects are masked from 'package:spdep':
## 
##     as_dgRMatrix_listw, as_dsCMatrix_I, as_dsCMatrix_IrW,
##     as_dsTMatrix_listw, as.spam.listw, can.be.simmed, cheb_setup,
##     create_WX, do_ldet, eigen_pre_setup, eigen_setup, eigenw,
##     errorsarlm, get.ClusterOption, get.coresOption, get.mcOption,
##     get.VerboseOption, get.ZeroPolicyOption, GMargminImage, GMerrorsar,
##     griffith_sone, gstsls, Hausman.test, impacts, intImpacts,
##     Jacobian_W, jacobianSetup, l_max, lagmess, lagsarlm, lextrB,
##     lextrS, lextrW, lmSLX, LU_prepermutate_setup, LU_setup,
##     Matrix_J_setup, Matrix_setup, mcdet_setup, MCMCsamp, ME, mom_calc,
##     mom_calc_int2, moments_setup, powerWeights, sacsarlm,
##     SE_classic_setup, SE_interp_setup, SE_whichMin_setup,
##     set.ClusterOption, set.coresOption, set.mcOption,
##     set.VerboseOption, set.ZeroPolicyOption, similar.listw, spam_setup,
##     spam_update_setup, SpatialFiltering, spautolm, spBreg_err,
##     spBreg_lag, spBreg_sac, stsls, subgraph_eigenw, trW
library(tmap)
## Registered S3 methods overwritten by 'stars':
##   method             from
##   st_bbox.SpatRaster sf  
##   st_crs.SpatRaster  sf

Instructions: The file usa48_usedcars.shp in directory ‘usedcars’ contains information on tax rates and delivery charges for new cars (tax_charge) in the 48 conterminous U.S. states for the period 1955-1959, as well as the average used car price for 1960 (price_1960). Use this dataset to build a spatial model linking used car prices to the tax and delivery costs. You will need the spdep library. * Build a neighborhood function linking the 48 states. You will need to choose one of the set of neighborhood functions available in R. Explain your choice of neighborhood function

cars <- st_read("./usedcars/usa48_usedcars.shp", quiet = TRUE)
cars <- st_set_crs(cars, 5070)


#spatial weight matrix
cars.geom <- st_geometry(cars)
cars.coords <- st_centroid(cars.geom)
cars.nbq <- poly2nb(cars, queen = FALSE)
cars.nbq.q <- poly2nb(cars)
cars.nb <- tri2nb(cars.coords)
cars.nb.sph <- graph2nb(soi.graph(cars.nb, cars.coords))
cars.nb.k4 <- knn2nb(knearneigh(cars.coords, k = 4))

#plot difference between queen's (red) and rook's (green)
plot(cars.geom, reset = FALSE, main = "Queen's Case vs. Rook's Case")
plot(cars.nbq.q, cars.coords, add = TRUE, col = rgb(red = 1, green = 0, blue = 0, alpha = 0.5), lwd = 2.5)
plot(cars.nbq, cars.coords, add = TRUE, col = rgb(red = 0, green = 1, blue = 0, alpha = 0.5), lwd = 2.5)

#difference between sphere of influence (blue) and k-nearest neighbors (purple)
plot(cars.geom, reset = FALSE, main = "Sphere of Influence vs. K-Nearest Neighbors (4)")
plot(cars.nb.sph, cars.coords, add = TRUE, rgb(red = 0.11, green = 0.455, blue = 0.81, alpha = 0.7), lwd = 2.5)
plot(cars.nb.k4, cars.coords, add = TRUE, rgb(red = 0.84, green = .09, blue = 0.87, alpha = 0.4), lwd = 2.5)

#distance function
cars.nb.k4 <- knn2nb(knearneigh(cars.coords, k = 4))
dists <- nbdists(cars.nb.k4, cars.coords)
dists <- unlist(dists)
max_lnn <- max(dists)
cars.nb.d = dnearneigh(cars.coords, d1 = 0, d2 = 0.75 * max_lnn)

#distance plot
plot(cars.geom, reset = FALSE, main = "Distance Function (3-Nearest Neighbors)")
plot(cars.nb.d, cars.coords, add = TRUE, rgb(red = 0.25, green = .25, blue = 0.9, alpha = 0.3), lwd = 2.5)

Answer: I tried most of our methods when choosing a neighborhood structure and ultimately decided to use a distance function.

The boundary methods seemed ill-suited to the odd polygon shapes. Some contiguous boundaries may not be very informative connections. A good example can be seen in Oklahoma, which is connected to New Mexico in both the queen’s and rook’s cases. The only difference between the two is at the Four Corners. I didn’t select either of these methods. On the East Coast, I believe people would be willing to cross multiple borders, and in the West, I think a centroid method may better capture the connections between states.

I did not select sphere of influence or k-nearest neighbors. I think k-nearest neighbors can be advantageous with polygons of different sizes, but I couldn’t find a happy medium that I was satisfied with — I wanted more states to be connected in the East, as I expected, but also connect western states that shared boundaries. Sphere of influence seemed too restrictive.

I adjusted the parameters on the distance function for awhile, eventually choosing three nearest neighbors at 75% maximum distance. While the East Coast looks quite busy, I think this more accurately captures what movement we might expect between these states. Connecticuters likely wouldn’t have qualms with moving a car to New Jersey, and Rhode Islanders would likely travel to New York. Because the movement of cars is likely related to the locations of people, I would prefer to use population-weighted centroids.

cars.nb <- tri2nb(cars.coords)
dists <- nbdists(cars.nb, cars.coords)

inv_dist <- function(x) {1/(x/1000)}

idw <- lapply(dists, inv_dist)

cars_idw <- nb2listw(cars.nb, glist = idw, style = "B")

summary(cars_idw)
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 48 
## Number of nonzero links: 264 
## Percentage nonzero weights: 11.45833 
## Average number of links: 5.5 
## Link number distribution:
## 
##  3  4  5  6  7  8  9 
##  1  8 16 16  4  2  1 
## 1 least connected region:
## 4 with 3 links
## 1 most connected region:
## 20 with 9 links
## 
## Weights style: B 
## Weights constants summary:
##    n   nn       S0       S1        S2
## B 48 2304 69060.47 56072684 493877040
cars.listw = nb2listw(cars.nb.d)

moran.test(cars$price_1960,
           listw = cars.listw,
           alternative = "two.sided",
           randomisation = TRUE)
## 
##  Moran I test under randomisation
## 
## data:  cars$price_1960  
## weights: cars.listw    
## 
## Moran I statistic standard deviate = 11.178, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.813250656      -0.021276596       0.005573973

Answer: We reject the null hypothesis and conclude that there is spatial autocorrelation (z = 11.178, p < 2.2e-16), with a Moran’s I is 0.8132.

cars.lm <- lm(price_1960 ~ tax_charge,
              data = cars)

#summary(cars.lm)

moran.mc(cars.lm$residuals,
         listw = cars.listw,
         nsim = 999,
         alternative = "greater")
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  cars.lm$residuals 
## weights: cars.listw  
## number of simulations + 1: 1000 
## 
## statistic = 0.56181, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater

Answer: The intercept is 1425.75 (p < 2e-16), the coefficient for tax and delivery costs is 0.6872 (p = 0.000294), and our R-squared is 0.2503. Our Monte-Carlo simulation results suggest that there is autocorrelation present in the residuals.

lmt <- lm.LMtests(cars.lm,
                  cars.listw,
                  test = c("LMerr", "LMlag"))
summary(lmt)
##  Lagrange multiplier diagnostics for spatial dependence
## data:  
## model: lm(formula = price_1960 ~ tax_charge, data = cars)
## weights: cars.listw
##  
##       statistic parameter   p.value    
## LMerr    49.596         1 1.889e-12 ***
## LMlag    62.759         1 2.331e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lmt_robust <- lm.LMtests(cars.lm,
                  cars.listw,
                  test = c("RLMerr", "RLMlag"))
summary(lmt_robust)
##  Lagrange multiplier diagnostics for spatial dependence
## data:  
## model: lm(formula = price_1960 ~ tax_charge, data = cars)
## weights: cars.listw
##  
##        statistic parameter   p.value    
## RLMerr  0.087019         1 0.7680019    
## RLMlag 13.250522         1 0.0002725 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Answer: Because both were significant, I conducted a robust test. The error model was not significant, so I determined that a spatial lag model should be used.

cars.fit1 <- lagsarlm(price_1960 ~ tax_charge,
                      data = cars,
                      cars_idw)
## Warning in lagsarlm(price_1960 ~ tax_charge, data = cars, cars_idw): inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16 
##   reciprocal condition number = 2.68723e-17 - using numerical Hessian.
summary(cars.fit1)
## 
## Call:lagsarlm(formula = price_1960 ~ tax_charge, data = cars, listw = cars_idw)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -115.9181  -43.2632   -1.9739   42.5999  106.6735 
## 
## Type: lag 
## Coefficients: (numerical Hessian approximate standard errors) 
##               Estimate Std. Error z value  Pr(>|z|)
## (Intercept) 1445.55879   39.63073 36.4757 < 2.2e-16
## tax_charge     0.66194    0.18704  3.5389 0.0004018
## 
## Rho: -2.7364e-06, LR test value: 0.11158, p-value: 0.73835
## Approximate (numerical Hessian) standard error: 8.0986e-06
##     z-value: -0.33789, p-value: 0.73544
## Wald statistic: 0.11417, p-value: 0.73544
## 
## Log likelihood: -261.11 for lag model
## ML residual variance (sigma squared): 3107.9, (sigma: 55.749)
## Number of observations: 48 
## Number of parameters estimated: 4 
## AIC: 530.22, (AIC for lm: 528.33)
  1. The intercept is 1445.56 (p < 2.2e-16), and the coefficient for tax and delivery charges is 0.66194 (p = 0.0004018).
  2. Rho is -2.7364e-0 — very small and not statistically significant (p = 0.73835). This suggests that the autoregressive spatial component is not very strong.
  3. Without the spatial weight matrix, the AIC is slightly lower at 528.33. With the SWM, the AIC is 530.22.
moran.mc(cars.fit1$residuals,
         listw = cars.listw,
         nsim = 999)
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  cars.fit1$residuals 
## weights: cars.listw  
## number of simulations + 1: 1000 
## 
## statistic = 0.5627, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
moran.test(cars.fit1$residuals,
           listw = cars.listw,
           alternative = "two.sided",
           randomisation = TRUE)
## 
##  Moran I test under randomisation
## 
## data:  cars.fit1$residuals  
## weights: cars.listw    
## 
## Moran I statistic standard deviate = 7.8377, p-value = 4.588e-15
## alternative hypothesis: two.sided
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.562698223      -0.021276596       0.005551494

Answer: The Moran’s I statistics were both about 0.5627. While this is an improvement from earlier (a difference of about 0.25, down from 0.8132 previously), I suspect that I haven’t adequately accounted for autocorrelation in my model.

Answer: Having not reduced my spatial autocorrelation below significance, I wouldn’t be confident stating that the tax and delivery cost were significant. I would want to revisit my choices, but also consider the automobile industry at the time. I suspect that tax and delivery was not as important as my results might suggest.

There is still a spatial trend to the residuals in my map, and when I compare it to the price, they seem to be related. My residuals are low where the price is low, and the residuals are high where the price is high.

cars$res <- cars.fit1$residuals

tm_shape(cars) +
  tm_fill("res") +
  tm_borders()
## Variable(s) "res" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

tm_shape(cars) +
  tm_fill("price_1960") +
  tm_borders()

range(cars$price_1960)
## [1] 1418 1659
sd(cars$price_1960)
## [1] 65.14084
hist(cars$price_1960, breaks = 10, col = "powderblue", border = "white")