Spatial Regression Practice

library(sf)

## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(spData)

## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`

library(ggplot2)
library(ggthemes)
library(spdep)
library(spatialreg)

## Loading required package: Matrix

## 
## Attaching package: 'spatialreg'

## The following objects are masked from 'package:spdep':
## 
##     get.ClusterOption, get.coresOption, get.mcOption,
##     get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
##     set.coresOption, set.mcOption, set.VerboseOption,
##     set.ZeroPolicyOption

library(GWmodel)

## Loading required package: robustbase

## Loading required package: sp

## Loading required package: Rcpp

## Welcome to GWmodel version 2.3-2.

library(tidyr)

## 
## Attaching package: 'tidyr'

## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack

setwd("/Users/erlisakabashi/Desktop/dida 370/homework/chicago_airbnb")
airbnb <- st_read("airbnb_Chicago 2015.shp")

## Reading layer `airbnb_Chicago 2015' from data source 
##   `/Users/erlisakabashi/Desktop/dida 370/homework/chicago_airbnb/airbnb_Chicago 2015.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 77 features and 20 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -87.94011 ymin: 41.64454 xmax: -87.52414 ymax: 42.02304
## Geodetic CRS:  WGS 84

#percent unemployed over 16 years old
ggplot()+
  geom_sf(data = airbnb, aes(fill=unemployed))+
  scale_fill_steps(
    name = "Unemployment",
    low = "lightsteelblue1",
    high = "tomato1",
    n.breaks = 5,
    show.limits = T)

RESEARCH QUESTION: Does hardship, crime, or without high school rates effect unemployment rates in Chicago?

summary(lm1 <- lm(unemployed~ harship_in + num_crimes + without_hs, data = airbnb))

## 
## Call:
## lm(formula = unemployed ~ harship_in + num_crimes + without_hs, 
##     data = airbnb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.2956 -1.8252  0.0558  2.1518  8.7022 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.024e+00  7.921e-01   8.867 3.31e-13 ***
## harship_in   3.741e-01  2.071e-02  18.060  < 2e-16 ***
## num_crimes   5.310e-06  6.032e-05   0.088     0.93    
## without_hs  -5.018e-01  4.958e-02 -10.121 1.53e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.995 on 73 degrees of freedom
## Multiple R-squared:  0.8486, Adjusted R-squared:  0.8424 
## F-statistic: 136.4 on 3 and 73 DF,  p-value: < 2.2e-16

airbnb_list <- airbnb %>% 
  poly2nb(st_geometry(airbnb)) %>% 
  nb2listw(zero.policy = TRUE) 

lm.morantest(lm1, airbnb_list)

## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = unemployed ~ harship_in + num_crimes + without_hs,
## data = airbnb)
## weights: airbnb_list
## 
## Moran I statistic standard deviate = 1.4754, p-value = 0.07005
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##      0.068772397     -0.033672615      0.004821243

The p-value in this Moran’s test is weakly significant, meaning that there is evidence of some spatial autocorrelation in the residuals.

lm_error <- errorsarlm(unemployed ~ harship_in + num_crimes + without_hs, data = airbnb,
              listw = airbnb_list,
              zero.policy = TRUE, 
              na.action = na.omit)

summary(lm_error)

## 
## Call:errorsarlm(formula = unemployed ~ harship_in + num_crimes + without_hs, 
##     data = airbnb, listw = airbnb_list, na.action = na.omit, 
##     zero.policy = TRUE)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -8.29031 -1.95061  0.12897  1.98196  8.72179 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                Estimate  Std. Error z value Pr(>|z|)
## (Intercept)  7.1103e+00  8.5477e-01  8.3184   <2e-16
## harship_in   3.6582e-01  2.2205e-02 16.4743   <2e-16
## num_crimes   2.0412e-05  6.1780e-05  0.3304   0.7411
## without_hs  -4.9218e-01  5.3272e-02 -9.2389   <2e-16
## 
## Lambda: 0.18754, LR test value: 1.0091, p-value: 0.31511
## Asymptotic standard error: 0.16167
##     z-value: 1.1601, p-value: 0.24602
## Wald statistic: 1.3458, p-value: 0.24602
## 
## Log likelihood: -191.1616 for error model
## ML residual variance (sigma squared): 8.3313, (sigma: 2.8864)
## Number of observations: 77 
## Number of parameters estimated: 6 
## AIC: 394.32, (AIC for lm: 393.33)

Lamda is insignificant, we dont use this model! We move on.

seResiduals <- rep(0, length(airbnb$unemployed))
#create an index based on the order of the residuals in the lm_error object
resIndex <- lm_error$residuals %>% names() %>% as.integer();
#use the index to fill in the seResiduals object with corresponding residual values
seResiduals[resIndex] <- lm_error$residuals

#perform the Moran Test
airbnb_list %>%
  moran.test(seResiduals, ., zero.policy = TRUE)

## 
##  Moran I test under randomisation
## 
## data:  seResiduals  
## weights: .    
## 
## Moran I statistic standard deviate = 0.17189, p-value = 0.4318
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##     -0.0008557562     -0.0131578947      0.0051222529

The linear error model is not significant, so now we look at the lag model.

lm_lag <- lagsarlm(unemployed~ harship_in + num_crimes + without_hs, data = airbnb,
              listw = airbnb_list,
              zero.policy = TRUE, 
              na.action = na.omit)

summary(lm_lag)

## 
## Call:lagsarlm(formula = unemployed ~ harship_in + num_crimes + without_hs, 
##     data = airbnb, listw = airbnb_list, na.action = na.omit, 
##     zero.policy = TRUE)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -7.77334 -1.76733  0.40202  1.91087  8.59803 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##                Estimate  Std. Error z value  Pr(>|z|)
## (Intercept)  4.5771e+00  1.1495e+00  3.9817 6.843e-05
## harship_in   3.2293e-01  2.6957e-02 11.9795 < 2.2e-16
## num_crimes   1.6666e-05  5.6040e-05  0.2974    0.7662
## without_hs  -4.3007e-01  5.3134e-02 -8.0941 6.661e-16
## 
## Rho: 0.22034, LR test value: 6.5236, p-value: 0.010645
## Asymptotic standard error: 0.081665
##     z-value: 2.6981, p-value: 0.0069731
## Wald statistic: 7.2799, p-value: 0.0069731
## 
## Log likelihood: -188.4044 for lag model
## ML residual variance (sigma squared): 7.7331, (sigma: 2.7809)
## Number of observations: 77 
## Number of parameters estimated: 6 
## AIC: 388.81, (AIC for lm: 393.33)
## LM test for residual autocorrelation
## test value: 0.16343, p-value: 0.68602

This model already looks promising with a p-value of 0.01

seResiduals <- rep(0, length(airbnb$unemployed))
resIndex <- lm_lag$residuals %>% names() %>% as.integer();
seResiduals[resIndex] <- lm_lag$residuals

#perform the Moran Test
airbnb_list %>%
  moran.test(seResiduals, ., zero.policy = TRUE)

## 
##  Moran I test under randomisation
## 
## data:  seResiduals  
## weights: .    
## 
## Moran I statistic standard deviate = -0.17724, p-value = 0.5703
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       -0.02583888       -0.01315789        0.00511869

The p-value for the lag model is obviously not significant, so there is no evidence here of spatial autocorrelation in the residuals. We continue.

LM <- lm.LMtests(lm1, airbnb_list, test = "all")

## Please update scripts to use lm.RStests in place of lm.LMtests

LM

## 
##  Rao's score (a.k.a Lagrange multiplier) diagnostics for spatial
##  dependence
## 
## data:  
## model: lm(formula = unemployed ~ harship_in + num_crimes + without_hs,
## data = airbnb)
## test weights: listw
## 
## RSerr = 0.8545, df = 1, p-value = 0.3553
## 
## 
##  Rao's score (a.k.a Lagrange multiplier) diagnostics for spatial
##  dependence
## 
## data:  
## model: lm(formula = unemployed ~ harship_in + num_crimes + without_hs,
## data = airbnb)
## test weights: listw
## 
## RSlag = 6.3731, df = 1, p-value = 0.01159
## 
## 
##  Rao's score (a.k.a Lagrange multiplier) diagnostics for spatial
##  dependence
## 
## data:  
## model: lm(formula = unemployed ~ harship_in + num_crimes + without_hs,
## data = airbnb)
## test weights: listw
## 
## adjRSerr = 0.18143, df = 1, p-value = 0.6701
## 
## 
##  Rao's score (a.k.a Lagrange multiplier) diagnostics for spatial
##  dependence
## 
## data:  
## model: lm(formula = unemployed ~ harship_in + num_crimes + without_hs,
## data = airbnb)
## test weights: listw
## 
## adjRSlag = 5.7, df = 1, p-value = 0.01696
## 
## 
##  Rao's score (a.k.a Lagrange multiplier) diagnostics for spatial
##  dependence
## 
## data:  
## model: lm(formula = unemployed ~ harship_in + num_crimes + without_hs,
## data = airbnb)
## test weights: listw
## 
## SARMA = 6.5545, df = 2, p-value = 0.03773

The RSlag and the adjRSlag tests are the only ones that are significant here, so this reinforced the idea that the lag model is the best fit for this data and these variables.