Load Libraries

library(tidyverse)
library(tidycensus)
library(sf)
library(units)
library(scales)
library(flextable)
library(sfdep)
library(spdep)
library(adespatial) # This demo's focus
library(lmtest)

# ACS_variables <- load_variables(2022, dataset = "acs5")

Retrieve ACS Data

ACS_Data <- get_acs(
    geography = "county",
    variables = c(
        "median_monthly_housing" = "B25105_001",
        "pop_over_16" = "B23025_001",
        "median_income" = "B19013A_001"
    ),
    state = "Tennessee",
    geometry = TRUE,
    output = "wide"
)

# Calculate population density
ACS_Data$area <- set_units(st_area(ACS_Data), mi^2)
ACS_Data$area_unitless <- as.vector(ACS_Data$area)
ACS_Data$pop_dense <- ACS_Data$pop_over_16E / ACS_Data$area_unitless

Map variables

ACS_Data %>% 
    ggplot(aes(fill = median_monthly_housingE)) +
    geom_sf() +
    scale_fill_continuous(name = "housing costs",
                          breaks = breaks_pretty())

ACS_Data %>% 
    ggplot(aes(fill = pop_dense)) +
    geom_sf() +
    scale_fill_continuous(name = "population\ndensity",
                          breaks = breaks_pretty())

ACS_Data %>% 
    ggplot(aes(fill = median_incomeE)) +
    geom_sf() +
    scale_fill_continuous(name = "household\nincome",
                          breaks = breaks_pretty())

Create the linear model

formula <- median_monthly_housingE ~ pop_dense + median_incomeE

model <- lm(formula, ACS_Data)

as_flextable(model)

Estimate

Standard Error

t value

Pr(>|t|)

(Intercept)

-72.618

39.692

-1.830

0.0706

.

pop_dense

0.227

0.057

3.942

0.0002

***

median_incomeE

0.014

0.001

19.544

0.0000

***

Signif. codes: 0 <= '***' < 0.001 < '**' < 0.01 < '*' < 0.05

Residual standard error: 82.02 on 92 degrees of freedom

Multiple R-squared: 0.8821, Adjusted R-squared: 0.8795

F-statistic: 344.2 on 92 and 2 DF, p-value: 0.0000

You have your linear model, and then you go to test the model assumptions: normality, homoskedasticity, and independence.

The Independence Assumption

Model residuals should not be correlated with each other. The presence of spatial autocorrelation would mean this assumption is violated. To test it, Moran’s I test is used using the moran.test function from the spdep package. First, a neighbors list, a weights list, and a listw object must be created. I created the them using the sfdep package.

## Create Weights
nb <- st_contiguity(ACS_Data, queen = TRUE)
weights <- st_weights(nb, style = "W")
my_listw <- recreate_listw(nb, weights)  

## Moran Test
moran.test(model$residuals, my_listw)
## 
##  Moran I test under randomisation
## 
## data:  model$residuals  
## weights: my_listw    
## 
## Moran I statistic standard deviate = 3.7688, p-value = 8.201e-05
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.231378679      -0.010638298       0.004123656

This Moran’s I score shows that a decent amount of spatial autocorrelation exists within the model’s residuals, meaning the independence assumption is not met. How do you fix this?

Adespatial for Eigenvector Spatial Filtering

Eigenvector Spatial Filtering creates eigenvectors (think of as rows) that maximize spatial autocorrelation, so that very little spatial autocorrelation remains in the residuals.

Several packages and functions exist that can create eigenvectors for spatial filtering, including the SpatialFiltering function from the spatialreg package. However, the mem.select function from the adespatial package is my favorite because it provides more options for eigenvector selection, such as allowing eigenvectors that control for negative autocorrelation to be created.

Eigenvector Selection

Depending on the method argument, the mem.select function uses step-wise regression to determine the eigenvectors that most effectively reduce autocorrelation (“MIR”) or improve the models R2 value (“FWD”). You can also control whether it includes all kinds of spatial autocorrelation or just positive or negative spatial autocorrelation using the MEM.autocor argument. Also the alpha argument allows you to control the significance threshold for eigenvector selection. The code below is designed to minimize the autocorrelation, since the R2 value is already high for this model.

# Create and select eigenvectors from model residuals using a weighting matrix
Evecs <- mem.select(model$residuals, listw = my_listw, MEM.autocor = "all", method = "MIR", alpha = 0.05, verbose = TRUE)

To get it into an easy to use form, a few extra steps are needed, including joining the eigenvectors back to the original data and creating a new formula

# Create a data frame for your eigenvectors
EvecsDF <- Evecs$MEM.select

# Join the original data with the Eigenvector Data
MEMdata <- cbind(ACS_Data, EvecsDF)

glimpse(MEMdata)
## Rows: 95
## Columns: 15
## $ GEOID                   <chr> "47065", "47163", "47009", "47019", "47035", "…
## $ NAME                    <chr> "Hamilton County, Tennessee", "Sullivan County…
## $ median_monthly_housingE <dbl> 1067, 742, 948, 602, 724, 1170, 627, 771, 627,…
## $ median_monthly_housingM <dbl> 14, 17, 27, 44, 29, 61, 46, 63, 47, 77, 41, 35…
## $ pop_over_16E            <dbl> 299839, 132231, 112546, 47403, 52487, 33387, 1…
## $ pop_over_16M            <dbl> 464, 255, 285, 210, 173, 136, 138, 107, 116, 1…
## $ median_incomeE          <dbl> 79190, 54620, 71520, 48575, 57569, 77585, 4810…
## $ median_incomeM          <dbl> 2127, 2224, 2269, 2881, 2582, 3758, 4451, 7303…
## $ area                    [mi^2] 575.6760 [mi^2], 429.5125 [mi^2], 566.5556 [m…
## $ area_unitless           <dbl> 575.6760, 429.5125, 566.5556, 347.5120, 684.73…
## $ pop_dense               <dbl> 520.84678, 307.86297, 198.64954, 136.40680, 76…
## $ MEM3                    <dbl> 1.23800983, -1.74873061, 0.96575416, -1.894543…
## $ MEM2                    <dbl> -0.8509133, 2.0450302, 0.3258265, 2.1235327, -…
## $ MEM5                    <dbl> 1.50250840, 0.90830472, -0.22340110, 1.0576729…
## $ geometry                <MULTIPOLYGON [°]> MULTIPOLYGON (((-85.47489 3..., M…

Once you have it rejoined with the original data, you could even map an eigenvector to see how and where it is remediating spatial autocorrelation. The use of multiple selected eigenvectors will ultimately remove the most SA.

MEMdata %>% 
    ggplot(aes(fill = MEM3)) +
    geom_sf()

MEMdata %>% 
    ggplot(aes(fill = MEM2)) +
    geom_sf()

MEMdata %>% 
    ggplot(aes(fill = MEM5)) +
    geom_sf()

Then you can create the new formula and use it in the regression model.

# Create the new formula

## Retrieve the names of the eigenvectors and chain them together for the formula
varchain <- paste(Evecs$summary$variables, collapse = " + ")

## Combine with the base formula and convert to regular language
ESFformula <- paste("median_monthly_housingE ~ pop_dense + median_incomeE", varchain, sep = " + ")
ESFforumla <- str2lang(ESFformula)

# Linear Model

ESFmodel <- lm(ESFformula, MEMdata)

as_flextable(ESFmodel)

Estimate

Standard Error

t value

Pr(>|t|)

(Intercept)

-57.831

37.332

-1.549

0.1249

pop_dense

0.236

0.052

4.519

0.0000

***

median_incomeE

0.014

0.001

20.338

0.0000

***

MEM3

-23.907

7.603

-3.145

0.0023

**

MEM2

-20.849

7.786

-2.678

0.0088

**

MEM5

-20.638

7.692

-2.683

0.0087

**

Signif. codes: 0 <= '***' < 0.001 < '**' < 0.01 < '*' < 0.05

Residual standard error: 73.74 on 89 degrees of freedom

Multiple R-squared: 0.9078, Adjusted R-squared: 0.9027

F-statistic: 175.3 on 89 and 5 DF, p-value: 0.0000

One last step is to check you spatial autocorrelation again…

moran.test(ESFmodel$residuals, my_listw)
## 
##  Moran I test under randomisation
## 
## data:  ESFmodel$residuals  
## weights: my_listw    
## 
## Moran I statistic standard deviate = 0.85506, p-value = 0.1963
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.044369449      -0.010638298       0.004138621

As you can see, the spatial autocorrelation was substantially reduced and is the p-value is no longer statistically significant. The independence assumption is met. Now, you can test the other assumptions like normality and homoskedasticity.

shapiro.test(ESFmodel$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  ESFmodel$residuals
## W = 0.98374, p-value = 0.2889
bptest(ESFmodel)
## 
##  studentized Breusch-Pagan test
## 
## data:  ESFmodel
## BP = 4.8737, df = 5, p-value = 0.4315