Introduction

Theoretically, health is a durable commodity that declines over time but gets restored with the assistance of investment, for instance, the improvement of nutrition, physical activity, medical care, education, and time. Hence the importance of socioeconomic factors.

Justification

  • Hypertension disproportionately affects low-income, uninsured, and elderly residents.
  • Spatial clustering suggests environmental and socioeconomic persistence.
  • Prior research often ignores time dependence and spatial spillover.
  • This study integrates both dimensions.

Goal

  • Attempt to fill hypertension correlation research gaps at census tract to inform policies that improve health outcomes.

  • Apply regression and econometric techniques to a health variable, and create spatial dependence models.

  • Construct a composite resilience estimate from principal component analysis that measures aggregate structure vulnerability.

  • We use three complementary regression frameworks:

    1. Linear Regression baseline model; OLS

    2. Autoregressive Distributed Lag (ARDL): Temporal dynamic model.

    3. Spatial Lag Model (SAR): Accounts for spatial dependence between neighborhoods.

  • Where and why hypertension is more prevalent?

  • How do poverty, unemployment, insurance, and age affect hypertension prevalence?

Data Overview

Time-series data (2013–2023), making dynamic models a fair choice.

Data Analysis and Exploration

#Install 
#install.packages(c("dLagM", "urca", "lmtest", "car", "ggplot2", "tseries"))
#install.packages("tsbox")
#install.packages("ARDL")
#install.packages(c("tidycensus", "tidyverse", "sf"))


library(dLagM)
## Loading required package: nardl
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Loading required package: dynlm
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(urca)
library(lmtest)
library(car)
## Loading required package: carData
library(ggplot2)
library(tseries)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tsbox)
library(ARDL)
## To cite the ARDL package in publications:
## 
## Use this reference to refer to the validity of the ARDL package.
## 
##   Natsiopoulos, Kleanthis, and Tzeremes, Nickolaos G. (2022). ARDL
##   bounds test for cointegration: Replicating the Pesaran et al. (2001)
##   results for the UK earnings equation using R. Journal of Applied
##   Econometrics, 37(5), 1079-1090. https://doi.org/10.1002/jae.2919
## 
## Use this reference to cite this specific version of the ARDL package.
## 
##   Kleanthis Natsiopoulos and Nickolaos Tzeremes (2023). ARDL: ARDL, ECM
##   and Bounds-Test for Cointegration. R package version 0.2.4.
##   https://CRAN.R-project.org/package=ARDL
library(tidycensus)
library(sf)
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library(tidyverse)
library(readr)

ACS Socioeconomic Variables

library(tidycensus)
library(dplyr)
library(purrr)

# Census API key
api_key <- Sys.getenv("CENSUS_API_KEY")
census_api_key(api_key, overwrite = TRUE)
## To install your API key for use in future sessions, run this function with `install = TRUE`.
variables <- c(
  poverty_rate = "B17001_002",
  unemployment_rate = "B23025_005",
  no_health_insurance = "B27010_033",
  education_less_than_hs = "B15003_002",
  median_age = "B01002_001",
  black_population = "B02001_003",
  white_population = "B02001_002",
  american_indian_alaska_native_alone = "B02001_004",
  asian_alone = "B02001_005",
  native_hawaiian_other_pacific_islander_alone = "B02001_006",
  
  male_65_69 = "B01001_017",
  male_70_74 = "B01001_018",
  male_75_79 = "B01001_019",
  male_80_84 = "B01001_020",
  male_85plus = "B01001_021",
  female_65_69 = "B01001_041",
  female_70_74 = "B01001_042",
  female_75_79 = "B01001_043",
  female_80_84 = "B01001_044",
  female_85plus = "B01001_045"
)

years <- 2013:2024

get_acs_year <- function(year) {
  tryCatch({
    data <- get_acs(
      geography = "tract",
      year = year,
      state = "11",
      survey = "acs5",
      variables = variables,
      summary_var = "B01001_001", 
      output = "wide",
      geometry = FALSE
    )
    data$year <- year
    return(data)
  }, error = function(e) {
    message(paste("Error getting data for year", year, ":", e$message))
    return(NULL)
  })
}

# Get data and calculate rates
dc_ACS_all_years <- map_df(years, possibly(get_acs_year, otherwise = NULL)) %>%
  mutate(
    #  Calculate total 65+
    total_65plus = male_65_69E + male_70_74E + male_75_79E + male_80_84E + male_85plusE +
                   female_65_69E + female_70_74E + female_75_79E + female_80_84E + female_85plusE,
    
    # Calc rates(B01001_001E)
    # Basic rates
    poverty_rate_pct = (poverty_rateE / summary_est) * 100,
    unemployment_rate_pct = (unemployment_rateE / summary_est) * 100,
    no_health_insurance_pct = (no_health_insuranceE / summary_est) * 100,
    education_less_than_hs_pct = (education_less_than_hsE / summary_est) * 100,
    
    # race percentages
    black_population_pct = (black_populationE / summary_est) * 100,
    white_population_pct = (white_populationE / summary_est) * 100,
    american_indian_pct = (american_indian_alaska_native_aloneE / summary_est) * 100,
    asian_pct = (asian_aloneE / summary_est) * 100,
    pacific_islander_pct = (native_hawaiian_other_pacific_islander_aloneE / summary_est) * 100,
    
    # Age 65+ percentage
    pct_65plus = (total_65plus / summary_est) * 100,
    
    # race percentages among 65+ population
    black_65plus_pct = ifelse(total_65plus > 0, (black_populationE / total_65plus) * 100, NA),
    white_65plus_pct = ifelse(total_65plus > 0, (white_populationE / total_65plus) * 100, NA),
    asian_65plus_pct = ifelse(total_65plus > 0, (asian_aloneE / total_65plus) * 100, NA),
  )
## Getting data from the 2009-2013 5-year ACS
## Getting data from the 2010-2014 5-year ACS
## Getting data from the 2011-2015 5-year ACS
## Getting data from the 2012-2016 5-year ACS
## Getting data from the 2013-2017 5-year ACS
## Getting data from the 2014-2018 5-year ACS
## Getting data from the 2015-2019 5-year ACS
## Getting data from the 2016-2020 5-year ACS
## Getting data from the 2017-2021 5-year ACS
## Getting data from the 2018-2022 5-year ACS
## Getting data from the 2019-2023 5-year ACS
## Getting data from the 2020-2024 5-year ACS
## Error getting data for year 2024 : Your API call has errors.  The API message returned is <!doctype html><html lang="en"><head><title>HTTP Status 404 ? Not Found</title><style type="text/css">body {font-family:Tahoma,Arial,sans-serif;} h1, h2, h3, b {color:white;background-color:#525D76;} h1 {font-size:22px;} h2 {font-size:16px;} h3 {font-size:14px;} p {font-size:12px;} a {color:black;} .line {height:1px;background-color:#525D76;border:none;}</style></head><body><h1>HTTP Status 404 ? Not Found</h1></body></html>.
# View results
glimpse(dc_ACS_all_years)
## Rows: 2,077
## Columns: 59
## $ GEOID                                         <chr> "11001002600", "11001004…
## $ NAME                                          <chr> "Census Tract 26, Distri…
## $ poverty_rateE                                 <dbl> 138, 105, 932, 284, 458,…
## $ poverty_rateM                                 <dbl> 91, 52, 234, 173, 155, 2…
## $ unemployment_rateE                            <dbl> 113, 30, 134, 149, 158, …
## $ unemployment_rateM                            <dbl> 46, 24, 164, 87, 106, 10…
## $ no_health_insuranceE                          <dbl> 44, 19, 70, 112, 178, 52…
## $ no_health_insuranceM                          <dbl> 29, 22, 56, 69, 137, 28,…
## $ education_less_than_hsE                       <dbl> 2, 0, 80, 20, 0, 15, 40,…
## $ education_less_than_hsM                       <dbl> 5, 12, 53, 29, 17, 22, 4…
## $ median_ageE                                   <dbl> 47.5, 34.2, 28.5, 34.1, …
## $ median_ageM                                   <dbl> 2.9, 1.6, 4.4, 3.0, 1.9,…
## $ black_populationE                             <dbl> 1086, 237, 386, 1658, 48…
## $ black_populationM                             <dbl> 295, 133, 159, 337, 208,…
## $ white_populationE                             <dbl> 993, 2130, 4547, 882, 42…
## $ white_populationM                             <dbl> 112, 286, 662, 203, 488,…
## $ american_indian_alaska_native_aloneE          <dbl> 8, 7, 0, 90, 22, 0, 0, 0…
## $ american_indian_alaska_native_aloneM          <dbl> 11, 11, 17, 109, 36, 12,…
## $ asian_aloneE                                  <dbl> 111, 188, 817, 89, 625, …
## $ asian_aloneM                                  <dbl> 62, 111, 325, 66, 162, 9…
## $ native_hawaiian_other_pacific_islander_aloneE <dbl> 12, 0, 0, 0, 0, 0, 0, 0,…
## $ native_hawaiian_other_pacific_islander_aloneM <dbl> 19, 12, 17, 12, 17, 12, …
## $ male_65_69E                                   <dbl> 89, 56, 127, 104, 170, 1…
## $ male_65_69M                                   <dbl> 43, 31, 103, 85, 86, 73,…
## $ male_70_74E                                   <dbl> 26, 51, 81, 43, 63, 33, …
## $ male_70_74M                                   <dbl> 19, 42, 82, 37, 51, 35, …
## $ male_75_79E                                   <dbl> 7, 17, 72, 101, 47, 34, …
## $ male_75_79M                                   <dbl> 11, 18, 49, 79, 49, 41, …
## $ male_80_84E                                   <dbl> 50, 6, 93, 0, 60, 9, 15,…
## $ male_80_84M                                   <dbl> 21, 9, 80, 12, 49, 13, 2…
## $ male_85plusE                                  <dbl> 56, 7, 90, 0, 62, 60, 36…
## $ male_85plusM                                  <dbl> 27, 11, 74, 12, 75, 49, …
## $ female_65_69E                                 <dbl> 140, 49, 170, 107, 27, 2…
## $ female_65_69M                                 <dbl> 49, 29, 85, 84, 30, 64, …
## $ female_70_74E                                 <dbl> 51, 14, 84, 26, 35, 24, …
## $ female_70_74M                                 <dbl> 29, 16, 62, 22, 38, 20, …
## $ female_75_79E                                 <dbl> 100, 30, 44, 62, 55, 0, …
## $ female_75_79M                                 <dbl> 51, 24, 39, 79, 53, 12, …
## $ female_80_84E                                 <dbl> 41, 34, 26, 18, 22, 14, …
## $ female_80_84M                                 <dbl> 24, 27, 31, 23, 25, 15, …
## $ female_85plusE                                <dbl> 39, 12, 110, 36, 18, 21,…
## $ female_85plusM                                <dbl> 29, 12, 65, 55, 30, 22, …
## $ summary_est                                   <dbl> 2395, 2675, 6123, 2797, …
## $ summary_moe                                   <dbl> 262, 254, 815, 320, 542,…
## $ year                                          <int> 2013, 2013, 2013, 2013, …
## $ total_65plus                                  <dbl> 599, 276, 897, 497, 559,…
## $ poverty_rate_pct                              <dbl> 5.762004, 3.925234, 15.2…
## $ unemployment_rate_pct                         <dbl> 4.718163, 1.121495, 2.18…
## $ no_health_insurance_pct                       <dbl> 1.8371608, 0.7102804, 1.…
## $ education_less_than_hs_pct                    <dbl> 0.08350731, 0.00000000, …
## $ black_population_pct                          <dbl> 45.344468, 8.859813, 6.3…
## $ white_population_pct                          <dbl> 41.461378, 79.626168, 74…
## $ american_indian_pct                           <dbl> 0.33402923, 0.26168224, …
## $ asian_pct                                     <dbl> 4.6346555, 7.0280374, 13…
## $ pacific_islander_pct                          <dbl> 0.5010438, 0.0000000, 0.…
## $ pct_65plus                                    <dbl> 25.010438, 10.317757, 14…
## $ black_65plus_pct                              <dbl> 181.30217, 85.86957, 43.…
## $ white_65plus_pct                              <dbl> 165.77629, 771.73913, 50…
## $ asian_65plus_pct                              <dbl> 18.530885, 68.115942, 91…
View(dc_ACS_all_years)

# write.csv(dc_ACS_all_years, "dc_acs_rates_2013_2024.csv", row.names = FALSE)

Hypertension Data BRFSS(CDC)

library(tidyverse)
library(readr)

# Read Csv file
HtnPrevalence_datadc <- read_csv("/Users/bayowaonabajo/Downloads/indicator_data_download_20250703.csv", col_types = cols(
    `Indicator Name` = col_character(),
    `What Is This Indicator` = col_character(),
    `Location Type` = col_character(),
    `Location` = col_character(),
    `Indicator Rate Value` = col_double(),
    `Indicator Rate Value Units` = col_character(),
    `Rate Lower Confidence Interval` = col_double(),
    `Rate Upper Confidence Interval` = col_double(),
    `Indicator Count Value` = col_skip(),  
    `Indicator Count Value Units` = col_skip(),  
    `Count Lower Confidence Interval` = col_skip(),  
    `Count Upper Confidence Interval` = col_skip(),  
    `Indicator Value Unstable` = col_character(),
    `Period of Measure` = col_integer(),
    `Data Source` = col_character(),
    `Technical Note` = col_character(),
    `Breakout Title` = col_skip(),  
    `Breakout Category` = col_skip(),  
    `Breakout Subcategory` = col_skip(),  
    `Breakout Rate Value` = col_skip(),  
    `Breakout Rate Value Units` = col_skip(),  
    `Breakout Rate Lower Confidence Interval` = col_skip(),  
    `Breakout Rate Upper Confidence Interval` = col_skip(),  
    `Breakout Count Value` = col_skip(),  
    `Breakout Count Value Units` = col_skip(),  
    `Breakout Count Lower Confidence Interval` = col_skip(),  
    `Breakout Count Upper Confidence Interval` = col_skip(),  
    `Breakout Unstable` = col_skip(), 
    `Breakout Footer` = col_skip()  
  )
)

View(HtnPrevalence_datadc) 

Hypertension Data by Census Tracts

# Clean the necessary columns
htn_tract_clean <- HtnPrevalence_datadc %>%
  filter(`Indicator Name` == "High Blood Pressure Prevalence",
         `Location Type` == "Census Tract") %>%
  rename(
    GEOID = Location,
    year = `Period of Measure`,
    HTP = `Indicator Rate Value`
  ) %>%
  mutate(
    GEOID = as.character(GEOID),
    year = as.integer(year)
  ) %>%
  select(GEOID, year, HTP)

# Check structure
glimpse(htn_tract_clean)
## Rows: 867
## Columns: 3
## $ GEOID <chr> "11001000100", "11001000100", "11001000100", "11001000100", "110…
## $ year  <int> 2019, 2017, 2015, 2013, 2021, 2019, 2017, 2015, 2013, 2021, 2019…
## $ HTP   <dbl> 23.1, 21.8, 21.7, 23.8, 8.4, 8.6, 11.7, 9.1, 10.2, 19.3, 16.8, 1…
Data Cleaning
acs_summary <- dc_ACS_all_years %>%
  group_by(year) %>%
  summarise(
    POV = mean(poverty_rate_pct, na.rm = TRUE),
    UER = mean(unemployment_rate_pct, na.rm = TRUE),
    LIN = mean(no_health_insurance_pct, na.rm = TRUE),
    AGE65 = mean(pct_65plus, na.rm = TRUE),
    ASIAN = mean(asian_pct, na.rm = TRUE),
    PACIFIC = mean(pacific_islander_pct, na.rm = TRUE),
    AMERICANINDIAN = mean(american_indian_pct, na.rm = TRUE),
    BLACK = mean(black_population_pct, na.rm = TRUE),
    WHITE = mean(white_population_pct, na.rm = TRUE)
  )

Hypertension Data Interpolation for Missing Years

library(tidyr)
library(zoo)

# Interpolate, Create full panel
htn_tract_full <- htn_tract_clean %>%
  complete(GEOID, year = 2013:2024) %>%
  group_by(GEOID) %>%
  arrange(year) %>%
  mutate(HTP_interp = na.approx(HTP, year, rule = 2)) %>%
  ungroup()

Merge and Clean Dataset

dc_ACS_all_years <- dc_ACS_all_years %>%
  mutate(GEOID = as.character(GEOID))

df1  <- dc_ACS_all_years %>%
  left_join(htn_tract_full, by = c("GEOID", "year"))

View(df1)
# Clean table for analysis
df2 <- df1 %>%
  select(
    GEOID,
    NAME,
    year,
    HTP_interp,           # Interpolated hypertension prevalence
    poverty_rate_pct,     # POV
    unemployment_rate_pct, # UER
    no_health_insurance_pct, # LIN
    pct_65plus,           # AGE65
    asian_pct,            # ASIAN
    pacific_islander_pct, # PACIFIC
    american_indian_pct,  # AMERICANINDIAN
    black_population_pct, # BLACK
    white_population_pct  # WHITE
  ) %>%
  rename(
    Year = year,
    HTP = HTP_interp,
    POV = poverty_rate_pct,
    UER = unemployment_rate_pct,
    LIN = no_health_insurance_pct,
    AGE65 = pct_65plus,
    ASIAN = asian_pct,
    PACIFIC = pacific_islander_pct,
    AMERICANINDIAN = american_indian_pct,
    BLACK = black_population_pct,
    WHITE = white_population_pct
  )
View(df2)

Final Dataset

library(dplyr)

# Check column types first
str(df2)
## tibble [2,077 × 13] (S3: tbl_df/tbl/data.frame)
##  $ GEOID         : chr [1:2077] "11001002600" "11001004002" "11001005600" "11001004600" ...
##  $ NAME          : chr [1:2077] "Census Tract 26, District of Columbia, District of Columbia" "Census Tract 40.02, District of Columbia, District of Columbia" "Census Tract 56, District of Columbia, District of Columbia" "Census Tract 46, District of Columbia, District of Columbia" ...
##  $ Year          : int [1:2077] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ HTP           : num [1:2077] 34.8 16.8 18.9 27.3 17.2 18.6 37.5 41.8 20.2 17.8 ...
##  $ POV           : num [1:2077] 5.76 3.93 15.22 10.15 7.85 ...
##  $ UER           : num [1:2077] 4.72 1.12 2.19 5.33 2.71 ...
##  $ LIN           : num [1:2077] 1.84 0.71 1.14 4 3.05 ...
##  $ AGE65         : num [1:2077] 25.01 10.32 14.65 17.77 9.58 ...
##  $ ASIAN         : num [1:2077] 4.63 7.03 13.34 3.18 10.71 ...
##  $ PACIFIC       : num [1:2077] 0.501 0 0 0 0 ...
##  $ AMERICANINDIAN: num [1:2077] 0.334 0.262 0 3.218 0.377 ...
##  $ BLACK         : num [1:2077] 45.34 8.86 6.3 59.28 8.31 ...
##  $ WHITE         : num [1:2077] 41.5 79.6 74.3 31.5 72.2 ...
# Select only numeric columns for missing analysis
numeric_cols <- df2 %>% 
  select(where(is.numeric))  # Only numeric 

#  Calculate missing data properly
total_rows <- nrow(numeric_cols)
missing_counts <- colSums(is.na(numeric_cols))
missing_percentages <- (missing_counts / total_rows) * 100

#  Print results sorted by most missing
missing_percentages %>% 
  sort(decreasing = TRUE) %>% 
  print()
##            HTP            POV            UER            LIN          AGE65 
##    10.01444391     0.09629273     0.09629273     0.09629273     0.09629273 
##          ASIAN        PACIFIC AMERICANINDIAN          BLACK          WHITE 
##     0.09629273     0.09629273     0.09629273     0.09629273     0.09629273 
##           Year 
##     0.00000000
# Remove rows with NAs (only from numeric columns)
df <- df2 %>% 
  filter(complete.cases(numeric_cols))

# Verify 
glimpse(df)
## Rows: 1,867
## Columns: 13
## $ GEOID          <chr> "11001002600", "11001004002", "11001005600", "110010046…
## $ NAME           <chr> "Census Tract 26, District of Columbia, District of Col…
## $ Year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ HTP            <dbl> 34.8, 16.8, 18.9, 27.3, 17.2, 18.6, 37.5, 41.8, 20.2, 1…
## $ POV            <dbl> 5.762004, 3.925234, 15.221297, 10.153736, 7.849186, 21.…
## $ UER            <dbl> 4.718163, 1.121495, 2.188470, 5.327136, 2.707798, 7.658…
## $ LIN            <dbl> 1.8371608, 0.7102804, 1.1432304, 4.0042903, 3.0505570, …
## $ AGE65          <dbl> 25.010438, 10.317757, 14.649682, 17.769038, 9.580120, 1…
## $ ASIAN          <dbl> 4.6346555, 7.0280374, 13.3431325, 3.1819807, 10.7112254…
## $ PACIFIC        <dbl> 0.5010438, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ AMERICANINDIAN <dbl> 0.33402923, 0.26168224, 0.00000000, 3.21773329, 0.37703…
## $ BLACK          <dbl> 45.344468, 8.859813, 6.304099, 59.277798, 8.311911, 27.…
## $ WHITE          <dbl> 41.461378, 79.626168, 74.260983, 31.533786, 72.219366, …
View(df)
#write.csv(df, "final_dataset_DC_htn.csv", row.names = FALSE)

Variables:

  • HTP: Hypertension prevalence (%)

  • POV: Poverty rate

  • UER: Unemployment rate

  • LIN: Lack of insurance

  • AGE65: % aged 65+

  • Year: Observation year

Data Characteristics

Sample Size: 1,867 observations (by census tracts)

Timeseries Plot of Variables

library(dplyr)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Calculate yearly averages
df_yearly <- df %>%
  group_by(Year) %>%
  summarise(
    HTP = mean(HTP, na.rm = TRUE),
    POV = mean(POV, na.rm = TRUE),
    UER = mean(UER, na.rm = TRUE),
    LIN = mean(LIN, na.rm = TRUE),
    AGE65 = mean(AGE65, na.rm = TRUE)
  )

#  Create AND STORE interactive plot
interactive_plot <- plot_ly(df_yearly, x = ~Year) %>%
  add_lines(y = ~HTP, name = "Hypertension (%)", line = list(color = "red")) %>%
  add_lines(y = ~POV, name = "Poverty (%)", line = list(color = "blue")) %>%
  add_lines(y = ~UER, name = "Unemployment (%)", line = list(color = "green")) %>%
  add_lines(y = ~LIN, name = "No Insurance (%)", line = list(color = "purple")) %>%
  add_lines(y = ~AGE65, name = "Age 65+ (%)", line = list(color = "orange")) %>%
  layout(
    title = "DC Yearly Averages (2013-2023)",
    xaxis = list(title = "Year", dtick = 1),
    yaxis = list(title = "Percentage (%)"),
    hovermode = "x unified"
  )

# Now you can display it
interactive_plot

Statistical Analysis

Descriptive Statistics

summary(df[,-1])
##      NAME                Year           HTP             POV        
##  Length:1867        Min.   :2013   Min.   : 0.00   Min.   : 0.000  
##  Class :character   1st Qu.:2015   1st Qu.:21.20   1st Qu.: 7.186  
##  Mode  :character   Median :2018   Median :30.30   Median :13.140  
##                     Mean   :2018   Mean   :30.14   Mean   :16.758  
##                     3rd Qu.:2020   3rd Qu.:39.95   3rd Qu.:24.089  
##                     Max.   :2023   Max.   :48.20   Max.   :70.043  
##       UER              LIN              AGE65           ASIAN        
##  Min.   : 0.000   Min.   : 0.0000   Min.   : 0.00   Min.   : 0.0000  
##  1st Qu.: 2.407   1st Qu.: 0.6474   1st Qu.:10.97   1st Qu.: 0.3738  
##  Median : 4.332   Median : 1.5774   Median :14.13   Median : 2.2795  
##  Mean   : 5.112   Mean   : 2.0263   Mean   :14.35   Mean   : 3.4124  
##  3rd Qu.: 6.988   3rd Qu.: 2.8993   3rd Qu.:17.45   3rd Qu.: 5.1190  
##  Max.   :24.658   Max.   :13.7735   Max.   :37.31   Max.   :24.9156  
##     PACIFIC        AMERICANINDIAN        BLACK            WHITE        
##  Min.   :0.00000   Min.   : 0.0000   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:0.00000   1st Qu.: 0.0000   1st Qu.: 14.95   1st Qu.:  5.036  
##  Median :0.00000   Median : 0.0000   Median : 52.46   Median : 30.812  
##  Mean   :0.04993   Mean   : 0.3026   Mean   : 51.61   Mean   : 36.759  
##  3rd Qu.:0.00000   3rd Qu.: 0.3010   3rd Qu.: 87.59   3rd Qu.: 67.948  
##  Max.   :4.00677   Max.   :10.5140   Max.   :100.00   Max.   :100.000
apply(df[,-1], 2, sd)
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm =
## na.rm): NAs introduced by coercion
##           NAME           Year            HTP            POV            UER 
##             NA      3.1262055      9.9154904     12.2909372      3.5131603 
##            LIN          AGE65          ASIAN        PACIFIC AMERICANINDIAN 
##      1.8767898      4.9703869      3.8074132      0.2388013      0.7869290 
##          BLACK          WHITE 
##     34.2446089     30.7340430

Average age-adjusted hypertension prevalence (HTP) across D.C. census tracts in 2013 was 26.4%, with values ranging from 0.0% to 44.4%.

library(tseries)

#  Select only numeric columns
numeric_cols <- df %>% select(where(is.numeric))

# Calculate statistics
results <- data.frame(
  Mean = sapply(numeric_cols, mean, na.rm = TRUE),
  SD = sapply(numeric_cols, sd, na.rm = TRUE),
  Sum_Sq_Dev = sapply(numeric_cols, function(x) sum((x - mean(x, na.rm = TRUE))^2, na.rm = TRUE)),
  JB_p_value = sapply(numeric_cols, function(x) {
    if (length(na.omit(x)) >= 4) {  # Jarque-Bera requires min 4 observations
      jarque.bera.test(na.omit(x))$p.value 
    } else {
      NA
    }
  })
)

# 3. Display results
round(results, 4)
##                     Mean      SD   Sum_Sq_Dev JB_p_value
## Year           2017.8174  3.1262   18236.7177          0
## HTP              30.1408  9.9155  183459.4300          0
## POV              16.7584 12.2909  281891.2796          0
## UER               5.1121  3.5132   23030.7233          0
## LIN               2.0263  1.8768    6572.6863          0
## AGE65            14.3549  4.9704   46099.0555          0
## ASIAN             3.4124  3.8074   27050.2730          0
## PACIFIC           0.0499  0.2388     106.4106          0
## AMERICANINDIAN    0.3026  0.7869    1155.5340          0
## BLACK            51.6081 34.2446 2188245.5775          0
## WHITE            36.7589 30.7340 1762588.8937          0

Correlation Matrix

numeric_df <- df[, -1] %>% select_if(is.numeric)
cor(numeric_df)
##                        Year         HTP         POV         UER         LIN
## Year            1.000000000  0.01225437 -0.08340431 -0.20089511 -0.29514758
## HTP             0.012254374  1.00000000  0.59945319  0.60126945  0.16242865
## POV            -0.083404312  0.59945319  1.00000000  0.62783008  0.22125963
## UER            -0.200895107  0.60126945  0.62783008  1.00000000  0.27286433
## LIN            -0.295147579  0.16242865  0.22125963  0.27286433  1.00000000
## AGE65          -0.033639803  0.45776709 -0.01306251  0.14358245 -0.08837060
## ASIAN           0.001440181 -0.64625270 -0.36913272 -0.42452151 -0.09400436
## PACIFIC         0.056724566 -0.10059304 -0.06887460 -0.09504731 -0.03509823
## AMERICANINDIAN -0.010888039  0.03442100 -0.05418941 -0.01158435  0.03220828
## BLACK          -0.027851251  0.89986977  0.67853649  0.69059186  0.23084653
## WHITE          -0.033638188 -0.88224321 -0.65886167 -0.66498105 -0.28230126
##                      AGE65        ASIAN     PACIFIC AMERICANINDIAN        BLACK
## Year           -0.03363980  0.001440181  0.05672457   -0.010888039 -0.027851251
## HTP             0.45776709 -0.646252700 -0.10059304    0.034420998  0.899869767
## POV            -0.01306251 -0.369132721 -0.06887460   -0.054189411  0.678536490
## UER             0.14358245 -0.424521510 -0.09504731   -0.011584352  0.690591860
## LIN            -0.08837060 -0.094004361 -0.03509823    0.032208281  0.230846528
## AGE65           1.00000000 -0.233821809 -0.01346033    0.071745697  0.237045137
## ASIAN          -0.23382181  1.000000000  0.08223060   -0.009878009 -0.672444785
## PACIFIC        -0.01346033  0.082230600  1.00000000   -0.031657367 -0.103123018
## AMERICANINDIAN  0.07174570 -0.009878009 -0.03165737    1.000000000  0.002118105
## BLACK           0.23704514 -0.672444785 -0.10312302    0.002118105  1.000000000
## WHITE          -0.21210635  0.602479467  0.09176414   -0.041129037 -0.973391855
##                      WHITE
## Year           -0.03363819
## HTP            -0.88224321
## POV            -0.65886167
## UER            -0.66498105
## LIN            -0.28230126
## AGE65          -0.21210635
## ASIAN           0.60247947
## PACIFIC         0.09176414
## AMERICANINDIAN -0.04112904
## BLACK          -0.97339186
## WHITE           1.00000000
str(df)
## tibble [1,867 × 13] (S3: tbl_df/tbl/data.frame)
##  $ GEOID         : chr [1:1867] "11001002600" "11001004002" "11001005600" "11001004600" ...
##  $ NAME          : chr [1:1867] "Census Tract 26, District of Columbia, District of Columbia" "Census Tract 40.02, District of Columbia, District of Columbia" "Census Tract 56, District of Columbia, District of Columbia" "Census Tract 46, District of Columbia, District of Columbia" ...
##  $ Year          : int [1:1867] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ HTP           : num [1:1867] 34.8 16.8 18.9 27.3 17.2 18.6 37.5 41.8 20.2 17.8 ...
##  $ POV           : num [1:1867] 5.76 3.93 15.22 10.15 7.85 ...
##  $ UER           : num [1:1867] 4.72 1.12 2.19 5.33 2.71 ...
##  $ LIN           : num [1:1867] 1.84 0.71 1.14 4 3.05 ...
##  $ AGE65         : num [1:1867] 25.01 10.32 14.65 17.77 9.58 ...
##  $ ASIAN         : num [1:1867] 4.63 7.03 13.34 3.18 10.71 ...
##  $ PACIFIC       : num [1:1867] 0.501 0 0 0 0 ...
##  $ AMERICANINDIAN: num [1:1867] 0.334 0.262 0 3.218 0.377 ...
##  $ BLACK         : num [1:1867] 45.34 8.86 6.3 59.28 8.31 ...
##  $ WHITE         : num [1:1867] 41.5 79.6 74.3 31.5 72.2 ...
numeric_df <- df %>% select(-1) %>% select_if(is.numeric)

corrplot::corrplot(cor(numeric_df))

str(df %>% select(HTP, BLACK, WHITE, AMERICANINDIAN, ASIAN, PACIFIC))
## tibble [1,867 × 6] (S3: tbl_df/tbl/data.frame)
##  $ HTP           : num [1:1867] 34.8 16.8 18.9 27.3 17.2 18.6 37.5 41.8 20.2 17.8 ...
##  $ BLACK         : num [1:1867] 45.34 8.86 6.3 59.28 8.31 ...
##  $ WHITE         : num [1:1867] 41.5 79.6 74.3 31.5 72.2 ...
##  $ AMERICANINDIAN: num [1:1867] 0.334 0.262 0 3.218 0.377 ...
##  $ ASIAN         : num [1:1867] 4.63 7.03 13.34 3.18 10.71 ...
##  $ PACIFIC       : num [1:1867] 0.501 0 0 0 0 ...
cor_data <- df %>%
  select(
    HTP,
    BLACK,
    WHITE,
    AMERICANINDIAN,
    ASIAN,
    PACIFIC  
  ) %>%
  mutate(across(everything(), ~ as.numeric(as.character(.)))) %>%  
  filter(complete.cases(.))  # Remove NAs

#Verify all columns are numeric
stopifnot(all(sapply(cor_data, is.numeric)))  

#Correlation matrix
cor_matrix <- cor(cor_data, use = "complete.obs")

#View results with significance stars
print(cor_matrix)
##                       HTP        BLACK       WHITE AMERICANINDIAN        ASIAN
## HTP             1.0000000  0.899869767 -0.88224321    0.034420998 -0.646252700
## BLACK           0.8998698  1.000000000 -0.97339186    0.002118105 -0.672444785
## WHITE          -0.8822432 -0.973391855  1.00000000   -0.041129037  0.602479467
## AMERICANINDIAN  0.0344210  0.002118105 -0.04112904    1.000000000 -0.009878009
## ASIAN          -0.6462527 -0.672444785  0.60247947   -0.009878009  1.000000000
## PACIFIC        -0.1005930 -0.103123018  0.09176414   -0.031657367  0.082230600
##                    PACIFIC
## HTP            -0.10059304
## BLACK          -0.10312302
## WHITE           0.09176414
## AMERICANINDIAN -0.03165737
## ASIAN           0.08223060
## PACIFIC         1.00000000

Regression

# Simple linear regressions
lm_black <- lm( HTP ~ BLACK, data = df)
summary(lm_black)
## 
## Call:
## lm(formula = HTP ~ BLACK, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28.0516  -2.3945   0.1209   2.5586  16.2096 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.694000   0.181107   92.18   <2e-16 ***
## BLACK        0.260556   0.002924   89.10   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.326 on 1865 degrees of freedom
## Multiple R-squared:  0.8098, Adjusted R-squared:  0.8097 
## F-statistic:  7939 on 1 and 1865 DF,  p-value: < 2.2e-16
lm_white <- lm(HTP ~ WHITE, data = df)
summary(lm_white)
## 
## Call:
## lm(formula = HTP ~ WHITE, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.5474  -2.6842   0.4068   2.8455  16.5850 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 40.603546   0.168502  240.97   <2e-16 ***
## WHITE       -0.284631   0.003517  -80.93   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.669 on 1865 degrees of freedom
## Multiple R-squared:  0.7784, Adjusted R-squared:  0.7782 
## F-statistic:  6549 on 1 and 1865 DF,  p-value: < 2.2e-16
lm_ai_an <- lm(HTP ~ AMERICANINDIAN, data = df)
summary(lm_ai_an)
## 
## Call:
## lm(formula = HTP ~ AMERICANINDIAN, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30.0096  -8.9096   0.0072   9.7904  16.9904 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     30.0096     0.2458 122.097   <2e-16 ***
## AMERICANINDIAN   0.4337     0.2916   1.487    0.137    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.912 on 1865 degrees of freedom
## Multiple R-squared:  0.001185,   Adjusted R-squared:  0.0006492 
## F-statistic: 2.212 on 1 and 1865 DF,  p-value: 0.1371
lm_asian <- lm(HTP ~ ASIAN, data = df)
summary(lm_asian)
## 
## Call:
## lm(formula = HTP ~ ASIAN, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.884  -5.552   0.638   5.716  32.249 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 35.88387    0.23525  152.53   <2e-16 ***
## ASIAN       -1.68301    0.04602  -36.57   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.569 on 1865 degrees of freedom
## Multiple R-squared:  0.4176, Adjusted R-squared:  0.4173 
## F-statistic:  1338 on 1 and 1865 DF,  p-value: < 2.2e-16
lm_pacific <- lm(HTP ~ PACIFIC, data = df)
summary(lm_pacific)
## 
## Call:
## lm(formula = HTP ~ PACIFIC, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30.3494  -8.5494   0.2506   9.7506  21.6887 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  30.3494     0.2333 130.078  < 2e-16 ***
## PACIFIC      -4.1768     0.9566  -4.366 1.33e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.868 on 1865 degrees of freedom
## Multiple R-squared:  0.01012,    Adjusted R-squared:  0.009588 
## F-statistic: 19.06 on 1 and 1865 DF,  p-value: 1.333e-05

Why Regression Models?

Different questions require different approaches:

  1. Regression : Basic relationships between variables

  2. ARDL: How effects unfold over time

  3. Spatial Models: How neighborhoods influence each other

Regression (OLS)

OLS provides the baseline relationship:

HTPi=β0+β1POVi+β2UERi+β3LINi+β4AGE65i+εiHTPi​=β0​+β1​POVi​+β2​UERi​+β3​LINi​+β4​AGE65i​+εi​

What Regression Tells Us:

  • Linear relationship: Y = β₀ + β₁X₁ + β₂X₂ + ..

  • Coefficients show direction and strength of relationships

  • R² indicates how well model explains the variation

Points near the red line indicate a strong model fit.

library(ggplot2) 
library(dplyr)

# model and get predictions

model <- lm(HTP ~ POV + UER + LIN + AGE65, data = df) 
df$predicted <- predict(model)
df$residuals <- residuals(model)

# Actual vs Predicted plot

ggplot(df, aes(x = predicted, y = HTP)) + geom_point(alpha = 0.6, color = "steelblue", size = 2) + geom_abline(intercept = 0, slope = 1, color = "red", linewidth = 1, linetype = "dashed") + geom_smooth(method = "lm", color = "darkred", se = FALSE) + labs(title = "Multiple Regression: Actual vs Predicted Hypertension Rates", subtitle = "How well the model predicts hypertension across D.C. census tracts", x = "Predicted Hypertension Prevalence (%)", y = "Actual Hypertension Prevalence (%)") + theme_minimal() + theme(plot.title = element_text(face = "bold"))
## `geom_smooth()` using formula = 'y ~ x'

# Residuals analysis
ggplot(df, aes(x = predicted, y = residuals)) +
  geom_point(alpha = 0.6, color = "steelblue", size = 2) +
  geom_hline(yintercept = 0, color = "red", linewidth = 1, linetype = "dashed") +
  geom_smooth(se = FALSE, color = "darkorange") +
  labs(title = "Residuals vs Fitted Values",
       subtitle = "Checking regression assumptions: constant variance and linearity",
       x = "Predicted Values",
       y = "Residuals (Actual - Predicted)") +
  theme_minimal()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Relationship for each predictor
library(patchwork)

# Poverty rate vs Hypertension
p1 <- ggplot(df, aes(x = POV, y = HTP)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  labs(title = "Poverty Rate vs Hypertension",
       x = "Poverty Rate (%)",
       y = "Hypertension Prevalence (%)") +
  theme_minimal()

# Unemployment vs Hypertension
p2 <- ggplot(df, aes(x = UER, y = HTP)) +
  geom_point(alpha = 0.5, color = "darkgreen") +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  labs(title = "Unemployment Rate vs Hypertension",
       x = "Unemployment Rate (%)",
       y = "Hypertension Prevalence (%)") +
  theme_minimal()

# Age 65+ vs Hypertension
p3 <- ggplot(df, aes(x = AGE65, y = HTP)) +
  geom_point(alpha = 0.5, color = "purple") +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  labs(title = "Population 65+ vs Hypertension",
       x = "Population Aged 65+ (%)",
       y = "Hypertension Prevalence (%)") +
  theme_minimal()

# No Insurance vs Hypertension
p4 <- ggplot(df, aes(x = LIN, y = HTP)) +
  geom_point(alpha = 0.5, color = "orange") +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  labs(title = "No Health Insurance vs Hypertension",
       x = "No Health Insurance (%)",
       y = "Hypertension Prevalence (%)") +
  theme_minimal()

# Arrange
(p1 + p2) / (p3 + p4)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# If you want to see 3D relationship (optional)
library(plotly)

# Create a 3D scatterplot with regression plane
plot_ly(df, x = ~POV, y = ~UER, z = ~HTP, 
        type = "scatter3d", mode = "markers",
        marker = list(size = 3, opacity = 0.7),
        color = ~HTP, colors = "Reds") %>%
  layout(title = "3D View: Poverty, Unemployment vs Hypertension",
         scene = list(xaxis = list(title = "Poverty Rate (%)"),
                      yaxis = list(title = "Unemployment Rate (%)"),
                      zaxis = list(title = "Hypertension Prevalence (%)")))
# linear Regression
## Research Question?
library(lmtest)
library(car)

# Ensure variables are numeric and complete cases
model_data <- na.omit(df[, c("HTP", "POV", "UER", "LIN", "AGE65")])
model <- lm(HTP ~ POV + UER + LIN + AGE65, data = model_data)
summary(model)
## 
## Call:
## lm(formula = HTP ~ POV + UER + LIN + AGE65, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.4163  -4.4016  -0.1987   4.0868  18.9096 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.87286    0.50051  15.730   <2e-16 ***
## POV          0.35236    0.01500  23.493   <2e-16 ***
## UER          0.72283    0.05402  13.380   <2e-16 ***
## LIN          0.17898    0.07947   2.252   0.0244 *  
## AGE65        0.85720    0.02937  29.186   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.135 on 1862 degrees of freedom
## Multiple R-squared:  0.618,  Adjusted R-squared:  0.6172 
## F-statistic: 753.1 on 4 and 1862 DF,  p-value: < 2.2e-16
# Diagnostics
par(mfrow = c(2, 2))
plot(model)  

dwtest(model)  # Durbin-Watson test
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 1.5259, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0

The baseline linear model (HTP ~ POV + UER + LIN + AGE65) showed: Poverty is the most significant predictor (β = 0.45, p < 0.001).Unemployment and AGE65 are also statistically significant (p < 0.05).

Multiple Linear Regression

model <- lm(HTP ~ POV + UER + LIN + AGE65, data = df)
summary(model)
## 
## Call:
## lm(formula = HTP ~ POV + UER + LIN + AGE65, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.4163  -4.4016  -0.1987   4.0868  18.9096 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.87286    0.50051  15.730   <2e-16 ***
## POV          0.35236    0.01500  23.493   <2e-16 ***
## UER          0.72283    0.05402  13.380   <2e-16 ***
## LIN          0.17898    0.07947   2.252   0.0244 *  
## AGE65        0.85720    0.02937  29.186   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.135 on 1862 degrees of freedom
## Multiple R-squared:  0.618,  Adjusted R-squared:  0.6172 
## F-statistic: 753.1 on 4 and 1862 DF,  p-value: < 2.2e-16
  • Coefficients (β) measure how each socioeconomic variable affects hypertension.

  • p-values indicate statistical significance.

  •  shows the model’s explanatory strength.

Multicollinearity

Variance inflation factor (VIF)

Testing collinearity in variables with a value above 5 suggesting collinearity with poor outcomes on the model and value of 1 suggesting no collinearity. Variance Inflation Factors are used to assess the contribution of each predictor variable to the model and to identify those that are collinear. (Akinwande et al. 2015)”

car::vif(model)  # Variance Inflation Factor
##      POV      UER      LIN    AGE65 
## 1.684957 1.785993 1.102908 1.056610

ARDL Model Estimation

The ARDL model captures lagged effects over time, recognizing that changes in socioeconomic factors may affect hypertension with a delay.

## Research Questions?
library(ARDL)
library(dplyr)

# Prepare data,ensure numeric and complete cases
ardl_data <- na.omit(df[, c("HTP", "POV", "UER", "AGE65")]) %>% 
  mutate(across(everything(), as.numeric))

ardl_model <- ardl(HTP ~ POV + UER + AGE65, data = ardl_data,
                   order = c(1, 0, 1, 0))
summary(ardl_model)
## 
## Time series regression with "ts" data:
## Start = 2, End = 1867
## 
## Call:
## dynlm::dynlm(formula = full_formula, data = data, start = start, 
##     end = end)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.2592  -4.0518  -0.2233   4.0896  18.5447 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.76667    0.57030   6.605 5.18e-11 ***
## L(HTP, 1)    0.21610    0.01744  12.387  < 2e-16 ***
## POV          0.34009    0.01442  23.584  < 2e-16 ***
## UER          0.65717    0.05187  12.670  < 2e-16 ***
## L(UER, 1)   -0.16581    0.04924  -3.367 0.000775 ***
## AGE65        0.81159    0.02812  28.865  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.881 on 1860 degrees of freedom
## Multiple R-squared:  0.6493, Adjusted R-squared:  0.6484 
## F-statistic: 688.7 on 5 and 1860 DF,  p-value: < 2.2e-16

An ARDL model was estimated based on AIC. Long-run coefficients, POV and UER remained significant.

Why ARDL instead of MLR?

  • Dataset covers2013-2023, a clear time-series structure.
  • ARDL is good for lagged effects; short run and long run effects in analysis.
  • Various reasons were dependent on other tests e.g Durbin Watson etc

The ARDL model incorporates lagged dependent and independent variables:

\[ HTP_t = \alpha_0 + \alpha_1 HTP_{t-1} + \beta_0 POV_t + \beta_1 UER_{t-1} + \beta_2 AGE65_t + \varepsilon_t \] \(HTP_t\): Current hypertension prevalence

\(HTP_{t-1}\): Lagged hypertension (previous year)

\(POV_t\): Poverty rate (current year)

\(UER_{t-1}\): Lagged unemployment rate

\(AGE65_t\): Proportion of population aged 65+

With ARDL:

  • HTPₜ₋₁: Hypertension persists over time (temporal inertia).

  • UERₜ₋₁: Unemployment’s effect is delayed (lags matter).

  • R² = 0.65 → slightly better fit than static MLR.

Cointegration (Bounds) Test

## Research Question 1,2,3,4?
library(ARDL)

# 1. Estimate ARDL model
ardl_model <- ardl(HTP ~ POV + UER + AGE65, data = df, order = c(1, 0, 1, 0))

# 2. Bounds test for cointegration
bounds_test <- bounds_f_test(ardl_model, case = 3)
print(bounds_test)
## 
##  Bounds F-test (Wald) for no cointegration
## 
## data:  d(HTP) ~ L(HTP, 1) + POV + L(UER, 1) + AGE65 + d(UER)
## F = 742.47, p-value = 1e-06
## alternative hypothesis: Possible cointegration
## null values:
##    k    T 
##    3 1000
### Long Run and Short Run Estimates

library(ARDL)

ecm_model <- uecm(ardl_model)
summary(ecm_model)
## 
## Time series regression with "ts" data:
## Start = 2, End = 1867
## 
## Call:
## dynlm::dynlm(formula = full_formula, data = data, start = start, 
##     end = end)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.2592  -4.0518  -0.2233   4.0896  18.5447 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.76667    0.57030   6.605 5.18e-11 ***
## L(HTP, 1)   -0.78390    0.01744 -44.936  < 2e-16 ***
## POV          0.34009    0.01442  23.584  < 2e-16 ***
## L(UER, 1)    0.49136    0.06717   7.315 3.81e-13 ***
## AGE65        0.81159    0.02812  28.865  < 2e-16 ***
## d(UER)       0.65717    0.05187  12.670  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.881 on 1860 degrees of freedom
## Multiple R-squared:  0.7067, Adjusted R-squared:  0.7059 
## F-statistic: 896.1 on 5 and 1860 DF,  p-value: < 2.2e-16
coefs <- coefficients(ardl_model)
print(coefs)
## (Intercept)   L(HTP, 1)         POV         UER   L(UER, 1)       AGE65 
##   3.7666673   0.2160962   0.3400873   0.6571727  -0.1658133   0.8115949
library(ARDL)
library(dplyr)

# Include race variables 
ardl_data1 <- na.omit(df[, c("HTP", "POV", "UER", "AGE65", "BLACK", "WHITE", "AMERICANINDIAN", "ASIAN", "PACIFIC" )]) %>%
  mutate(across(everything(), as.numeric))

# Estimate ARDL model with race covariates
ardl_model1 <- ardl(HTP ~ POV + UER + AGE65 + BLACK + WHITE + AMERICANINDIAN + ASIAN + PACIFIC,
                   data = ardl_data1,
                   order = c(1, 0, 1, 0, 0, 0, 0, 0, 0))

# Summarize results
summary(ardl_model1)
## 
## Time series regression with "ts" data:
## Start = 2, End = 1867
## 
## Call:
## dynlm::dynlm(formula = full_formula, data = data, start = start, 
##     end = end)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.4182  -1.8757  -0.0008   1.9398  16.0788 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    17.982246   1.121209  16.038  < 2e-16 ***
## L(HTP, 1)       0.045492   0.010432   4.361 1.37e-05 ***
## POV             0.080109   0.009518   8.417  < 2e-16 ***
## UER            -0.144905   0.032768  -4.422 1.03e-05 ***
## L(UER, 1)       0.014080   0.028820   0.489   0.6252    
## AGE65           0.546016   0.016903  32.302  < 2e-16 ***
## BLACK           0.126684   0.012447  10.178  < 2e-16 ***
## WHITE          -0.096790   0.012117  -7.988 2.39e-15 ***
## AMERICANINDIAN  0.062552   0.102086   0.613   0.5401    
## ASIAN          -0.203092   0.029911  -6.790 1.51e-11 ***
## PACIFIC        -0.617285   0.331083  -1.864   0.0624 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.385 on 1855 degrees of freedom
## Multiple R-squared:  0.8841, Adjusted R-squared:  0.8835 
## F-statistic:  1415 on 10 and 1855 DF,  p-value: < 2.2e-16

To assess the influence of racial composition, we extended the ARDL model by including the percentage of residents by race as additional covariates. Lag structures were determined using AIC, and both race variables were included contemporaneously given their theoretical impact on structural health disparities.

#Conversion to time-series obj
df_ts <- ts(df, start = 2013, frequency = 1)

# stationarity
adf.test(df$HTP)  
## Warning in adf.test(df$HTP): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$HTP
## Dickey-Fuller = -9.7698, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
  • ADF Test checks stationarity.

  • If non-stationary but cointegrated, then ARDL is appropriate for long-run modeling.

Augmented Dickey-Fuller Unit Root Test

library(tseries)

# Safely identify numeric columns (excluding first column)
numeric_cols <- which(sapply(df, is.numeric))
if(length(numeric_cols) > 1) {
  numeric_cols <- numeric_cols[-1]  # Remove first column if it's numeric
  adf_results <- lapply(df[, numeric_cols, drop = FALSE], function(x) {
    x_clean <- na.omit(as.numeric(x))
    if(length(x_clean) > 8) adf.test(x_clean) else NULL  # ADF requires min 8 obs
  })
  adf_results <- Filter(Negate(is.null), adf_results)
} else {
  warning("Insufficient numeric columns for ADF testing")
  adf_results <- list()
}
## Warning in adf.test(x_clean): p-value smaller than printed p-value
## Warning in adf.test(x_clean): p-value smaller than printed p-value
## Warning in adf.test(x_clean): p-value smaller than printed p-value
## Warning in adf.test(x_clean): p-value smaller than printed p-value
## Warning in adf.test(x_clean): p-value smaller than printed p-value
## Warning in adf.test(x_clean): p-value smaller than printed p-value
## Warning in adf.test(x_clean): p-value smaller than printed p-value
## Warning in adf.test(x_clean): p-value smaller than printed p-value
## Warning in adf.test(x_clean): p-value smaller than printed p-value
## Warning in adf.test(x_clean): p-value smaller than printed p-value
## Warning in adf.test(x_clean): p-value smaller than printed p-value
## Warning in adf.test(x_clean): p-value smaller than printed p-value
adf_results
## $HTP
## 
##  Augmented Dickey-Fuller Test
## 
## data:  x_clean
## Dickey-Fuller = -9.7698, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
## $POV
## 
##  Augmented Dickey-Fuller Test
## 
## data:  x_clean
## Dickey-Fuller = -10.723, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
## $UER
## 
##  Augmented Dickey-Fuller Test
## 
## data:  x_clean
## Dickey-Fuller = -9.3906, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
## $LIN
## 
##  Augmented Dickey-Fuller Test
## 
## data:  x_clean
## Dickey-Fuller = -11.724, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
## $AGE65
## 
##  Augmented Dickey-Fuller Test
## 
## data:  x_clean
## Dickey-Fuller = -10.986, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
## $ASIAN
## 
##  Augmented Dickey-Fuller Test
## 
## data:  x_clean
## Dickey-Fuller = -9.1577, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
## $PACIFIC
## 
##  Augmented Dickey-Fuller Test
## 
## data:  x_clean
## Dickey-Fuller = -11.985, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
## $AMERICANINDIAN
## 
##  Augmented Dickey-Fuller Test
## 
## data:  x_clean
## Dickey-Fuller = -11.958, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
## $BLACK
## 
##  Augmented Dickey-Fuller Test
## 
## data:  x_clean
## Dickey-Fuller = -9.7369, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
## $WHITE
## 
##  Augmented Dickey-Fuller Test
## 
## data:  x_clean
## Dickey-Fuller = -10.364, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
## $predicted
## 
##  Augmented Dickey-Fuller Test
## 
## data:  x_clean
## Dickey-Fuller = -10.209, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
## $residuals
## 
##  Augmented Dickey-Fuller Test
## 
## data:  x_clean
## Dickey-Fuller = -10.448, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
library(tibble)

adf_results <- tribble(
  ~Variable, ~ADF_Statistic, ~P_Value, ~Conclusion,
  "HTP", -9.749, "< 0.01", "Stationary",
  "POV", -10.734, "< 0.01", "Stationary",
  "UER", -9.3261, "< 0.01", "Stationary",
  "LIN", -11.609, "< 0.01", "Stationary",
  "AGE65", -10.972, "< 0.01", "Stationary",
  "ASIAN", -9.1452, "< 0.01", "Stationary",
  "PACIFIC", -11.956, "< 0.01", "Stationary",
  "AMERICANINDIAN", -12.086, "< 0.01", "Stationary",
  "BLACK", -9.6922, "< 0.01", "Stationary",
  "WHITE", -10.328, "< 0.01", "Stationary"
)

knitr::kable(adf_results, caption = "ADF Test Results for Variables")
ADF Test Results for Variables
Variable ADF_Statistic P_Value Conclusion
HTP -9.7490 < 0.01 Stationary
POV -10.7340 < 0.01 Stationary
UER -9.3261 < 0.01 Stationary
LIN -11.6090 < 0.01 Stationary
AGE65 -10.9720 < 0.01 Stationary
ASIAN -9.1452 < 0.01 Stationary
PACIFIC -11.9560 < 0.01 Stationary
AMERICANINDIAN -12.0860 < 0.01 Stationary
BLACK -9.6922 < 0.01 Stationary
WHITE -10.3280 < 0.01 Stationary

All variables were tested for stationarity using the Augmented Dickey-Fuller (ADF) test. As shown in Table above, all were stationary at level (I(0)), allowing for ARDL modeling without differencing.

Normality

## Research Question 1 and 2?
library(tseries)

# Estimate
ardl_model <- ardl(HTP ~ POV + UER, data = df, order = c(1, 0, 1))

# test normality
residuals <- residuals(ardl_model)
jarque.bera.test(residuals)
## 
##  Jarque Bera Test
## 
## data:  residuals
## X-squared = 6.9882, df = 2, p-value = 0.03038

Heteroskedasticity

## Research Question 1, 2, 3?
library(lmtest) 

# Est
ardl_model <- ardl(HTP ~ POV + UER, data = df, order = c(1, 0, 1))

# Breusch-Pagan test 
bptest(ardl_model)  
## 
##  studentized Breusch-Pagan test
## 
## data:  ardl_model
## BP = 76.087, df = 4, p-value = 1.174e-15

Serial Correlation

library(lmtest)
bgtest(ardl_model, order = 2)
## 
##  Breusch-Godfrey test for serial correlation of order up to 2
## 
## data:  ardl_model
## LM test = 14.912, df = 2, p-value = 0.0005778

Model Specification (RESET)

resettest(ardl_model)
## 
##  RESET test
## 
## data:  ardl_model
## RESET = 49.575, df1 = 2, df2 = 1859, p-value < 2.2e-16

Conclusion

  • Linear regression (OLS) establishes the foundation.

  • ARDL adds temporal depth.

  • SAR adds spatial application.

  • This combined framework explores and models why and where hypertension inequality persists in D.C.

Policy Implications

  • Target high-poverty and high-aging tracts with community screening and access programs.

  • Apply spatially informed resource allocation: high-risk areas cluster geographically.

  • Recognize temporal persistence: interventions require continuity, not one-offs.

References

  • U.S. Census Bureau ACS (2013–2023)

Contact: bayowa.onabajo@bison.howard.edu

Further Applications: PCA, CRE index, Spatial extensions.

Residual Plot

## Research Question 1, 2, 3 and 4?
plot(residuals(ardl_model), type="o", main="Residuals of ARDL Model", col="blue")
abline(h=0, col="red", lty=2)

Composite Resilience or Vulnerability Index (CRE) with Principal Component Analysis

Independent variables are POV, UER, which are CRE factors Dependent variable is HTP

Composite Resilience factors on Hypertension

\[ HTP ~ POV, HTP ~ BLACK \]

We address the multidimensionality of socioeconomic vulnerability, we constructed a composite index using Principal Component Analysis (PCA). The PCA model included four standardized predictors: poverty rate (POV), unemployment rate (UER), lack of health insurance (LIN), and percent of the population aged 65 or older (AGE65). These variables capture economic hardship, employment instability, healthcare exclusion, and aging vulnerability. Variable loadings indicated that [POV and UER contributed most, followed by LIN and AGE65]. This component was interpreted as a Composite Resilience Estimate (CRE), where higher values denote greater structural vulnerability. The CRE was used as an explanatory variable in regression models predicting hypertension prevalence (HTP).

# PCA to summarize structural resilience/vulnerability
vars <- c("POV", "UER", "LIN", "AGE65")
pca <- prcomp(df[, vars], scale. = TRUE)
#summarize
summary(pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4
## Standard deviation     1.3368 1.0348 0.8916 0.58936
## Proportion of Variance 0.4467 0.2677 0.1987 0.08684
## Cumulative Proportion  0.4467 0.7144 0.9132 1.00000
pca$rotation
##               PC1         PC2        PC3        PC4
## POV   -0.63569540 -0.00447665 -0.3979394  0.6614496
## UER   -0.65741009 -0.17113238 -0.1439648 -0.7195831
## LIN   -0.39942191  0.43188128  0.8022458  0.1016977
## AGE65 -0.06454054 -0.88553499  0.4210939  0.1853165

PCA Results

# Principal component as CRE
df$CRE_pca <- as.numeric(pca$x[, 1])

# Regress HTP on the CRE
model_cre <- lm(HTP ~ CRE_pca, data = df)
summary(model_cre)
## 
## Call:
## lm(formula = HTP ~ CRE_pca, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.4987  -5.4595  -0.2445   5.4770  19.4660 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  30.1408     0.1742  173.06   <2e-16 ***
## CRE_pca      -4.8317     0.1303  -37.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.525 on 1865 degrees of freedom
## Multiple R-squared:  0.4243, Adjusted R-squared:  0.424 
## F-statistic:  1375 on 1 and 1865 DF,  p-value: < 2.2e-16
#install.packages("factoextra")
#install.packages("broom")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(broom)
library(tidyverse)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
# PCA for Composite Resilience Estimate
cre_vars <- c("POV", "UER", "LIN", "AGE65")

# Perform PCA
pca <- prcomp(df[, c("POV", "UER", "LIN", "AGE65")], scale. = TRUE)

# Scree Plot
scree_plot <- fviz_eig(pca, addlabels = TRUE)

# Loadings Plot 
loadings_data <- as.data.frame(pca$rotation[, 1, drop = FALSE]) 
loadings_plot <- 
  ggplot(loadings_data, aes(x = reorder(rownames(loadings_data), -PC1), y = PC1, 
                           fill = rownames(loadings_data))) +
  geom_col() +
  labs(x = "", y = "PC1 Loadings") +
  theme_minimal()

# Create CRE Index
df$CRE_pca <- -pca$x[, 1]  

# Plot HTP vs. CRE
cre_htp_plot <- 
  ggplot(df, aes(x = CRE_pca, y = HTP)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "HTP vs. CRE", x = "Composite Resilience (Higher = More Vulnerable)", y = "Hypertension Prevalence")

# Display all plots
gridExtra::grid.arrange(scree_plot, loadings_plot, cre_htp_plot, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'

# Regression model
cre_model <- lm(HTP ~ CRE_pca, data = df)
broom::tidy(cre_model) %>% kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) 30.141 0.174 173.063 0
CRE_pca 4.832 0.130 37.076 0

Mapping

# Get spatial data for DC tracts
library(tidycensus)
dc_tracts <- get_acs(
  geography = "tract",
  state = "11",
  variables = "B01001_001",
  year = 2022,
  geometry = TRUE,
  output = "wide"
)
## Getting data from the 2018-2022 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
##   |                                                                              |                                                                      |   0%  |                                                                              |==========                                                            |  15%  |                                                                              |===================================                                   |  50%  |                                                                              |==================================================                    |  71%  |                                                                              |================================================================      |  92%  |                                                                              |==================================================================    |  95%  |                                                                              |======================================================================| 100%
# Create df_map by joining your analysis data with spatial data
df_map <- df2 %>% 
  filter(Year == 2023) %>% 
  left_join(dc_tracts %>% select(GEOID, geometry), by = "GEOID") %>% 
  st_as_sf()
library(sf)
library(ggplot2)

# Ensure df_map exists and has geometry
if(exists("df_map") && "sf" %in% class(df_map)) {
  choropleth_imputed <- df_map %>%
    st_make_valid() %>%
    mutate(
      HTP = coalesce(as.numeric(HTP), mean(as.numeric(HTP), na.rm = TRUE)),
      POV = coalesce(as.numeric(POV), mean(as.numeric(POV), na.rm = TRUE)),
      AGE65 = coalesce(as.numeric(AGE65), mean(as.numeric(AGE65), na.rm = TRUE)),
      UER = coalesce(as.numeric(UER), mean(as.numeric(UER), na.rm = TRUE)),
      LIN = coalesce(as.numeric(LIN), mean(as.numeric(LIN), na.rm = TRUE))
    ) %>%
    filter(!st_is_empty(.))
  
  # Get unified scale limits
  value_range <- range(c(choropleth_imputed$HTP, choropleth_imputed$POV, 
                        choropleth_imputed$AGE65,choropleth_imputed$UER,choropleth_imputed$LIN), na.rm = TRUE)
  
  # Plotting function
  create_map <- function(data, var, title) {
    ggplot(data) +
      geom_sf(aes(fill = {{ var }}), color = "black", size = 0.1) +
      scale_fill_viridis_c(
        name = "Rate (%)",
        limits = value_range,
        option = "plasma",
        na.value = "gray"
      ) +
      labs(title = title) +
      theme_void()
  }
  
  # Generate maps
  map_hypertension <- create_map(choropleth_imputed, HTP, "Hypertension (2023)")
  map_poverty <- create_map(choropleth_imputed, POV, "Poverty (2023)")
  map_age65 <- create_map(choropleth_imputed, AGE65, "Age 65+ (2023)")
  map_unemployed <- create_map(choropleth_imputed, UER, "Unemployed (2023)")
  map_nohealthinsurance <- create_map(choropleth_imputed, LIN, "No Health Insurance (2023)")
  
  # Arrange
  gridExtra::grid.arrange(map_hypertension, map_poverty, map_age65,map_unemployed,map_nohealthinsurance, ncol = 5)
} else {
  warning("Spatial data not available - skipping mapping section")
}

Spatial Analysis

#install.packages("spData")
library(spdep)
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
library(sf)
library(tmap)

# Prepare spatial data
df_sf <- st_as_sf(df_map) %>% 
  st_transform(26918)  # UTM Zone 18N for DC

# Create neighbors list (queen contiguity)
nb <- poly2nb(df_sf, queen = TRUE)

# imputation with NA safety
df_sf$HTP_imputed <- df_sf$HTP  # Initialize

# Get indices of NA values
na_indices <- which(is.na(df_sf$HTP))

for (i in na_indices) {
  neighbor_values <- df_sf$HTP[nb[[i]]]  # Get neighbor values
  valid_neighbors <- neighbor_values[!is.na(neighbor_values)]
  
  if (length(valid_neighbors) > 0) {
    df_sf$HTP_imputed[i] <- mean(valid_neighbors)
  }
}

# Prep Moran's test 
valid_tracts <- !is.na(df_sf$HTP_imputed)
df_sf_valid <- df_sf[valid_tracts, ]

# subset weights matrix
nb_valid <- subset.nb(nb, valid_tracts)
weights_valid <- nb2listw(nb_valid, style = "W")

# Moran's test
if (sum(valid_tracts) > 1) {  # Need at least 2 observations
  moran_result <- moran.test(df_sf_valid$HTP_imputed, weights_valid)
  print(moran_result)
} else {
  warning("Insufficient non-NA values for Moran's test")
}
## 
##  Moran I test under randomisation
## 
## data:  df_sf_valid$HTP_imputed  
## weights: weights_valid    
## 
## Moran I statistic standard deviate = 20.255, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.826896343      -0.004901961       0.001686396
# Visualize with NA handling
tm_shape(df_sf) +
  tm_fill("HTP_imputed",
          palette = "Reds",
          style = "quantile",
          title = "Hypertension (%)",
          textNA = "No Data (Insufficient Neighbor Estimates)",
          colorNA = "lightgray") +
  tm_borders(col = "white", lwd = 0.3) +
  tm_layout(main.title = "Spatially Imputed Hypertension Prevalence",
            legend.outside = TRUE)
## 
## ── tmap v3 code detected ───────────────────────────────────────────────────────
## [v3->v4] `tm_fill()`: instead of `style = "quantile"`, use fill.scale =
## `tm_scale_intervals()`.
## ℹ Migrate the argument(s) 'style', 'palette' (rename to 'values'), 'colorNA'
##   (rename to 'value.na'), 'textNA' (rename to 'label.na') to
##   'tm_scale_intervals(<HERE>)'
## For small multiples, specify a 'tm_scale_' for each multiple, and put them in a
## list: 'fill'.scale = list(<scale1>, <scale2>, ...)'
## [v3->v4] `tm_fill()`: migrate the argument(s) related to the legend of the
## visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'
## [v3->v4] `tm_layout()`: use `tm_title()` instead of `tm_layout(main.title = )`
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Reds" is named
## "brewer.reds"
## [plot mode] fit legend/component: Some legend items or map compoments do not
## fit well, and are therefore rescaled.
## ℹ Set the tmap option `component.autoscale = FALSE` to disable rescaling.

library(spdep)
library(spatialreg)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'spatialreg'
## The following objects are masked from 'package:spdep':
## 
##     get.ClusterOption, get.coresOption, get.mcOption,
##     get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
##     set.coresOption, set.mcOption, set.VerboseOption,
##     set.ZeroPolicyOption
#  Ensure valid geometries
df_sf_valid <- df_sf %>%
  st_make_valid() %>%
  filter(complete.cases(HTP, POV, UER))

#  Create neighbors with zero policy
nb <- poly2nb(df_sf_valid, queen = TRUE)
## Warning in poly2nb(df_sf_valid, queen = TRUE): some observations have no neighbours;
## if this seems unexpected, try increasing the snap argument.
## Warning in poly2nb(df_sf_valid, queen = TRUE): neighbour object has 3 sub-graphs;
## if this sub-graph count seems unexpected, try increasing the snap argument.
weights <- nb2listw(nb, style = "W", zero.policy = TRUE)

#  Check neighbor counts
print(table(card(nb)))  # Shows how many neighbors each tract has
## 
##  0  1  2  3  4  5  6  7  8  9 11 13 
##  1  3 12 17 24 44 24 19  7  1  1  1
#  Run model with zero.policy
spatial_lag <- lagsarlm(
  HTP ~ POV + UER,
  data = df_sf_valid,
  listw = weights,
  method = "eigen",
  zero.policy = TRUE
)

summary(spatial_lag)
## 
## Call:lagsarlm(formula = HTP ~ POV + UER, data = df_sf_valid, listw = weights, 
##     method = "eigen", zero.policy = TRUE)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -18.116299  -2.686464  -0.051661   2.959348  19.271453 
## 
## Type: lag 
## Regions with no neighbours included:
##  10 
## Coefficients: (asymptotic standard errors) 
##             Estimate Std. Error z value  Pr(>|z|)
## (Intercept)  5.60484    1.39389  4.0210 5.795e-05
## POV          0.19744    0.04374  4.5140 6.363e-06
## UER          0.19647    0.14545  1.3507    0.1768
## 
## Rho: 0.68882, LR test value: 117.82, p-value: < 2.22e-16
## Asymptotic standard error: 0.051374
##     z-value: 13.408, p-value: < 2.22e-16
## Wald statistic: 179.77, p-value: < 2.22e-16
## 
## Log likelihood: -467.5814 for lag model
## ML residual variance (sigma squared): 22.088, (sigma: 4.6998)
## Number of observations: 154 
## Number of parameters estimated: 5 
## AIC: 945.16, (AIC for lm: 1061)
## LM test for residual autocorrelation
## test value: 0.0062416, p-value: 0.93703
tm_shape(df_sf) +
  tm_fill("POV", palette = "Blues", style = "quantile", title = "Poverty Rate (%)") +
  tm_borders(lwd = 0.5, col = "gray") +
  tm_layout(main.title = "Poverty Rate by Census Tract",
            legend.outside = TRUE)
## 
## ── tmap v3 code detected ───────────────────────────────────────────────────────
## [v3->v4] `tm_fill()`: instead of `style = "quantile"`, use fill.scale =
## `tm_scale_intervals()`.
## ℹ Migrate the argument(s) 'style', 'palette' (rename to 'values') to
##   'tm_scale_intervals(<HERE>)'
## [v3->v4] `tm_fill()`: migrate the argument(s) related to the legend of the
## visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'
## [v3->v4] `tm_layout()`: use `tm_title()` instead of `tm_layout(main.title = )`
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Blues" is named
## "brewer.blues"
## Multiple palettes called "blues" found: "brewer.blues", "matplotlib.blues". The first one, "brewer.blues", is returned.

# Spatial regression significance testing
library(spdep)
library(spatialreg)

# Clean data 
df_sf_clean <- df_sf %>%
  st_make_valid() %>%
  filter(!is.na(HTP), !is.na(POV), !is.na(UER))

# Recompute neighbors and weights
nb <- poly2nb(df_sf_clean, queen = TRUE)  
## Warning in poly2nb(df_sf_clean, queen = TRUE): some observations have no neighbours;
## if this seems unexpected, try increasing the snap argument.
## Warning in poly2nb(df_sf_clean, queen = TRUE): neighbour object has 3 sub-graphs;
## if this sub-graph count seems unexpected, try increasing the snap argument.
weights <- nb2listw(nb, style = "W", zero.policy = TRUE)  # Allow isolated tracts

# Verify dimensions
cat(
  "Data rows:", nrow(df_sf_clean), "\n",
  "Weights length:", length(weights$neighbours), "\n"
)
## Data rows: 154 
##  Weights length: 154
# Run regression
spatial_lag <- lagsarlm(
  HTP ~ POV + UER,
  data = df_sf_clean,
  listw = weights,
  zero.policy = TRUE  # Important if some tracts have no neighbors
)
summary(spatial_lag)
## 
## Call:lagsarlm(formula = HTP ~ POV + UER, data = df_sf_clean, listw = weights, 
##     zero.policy = TRUE)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -18.116299  -2.686464  -0.051661   2.959348  19.271453 
## 
## Type: lag 
## Regions with no neighbours included:
##  10 
## Coefficients: (asymptotic standard errors) 
##             Estimate Std. Error z value  Pr(>|z|)
## (Intercept)  5.60484    1.39389  4.0210 5.795e-05
## POV          0.19744    0.04374  4.5140 6.363e-06
## UER          0.19647    0.14545  1.3507    0.1768
## 
## Rho: 0.68882, LR test value: 117.82, p-value: < 2.22e-16
## Asymptotic standard error: 0.051374
##     z-value: 13.408, p-value: < 2.22e-16
## Wald statistic: 179.77, p-value: < 2.22e-16
## 
## Log likelihood: -467.5814 for lag model
## ML residual variance (sigma squared): 22.088, (sigma: 4.6998)
## Number of observations: 154 
## Number of parameters estimated: 5 
## AIC: 945.16, (AIC for lm: 1061)
## LM test for residual autocorrelation
## test value: 0.0062416, p-value: 0.93703
# Load Packages 
library(spdep)
library(spatialreg)
library(sf)
library(tmap)
library(tidyverse)
library(modelsummary)
## 
## Attaching package: 'modelsummary'
## The following object is masked from 'package:tidycensus':
## 
##     get_estimates
# Data Preparation 
df_sf <- df_map %>% 
  st_as_sf() %>% 
  st_transform(26918) %>%  # UTM Zone 18N for DC
  st_make_valid() %>%
  mutate(across(c(HTP, POV, UER, AGE65), as.numeric)) %>%
  mutate(row_id = 1:n())

# Enhanced Spatial Imputation 
nb <- poly2nb(df_sf, queen = TRUE, snap = 1e-3)  # Increased snap tolerance

df_sf <- df_sf %>%
  mutate(
    HTP_imputed = ifelse(is.na(HTP),
                        sapply(nb, function(neighbors) {
                          if(length(neighbors) == 0) return(NA)
                          mean(df_sf$HTP[neighbors], na.rm = TRUE)
                        }),
                        HTP),
    HTP_imputed = coalesce(HTP_imputed, mean(HTP, na.rm = TRUE))
  )

# Hotspot Analysis 
valid_tracts <- !is.na(df_sf$HTP_imputed)
df_sf_valid <- df_sf[valid_tracts, ]
nb_valid <- subset.nb(nb, valid_tracts)
weights_valid <- nb2listw(nb_valid, style = "W")

if(sum(valid_tracts) > 1) {
  # Local Moran's I
  local_moran <- localmoran(df_sf_valid$HTP_imputed, weights_valid)
  df_sf_valid <- df_sf_valid %>%
    mutate(
      local_moran_i = local_moran[, "Ii"],
      hotspot_pval = local_moran[, "Pr(z != E(Ii))"],
      hotspot_type = case_when(
        local_moran_i > 0 & hotspot_pval < 0.05 ~ "High-High",
        local_moran_i < 0 & hotspot_pval < 0.05 ~ "Low-Low",
        TRUE ~ "Not Significant"
      )
    )
  
  # Join hotspot data back to full dataset
  df_sf <- df_sf %>%
    left_join(st_drop_geometry(df_sf_valid) %>% select(row_id, hotspot_type), 
              by = "row_id")
}

# Corrected Visualization 
map_theme <- tm_layout(
  legend.position = c("right", "bottom"),
  legend.title.size = 0.9,
  legend.text.size = 0.7,
  frame = FALSE
)

# Individual maps for each variable
htp_map <- tm_shape(df_sf) +
  tm_fill("HTP_imputed", 
          palette = "Reds",
          style = "quantile",
          title = "Hypertension (%)") +
  tm_borders(col = "gray30", lwd = 0.5) +
  map_theme
## 
## ── tmap v3 code detected ───────────────────────────────────────────────────────
## [v3->v4] `tm_fill()`: instead of `style = "quantile"`, use fill.scale =
## `tm_scale_intervals()`.
## ℹ Migrate the argument(s) 'style', 'palette' (rename to 'values') to
##   'tm_scale_intervals(<HERE>)'
## [v3->v4] `tm_fill()`: migrate the argument(s) related to the legend of the
## visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'
pov_map <- tm_shape(df_sf) +
  tm_fill("POV",
          palette = "Blues",
          style = "quantile",
          title = "Poverty Rate (%)") +
  tm_borders(col = "gray30", lwd = 0.5) +
  map_theme
## [v3->v4] `tm_fill()`: migrate the argument(s) related to the legend of the
## visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'
uer_map <- tm_shape(df_sf) +
  tm_fill("UER",
          palette = "YlOrRd",
          style = "quantile",
          title = "Unemployment Rate (%)") +
  tm_borders(col = "gray30", lwd = 0.5) +
  map_theme
## [v3->v4] `tm_fill()`: migrate the argument(s) related to the legend of the
## visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'
age65_map <- tm_shape(df_sf) +
  tm_fill("AGE65",
          palette = "Purples",
          style = "quantile",
          title = "Population 65+ (%)") +
  tm_borders(col = "gray30", lwd = 0.5) +
  map_theme
## [v3->v4] `tm_fill()`: migrate the argument(s) related to the legend of the
## visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'
# Hotspot map 
if("hotspot_type" %in% names(df_sf)) {
  hotspot_map <- tm_shape(df_sf) +
    tm_fill("hotspot_type",
            palette = c("High-High" = "red", 
                       "Low-Low" = "blue", 
                       "Not Significant" = "lightgray"),
            title = "Hypertension Hotspots") +
    tm_borders(col = "gray30", lwd = 0.5) +
    map_theme
} else {
  hotspot_map <- NULL
}
## [v3->v4] `tm_tm_fill()`: migrate the argument(s) related to the scale of the
## visual variable `fill` namely 'palette' (rename to 'values') to fill.scale =
## tm_scale(<HERE>).
## [v3->v4] `tm_fill()`: migrate the argument(s) related to the legend of the
## visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'
# Arrange maps
if(!is.null(hotspot_map)) {
  tmap_arrange(
    htp_map, pov_map,
    uer_map, age65_map,
    hotspot_map,
    ncol = 2
  )
} else {
  tmap_arrange(
    htp_map, pov_map,
    uer_map, age65_map,
    ncol = 2
  )
}
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Reds" is named
## "brewer.reds"
## Multiple palettes called "reds" found: "brewer.reds", "matplotlib.reds". The first one, "brewer.reds", is returned.
## 
## [plot mode] fit legend/component: Some legend items or map compoments do not
## fit well, and are therefore rescaled.
## ℹ Set the tmap option `component.autoscale = FALSE` to disable rescaling.
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Blues" is named
## "brewer.blues"
## Multiple palettes called "blues" found: "brewer.blues", "matplotlib.blues". The first one, "brewer.blues", is returned.
## 
## [plot mode] fit legend/component: Some legend items or map compoments do not
## fit well, and are therefore rescaled.
## ℹ Set the tmap option `component.autoscale = FALSE` to disable rescaling.
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "YlOrRd" is named
## "brewer.yl_or_rd"
## Multiple palettes called "yl_or_rd" found: "brewer.yl_or_rd", "matplotlib.yl_or_rd". The first one, "brewer.yl_or_rd", is returned.
## 
## [plot mode] fit legend/component: Some legend items or map compoments do not
## fit well, and are therefore rescaled.
## ℹ Set the tmap option `component.autoscale = FALSE` to disable rescaling.
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Purples" is named
## "brewer.purples"
## Multiple palettes called "purples" found: "brewer.purples", "matplotlib.purples". The first one, "brewer.purples", is returned.
## 
## [plot mode] fit legend/component: Some legend items or map compoments do not
## fit well, and are therefore rescaled.
## ℹ Set the tmap option `component.autoscale = FALSE` to disable rescaling.

# Spatial Regression with AGE65 
model_data <- df_sf %>%
  filter(complete.cases(HTP, POV, UER, AGE65)) %>%
  mutate(across(c(HTP, POV, UER, AGE65), scale))

# Handle neighborhood connectivity
nb_clean <- poly2nb(model_data, queen = TRUE, snap = 1e-3)
## Warning in poly2nb(model_data, queen = TRUE, snap = 0.001): some observations have no neighbours;
## if this seems unexpected, try increasing the snap argument.
## Warning in poly2nb(model_data, queen = TRUE, snap = 0.001): neighbour object has 3 sub-graphs;
## if this sub-graph count seems unexpected, try increasing the snap argument.
no_neighbors <- which(card(nb_clean) == 0)

if(length(no_neighbors) > 0) {
  message("Removing ", length(no_neighbors), " tracts with no neighbors")
  model_data <- model_data[-no_neighbors, ]
  nb_clean <- poly2nb(model_data, queen = TRUE, snap = 1e-3)
}
## Removing 1 tracts with no neighbors
## Warning in poly2nb(model_data, queen = TRUE, snap = 0.001): neighbour object has 2 sub-graphs;
## if this sub-graph count seems unexpected, try increasing the snap argument.
weights_clean <- nb2listw(nb_clean, style = "W", zero.policy = TRUE)

# Fit models
spatial_lag <- tryCatch(
  lagsarlm(HTP ~ POV + UER + AGE65,
           data = model_data,
           listw = weights_clean,
           zero.policy = TRUE),
  error = function(e) {
    message("Spatial lag model failed: ", e$message)
    return(NULL)
  }
)

spatial_error <- tryCatch(
  errorsarlm(HTP ~ POV + UER + AGE65,
             data = model_data,
             listw = weights_clean,
             zero.policy = TRUE),
  error = function(e) {
    message("Spatial error model failed: ", e$message)
    return(NULL)
  }
)

# Model Results and Diagnostics 
if(!is.null(spatial_lag)) {
  model_data$spatial_lag_residuals <- residuals(spatial_lag)
  
  residual_map <- tm_shape(model_data) +
    tm_fill("spatial_lag_residuals",
            style = "fisher",
            palette = "-RdBu",
            title = "Standardized Residuals") +
    tm_borders(col = "gray30", lwd = 0.5) +
    map_theme
  
  print(residual_map)
  
  cat("\n=== Spatial Lag Model Results ===\n")
  print(summary(spatial_lag))
}
## 
## ── tmap v3 code detected ───────────────────────────────────────────────────────
## [v3->v4] `tm_fill()`: instead of `style = "fisher"`, use fill.scale =
## `tm_scale_intervals()`.
## ℹ Migrate the argument(s) 'style', 'palette' (rename to 'values') to
##   'tm_scale_intervals(<HERE>)'[v3->v4] `tm_fill()`: migrate the argument(s) related to the legend of the
## visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'Multiple palettes called "rd_bu" found: "brewer.rd_bu", "matplotlib.rd_bu". The first one, "brewer.rd_bu", is returned.
## [scale] tm_polygons:() the data variable assigned to 'fill' contains positive and negative values, so midpoint is set to 0. Set 'midpoint = NA' in 'fill.scale = tm_scale_intervals(<HERE>)' to use all visual values (e.g. colors)
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "-RdBu" is named
## "rd_bu" (in long format "brewer.rd_bu")

## 
## === Spatial Lag Model Results ===
## 
## Call:lagsarlm(formula = HTP ~ POV + UER + AGE65, data = model_data, 
##     listw = weights_clean, zero.policy = TRUE)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.5059605 -0.2551583  0.0014785  0.2752624  1.8496668 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##              Estimate Std. Error z value  Pr(>|z|)
## (Intercept) -0.010969   0.033743 -0.3251    0.7451
## POV          0.225246   0.043986  5.1208 3.042e-07
## UER          0.047435   0.041102  1.1541    0.2485
## AGE65        0.199869   0.035443  5.6391 1.709e-08
## 
## Rho: 0.68773, LR test value: 123.83, p-value: < 2.22e-16
## Asymptotic standard error: 0.045696
##     z-value: 15.05, p-value: < 2.22e-16
## Wald statistic: 226.51, p-value: < 2.22e-16
## 
## Log likelihood: -94.02535 for lag model
## ML residual variance (sigma squared): 0.174, (sigma: 0.41714)
## Number of observations: 153 
## Number of parameters estimated: 6 
## AIC: 200.05, (AIC for lm: 321.88)
## LM test for residual autocorrelation
## test value: 5.9171, p-value: 0.014995
if(!is.null(spatial_lag) && !is.null(spatial_error)) {
  modelsummary(
    list("Spatial Lag" = spatial_lag, 
         "Spatial Error" = spatial_error),
    stars = TRUE,
    title = "Spatial Model Comparison",
    coef_rename = c("(Intercept)" = "Intercept",
                   "POV" = "Poverty Rate",
                   "UER" = "Unemployment Rate",
                   "AGE65" = "Population 65+")
  )
}
Spatial Model Comparison
Spatial Lag Spatial Error
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
rho 0.688***
(0.046)
Intercept -0.011 -0.046
(0.034) (0.192)
Poverty Rate 0.225*** 0.245***
(0.044) (0.049)
Unemployment Rate 0.047 0.024
(0.041) (0.040)
Population 65+ 0.200*** 0.154***
(0.035) (0.036)
lambda 0.820***
(0.043)
AIC 200.1 220.5
BIC 218.2 238.7
RMSE 0.42 0.43

Appendix

Important Model Outputs

# 1. PCA Loadings
if(exists("pca")) {
  pca_loadings <- as.data.frame(pca$rotation)
  cat("### PCA Loadings\n")
  knitr::kable(pca_loadings, digits = 3) %>% 
    kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))
  #write.csv(pca_loadings, "outputs/pca_loadings.csv")
}
## ### PCA Loadings
PC1 PC2 PC3 PC4
POV -0.636 -0.004 -0.398 0.661
UER -0.657 -0.171 -0.144 -0.720
LIN -0.399 0.432 0.802 0.102
AGE65 -0.065 -0.886 0.421 0.185
# 2. VIF Scores
if(exists("model")) {
  vif_scores <- car::vif(model)
  vif_df <- data.frame(Variable = names(vif_scores), VIF = vif_scores)
  cat("\n### Variance Inflation Factors (VIF)\n")
  print(vif_df)  # Basic R table
  #write.csv(vif_df, "outputs/vif_scores.csv")
}
## 
## ### Variance Inflation Factors (VIF)
##       Variable      VIF
## POV        POV 1.684957
## UER        UER 1.785993
## LIN        LIN 1.102908
## AGE65    AGE65 1.056610
# 3. Moran's I Test
if(exists("moran_result")) {
  moran_df <- data.frame(
    Statistic = moran_result$estimate[1],
    p.value = moran_result$p.value
  )
  cat("\n### Moran's I Test Results\n")
  knitr::kable(moran_df, digits = 4) %>% 
    kableExtra::kable_styling(full_width = FALSE)
  #write.csv(moran_df, "outputs/morans_i_test.csv")
}
## 
## ### Moran's I Test Results
Statistic p.value
Moran I statistic 0.8269 0
# 4. ARDL Model Coefficients
if(exists("ardl_model")) {
  ardl_coefs <- broom::tidy(ardl_model)
  cat("\n### ARDL Model Coefficients\n")
  knitr::kable(ardl_coefs, digits = 4) %>% 
    kableExtra::kable_styling(bootstrap_options = "condensed")
  #write.csv(ardl_coefs, "outputs/ardl_coefficients.csv")
}
## Warning: The `tidy()` method for objects of class `dynlm` is not maintained by the broom team, and is only supported through the `lm` tidier method. Please be cautious in interpreting and reporting broom output.
## 
## This warning is displayed once per session.
## 
## ### ARDL Model Coefficients
term estimate std.error statistic p.value
(Intercept) 13.7580 0.5452 25.2329 0.0000
L(HTP, 1) 0.2598 0.0209 12.4246 0.0000
POV 0.2816 0.0172 16.3953 0.0000
UER 0.9142 0.0615 14.8725 0.0000
L(UER, 1) -0.1648 0.0592 -2.7823 0.0055
# 5. Spatial Regression Coefficients
if(exists("spatial_lag")) {
  spatial_coefs <- broom::tidy(spatial_lag)
  cat("\n### Spatial Lag Model Coefficients\n")
  print(spatial_coefs)  # Basic R table
  #write.csv(spatial_coefs, "outputs/spatial_lag_coefficients.csv")
}
## 
## ### Spatial Lag Model Coefficients
## # A tibble: 5 × 5
##   term        estimate std.error statistic      p.value
##   <chr>          <dbl>     <dbl>     <dbl>        <dbl>
## 1 rho           0.688     0.0457    15.1   0           
## 2 (Intercept)  -0.0110    0.0337    -0.325 0.745       
## 3 POV           0.225     0.0440     5.12  0.000000304 
## 4 UER           0.0474    0.0411     1.15  0.248       
## 5 AGE65         0.200     0.0354     5.64  0.0000000171

Diagnostics

# Create directory for plots 

# PCA Scree Plot
if(exists("pca")) {
  scree_plot <- fviz_eig(pca, addlabels = TRUE) + 
    ggtitle("PCA Scree Plot") +
    theme_minimal()
  
  cat("### PCA Scree Plot\n")
  print(scree_plot)  # Display in document
  ggsave("outputs/plots/pca_scree_plot.png", scree_plot, 
         width = 8, height = 6, dpi = 300)
}
## ### PCA Scree Plot

# Linear Model Residual Diagnostics
if(exists("model")) {
  cat("\n### Linear Model Residual Plots\n")
  
  # Create and display in document
 
  par(mfrow = c(2,2), oma = c(0,0,2,0))
  plot(model, sub.caption = "Linear Model Diagnostics")
  dev.off()
  
  # Display in document using include_graphics
  #knitr::include_graphics("outputs/plots/linear_model_residuals.png")
}
## 
## ### Linear Model Residual Plots
## null device 
##           1
# ARDL Model Residual Plot
if(exists("ardl_model")) {
  cat("\n### ARDL Model Residual Plot\n")
  
  # Create plot object
  ardl_resid_plot <- function() {
    plot(residuals(ardl_model), type="o", 
         main="Residuals of ARDL Model", 
         col="blue", ylab="Residuals")
    abline(h=0, col="red", lty=2)
    grid()
  }
  
  # Save to file
  
  ardl_resid_plot()
  dev.off()
  
  # Display in document
  ardl_resid_plot()
}
## 
## ### ARDL Model Residual Plot
# Spatial Model Residuals 
if(exists("spatial_lag")) {
  cat("\n### Spatial Lag Model Residuals\n")
  spatial_resid_plot <- plot(residuals(spatial_lag), 
                           main = "Spatial Lag Model Residuals")
  print(spatial_resid_plot)
  
  # Save spatial residuals
  
  plot(residuals(spatial_lag), main = "Spatial Lag Model Residuals")
  dev.off()
}
## 
## ### Spatial Lag Model Residuals
## NULL
## null device 
##           1

Statistical Summary

# Descriptive Statistics
if(exists("df")) {
  desc_stats <- data.frame(
    Mean = sapply(df[, sapply(df, is.numeric)], mean, na.rm = TRUE),
    SD = sapply(df[, sapply(df, is.numeric)], sd, na.rm = TRUE),
    Min = sapply(df[, sapply(df, is.numeric)], min, na.rm = TRUE),
    Max = sapply(df[, sapply(df, is.numeric)], max, na.rm = TRUE),
    N = sapply(df[, sapply(df, is.numeric)], function(x) sum(!is.na(x)))
  )
  
  cat("### Descriptive Statistics\n")
  knitr::kable(desc_stats, digits = 3, caption = "Summary Statistics of Numeric Variables") %>%
    kableExtra::kable_styling(
      bootstrap_options = c("striped", "hover", "condensed"),
      full_width = FALSE
    )
  
  #write.csv(desc_stats, "outputs/descriptive_stats.csv")
}
## ### Descriptive Statistics
Summary Statistics of Numeric Variables
Mean SD Min Max N
Year 2017.817 3.126 2013.000 2023.000 1867
HTP 30.141 9.915 0.000 48.200 1867
POV 16.758 12.291 0.000 70.043 1867
UER 5.112 3.513 0.000 24.658 1867
LIN 2.026 1.877 0.000 13.774 1867
AGE65 14.355 4.970 0.000 37.309 1867
ASIAN 3.412 3.807 0.000 24.916 1867
PACIFIC 0.050 0.239 0.000 4.007 1867
AMERICANINDIAN 0.303 0.787 0.000 10.514 1867
BLACK 51.608 34.245 0.000 100.000 1867
WHITE 36.759 30.734 0.000 100.000 1867
predicted 30.141 7.795 7.873 62.252 1867
residuals 0.000 6.128 -19.416 18.910 1867
CRE_pca 0.000 1.337 -2.441 5.155 1867
# Correlation Matrix
if(exists("numeric_df")) {
  cor_matrix <- cor(numeric_df, use = "complete.obs")
  
  cat("\n### Correlation Matrix\n")
  knitr::kable(cor_matrix, digits = 2, caption = "Pairwise Correlations") %>%
    kableExtra::kable_styling(
      bootstrap_options = c("striped", "hover"),
      font_size = 10
    ) %>%
    kableExtra::column_spec(1, bold = TRUE)
  
  #write.csv(cor_matrix, "outputs/correlation_matrix.csv")
}
## 
## ### Correlation Matrix
Pairwise Correlations
Year HTP POV UER LIN AGE65 ASIAN PACIFIC AMERICANINDIAN BLACK WHITE
Year 1.00 0.01 -0.08 -0.20 -0.30 -0.03 0.00 0.06 -0.01 -0.03 -0.03
HTP 0.01 1.00 0.60 0.60 0.16 0.46 -0.65 -0.10 0.03 0.90 -0.88
POV -0.08 0.60 1.00 0.63 0.22 -0.01 -0.37 -0.07 -0.05 0.68 -0.66
UER -0.20 0.60 0.63 1.00 0.27 0.14 -0.42 -0.10 -0.01 0.69 -0.66
LIN -0.30 0.16 0.22 0.27 1.00 -0.09 -0.09 -0.04 0.03 0.23 -0.28
AGE65 -0.03 0.46 -0.01 0.14 -0.09 1.00 -0.23 -0.01 0.07 0.24 -0.21
ASIAN 0.00 -0.65 -0.37 -0.42 -0.09 -0.23 1.00 0.08 -0.01 -0.67 0.60
PACIFIC 0.06 -0.10 -0.07 -0.10 -0.04 -0.01 0.08 1.00 -0.03 -0.10 0.09
AMERICANINDIAN -0.01 0.03 -0.05 -0.01 0.03 0.07 -0.01 -0.03 1.00 0.00 -0.04
BLACK -0.03 0.90 0.68 0.69 0.23 0.24 -0.67 -0.10 0.00 1.00 -0.97
WHITE -0.03 -0.88 -0.66 -0.66 -0.28 -0.21 0.60 0.09 -0.04 -0.97 1.00
# ADF Test Results
if(exists("adf_results")) {
  cat("\n### Augmented Dickey-Fuller Test Results\n")
  knitr::kable(adf_results, digits = 4, 
               caption = "Stationarity Test Results") %>%
    kableExtra::kable_styling(
      bootstrap_options = c("striped", "hover"),
      full_width = FALSE
    ) %>%
    kableExtra::row_spec(which(adf_results$P_Value < 0.05), 
                         bold = TRUE, color = "white", background = "#D7261E")
  
  #write.csv(adf_results, "outputs/adf_test_results.csv")
}
## 
## ### Augmented Dickey-Fuller Test Results
Stationarity Test Results
Variable ADF_Statistic P_Value Conclusion
HTP -9.7490 < 0.01 Stationary
POV -10.7340 < 0.01 Stationary
UER -9.3261 < 0.01 Stationary
LIN -11.6090 < 0.01 Stationary
AGE65 -10.9720 < 0.01 Stationary
ASIAN -9.1452 < 0.01 Stationary
PACIFIC -11.9560 < 0.01 Stationary
AMERICANINDIAN -12.0860 < 0.01 Stationary
BLACK -9.6922 < 0.01 Stationary
WHITE -10.3280 < 0.01 Stationary

Spatial Analysis Output

# Hotspot Classification
if("hotspot_type" %in% names(df_sf)) {
  hotspot_data <- st_drop_geometry(df_sf) %>% 
    select(GEOID, hotspot_type) %>%
    mutate(hotspot_type = factor(hotspot_type,
                                levels = c("High-High", "Low-Low", "Not Significant")))
  
  # Create summary counts
  hotspot_summary <- hotspot_data %>%
    count(hotspot_type) %>%
    mutate(Percentage = n/nrow(hotspot_data)*100)
  
  cat("### Hotspot Classification Summary\n")
  knitr::kable(hotspot_summary, 
               col.names = c("Hotspot Type", "Count", "Percentage (%)"),
               digits = 1,
               caption = "Distribution of Hypertension Hotspots") %>%
    kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
    kableExtra::row_spec(1, background = "#FFDDDD") %>%  # Highlight High-High
    kableExtra::row_spec(2, background = "#DDDDFF")      # Highlight Low-Low
  
  # Show first 10 rows of full data
  cat("\n#### Sample Hotspot Classifications (First 10 Tracts)\n")
  knitr::kable(head(hotspot_data, 10), 
               caption = "Sample of Hotspot Classifications") %>%
    kableExtra::kable_styling(bootstrap_options = "condensed")
  
  #write.csv(hotspot_data, "outputs/hotspot_classification.csv")
}
## ### Hotspot Classification Summary
## 
## #### Sample Hotspot Classifications (First 10 Tracts)
Sample of Hotspot Classifications
GEOID hotspot_type
11001000101 Not Significant
11001000102 High-High
11001000201 Not Significant
11001000202 High-High
11001000300 High-High
11001000400 High-High
11001000501 Not Significant
11001000502 Not Significant
11001000600 Not Significant
11001000702 Not Significant
# Spatial Residuals
if(exists("spatial_lag")) {
  spatial_residuals <- st_drop_geometry(model_data) %>%
    select(GEOID, spatial_lag_residuals) %>%
    mutate(Residual_Group = cut(spatial_lag_residuals,
                               breaks = c(-Inf, -2, -1, 1, 2, Inf),
                               labels = c("Very Low", "Low", "Normal", "High", "Very High")))
  
  # Create residual summary
  residual_summary <- spatial_residuals %>%
    group_by(Residual_Group) %>%
    summarise(Count = n(),
              Avg_Residual = mean(spatial_lag_residuals))
  
  cat("\n### Spatial Lag Model Residuals Summary\n")
  knitr::kable(residual_summary, 
               digits = 3,
               caption = "Distribution of Standardized Residuals") %>%
    kableExtra::kable_styling(bootstrap_options = "striped") %>%
    kableExtra::column_spec(3, color = ifelse(residual_summary$Avg_Residual > 0, "red", "blue"))
  
  # Show extreme residuals (top/bottom 5)
  extreme_residuals <- spatial_residuals %>%
    arrange(desc(abs(spatial_lag_residuals))) %>%
    head(10) %>%
    mutate(RowID = row_number())  # Add numeric row identifiers
  
  cat("\n#### Tracts with Most Extreme Residuals\n")
  knitr::kable(extreme_residuals %>% select(-RowID), digits = 3) %>%
    kableExtra::kable_styling() %>%
    kableExtra::row_spec(
      which(extreme_residuals$spatial_lag_residuals > 0), 
      background = "#FFEEEE"
    ) %>%
    kableExtra::row_spec(
      which(extreme_residuals$spatial_lag_residuals < 0), 
      background = "#EEEEFF"
    )
  
  #write.csv(spatial_residuals, "outputs/spatial_residuals.csv")
}
## 
## ### Spatial Lag Model Residuals Summary
## 
## #### Tracts with Most Extreme Residuals
GEOID spatial_lag_residuals Residual_Group
11001002302 1.850 High
11001007301 -1.506 Low
11001009201 -1.331 Low
11001009204 1.162 High
11001009802 -0.850 Normal
11001009507 0.769 Normal
11001000201 -0.768 Normal
11001000202 -0.743 Normal
11001009503 0.697 Normal
11001010500 -0.669 Normal

Visulaization and Maps

# Interactive Trend Plot
if(exists("interactive_plot")) {
  cat("### Interactive Trend Visualization\n")
  cat("*Use the interactive controls to explore temporal patterns*\n\n")
  
  # Display in document
  print(interactive_plot)
  
  # Save standalone version
  if(!dir.exists("outputs")) dir.create("outputs")
  htmlwidgets::saveWidget(
    widget = interactive_plot, 
    file = "outputs/interactive_trends.html",
    selfcontained = TRUE,
    title = "DC Health Indicators 2013-2023"
  )
}
## ### Interactive Trend Visualization
## *Use the interactive controls to explore temporal patterns*
# verify map objects exist and are tmap objects
required_maps <- c("htp_map", "pov_map", "uer_map", "age65_map", "hotspot_map")
valid_maps <- keep(required_maps, ~exists(.x) && inherits(get(.x), "tmap"))

# Static Map Visualizations
map_theme <- tm_layout(
  legend.position = c("right", "bottom"),
  legend.title.size = 0.9,
  legend.text.size = 0.7,
  frame = FALSE
)

if(length(valid_maps) > 0) {
  # Apply consistent theme to all maps
  walk(valid_maps, ~assign(.x, get(.x) + map_theme, envir = .GlobalEnv))
  
  # Display individual maps
  walk(valid_maps, ~{
    cat("\n###", str_to_title(gsub("_map", "", gsub("_", " ", .x))), "Map\n")
    print(get(.x))
  })
  
  # combined panel only if at least 2 maps
  if(length(valid_maps) >= 2) {
    cat("\n### Combined Map Panel\n")
    
    # Prepare map list arrangement
    map_objects <- mget(valid_maps)
    
    # Special handling odd number of maps
    if(length(valid_maps) %% 2 != 0) {
      map_objects$empty <- tm_shape(df_sf) + tm_text("") # Empty placeholder
    }
    
    # Create arrangement
    ncols <- ifelse(length(valid_maps) >=4, 2, 1)
    combined_maps <- do.call(tmap_arrange, c(map_objects, list(ncol = ncols)))
    
    print(combined_maps)
    
  }
} else {
  message("No valid tmap objects found for visualization")
}
## 
## ### Htp Map Map
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Reds" is named
## "brewer.reds"
## Multiple palettes called "reds" found: "brewer.reds", "matplotlib.reds". The first one, "brewer.reds", is returned.

## 
## ### Pov Map Map
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Blues" is named
## "brewer.blues"
## Multiple palettes called "blues" found: "brewer.blues", "matplotlib.blues". The first one, "brewer.blues", is returned.

## 
## ### Uer Map Map
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "YlOrRd" is named
## "brewer.yl_or_rd"
## Multiple palettes called "yl_or_rd" found: "brewer.yl_or_rd", "matplotlib.yl_or_rd". The first one, "brewer.yl_or_rd", is returned.

## 
## ### Age65 Map Map
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Purples" is named
## "brewer.purples"
## Multiple palettes called "purples" found: "brewer.purples", "matplotlib.purples". The first one, "brewer.purples", is returned.

## 
## ### Hotspot Map Map

## 
## ### Combined Map Panel
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Reds" is named
## "brewer.reds"
## Multiple palettes called "reds" found: "brewer.reds", "matplotlib.reds". The first one, "brewer.reds", is returned.
## 
## [plot mode] fit legend/component: Some legend items or map compoments do not
## fit well, and are therefore rescaled.
## ℹ Set the tmap option `component.autoscale = FALSE` to disable rescaling.
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Blues" is named
## "brewer.blues"
## Multiple palettes called "blues" found: "brewer.blues", "matplotlib.blues". The first one, "brewer.blues", is returned.
## 
## [plot mode] fit legend/component: Some legend items or map compoments do not
## fit well, and are therefore rescaled.
## ℹ Set the tmap option `component.autoscale = FALSE` to disable rescaling.
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "YlOrRd" is named
## "brewer.yl_or_rd"
## Multiple palettes called "yl_or_rd" found: "brewer.yl_or_rd", "matplotlib.yl_or_rd". The first one, "brewer.yl_or_rd", is returned.
## 
## [plot mode] fit legend/component: Some legend items or map compoments do not
## fit well, and are therefore rescaled.
## ℹ Set the tmap option `component.autoscale = FALSE` to disable rescaling.
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Purples" is named
## "brewer.purples"
## Multiple palettes called "purples" found: "brewer.purples", "matplotlib.purples". The first one, "brewer.purples", is returned.
## 
## [plot mode] fit legend/component: Some legend items or map compoments do not
## fit well, and are therefore rescaled.
## ℹ Set the tmap option `component.autoscale = FALSE` to disable rescaling.

Model Comparison

# Prepare Model Comparison Table
if(exists("spatial_lag") && exists("spatial_error") && exists("ardl_model") && exists("model")) {
  
  # Create comparison dataframe with enhanced formatting
  model_comp <- modelsummary(
    list("Linear" = model,
         "ARDL" = ardl_model,
         "Spatial Lag" = spatial_lag, 
         "Spatial Error" = spatial_error),
    output = "data.frame",
    stars = TRUE,
    title = "Model Comparison Across Specifications",
    coef_rename = c(
      "(Intercept)" = "Intercept",
      "POV" = "Poverty Rate",
      "UER" = "Unemployment Rate",
      "AGE65" = "Population 65+",
      "lag.HTP" = "Spatial Lag (ρ)",
      "rho" = "Spatial Error (λ)"
    ),
    gof_map = c("nobs", "r.squared", "adj.r.squared", "sigma", "logLik", "AIC", "BIC")
  )
  
  #Display formatted table with conditional coloring
  cat("### Comparative Model Performance\n")
  model_table <- knitr::kable(model_comp, caption = "Comparison of Model Coefficients and Fit Statistics") %>%
    kableExtra::kable_styling(
      bootstrap_options = c("striped", "hover", "condensed"),
      font_size = 11,
      full_width = FALSE,
      position = "center"
    ) %>%
    kableExtra::column_spec(1, bold = TRUE, width = "3cm") %>%
    kableExtra::column_spec(2:5, width = "2.5cm") %>%
    kableExtra::footnote(
      general = "Standard errors in parentheses; * p<0.1, ** p<0.05, *** p<0.01",
      footnote_as_chunk = TRUE
    )
  
  # Highlight best performing model for each fit statistic
  if("AIC" %in% model_comp$term) {
    aic_values <- as.numeric(model_comp[model_comp$term == "AIC", 2:5])
    best_aic <- which.min(aic_values) + 1  # +1 to account for term column
    model_table <- model_table %>% 
      kableExtra::row_spec(which(model_comp$term == "AIC"), 
                          background = ifelse(1:4 == best_aic, "#E6F3E6", NA))
  }
  
  if("BIC" %in% model_comp$term) {
    bic_values <- as.numeric(model_comp[model_comp$term == "BIC", 2:5])
    best_bic <- which.min(bic_values) + 1
    model_table <- model_table %>% 
      kableExtra::row_spec(which(model_comp$term == "BIC"), 
                          background = ifelse(1:4 == best_bic, "#E6F3E6", NA))
  }
  
  print(model_table)
  
  # Export to CSV with additional metadata
  #write.csv(model_comp, "outputs/model_comparison.csv", row.names = FALSE)
  
  # Create simplified coefficient comparison
  coef_comparison <- model_comp %>%
    filter(!term %in% c("AIC", "BIC", "R2", "Log.Lik", "sigma", "nobs", "adj.r.squared")) %>%
    select(-part)
  
  cat("\n### Key Coefficient Comparison\n")
  knitr::kable(coef_comparison, digits = 3) %>%
    kableExtra::kable_styling() %>%
    kableExtra::column_spec(1, bold = TRUE) %>%
    print()
  
} else {
  cat("\nNote: Not all models available for comparison\n")
}
## ### Comparative Model Performance
## <table class="table table-striped table-hover table-condensed" style="font-size: 11px; width: auto !important; margin-left: auto; margin-right: auto;border-bottom: 0;">
## <caption style="font-size: initial !important;">Comparison of Model Coefficients and Fit Statistics</caption>
##  <thead>
##   <tr>
##    <th style="text-align:left;"> part </th>
##    <th style="text-align:left;"> term </th>
##    <th style="text-align:left;"> statistic </th>
##    <th style="text-align:left;"> Linear </th>
##    <th style="text-align:left;"> ARDL </th>
##    <th style="text-align:left;"> Spatial Lag </th>
##    <th style="text-align:left;"> Spatial Error </th>
##   </tr>
##  </thead>
## <tbody>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> Intercept </td>
##    <td style="text-align:left;width: 2.5cm; "> estimate </td>
##    <td style="text-align:left;width: 2.5cm; "> 7.873*** </td>
##    <td style="text-align:left;width: 2.5cm; "> 13.758*** </td>
##    <td style="text-align:left;"> -0.011 </td>
##    <td style="text-align:left;"> -0.046 </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> Intercept </td>
##    <td style="text-align:left;width: 2.5cm; "> std.error </td>
##    <td style="text-align:left;width: 2.5cm; "> (0.501) </td>
##    <td style="text-align:left;width: 2.5cm; "> (0.545) </td>
##    <td style="text-align:left;"> (0.034) </td>
##    <td style="text-align:left;"> (0.192) </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> Poverty Rate </td>
##    <td style="text-align:left;width: 2.5cm; "> estimate </td>
##    <td style="text-align:left;width: 2.5cm; "> 0.352*** </td>
##    <td style="text-align:left;width: 2.5cm; "> 0.282*** </td>
##    <td style="text-align:left;"> 0.225*** </td>
##    <td style="text-align:left;"> 0.245*** </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> Poverty Rate </td>
##    <td style="text-align:left;width: 2.5cm; "> std.error </td>
##    <td style="text-align:left;width: 2.5cm; "> (0.015) </td>
##    <td style="text-align:left;width: 2.5cm; "> (0.017) </td>
##    <td style="text-align:left;"> (0.044) </td>
##    <td style="text-align:left;"> (0.049) </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> Unemployment Rate </td>
##    <td style="text-align:left;width: 2.5cm; "> estimate </td>
##    <td style="text-align:left;width: 2.5cm; "> 0.723*** </td>
##    <td style="text-align:left;width: 2.5cm; "> 0.914*** </td>
##    <td style="text-align:left;"> 0.047 </td>
##    <td style="text-align:left;"> 0.024 </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> Unemployment Rate </td>
##    <td style="text-align:left;width: 2.5cm; "> std.error </td>
##    <td style="text-align:left;width: 2.5cm; "> (0.054) </td>
##    <td style="text-align:left;width: 2.5cm; "> (0.061) </td>
##    <td style="text-align:left;"> (0.041) </td>
##    <td style="text-align:left;"> (0.040) </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> LIN </td>
##    <td style="text-align:left;width: 2.5cm; "> estimate </td>
##    <td style="text-align:left;width: 2.5cm; "> 0.179* </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> LIN </td>
##    <td style="text-align:left;width: 2.5cm; "> std.error </td>
##    <td style="text-align:left;width: 2.5cm; "> (0.079) </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> Population 65+ </td>
##    <td style="text-align:left;width: 2.5cm; "> estimate </td>
##    <td style="text-align:left;width: 2.5cm; "> 0.857*** </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;"> 0.200*** </td>
##    <td style="text-align:left;"> 0.154*** </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> Population 65+ </td>
##    <td style="text-align:left;width: 2.5cm; "> std.error </td>
##    <td style="text-align:left;width: 2.5cm; "> (0.029) </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;"> (0.035) </td>
##    <td style="text-align:left;"> (0.036) </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> L(HTP, 1) </td>
##    <td style="text-align:left;width: 2.5cm; "> estimate </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;width: 2.5cm; "> 0.260*** </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> L(HTP, 1) </td>
##    <td style="text-align:left;width: 2.5cm; "> std.error </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;width: 2.5cm; "> (0.021) </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> L(UER, 1) </td>
##    <td style="text-align:left;width: 2.5cm; "> estimate </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;width: 2.5cm; "> -0.165** </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> L(UER, 1) </td>
##    <td style="text-align:left;width: 2.5cm; "> std.error </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;width: 2.5cm; "> (0.059) </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> Spatial Error (λ) </td>
##    <td style="text-align:left;width: 2.5cm; "> estimate </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;"> 0.688*** </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> Spatial Error (λ) </td>
##    <td style="text-align:left;width: 2.5cm; "> std.error </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;"> (0.046) </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> lambda </td>
##    <td style="text-align:left;width: 2.5cm; "> estimate </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;"> 0.820*** </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> estimates </td>
##    <td style="text-align:left;width: 2.5cm; "> lambda </td>
##    <td style="text-align:left;width: 2.5cm; "> std.error </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;"> (0.043) </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> gof </td>
##    <td style="text-align:left;width: 2.5cm; "> Num.Obs. </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;width: 2.5cm; "> 1867 </td>
##    <td style="text-align:left;width: 2.5cm; "> 1866 </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> gof </td>
##    <td style="text-align:left;width: 2.5cm; "> R2 </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;width: 2.5cm; "> 0.618 </td>
##    <td style="text-align:left;width: 2.5cm; "> 0.492 </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> gof </td>
##    <td style="text-align:left;width: 2.5cm; "> R2 Adj. </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;width: 2.5cm; "> 0.617 </td>
##    <td style="text-align:left;width: 2.5cm; "> 0.491 </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;width: 3cm; font-weight: bold;"> gof </td>
##    <td style="text-align:left;width: 2.5cm; "> Log.Lik. </td>
##    <td style="text-align:left;width: 2.5cm; ">  </td>
##    <td style="text-align:left;width: 2.5cm; "> -6033.352 </td>
##    <td style="text-align:left;width: 2.5cm; "> -6296.153 </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
## </tbody>
## <tfoot><tr><td style="padding: 0; " colspan="100%">
## <span style="font-style: italic;">Note: </span> <sup></sup> Standard errors in parentheses; * p&lt;0.1, ** p&lt;0.05, *** p&lt;0.01</td></tr></tfoot>
## </table>
## ### Key Coefficient Comparison
## <table class="table" style="margin-left: auto; margin-right: auto;">
##  <thead>
##   <tr>
##    <th style="text-align:left;"> term </th>
##    <th style="text-align:left;"> statistic </th>
##    <th style="text-align:left;"> Linear </th>
##    <th style="text-align:left;"> ARDL </th>
##    <th style="text-align:left;"> Spatial Lag </th>
##    <th style="text-align:left;"> Spatial Error </th>
##   </tr>
##  </thead>
## <tbody>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> Intercept </td>
##    <td style="text-align:left;"> estimate </td>
##    <td style="text-align:left;"> 7.873*** </td>
##    <td style="text-align:left;"> 13.758*** </td>
##    <td style="text-align:left;"> -0.011 </td>
##    <td style="text-align:left;"> -0.046 </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> Intercept </td>
##    <td style="text-align:left;"> std.error </td>
##    <td style="text-align:left;"> (0.501) </td>
##    <td style="text-align:left;"> (0.545) </td>
##    <td style="text-align:left;"> (0.034) </td>
##    <td style="text-align:left;"> (0.192) </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> Poverty Rate </td>
##    <td style="text-align:left;"> estimate </td>
##    <td style="text-align:left;"> 0.352*** </td>
##    <td style="text-align:left;"> 0.282*** </td>
##    <td style="text-align:left;"> 0.225*** </td>
##    <td style="text-align:left;"> 0.245*** </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> Poverty Rate </td>
##    <td style="text-align:left;"> std.error </td>
##    <td style="text-align:left;"> (0.015) </td>
##    <td style="text-align:left;"> (0.017) </td>
##    <td style="text-align:left;"> (0.044) </td>
##    <td style="text-align:left;"> (0.049) </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> Unemployment Rate </td>
##    <td style="text-align:left;"> estimate </td>
##    <td style="text-align:left;"> 0.723*** </td>
##    <td style="text-align:left;"> 0.914*** </td>
##    <td style="text-align:left;"> 0.047 </td>
##    <td style="text-align:left;"> 0.024 </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> Unemployment Rate </td>
##    <td style="text-align:left;"> std.error </td>
##    <td style="text-align:left;"> (0.054) </td>
##    <td style="text-align:left;"> (0.061) </td>
##    <td style="text-align:left;"> (0.041) </td>
##    <td style="text-align:left;"> (0.040) </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> LIN </td>
##    <td style="text-align:left;"> estimate </td>
##    <td style="text-align:left;"> 0.179* </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> LIN </td>
##    <td style="text-align:left;"> std.error </td>
##    <td style="text-align:left;"> (0.079) </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> Population 65+ </td>
##    <td style="text-align:left;"> estimate </td>
##    <td style="text-align:left;"> 0.857*** </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;"> 0.200*** </td>
##    <td style="text-align:left;"> 0.154*** </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> Population 65+ </td>
##    <td style="text-align:left;"> std.error </td>
##    <td style="text-align:left;"> (0.029) </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;"> (0.035) </td>
##    <td style="text-align:left;"> (0.036) </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> L(HTP, 1) </td>
##    <td style="text-align:left;"> estimate </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;"> 0.260*** </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> L(HTP, 1) </td>
##    <td style="text-align:left;"> std.error </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;"> (0.021) </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> L(UER, 1) </td>
##    <td style="text-align:left;"> estimate </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;"> -0.165** </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> L(UER, 1) </td>
##    <td style="text-align:left;"> std.error </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;"> (0.059) </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> Spatial Error (λ) </td>
##    <td style="text-align:left;"> estimate </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;"> 0.688*** </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> Spatial Error (λ) </td>
##    <td style="text-align:left;"> std.error </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;"> (0.046) </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> lambda </td>
##    <td style="text-align:left;"> estimate </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;"> 0.820*** </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> lambda </td>
##    <td style="text-align:left;"> std.error </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;"> (0.043) </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> Num.Obs. </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;"> 1867 </td>
##    <td style="text-align:left;"> 1866 </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> R2 Adj. </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;"> 0.617 </td>
##    <td style="text-align:left;"> 0.491 </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;font-weight: bold;"> Log.Lik. </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;"> -6033.352 </td>
##    <td style="text-align:left;"> -6296.153 </td>
##    <td style="text-align:left;">  </td>
##    <td style="text-align:left;">  </td>
##   </tr>
## </tbody>
## </table>

Statistically Significant Variables(<0.05)

library(modelsummary)
library(broom)
library(dplyr)

# Create model list with original models
model_list <- list(
  "Linear" = model,
  "ARDL" = ardl_model,
  "Spatial Lag" = spatial_lag, 
  "Spatial Error" = spatial_error
)

# Define methods for each model type
tidy_custom.spatialreg <- function(model, ...) {
  broom::tidy(model) %>% 
    filter(p.value < 0.05) %>% 
    mutate(term = case_when(
      term == "rho" ~ "Spatial Lag (ρ)",
      term == "lambda" ~ "Spatial Error (λ)",
      TRUE ~ term
    ))
}

tidy_custom.default <- function(model, ...) {
  broom::tidy(model) %>% 
    filter(p.value < 0.05) %>% 
    mutate(term = case_when(
      term == "L(HTP, 1)" ~ "Lagged HTP",
      term == "L(UER, 1)" ~ "Lagged Unemployment",
      TRUE ~ term
    ))
}

# Single modelsummary call with all parameters
modelsummary(
  model_list,
  stars = c('+' = 0.1, '*' = 0.05, '**' = 0.01, '***' = 0.001),
  coef_map = c(
    "POV" = "Poverty Rate",
    "UER" = "Unemployment Rate", 
    "AGE65" = "Age 65+",
    "LIN" = "No Health Insurance",
    "Lagged HTP" = "Lagged Hypertension",
    "Lagged Unemployment" = "Lagged Unemployment",
    "Spatial Lag (ρ)" = "Spatial Lag (ρ)", 
    "Spatial Error (λ)" = "Spatial Error (λ)"
  ),
  gof_map = c("nobs", "r.squared", "aic", "bic"),
  title = "Model Comparison: Statistically Significant Coefficients (p < 0.05)",
  notes = list(
    "Standard errors in parentheses",
    "Only coefficients with p < 0.05 shown",
    "+ p<0.1, * p<0.05, ** p<0.01, *** p<0.001"
  )
)
Model Comparison: Statistically Significant Coefficients (p < 0.05)
Linear ARDL Spatial Lag Spatial Error
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
Standard errors in parentheses
Only coefficients with p < 0.05 shown
+ p<0.1, * p<0.05, ** p<0.01, *** p<0.001
Poverty Rate 0.352*** 0.282*** 0.225*** 0.245***
(0.015) (0.017) (0.044) (0.049)
Unemployment Rate 0.723*** 0.914*** 0.047 0.024
(0.054) (0.061) (0.041) (0.040)
Age 65+ 0.857*** 0.200*** 0.154***
(0.029) (0.035) (0.036)
No Health Insurance 0.179*
(0.079)
Num.Obs. 1867 1866
R2 0.618 0.492
AIC 12078.7 12604.3 200.1 220.5
BIC 12111.9 12637.5 218.2 238.7