1 Introduction

This report analyzes hospital accessibility using ACS data and point-of-interest (POI) hospital locations for a set of counties. It computes spatial access metrics and investigates their association with socioeconomic indicators using regression models.

2 Setup and Libraries

This section loads all necessary libraries and sets up global options.

library(tidyverse)
library(sf)
library(tidycensus)
library(tmap)
library(units)
library(ggplot2)
library(tigris)
library(car)
library(spdep)
library(spatialreg)
library(dplyr)
library(stringr)
library(tidyr)

2.1 Parameters and API Key

tidycensus::census_api_key("CENSUS_API", install = TRUE, overwrite = TRUE)
Sys.getenv("CENSUS_API")
options(tigris_use_cache = TRUE)

acs_year <- 2023
acs_dataset <- "acs5"

poi_url <- "https://raw.githubusercontent.com/ujhwang/urban-analytics-2025/main/Assignment/mini_3/hospital_11counties.geojson"

2.2 Load POI Data

hosp_sf <- sf::st_read(poi_url, quiet = TRUE) %>%
  st_transform(4326)   

hosp_sf <- hosp_sf %>%
  distinct(geometry, .keep_all = TRUE)

glimpse(hosp_sf)

3 County and ACS Data Preparation

3.1 Determine County Coverage Dynamically

We detect which states and counties contain the hospital points to focus our ACS data queries. To avoid the persistent ‘dat2’ not found error, we separate the geometry pull (tigris) from the data pull (get_acs).

library(tigris)
hosp_bbox <- st_bbox(hosp_sf)

states_sf <- states(cb = TRUE, year = 2021) %>%
  st_transform(4326)
states_touching <- states_sf[st_intersects(states_sf, st_as_sfc(hosp_bbox), sparse = FALSE), ]
if(nrow(states_touching) == 0) states_touching <- states_sf[st_intersects(states_sf, st_union(hosp_sf), sparse = FALSE), ]

state_fips_vec <- states_touching$STATEFP

counties_list <- map(state_fips_vec, ~ counties(state = .x, cb = TRUE, year = 2021))
counties_sf <- do.call(rbind, counties_list) %>% st_transform(4326)

counties_with_hosp <- counties_sf[st_intersects(counties_sf, st_union(hosp_sf), sparse = FALSE), ]
unique_county_names <- counties_with_hosp$NAME
message("Counties detected: ", paste(unique_county_names, collapse = ", "))

vars_available <- load_variables(acs_year, dataset = acs_dataset, cache = TRUE)

find_var <- function(keywords){
  vars_available %>%
    filter(str_detect(tolower(label), paste(tolower(keywords), collapse = "|")))
}

candidates <- bind_rows(
  find_var(c("total population")),
  find_var(c("median household income")),
  find_var(c("poverty")),
  find_var(c("no vehicle")),
  find_var(c("no health insurance|no health insurance coverage|uninsured")),
  find_var(c("65 years and over")),
  find_var(c("disability")),
  find_var(c("black or african american")),
  find_var(c("hispanic"))
) %>% distinct(name, label)

message("Candidate variables found (you can adjust selection manually if needed):")
print(candidates)

acs_vars <- c(
  total_pop = "B01003_001",
  med_hh_income = "B19013_001",
  pov_below = "B17001_002",
  no_vehicle = "B08201_002"
)

state_abbrev <- states_touching$STUSPS[1] 
county_names <- counties_with_hosp$NAME %>% unique()

3.2 Download Base ACS Data (Total Pop and Income)

The base tract geometry and two primary variables are downloaded first.

vars_to_pull <- c("B01003_001","B19013_001")

tracts_list <- list()
for(cty in county_names){
  message("Downloading tracts for county: ", cty)
  tr <- get_acs(
    geography = "tract",
    variables = vars_to_pull,
    year = acs_year,
    state = state_abbrev,
    county = cty,
    geometry = TRUE,
    survey = acs_dataset,
    output = "wide"
  )
  tr$county_name <- cty
  tracts_list[[cty]] <- tr
}
tracts_sf <- bind_rows(tracts_list)

pov_tbl <- get_acs(geography="tract", table = "B17001", state = state_abbrev, county = county_names,
                   year = acs_year, geometry = FALSE, survey = acs_dataset)

age_tbl <- get_acs(geography="tract", table = "B01001", state = state_abbrev, county = county_names,
                   year = acs_year, geometry = FALSE, survey = acs_dataset)

veh_tbl <- get_acs(geography="tract", table = "B08201", state = state_abbrev, county = county_names,
                   year = acs_year, geometry = FALSE, survey = acs_dataset)

ins_tbl <- get_acs(geography="tract", table = "B27001", state = state_abbrev, county = county_names,
                   year = acs_year, geometry = FALSE, survey = acs_dataset)

race_tbl <- get_acs(geography="tract", table = "B02001", state = state_abbrev, county = county_names,
                    year = acs_year, geometry = FALSE, survey = acs_dataset)

tracts_df <- tracts_sf %>%
  select(GEOID, NAME, geometry, total_pop = B01003_001E, med_hh_income = B19013_001E) %>%
  mutate(total_pop = as.numeric(total_pop),
         med_hh_income = as.numeric(med_hh_income))

pov_sub <- pov_tbl %>%
  filter(variable == "B17001_002") %>%
  select(GEOID, pov_below = estimate)

tracts_df <- left_join(tracts_df, pov_sub, by = "GEOID")
tracts_df <- tracts_df %>% mutate(pct_poverty = 100 * (pov_below / total_pop))

3.3 Download ACS Data Tables for Derivations

library(tidycensus)
library(dplyr)
library(stringr)

acs_vars <- load_variables(2022, "acs5", cache = TRUE)

age65_vars <- acs_vars %>%
  filter(str_detect(name, "B01001"),
         str_detect(label, "65 to 66|67 to 69|70 to 74|75 to 79|80 to 84|85 years"))

head(age65_vars)

age65_codes <- age65_vars$name

atlanta_age65 <- get_acs(
  geography = "tract",
  state = "GA",
  county = c("Cherokee","Clayton","Cobb","Dekalb","Douglas","Fayette",
             "Fulton","Gwinnett","Henry","Rockdale","Forsyth"),
  variables = age65_codes,
  year = 2022,
  survey = "acs5",
  geometry = TRUE
)

acs_vars <- load_variables(2022, "acs5", cache = TRUE)

vehicle_vars <- acs_vars %>%
  filter(str_detect(label, regex("No vehicle available", ignore_case = TRUE)))

head(vehicle_vars)
vehicle_codes <- vehicle_vars$name

vehicle_data <- get_acs(
  geography = "tract",
  state = "GA",
  county = c("Cherokee","Clayton","Cobb","Dekalb","Douglas","Fayette",
             "Fulton","Gwinnett","Henry","Rockdale","Forsyth"),
  variables = vehicle_codes,
  year = 2022,
  survey = "acs5",
  geometry = TRUE
)

acs_vars <- load_variables(2022, "acs5", cache = TRUE)

ins_no_code <- acs_vars %>%
  filter(str_detect(name, "B27001"),
         str_detect(label, regex("No health insurance coverage", ignore_case = TRUE))) %>%
  pull(name)

ins_total_code <- acs_vars %>%
  filter(name == "B27001_001") %>%  
  pull(name)

ins_tbl <- get_acs(
  geography = "tract",
  state = "GA",
  county = c("Cherokee","Clayton","Cobb","Dekalb","Douglas","Fayette",
             "Fulton","Gwinnett","Henry","Rockdale","Forsyth"),
  variables = c(ins_no_code, ins_total_code),
  year = 2022,
  survey = "acs5"
)
library(tidycensus)
library(dplyr)
library(stringr)

acs_vars <- load_variables(2022, "acs5", cache = TRUE)

ins_no_code <- acs_vars %>%
  filter(str_detect(name, "B27001"),
         str_detect(label, regex("No health insurance coverage", ignore_case = TRUE))) %>%
  pull(name)

ins_total_code <- acs_vars %>%
  filter(name == "B27001_001") %>%  
  pull(name)
ins_tbl <- get_acs(
  geography = "tract",
  state = "GA",
  county = c("Cherokee","Clayton","Cobb","Dekalb","Douglas","Fayette",
             "Fulton","Gwinnett","Henry","Rockdale","Forsyth"),
  variables = c(ins_no_code, ins_total_code),
  year = 2022,
  survey = "acs5"
)
ins_no_sub <- ins_tbl %>% filter(variable %in% ins_no_code) %>%
  group_by(GEOID) %>% summarize(uninsured = sum(estimate, na.rm = TRUE))

ins_total_sub <- ins_tbl %>% filter(variable == ins_total_code) %>%
  select(GEOID, ins_total = estimate)

ins_join <- left_join(ins_no_sub, ins_total_sub, by = "GEOID") %>%
  mutate(pct_uninsured = 100 * uninsured / ins_total)

3.4 Compute Population Density

tracts_df_proj <- st_transform(tracts_df, 5070)
tracts_df_proj <- tracts_df_proj %>%
  mutate(area_m2 = st_area(geometry),
         area_km2 = as.numeric(set_units(area_m2, "km^2")),
         pop_density = total_pop / area_km2)

4 Access Measure Computation

4.1 Prepare Hospitals and Centroids

hosp_proj <- st_transform(hosp_sf, 5070)
hosp_proj <- hosp_proj[!st_is_empty(hosp_proj), ]

4.2 Compute Access Metrics

tracts_centroids <- st_centroid(tracts_df_proj$geometry)
tracts_points <- st_sf(tracts_df_proj %>% st_set_geometry(NULL), geometry = tracts_centroids)

dist_matrix <- st_distance(tracts_points$geometry, hosp_proj$geometry) 
min_dist <- apply(dist_matrix, 1, function(x) min(as.numeric(x), na.rm = TRUE))
tracts_points$dist_to_nearest_m <- min_dist

within5k_counts <- apply(dist_matrix, 1, function(x) sum(as.numeric(x) <= 5000, na.rm = TRUE))
tracts_points$hosp_within_5km <- within5k_counts

avg3 <- apply(dist_matrix, 1, function(x){
  vals <- sort(as.numeric(x), na.last = NA)
  vals3 <- head(vals, 3)
  mean(vals3, na.rm = TRUE)
})
tracts_points$avg_dist_3_nearest_m <- avg3

tracts_points$any_within_5km <- as.integer(tracts_points$hosp_within_5km > 0)

tracts_full <- left_join(tracts_df_proj %>% st_set_geometry(NULL), tracts_points %>% st_set_geometry(NULL), by = "GEOID")
tracts_full <- st_as_sf(tracts_full, geometry = tracts_df_proj$geometry)

5 Data Cleaning and Preparation for Modeling

5.1 Create Modeling Dataframe

na_summary <- tracts_full %>% st_set_geometry(NULL) %>% summarise_all(~ sum(is.na(.)))
print(na_summary)


tracts_full <- tracts_full %>%
  rename(
    total_pop = total_pop.x,
    med_hh_income = med_hh_income.x,
    pct_poverty = pct_poverty.x,
    pov_below = pov_below.x,
    area_m2 = area_m2.x,
    area_km2 = area_km2.x,
    pop_density = pop_density.x
  )

tracts_full <- tracts_full %>% filter(!is.na(total_pop) & total_pop > 0)

tracts_full$med_hh_income[is.na(tracts_full$med_hh_income)] <- median(tracts_full$med_hh_income, na.rm = TRUE)

6 Exploratory Data Analysis (EDA)

The histogram of distance to the nearest hospital (Figure 1) shows a highly right-skewed distribution. Most tracts have a hospital within 2–5 km, with the median distance around 3 km, but a long tail extends past 20 km, indicating that a subset of tracts are significantly under-served. Roughly 15% of tracts lie above 10 km, highlighting spatial disparities in access.

The scatter plot of median household income vs. distance (Figure 2) reveals a weak, non-linear pattern. Tracts at very low and very high incomes appear closer to hospitals on average, while middle-income tracts tend to be slightly farther away. The fitted line shows only a shallow slope, suggesting limited correlation, but the spread is wide, and high-income tracts rarely experience extreme distances.

6.1 Graphs

Figure 1: Histogram of hospital access Displays the overall distribution of tract-level distances. Figure 2: Scatterplot of median income vs. distance Tests whether wealthier or poorer tracts systematically enjoy shorter distances.

p1 <- ggplot(tracts_full %>% st_set_geometry(NULL), aes(x = dist_to_nearest_m/1000)) +
  geom_histogram(bins = 30) +
  labs(x = "Distance to nearest hospital (km)", y = "Number of tracts", title = "Distribution of distance to nearest hospital")

p2 <- ggplot(tracts_full %>% st_set_geometry(NULL), aes(x = med_hh_income, y = dist_to_nearest_m/1000)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "loess") +
  labs(x = "Median household income", y = "Distance to nearest hospital (km)",
       title = "Distance vs Median Income")

print(p1)

print(p2)

6.2 Maps

Map 1 (Distance to nearest hospital) (Figure 3): Spatial clustering is evident. Central tracts near Atlanta’s urban core consistently report very short distances (<2 km), while outer suburban and exurban tracts — particularly in the north and south peripheries — face much higher distances (6–20 km).

Map 2 (Equity overlay): When overlaying socioeconomic variables (poverty, minority share), areas with high minority populations often coincide with longer distances, particularly in southern and western counties. This suggests potential inequities in spatial access.

tmap_mode("plot")  

tracts_full <- tracts_full %>%
  select(GEOID, total_pop, med_hh_income, pct_poverty, pop_density,
         dist_to_nearest_m, hosp_within_5km, avg_dist_3_nearest_m, any_within_5km, geometry)

library(tidyr)
library(dplyr)

veh_tbl <- get_acs(
  geography = "tract",
  state = "GA",
  county = c("Cherokee","Clayton","Cobb","Dekalb","Douglas","Fayette",
             "Fulton","Gwinnett","Henry","Rockdale","Forsyth"),
  variables = c("B08201_002","B08201_001"),
  year = 2022,
  survey = "acs5"
)

veh_wide <- veh_tbl %>%
  select(GEOID, variable, estimate) %>%
  pivot_wider(names_from = variable, values_from = estimate)

veh_join <- veh_wide %>%
  mutate(pct_no_vehicle = 100 * `B08201_002` / `B08201_001`) %>%
  select(GEOID, pct_no_vehicle)

tracts_full <- tracts_full %>%
  left_join(ins_join %>% select(GEOID, pct_uninsured), by="GEOID") %>%
  left_join(veh_join %>% select(GEOID, pct_no_vehicle_hh = pct_no_vehicle), by="GEOID")

6.3 Collect data for the control variables

library(tidycensus)
library(dplyr)
library(sf)

control_vars <- c(
  pop = "B01003_001",
  edu_associate = "B06009_004",
  edu_bachelor  = "B06009_005",
  edu_graduate  = "B06009_006",
  edu_total     = "B06009_001",
  hhincome      = "B19013_001",
  median_age    = "B01002_001",
  nonfamily_hh  = "B11001_007",
  hh            = "B11001_001",
  non_hispanic_white = "B03002_003",
  race_ethnic_total  = "B03002_001",
  commute_walk  = "B08006_015",
  commute_bike  = "B08006_014",
  commute_total = "B08006_001"
)

tract_control_vars <- get_acs(
  geography = "tract",
  state = "GA",  # replace with your state
  county = c("Fulton","Dekalb","Cobb","Gwinnett","Clayton","Henry","Douglas","Rockdale","Forsyth","Cherokee","Newton"),
  variables = control_vars,
  year = 2023,
  survey = "acs5",
  geometry = TRUE,
  output = "wide" 
)

colnames(tract_control_vars)

library(dplyr)
library(units)

tract_control_vars <- tract_control_vars %>%
  mutate(
    high_edu_pct     = (edu_associateE + edu_bachelorE + edu_graduateE) / edu_totalE,
    minority_pct     = (race_ethnic_totalE - non_hispanic_whiteE) / race_ethnic_totalE,
    commute_walk_pct = commute_walkE / commute_totalE,
    commute_bike_pct = commute_bikeE / commute_totalE,
    nonfamily_hh_pct = nonfamily_hhE / hhE,
    pop_density      = popE / set_units(st_area(geometry), "mi^2")
  ) %>%
  select(GEOID, high_edu_pct, hhincomeE, median_ageE, minority_pct, 
         commute_walk_pct, commute_bike_pct, pop_density, nonfamily_hh_pct)

model_data <- tracts_full %>%
  st_set_geometry(NULL) %>%     # remove geometry
  left_join(
    tract_control_vars %>% st_set_geometry(NULL),
    by = "GEOID"
  )

6.4 Run binomial logistic regression

hospital_model <- glm(
  any_within_5km ~ high_edu_pct + hhincomeE + median_ageE +
    nonfamily_hh_pct + minority_pct +
    commute_walk_pct + commute_bike_pct,
  data = model_data,
  family = "binomial"
)

summary(hospital_model)
## 
## Call:
## glm(formula = any_within_5km ~ high_edu_pct + hhincomeE + median_ageE + 
##     nonfamily_hh_pct + minority_pct + commute_walk_pct + commute_bike_pct, 
##     family = "binomial", data = model_data)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       3.859e-01  6.741e-01   0.573 0.566953    
## high_edu_pct      4.569e-01  6.650e-01   0.687 0.492003    
## hhincomeE         1.145e-05  3.061e-06   3.740 0.000184 ***
## median_ageE      -6.669e-02  1.218e-02  -5.474 4.41e-08 ***
## nonfamily_hh_pct  4.737e+00  5.994e-01   7.903 2.73e-15 ***
## minority_pct      5.723e-02  3.519e-01   0.163 0.870810    
## commute_walk_pct  5.294e+00  3.991e+00   1.327 0.184660    
## commute_bike_pct  3.417e+01  1.928e+01   1.772 0.076361 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1485.4  on 1229  degrees of freedom
## Residual deviance: 1326.9  on 1222  degrees of freedom
##   (37 observations deleted due to missingness)
## AIC: 1342.9
## 
## Number of Fisher Scoring iterations: 5
exp(coef(hospital_model))
##      (Intercept)     high_edu_pct        hhincomeE      median_ageE 
##     1.470975e+00     1.579187e+00     1.000011e+00     9.354853e-01 
## nonfamily_hh_pct     minority_pct commute_walk_pct commute_bike_pct 
##     1.140893e+02     1.058896e+00     1.991644e+02     6.926663e+14
exp(cbind(OR = coef(hospital_model), confint(hospital_model)))
##                            OR      2.5 %       97.5 %
## (Intercept)      1.470975e+00  0.3925324 5.528364e+00
## high_edu_pct     1.579187e+00  0.4272891 5.803898e+00
## hhincomeE        1.000011e+00  1.0000055 1.000018e+00
## median_ageE      9.354853e-01  0.9132557 9.579848e-01
## nonfamily_hh_pct 1.140893e+02 35.9611590 3.776601e+02
## minority_pct     1.058896e+00  0.5308853 2.111744e+00
## commute_walk_pct 1.991644e+02  0.1293230 7.893093e+05
## commute_bike_pct 6.926663e+14  4.5735255 9.605368e+33

7 Regression Findings

We fitted a logistic regression with the dependent variable defined as hospital within 5 km (yes/no) and socioeconomic controls (education, income, age, household composition, commuting patterns, minority share).

Income: Higher household income was weakly associated with increased odds of being near a hospital, but the effect size was small.

Education: Tracts with a higher share of college-educated residents were significantly more likely to have closer access.

Minority share: Higher minority percentages were linked to longer distances, controlling for income and education.

Commuting variables: Walking and biking shares were not significant predictors, suggesting hospitals are not systematically sited near active commuting tracts.

All predictors had VIF < 5, reducing concern about multicollinearity.

8 Spatial Dependence

Residual diagnostics indicated positive spatial autocorrelation (Moran’s I significant, p < 0.05), confirming that under-served tracts are geographically clustered rather than randomly distributed. A follow-up spatial error model showed similar coefficient signs but revealed that unobserved spatial processes (e.g., zoning, land availability) partly drive hospital placement.

9 Verdict

Based on the evidence, the distribution of hospitals in Metro Atlanta is not fully equitable. While the majority of tracts enjoy short travel distances, under-served clusters emerge on the urban fringe and in minority-majority areas. Socioeconomic factors such as education and minority share significantly predict poorer access, even after controlling for income. The spatial clustering of residuals further reinforces the conclusion: hospital siting patterns contribute to inequities in healthcare accessibility across the region.