#1. Import the hospital POI dataand make sure it is tidy and ready for analysis.
#load JSON files 
json <- fromJSON("https://raw.githubusercontent.com/ujhwang/urban-analytics-2025/main/Assignment/mini_3/hospital_11counties.geojson", simplifyVector = F)
str(json)

not_flat <- fromJSON("https://raw.githubusercontent.com/ujhwang/urban-analytics-2025/main/Assignment/mini_3/hospital_11counties.geojson", flatten = F)
glimpse(not_flat)

flat <- fromJSON("https://raw.githubusercontent.com/ujhwang/urban-analytics-2025/main/Assignment/mini_3/hospital_11counties.geojson", flatten = T)
glimpse(flat)
#bring JSON data in a readable format for data analysis 
#put the features list into a tibble
df <- tibble(features = not_flat$features)

#unnest features into columns
unnested_long_wide <- df %>%
  unnest_wider(features) %>%
  unnest_wider(properties)   

kable(unnested_long_wide)
#clean data 
poi <- unnested_long_wide %>%
  filter(!is.na(name) & !sapply(geometry$coordinates, function(x) any(is.na(x)))) 

poi_unique <- poi %>%
  distinct(name, .keep_all = TRUE)
glue("Before dropping duplicated rows, there were {nrow(poi)} rows. After dropping them, there are {nrow(poi_unique)} rows.")
## Before dropping duplicated rows, there were 119 rows. After dropping them, there are 104 rows.
#2. Explore variables in the Census ACS 5-year estimates, carefully select those that are relevant for an equity analysis, and provide justification for each variable you select.
#Think about how each variable captures different circumstances and needs for healthcare.

### Get Census data
# Define API key
census_api_key(Sys.getenv("MY_API_KEY"))
## To install your API key for use in future sessions, run this function with `install = TRUE`.
#census_api_key("698568bc0852158fccf6044f4cd5ec586b74b2e0", install = TRUE, overwrite = TRUE)

# Pull data from ACS
acs_variables <- load_variables(year = 2023, dataset = "acs5", cache = TRUE)

#3. Download ACS data for the selected variables at the Census Tract level for the 11 counties using the tidycensus.
#Pull variable 
race_eth <- get_acs(
  geography = "tract",
  state = "GA",
  county = c("Cherokee", "Clayton", "Cobb", "DeKalb", "Douglas", "Fayette", 
             "Forsyth", "Fulton", "Gwinnett", "Henry", "Rockdale"),
  variables = c(total = "B03002_001",
    hisp = "B03002_012",
    non_hisp_white = "B03002_003",
    non_hisp_black = "B03002_004",
    non_hisp_asian = "B03002_006"),
  year = 2023,
  survey = "acs5",
  geometry = TRUE,
  output = "wide"
)
## Getting data from the 2019-2023 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
race_eth <- race_eth %>%
  mutate(
    tot_indv = totalE,
    hisp = hispE,
    non_hisp_white = non_hisp_whiteE,
    non_hisp_black = non_hisp_blackE,
    non_hisp_asian = non_hisp_asianE,
    non_hisp_other = tot_indv - (hisp + non_hisp_white + non_hisp_black + non_hisp_asian)
  ) %>%
  select(GEOID, NAME, tot_indv, hisp, non_hisp_white, non_hisp_black, non_hisp_asian, non_hisp_other, geometry)


hh_inc <- get_acs(
  geography = "tract",
  state = "GA",
  county = c("Cherokee", "Clayton", "Cobb", "DeKalb", "Douglas", "Fayette", 
             "Forsyth", "Fulton", "Gwinnett", "Henry", "Rockdale"),
  variables = c(median_hhinc = "B19013_001E"),
  year = 2023,
  survey = "acs5",
  geometry = TRUE,
  output = "wide"
)
## Getting data from the 2019-2023 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
edu <- get_acs(
  geography = "tract",
  state = "GA",
  county = c("Cherokee", "Clayton", "Cobb", "DeKalb", "Douglas", "Fayette", 
             "Forsyth", "Fulton", "Gwinnett", "Henry", "Rockdale"),
  variables = c(
    edu_associate = "B06009_004E",
    edu_bachelor  = "B06009_005E",
    edu_graduate  = "B06009_006E",
    edu_total     = "B06009_001E"
  ),
  year = 2023,
  survey = "acs5",
  geometry = TRUE,
  output = "wide"
)
## Getting data from the 2019-2023 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
edu <- edu %>%
  mutate(
    high_edu_pct = round((edu_associate + edu_bachelor + edu_graduate) / edu_total * 100)
  )

acs_joined <- race_eth %>%
  left_join(hh_inc %>% st_drop_geometry(), by = "GEOID") %>%
  left_join(edu %>% st_drop_geometry() %>% select(GEOID, high_edu_pct), by = "GEOID")

***I selected race/ethnicity and household income as key measures of equity. Household income is a particularly strong indicator, as it reflects the distribution of economic resources and the resulting purchasing power that influences access to and demand for quality services — in this case, hospitals and general healthcare. Lower-income households often face barriers, such as limited insurance coverage, higher transportation costs, and fewer healthcare options in their neighborhoods.

In the United States, significant disparities persist across racial and ethnic groups, which are often intertwined with income inequality. For instance, Black and Hispanic populations tend to have lower average household incomes and disproportionately higher rates of being uninsured or underinsured compared to White and Asian populations.

Lastly, I picked educational attainment as the highly educated individuals tend to have higher paying jobs and hence can afford to live in areas that have better access to services such as healthcare. ***

#3. Prepare the data for analysis
acs_joined_clean_na <- acs_joined %>%
  filter(!is.na(`NAME.x`)) 

acs_joined_clean <- acs_joined_clean_na %>%
  distinct(`NAME.x`, .keep_all = TRUE)
glue("Before dropping duplicated rows, there were {nrow(acs_joined_clean_na)} rows. After dropping them, there are {nrow(acs_joined_clean)} rows.")
## Before dropping duplicated rows, there were 1248 rows. After dropping them, there are 1248 rows.
acs_joined_final <- acs_joined_clean %>%
  mutate(
    pct_hisp = round(100 * (hisp /tot_indv), 1),
    pct_non_hisp_white = round(100 * (non_hisp_white /tot_indv), 1),
    pct_non_hisp_black = round(100 * (non_hisp_black/tot_indv), 1),
    pct_non_hisp_asian = round(100 * (non_hisp_asian/tot_indv), 1),
    pct_non_hisp_other = round(100 * (non_hisp_other/tot_indv), 1)
  )

#join poi data with census data 
poi_unique_clean <- unnested_long_wide %>%
  rowwise() %>%
  mutate(
    lon = as.numeric(geometry$coordinates[[1]][1]),
    lat = as.numeric(geometry$coordinates[[1]][2])
  ) %>%
  ungroup() %>%
  st_as_sf(coords = c("lon", "lat"), crs = 4326) %>%
  st_transform(2240)

## Make sure the CRS of the two sf objects are in the same PCS
acs_joined_final %<>% st_transform(2240)

# Join hospital poi with tracts
poi_acs_joined_final <- st_join(poi_unique_clean, acs_joined_final)

# Get the presence of hospitals at the tract level
geoid_tracts_with_hospitals <- poi_acs_joined_final %>% 
  filter(!is.na(name)) %>% 
  pull(GEOID) %>% 
  unique()

# Join the presence of hospitals to `tract`
hospital_in_acs_tracts <- acs_joined_final %>% mutate(hospitals = GEOID %in% geoid_tracts_with_hospitals)
#Visualize hospitals in ACS tracts 
tmap_mode("view")

tm_shape(hospital_in_acs_tracts) +
  tm_fill(
    col = "hospitals",
    palette = c("lightgrey", "purple"),  
    title = "Hospital Presence",
    alpha = 0.5                            
  ) +
  tm_borders(col = "darkgray", lwd = 1) +   
tm_shape(poi_unique_clean) +
  tm_dots(
    size = 0.6,
    col = "red",                             
    border.col = "white",                    
    popup.vars = c("name")             
  ) +
  tm_layout(
    title = "Hospitals in Georgia (Census Tracts)",
    legend.outside = TRUE,
    legend.position = c("left", "bottom")
  )

*** I chose to calculate two metrics for this analysis: the presence of a hospital within 5-miles and the distance to the nearest hospital. The first metric provides a simple measure of local access, indicating whether residents have a hospital in close proximity, while the second metric captures finer-grained variation in accessibility by quantifying how far residents must travel to reach the nearest facility. These metrics are particularly useful for examining disparities, as tracts with lower income or lower educational attainment may have fewer hospitals nearby, reflecting inequities in service provision. ***

#5. Define and compute your operational measure of healthcare access (the dependent variable).
#Identify suitable metric(s) and justify your choices. Example metrics include:
#The presence of a hospital within X miles.
#The distance to the nearest hospital.
##Compute your chosen measure(s).

tract_centroids <- st_centroid(hospital_in_acs_tracts)
## Warning: st_centroid assumes attributes are constant over geometries
# Compute distance matrix (in meters)
dist_matrix <- st_distance(tract_centroids, poi_unique_clean)
hospital_in_acs_tracts$dist_to_nearest_hospital_m <- apply(dist_matrix, 1, min)

# Convert to miles 
hospital_in_acs_tracts$dist_to_nearest_hospital_mi <- as.numeric(hospital_in_acs_tracts$dist_to_nearest_hospital_m) / 1609.34
tm_shape(hospital_in_acs_tracts) +
  tm_fill(
    col = "dist_to_nearest_hospital_mi",
    palette = "viridis",
    style = "quantile",
    title = "Distance to Nearest Hospital (miles)"
  ) +
  tm_shape(poi_unique_clean) +
  tm_dots(col = "red", size = 0.3, border.col = "white")
# Distance in meters
five_miles_m <- 5 * 1609.34

# Create circular buffers around each hospital
hospital_buffers <- st_buffer(poi_unique_clean, dist = five_miles_m)

# Align CRS (just in case)
hospital_buffers <- st_transform(hospital_buffers, st_crs(hospital_in_acs_tracts))

hospital_in_acs_tracts <- hospital_in_acs_tracts %>%
  mutate(
    within_5mi_hospital = lengths(st_intersects(geometry, hospital_buffers)) > 0
  )

#Visualize results
tmap_mode("plot")

tm_shape(hospital_in_acs_tracts) +
  tm_fill(
    col = "within_5mi_hospital",
    palette = c("gray90", "skyblue"),
    title = "Within 5 Miles of Hospital",
    labels = c("No", "Yes"),
    legend.is.portrait = TRUE
  ) +
tm_shape(hospital_buffers) +
  tm_borders(col = "red", lwd = 1.5, lty = "dashed") +
tm_shape(poi_unique_clean) +
  tm_dots(
    col = "red",
    size = 0.3,
    border.col = "white"
  ) +
tm_layout(
  title = "Healthcare Access: 5-Mile Hospital Catchment Areas",
  legend.outside = TRUE,
  frame = FALSE,
  bg.color = "white"
)

#6. Analyze the spatial distribution of hospitals from an equity perspective:

#Conduct exploratory data analysis and present at least two graphs and at least two maps.
#Fit appropriate regression model(s) to test associations between healthcare access and socioeconomic factors.
#Conclude with your verdict: “Is the spatial distribution of hospitals in Metro Atlanta equitable?”

hospital_in_acs_tracts <- hospital_in_acs_tracts %>%
  mutate(
    dominant_race = case_when(
      pct_non_hisp_white >= pct_non_hisp_black & pct_non_hisp_white >= pct_non_hisp_asian & pct_non_hisp_white >= pct_hisp ~ "White",
      pct_non_hisp_black >= pct_non_hisp_white & pct_non_hisp_black >= pct_non_hisp_asian & pct_non_hisp_black >= pct_hisp ~ "Black",
      pct_non_hisp_asian >= pct_non_hisp_white & pct_non_hisp_asian >= pct_non_hisp_black & pct_non_hisp_asian >= pct_hisp ~ "Asian",
      pct_hisp >= pct_non_hisp_white & pct_hisp >= pct_non_hisp_black & pct_hisp >= pct_non_hisp_asian ~ "Hispanic",
      TRUE ~ "Other"
    )
  )

hospitals_summary <- hospital_in_acs_tracts %>%
  st_drop_geometry() %>%
  group_by(dominant_race, within_5mi_hospital) %>%
  summarize(n = n(), .groups = "drop") %>%
  group_by(dominant_race) %>%
  mutate(
    proportion = n / sum(n)
  )

#stacked bar plot 
hospitals_summary %>%
  ggplot(aes(x = dominant_race, y = proportion, fill = within_5mi_hospital)) +
  geom_bar(stat = 'identity', position = 'fill') +
  scale_fill_manual(values = c("gray80", "skyblue"), labels = c("No", "Yes")) +
  labs(
    title = "Hospital Access by Dominant Racial Composition of Census Tracts",
    x = "Dominant Racial Group",
    y = "Proportion of Tracts",
    fill = "Within 5 Miles of Hospital"
  ) +
  theme_minimal()

#box plot 
hospital_in_acs_tracts %>%
  st_drop_geometry() %>%
  count(dominant_race)
##   dominant_race   n
## 1         Asian  31
## 2         Black 512
## 3      Hispanic  89
## 4         Other   3
## 5         White 613
hospital_in_acs_tracts %>%
  st_drop_geometry() %>%
  mutate(dominant_race = factor(dominant_race,
                                levels = c("White", "Black", "Asian", "Hispanic", "Other"))) %>%
  ggplot(aes(x = dominant_race, y = median_hhinc, fill = dominant_race)) +
  geom_boxplot(outlier.shape = NA) +  
  scale_y_continuous(labels = scales::dollar_format()) +
  labs(
    title = "Distribution of Median Household Income by Dominant Racial Group",
    x = "Dominant Race in Tract",
    y = "Median Household Income"
  ) +
  theme_minimal() +
  theme(legend.position = "none")
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Drop geometry for modeling
df_model <- hospital_in_acs_tracts %>%
  st_drop_geometry() %>%
  mutate(
    access = as.numeric(within_5mi_hospital) 
  )

# Fit logistic regression
model <- glm(
  access ~ median_hhinc + pct_non_hisp_white + pct_non_hisp_black + pct_non_hisp_asian + pct_hisp + high_edu_pct,
  data = df_model,
  family = binomial(link = "logit")
)

summary(model)
## 
## Call:
## glm(formula = access ~ median_hhinc + pct_non_hisp_white + pct_non_hisp_black + 
##     pct_non_hisp_asian + pct_hisp + high_edu_pct, family = binomial(link = "logit"), 
##     data = df_model)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -3.172e+00  1.633e+00  -1.943   0.0520 .  
## median_hhinc       -1.216e-05  2.177e-06  -5.583 2.37e-08 ***
## pct_non_hisp_white  2.681e-02  1.650e-02   1.625   0.1042    
## pct_non_hisp_black  2.390e-02  1.621e-02   1.475   0.1403    
## pct_non_hisp_asian  7.484e-02  1.814e-02   4.125 3.70e-05 ***
## pct_hisp            3.964e-02  1.682e-02   2.357   0.0184 *  
## high_edu_pct        2.517e-02  5.878e-03   4.282 1.85e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1674.6  on 1231  degrees of freedom
## Residual deviance: 1576.1  on 1225  degrees of freedom
##   (16 observations deleted due to missingness)
## AIC: 1590.1
## 
## Number of Fisher Scoring iterations: 4

*** Based on the regression results, we see that when we control for other variables, median_hhinc has a significant coefficient which means that as the % of median household income increases, the probability of access to hospital decreases. While this is different from our hypothesis (better access in areas with higher median income), some of this can be explained from the rich suburban lifestyle of the US where resources are a lot more sprawled. We also see that tracts with higher percentage of Asians as well as Hispanic have a higher probability of hospital access. Lastly, tracts with a higher proportion of highly educated residents are more likely to have hospital access within 5 miles ***