WVUM Distance Calculator

Big picture idea

The goal: To quickly determine which WVU facilities are the closest to where a patient lives

The perfect solution: You have the patient’s address, which gets geocoded to extract the coordinates (the origin). I could then calculate the routes to each of our facilities (the destinations), and select whichever is the closest

Why that won’t work:

HIPAA (probably): Addresses are PHI, and I’m not about to create a tool that people can use to disclose PHI
Addresses are too detailed: It’s not like I need turn by turn directions, I just want to know where I should tell the patient to get labs, and to get a reasonable estimate of how far their drive to in person appointments will be
Money: I’m not paying for everyone to use my personal MapBox API token! Also, there might be some value for QI in being able to map zip codes to the nearest facility

What will we do?

…

Important

Todo

Import data

Hospitals

The first bit to import is the names & locations of the hospitals. This is stored on a Google Sheet, which also holds the coordinates of each facility

Show the code

library(googlesheets4)
options(gargle_oauth_email = TRUE)
gs4_auth(cache = TRUE)
hospitals <- read_sheet("https://docs.google.com/spreadsheets/d/1MQG2mWaKPmqM7cbKUbg3BQvElWV4-_F1nINZPg2b5XQ") %>%
  select(abbv, name_short, ID_access, city, state, hosp_type, Address, Zip, CMS_ID, name_full, lat, long) %>%
  st_as_sf(coords = c("long", "lat"), crs=params$CRS) 
  
#%>%
  # geocode(city=city, state=state) %>%
  # mutate(ID_consult = case_when(is.na(ID_consult) ~ "No ID available",
  #                               ID_consult == F   ~ "No ID available",
  #                               ID_consult == T   ~ "ID consults available")) %>%
  # filter(include_on_map)

If the Google Sheet needs to be updated in the future, you’ll need to manually update coordinates of the (new) facilities

Show the code

## To get the actual coordinates of the hospitals (i.e. what to run when updating it)
# Make note of what column is being joined on (in this case `abbv`)

# Geocode them
hospitals_geocoded <- read_sheet("https://docs.google.com/spreadsheets/d/1MQG2mWaKPmqM7cbKUbg3BQvElWV4-_F1nINZPg2b5XQ") %>%
  select(abbv, name_short, city, state, Address, Zip, name_full) %>%
  mutate(search_query = str_glue("{name_full}, {city}, {state}")) %>%
  geocode_combine(
    queries = list(
      list(
        method = "census", mode = "batch", street = "Address",
        city = "city", state = "state", postalcode = "Zip"
      ),
      # list(
      #   method = "osm", street = "Address",
      #   city = "city", state = "state", postalcode = "Zip"
      # ),
      list(method = "mapbox", address = "search_query")
    ),
    query_names = c("census batch", "mapbox")
    # global_params = list(return_addresses = TRUE)
    
  )

# Write to file
hospitals_geocoded %>%
  select(abbv, name_short, lat, long) %>%
  readr::write_csv(glue::glue("{rstudioapi::selectDirectory()}/Hosp_geocoded.csv"))

rm(hospitals_geocoded)

Show the code

mapview(hospitals, label="name_full")

Defining our area of interest

Before we gather all of our data, first we should define the area that we will…

County shapes

One approach is to build a “buffer” around the state of West Virginia, and use that buffer to define our search space

Show the code

### Get the starting shape files
wv_counties  <- tigris::counties(state = "WV", cb=T) %>% 
  st_transform(crs = params$CRS)
wv_outline <- st_union(wv_counties)
all_conuties_raw <- tigris::counties(state = c("WV", "PA", "MD", 
                                               "VA", "OH", "KY"),
                                     cb=T) %>%
  st_transform(crs = params$CRS)
### Define the buffer zone for counties
wv_buffer <- st_union(wv_counties) %>% st_buffer(params$buffer*1000)
all_conuties <- all_conuties_raw %>%
  st_filter(wv_buffer, .predicate = st_intersects)

### Visual of selection criteria
all_conuties %>%
  ggplot() +
  geom_sf() +
  geom_sf(data=st_union(wv_counties), fill='darkgrey', alpha=.5) +
  geom_sf(data=wv_buffer, color='red', fill=NA) +
  labs(title = str_glue("Counties included using a {params$buffer}km buffer"))

Show the code

rm(all_conuties_raw)
rm(all_conuties, wv_counties)
# rm(wv_buffer)

#all_conuties %>% mapview(zcol="STATEFP")

This isn’t a bad starting place, but doesn’t actually reflect our health system: We have multiple facilities that are out of West Virginia and have stronger representation in the northern & western edges (compared to the south/east)

Show the code

last_plot() +
  geom_sf(data=hospitals)

CMS data?

One option may be to look at CMS data

Hospital Service Area looks like you can query

ZCTAs

Another approach is to use ZIP Code Tabulation Areas (ZCTA), which are areas defined by the US Census. These are rough estimates of census blocks that correspond to ZIP codes

Show the code

# c("WV", "PA", "MD", "VA", "OH", "KY")

# This is NOT the high res shapefile
ZCTA_full <- tigris::zctas(year=2020, cb=T)

ZCTA <- ZCTA_full %>%
  st_filter(wv_buffer, .predicate = st_intersects) 
rm(ZCTA_full)

Population weighted centroids

Our next step is to calculate a point (within each zip code) that will serve as our “origin” point

The easy solution is to calculate the geographic/geometric centroid of each ZCTA, as shown below

Show the code

subset_ZCTA <- filter(ZCTA, GEOID20 %in% c("26288", "26222", "26217", "26298"))

p1 <- subset_ZCTA %>%
  ggplot() + geom_sf() +
  geom_sf(data=st_centroid(subset_ZCTA), color="red") +
  labs(title = "Geometic centroids for select zipcodes",
       subtitle = str_flatten_comma(subset_ZCTA$ZCTA5CE20, last = ", and ")) +
  ggthemes::theme_map()

p2 <- ggplot() + 
  geom_sf(data=wv_outline, fill="lightblue") +
  geom_sf(data=subset_ZCTA, fill="grey") +
  # geom_sf(data=st_centroid(subset_ZCTA), color="red") +
  labs(title = "Area of interest") +
  ggthemes::theme_map()
  
p1 + p2 + patchwork::plot_layout(widths=c(1.5,1))

Show the code

rm(p1, p2)

The issue with this method is that the population may not be spread out evenly among the zip code, as shown with the same area below

Show the code

blocks <- c(041, 097, 083, 007, 101, 075) %>%
  # as.character() %>%
  map(~get_decennial(geography = "block",
                     variables = "P1_001N", # Total population
                     year = 2020,
                     state= "WV",
                     county = .x,
                     geometry = TRUE)) %>%
  bind_rows()

blocks %>%
  st_filter(st_union(subset_ZCTA), .predicate = st_intersects) %>%
  ggplot() + 
  geom_sf(aes(fill=value)) +
  geom_sf(data=subset_ZCTA, fill=NA, color="red", alpha=.5, linetype="dashed") +
  geom_sf(data=st_centroid(subset_ZCTA), color="red") +
  scale_fill_viridis_c() +
  labs(fill="Population",
       title = "Select ZCTA, with population", 
       subtitle = "Populated area is Webster Springs, WV") +
  ggthemes::theme_map()

Calculate the population weighted centroids

Unfortunately, the only way to calculate population weighted centroids at a ZCTA level is using block data (the most refined level of census data), so this section calls in over a million rows of data (each with their own geography, >2GB of RAM).

Since this takes so long, we will only run it once, then save the results in the ZCTA_coords tab of the same Google Sheet we used before

Show the code

## This could be another option: Using the census look up table to map [blocks] --> [ZCTA]
# https://www.census.gov/geographies/reference-files/time-series/geo/relationship-files.2020.html#zcta
x <- read_delim("https://www2.census.gov/geo/docs/maps-data/data/rel2020/zcta520/tab20_zcta520_tabblock20_natl.txt", 
                delim = "|", escape_double = FALSE, trim_ws = TRUE)

crosswalk_block_ZCTA <- x %>% 
  select(GEOID_ZCTA5_20, NAMELSAD_ZCTA5_20, GEOID_TABBLOCK_20, NAMELSAD_TABBLOCK_20) %>%
  
  # Limit to blocks that have associated ZCTA 
  filter(!is.na(GEOID_ZCTA5_20)) %>%
  
  # Limit to the ZCTA's we are using
  filter(GEOID_ZCTA5_20 %in% ZCTA$GEOID20) %>%
  
  # Extract some helpful info
  mutate(GEOID_county = str_sub(GEOID_TABBLOCK_20, 1, 5), 
         GEOID_tract  = str_sub(GEOID_TABBLOCK_20, 6, 11),
         GEOID_blkgrp = str_sub(GEOID_TABBLOCK_20, 12, 12),
         GEOID_block  = str_sub(GEOID_TABBLOCK_20, 12, 16))
rm(x)

crosswalk_block_ZCTA %>%
  group_by(GEOID_ZCTA5_20, GEOID_county, GEOID_tract) %>%
  summarise(n_blocks = n(), 
            .groups = "drop_last") %>%
  summarise(n_blocks = sum(n_blocks), 
            n_tracts = n(),
            .groups = "drop_last") %>%
  summarise(n_blocks = sum(n_blocks), 
            n_tracts = sum(n_tracts),
            n_counties = n(),
            .groups = "drop_last")

crosswalk_block_ZCTA %>%
  select(GEOID_ZCTA5_20, GEOID_county, GEOID_tract) %>%
  unique() %>%
  group_by(GEOID_county, GEOID_tract) %>%
  summarise(n_ZCTA_per_tract = n()) %>%
  arrange(desc(n_ZCTA_per_tract))
rm(crosswalk_block_ZCTA)

Show the code

###################################################-
###     [part 1] Get census block data
tictoc::tic("Entire process") # 449 sec (7:30)
projected_complete(c(449), name="ENTIRE THING")
###################################################- 
tic("Call census block data") # 152, 161, 197 sec
projected_complete(c(152, 161, 197, 311, 218, 256))

pop_block_ls <- c("WV", "PA", "MD", "VA", "OH", "KY") %>%
  map(~get_decennial(geography = "block",
                     variables = "P1_001N", # Total population
                     year = 2020,
                     state = .x,
                     geometry = TRUE))
tictoc::toc()


tic("Join lists to a single sf object of block data") # 35 sec
projected_complete(c(35, 29, 27))
pop_block <- bind_rows(pop_block_ls) %>%
  filter(value!=0)
rm(pop_block_ls)
toc()


###################################################-
###     [part 2a] The **NEW** way to join to ZCTA
tic("Join blocks to ZCTA (type a)")
projected_complete(c(168, 156))
###################################################- 
## Grab the census's look up table to map [blocks] --> [ZCTA]
# https://www.census.gov/geographies/reference-files/time-series/geo/relationship-files.2020.html#zcta
temp_df1 <- read_delim("https://www2.census.gov/geo/docs/maps-data/data/rel2020/zcta520/tab20_zcta520_tabblock20_natl.txt", 
                delim = "|", escape_double = FALSE, trim_ws = TRUE) %>%
  select(GEOID_ZCTA5_20, NAMELSAD_ZCTA5_20, GEOID_TABBLOCK_20, NAMELSAD_TABBLOCK_20) %>%
  
  # Limit to blocks that have associated ZCTA 
  filter(!is.na(GEOID_ZCTA5_20)) %>%
  
  # Limit to the ZCTA's we are using
  filter(GEOID_ZCTA5_20 %in% ZCTA$GEOID20) %>%
  
  # Extract some helpful info
  mutate(GEOID_county = str_sub(GEOID_TABBLOCK_20, 1, 5), 
         GEOID_tract  = str_sub(GEOID_TABBLOCK_20, 6, 11),
         GEOID_blkgrp = str_sub(GEOID_TABBLOCK_20, 12, 12),
         GEOID_block  = str_sub(GEOID_TABBLOCK_20, 12, 16)) %>% 
  
  select(-starts_with("NAME")) 

# Limit the geo_df of blocks to the ones we care about, then find their centroids
temp_df2 <- pop_block %>%
  filter(GEOID %in% temp_df1$GEOID_TABBLOCK_20) %>%
  st_centroid()

crosswalk_block_ZCTA <- temp_df1 %>%
  # Join to block data
  left_join(temp_df2, by=c("GEOID_TABBLOCK_20" = "GEOID")) %>%
  
  # Ensure no missing data (ie blocks with no population)
  filter(!is.na(value)) %>% 

  # Group by ZCTA
  group_by(GEOID_ZCTA5_20) %>%
  
  summarize(
    pw_lon = weighted.mean(st_coordinates(geometry)[, 1], value),
    pw_lat = weighted.mean(st_coordinates(geometry)[, 2], value),
    population = sum(value, na.rm = T),
    blocks_list = str_flatten_comma(GEOID_TABBLOCK_20),
    .groups = "drop"
  )
rm(temp_df1, temp_df2)

toc()

# ###################################################-
# ###     [part 2b.1] original way to join to ZCTA
# tic("Join blocks to ZCTA (type b)")
# projected_complete(c(136, 210, 273))
# ###################################################- 
# blocks_zcta <- st_join(st_transform(pop_block, crs = params$CRS), 
#                        st_transform(ZCTA, crs = params$CRS), 
#                        join = st_within, # st_intersects is another option
#                        left = FALSE)
# toc()




rm(pop_block)

# tic()
# x <- crosswalk_block_ZCTA %>%
#   # Get the centroids of each block
#   st_centroid() %>%
#   
#   # Ensure no missing data (ie blocks with no population)
#   filter(!is.na(value)) %>% 
# 
#   # Group by ZCTA
#   group_by(GEOID_ZCTA5_20) %>%
#   
#   # Summarize to population weighted centroid coordinates
#   summarize(
#     pw_lon = weighted.mean(st_coordinates(geometry)[, 1], value),
#     pw_lat = weighted.mean(st_coordinates(geometry)[, 2], value),
#     population = sum(value, na.rm = T),
#     blocks_list = str_flatten_comma(GEOID_TABBLOCK_20),
#     .groups = "drop"
#   ) %>%
#   
#   # Convert back to sf object
#   st_as_sf(coords = c("pw_lon", "pw_lat"), crs = params$CRS)
# toc()
# 
# ###################################################-
# ###     [part 2b.2] identify pop wt centroid
# tic("Get weighted centroids by ZCTA") # 56-126 sec
# projected_complete(c(56, 126, 85))
# ###################################################- 
# ZCTA_wt_centroids <- blocks_zcta %>%
#   # Get the centroids of each block
#   st_centroid() %>%
#   
#   # Ensure no missing data (ie blocks with no population)
#   filter(!is.na(value)) %>% 
# 
#   # Group by ZCTA
#   group_by(ZCTA5CE20) %>%
#   
#   # Summarize to population weighted centroid coordinates
#   summarize(
#     pw_lon = weighted.mean(st_coordinates(geometry)[, 1], value),
#     pw_lat = weighted.mean(st_coordinates(geometry)[, 2], value),
#     .groups = "drop"
#   ) %>%
#   
#   # Convert back to sf object
#   st_as_sf(coords = c("pw_lon", "pw_lat"), crs = params$CRS)
# toc() # end: weighted centroids
# 
# rm(blocks_zcta)



###################################################-
###     [part 3] Find ZCTA centroids
### Find the centroid of each ZCTA. This will be done both using a 
### population weighted method (pw_lon / pw_lat) and using simple
### centrodis
###
### The goal here is to build out a data frame with
###   1. **`cntr_lon/lat`**: Geometric coordinates of centroids accomplished
###      using the extract_coords function
###   2. **`pw_lon/lat`**: Population weighted centroids, pulled from the
###      crosswalk_block_ZCTA dataframe above. This will also include two
###      additional variables (from crosswalk_block_ZCTA), which are included
###      for assistance in troubleshooting
###        - **`population`** which is the total population used from the   
###          blocks when calculating the population weighted centroid   
###        - **`blocks_list`** which is a comma separated list of the GEOID's 
###          of the blocks used in the calculation
###   3. **`lon/lat`**: As some of the ZCTA's couldn't calculate the population
###      weighted centroids, this is included for convince. In cases where the
###      population weighted centroid could be calculated, it will use that.
###      If not, it falls back on the geometric/geographic centroid
###        - **`centroid_method`** defines which method is used for lon/lat
###   4. **`diff_dist_km`**: The distance between the two calculations
###   5. **`mid_lon/lat`**: The midpoint between the two calculations
###
tic("Find ZCTA centroids & merge various calculations")
projected_complete(c(168))
###
###################################################- 

ZCTA_centroids <- list() # a list for the results

# Helper function that extracts the coordinates & keeps the geoid
extract_coords <- function(sf_geo){
  #' @description
  #' Extracts the coordinates from a SF object
  #' 
  #' @param sf_geo sf object of points, with the first column being an ID
  
  # Test to make sure the provided geography is correct
  n_row_correctType <- sum(st_is(sf_geo, "POINT")|st_is(sf_geo, "MULTIPOINT"))
  
  if(n_row_correctType != nrow(sf_geo)){
    warning("Make sure correct geography type is provided! ")
    message("Did you forget to do `st_centroid()`?")
  }
  
  df <- sf_geo %>%
    st_coordinates() %>%
    as_tibble() %>%
    magrittr::set_names(c("lon", "lat"))
  
  df$geo_id <- st_drop_geometry(sf_geo)[,1]
  
  df %>%
    select(geo_id, lon, lat)
}

# Extract the centroid of unweighted ZCTA
ZCTA_centroids$cntr_unwt_ZCTA <- ZCTA %>%
  st_centroid() %>%
  extract_coords() %>%
  rename(cntr_lon=lon, cntr_lat=lat)

# Join the two. If you were able to find the population weighted centroid,
# use that one. If not, fall back on the regular version
ZCTA_centroids$part1 <- ZCTA_centroids$cntr_unwt_ZCTA %>% 
  left_join(crosswalk_block_ZCTA, by=c("geo_id"="GEOID_ZCTA5_20")) %>%
  rename(ZCTA_geoid = geo_id) %>%

  mutate(centroid_method = case_when(
    !is.na(pw_lon)                   ~ "pop_wt",
    is.na(pw_lon) & !is.na(cntr_lon) ~ "geo",
    is.na(pw_lon) & is.na(cntr_lon)  ~ "error"
  )) %>%
  mutate(lon = if_else(!is.na(pw_lon), pw_lon, cntr_lon),
         lat = if_else(!is.na(pw_lat), pw_lat, cntr_lat))

# ### Used if you are running the old way of calculating it
# ZCTA_centroids <- cntr_unwt_ZCTA %>%
#   left_join(st_drop_geometry(ZCTA_wt_centroids), by=c("geo_id"="ZCTA5CE20")) %>%
#   rename(ZCTA_geoid = geo_id) %>%
#   
#   mutate(centroid_method = case_when(
#     !is.na(pw_lon)                   ~ "pop_wt",
#     is.na(pw_lon) & !is.na(cntr_lon) ~ "geo",
#     is.na(pw_lon) & is.na(cntr_lon)  ~ "error"
#   )) %>%
#   mutate(lon = if_else(!is.na(pw_lon), pw_lon, cntr_lon),
#          lat = if_else(!is.na(pw_lat), pw_lat, cntr_lat)) 

rm(ZCTA_wt_centroids, crosswalk_block_ZCTA, cntr_unwt_ZCTA, extract_coords)


### Calculate the difference between the two, in km
tictoc::tic("Calculate the difference in km") # 26 sec
projected_complete(c(26, 26, 23, 31, 28, 24))
ZCTA_centroids$ZCTA_method_diff <- ZCTA_centroids$part1 %>%
  filter(!is.na(pw_lon)) %>%
  mutate(cntr = st_sfc(mapply(function(lon, lat) st_point(c(lon, lat)), cntr_lon, cntr_lat, SIMPLIFY = FALSE), crs = params$CRS),
         pw = st_sfc(mapply(function(lon, lat) st_point(c(lon, lat)), pw_lon, pw_lat, SIMPLIFY = FALSE), crs = params$CRS)
    
  ) %>%
  st_as_sf() %>%
  rowwise() %>%
  mutate(
    dist_m = st_distance(cntr, pw, by_element = TRUE),
    diff_dist_km = as.numeric(dist_m) / 1000
  ) %>%
  ungroup() %>%
  select(ZCTA_geoid, diff_dist_km)

# Save distances to prior df
ZCTA_centroids$part2 <- ZCTA_centroids$part1 %>%
  left_join(st_drop_geometry(ZCTA_centroids$ZCTA_method_diff), by="ZCTA_geoid")


toc() # End: calc the diff
  
# Calculate the midpoint between the two methods
ZCTA_centroids$final <- ZCTA_centroids$part2 %>%
  rowwise() %>%
  mutate(
    mid_lon = if_else(centroid_method=="pop_wt", mean(c(cntr_lon, pw_lon)), lon),
    mid_lat = if_else(centroid_method=="pop_wt", mean(c(cntr_lat, pw_lat)), lat)
  ) %>%
  ungroup() 

# Rearrange columns
ZCTA_centroids <- ZCTA_centroids$final %>%
  select(ZCTA_geoid, lon, lat, centroid_method, 
         mid_lon, mid_lat, diff_dist_km, 
         everything()) 

toc() # End: Overall

### -------------- WRITE TO GOOGLE SHEET
library(googlesheets4)
options(gargle_oauth_email = TRUE)
gs4_auth(cache = TRUE)

ss <- gs4_get("https://docs.google.com/spreadsheets/d/1MQG2mWaKPmqM7cbKUbg3BQvElWV4-_F1nINZPg2b5XQ")

# Delete the old values
range_delete(ss, sheet = "ZCTA_coords", range = "A:O")

# Write to sheet
ZCTA_centroids %>%
  write_sheet(ss, sheet = "ZCTA_coords")

rm(ZCTA_centroids, ss, crosswalk_block_ZCTA, pop_block)

Show the code

library(googlesheets4)
options(gargle_oauth_email = TRUE)
gs4_auth(cache = TRUE)

ZCTA_centroids <- read_sheet("https://docs.google.com/spreadsheets/d/1MQG2mWaKPmqM7cbKUbg3BQvElWV4-_F1nINZPg2b5XQ", sheet = "ZCTA_coords")

For some reason, I was not able to calculate the population weighted centroid for all ZCTA. It looks like most of these are in Tennessee or North Carolina (and maybe the Washington DC area?). The number of ZCTAs that are missing population weighted centroids (n_missing = 70) only represent 2.80% of the total ZCTAs (n_total = 2497).

Calculating the population weighted centroid for this quite small number of ZCTAs would require calling even more of the incredibly detailed block level data (substantially increasing the time it takes the script to run) for unclear benefit; I find it very unlikely that patients who live close to Washington DC are driving to the WVUM system,

I’m still not sure why the others didn’t. Regardless, in cases where the population weighted centroid didn’t work, I used the geographic/geometric centroid instead.

Show the code

temp_states <- tigris::states() %>% 
  # filter(STUSPS %in% c("WV", "MD", "PA", "MD", "VA", "KY", "NC", "OH"))
  filter(STUSPS %in% c("WV", "MD", "MD", "VA", "NC"))

x <- ZCTA %>%
  left_join(select(ZCTA_centroids, GEOID20=ZCTA_geoid, method=centroid_method, lon, lat), by="GEOID20")

# x %>% 
#   st_drop_geometry() %>%
#   count(method) %>%
#   mutate(`n (%)` = str_glue("{n} ({scales::percent(n/sum(n))})")) %>%
#   mutate(method = case_match(method, 
#                              "pop_wt" ~ "Population weighted",
#                              "geo" ~ "Geometic centroid")) %>%
#   select(-n) %>%
#   knitr::kable(label="Method for calculating centroid")

p1 <- x %>%
    mutate(method = case_match(method, 
                             "pop_wt" ~ "Population weighted",
                             "geo" ~ "Geometic centroid")) %>%
  ggplot() +
  geom_sf(aes(fill=method), color=NA) +
  geom_sf(fill=NA, color="grey70", alpha=0.5) +
  geom_sf(data = st_transform(wv_outline, crs = params$CRS),
          fill=NA,
          color="black") +
  
  sjPlot::scale_fill_sjplot() +
  ggthemes::theme_map() +
  theme(legend.position = c(.8, .1))

p2 <- p1 +
  geom_sf(data = st_transform(temp_states,
                              crs = params$CRS),
          fill=NA,
          color="black") +
  guides(fill=F)

p1 + p2

Show the code

rm(x, temp_states, p1, p2)

The good & the bad

Now we can see how this has adjusted the centroids using population weighted centroids (in blue)

Show the code

ZCTA_centroids %>%
  filter(ZCTA_geoid %in% c("26288", "26222", "26217", "26217", "26298")) %>%
  st_as_sf(coords = c("pw_lon","pw_lat")) %>%
  st_set_crs(params$CRS) %>%
  ggplot() +
  geom_sf(color="blue") +
  geom_sf(data=filter(ZCTA, GEOID20 %in% c("26288", "26222", "26217", "26217", "26298")), fill=NA) +
  geom_sf(data=filter(st_centroid(ZCTA), GEOID20 %in% c("26288", "26222", "26217", "26217", "26298")), color="red") +
  labs(title = "Geometic (red) & population weighted (blue) centroids",
       subtitle = paste0("For select zipcodes: ", 
                         str_flatten_comma(c("26288", "26222", "26217", "26217", "26298"), last = ", and "))) +
  ggthemes::theme_map()

ZCTA 26298

This approach does have some edge cases, as you can see for the zip code 26298 in Webster County, WV. For this zip code, the population weighted centroid is far in the northern corner

Show the code

## Example using 26298
tmp <- list()
tmp$within <- blocks %>%
  st_filter(filter(subset_ZCTA, GEOID20=="26298"), .predicate = st_within)

tmp$intersect <- blocks %>%
  st_filter(filter(subset_ZCTA, GEOID20=="26298"), .predicate = st_intersects)

tmp$lookupTable_block_IDs <- ZCTA_centroids %>%
  # Look up the blocks used in the lookup table
  mutate(blocks_list = str_split(blocks_list, ",\\s*")) %>%
  # Limit to ZCTA(s) of interest
  filter(ZCTA_geoid == "26298") %>%
  # Convert to char vector of the block IDs
  pull(blocks_list) %>% unlist()
  
tmp$lookupTable_block_geos <- blocks %>% 
  filter(GEOID %in% tmp$lookupTable_block_IDs)

tmp$zip <- filter(subset_ZCTA, GEOID20=="26298")

tmp$pop_cent <- ZCTA_centroids %>%
  filter(ZCTA_geoid=="26298") %>%
  st_as_sf(coords = c("pw_lon","pw_lat")) %>%
  st_set_crs(params$CRS)

tmp$geo_cent <- filter(st_centroid(ZCTA), GEOID20=="26298")

# ggplot() + 
#   # geom_sf(data=tmp$intersect, fill="grey70", color="grey50") +
#   geom_sf(data=tmp$within, aes(fill=value)) +
#   geom_sf(data=tmp$zip, fill=NA, color="red", alpha=.5, linetype="dashed") +
#   geom_sf(data=tmp$geo_cent, color="red") +
#   # geom_sf(data=tmp$pop_cent, color="blue") +
#   labs(fill="Population",
#        title="Zipcode 26298 (red outiline)", subtitle = "In Webster County, WV") +
#   scale_fill_viridis_c() +
#   ggthemes::theme_map() +
#   theme(legend.position = c(.9,0))

ggplot() +
  # geom_sf(data=tmp$intersect, fill="grey70", color="grey50") +
  geom_sf(data=tmp$lookupTable_block_geos, aes(fill=value)) +
  geom_sf(data=tmp$zip, fill=NA, color="red", alpha=.5, linetype="dashed") +
  geom_sf(data=tmp$geo_cent, color="red") +
  geom_sf(data=tmp$pop_cent, color="blue") +
  labs(fill="Population",
       title="Zipcode 26298 (red outiline)", subtitle = "In Webster County, WV") +
  # scale_fill_viridis_c() +
  scale_fill_binned(type = "gradient",
                    low = RColorBrewer::brewer.pal(3, "Greens")[1],
                    high = RColorBrewer::brewer.pal(3, "Greens")[3]) +
  ggthemes::theme_map() +
  theme(legend.position = c(.9,0))

At first I thought this was becasue the block’s don’t line up perfectly with the ZCTA, but that might not actually be the problem. When I looked at the granular (block level) data for the census blocks that are within the ZCTA or adjacent to them (st_intersects, the most conservative estimate), it turns out that many of the blocks have a population of zero. These counties are shown in grey below

Show the code

ggplot() +
  # geom_sf(data=tmp$intersect, fill="grey70", color="grey50") +
  geom_sf(data=mutate(tmp$intersect, value = na_if(value, 0)), aes(fill=value)) +
  geom_sf(data=tmp$zip, fill=NA, color="gold", linewidth=1.2) +
  geom_sf(data=tmp$geo_cent, color="red") +
  geom_sf(data=tmp$pop_cent, color="blue") +
  labs(fill="Population",
       title="Zipcode 26298 (gold outiline)", 
       subtitle = "Showing all block populations, including in nearby ZCTA") +
  # scale_fill_viridis_c() +
  scale_fill_binned(type = "gradient",
                    low = RColorBrewer::brewer.pal(3, "Greens")[1],
                    high = RColorBrewer::brewer.pal(3, "Greens")[3]) +
  ggthemes::theme_map() +
  theme(legend.position = c(.9,0))

Show the code

rm(tmp)

Show the code

## Above example (26298) explained 
p1 <- ggplot() +
  geom_sf(data=tmp$intersect, fill="grey70", color="grey50") +
  geom_sf(data=tmp$within, aes(fill=value)) +
  geom_sf(data=tmp$zip, fill=NA, color="red", alpha=.5, linetype="dashed") +
  labs(title="All blocks **touching** zip code 26298") +
  scale_fill_viridis_c() +
  ggthemes::theme_map() +
  guides(fill="none")

p2 <- ggplot() + 
  # geom_sf(data=tmp$intersect, fill="grey70", color="grey50") +
  geom_sf(data=tmp$within, aes(fill=value)) +
  # geom_sf(data=tmp$zip, fill=NA, color="red", alpha=.5, linetype="dashed") +
  labs(fill="Population",
       title="All blocks **contined within** zip code 26298") +
  scale_fill_viridis_c() +
  ggthemes::theme_map() +
  guides(fill="none")

p1 + p2
rm(tmp, p1, p2)

ZCTA 26288

Compare this with the zip code 26288 in Webster County, WV. This zip code contains Webster Springs, WV. Here there are fewer areas that do not have any population, so the population centroid isn’t skewed as drastically

Show the code

## Example using 26288
tmp <- list()
tmp$within <- blocks %>%
  st_filter(filter(subset_ZCTA, GEOID20=="26288"), .predicate = st_within) %>%
  mutate(value = na_if(value, 0))

tmp$intersect <- blocks %>%
  st_filter(filter(subset_ZCTA, GEOID20=="26288"), .predicate = st_intersects) %>%
  mutate(value = na_if(value, 0))

tmp$zip <- filter(subset_ZCTA, GEOID20=="26288")

tmp$lookupTable_block_IDs <- ZCTA_centroids %>%
  # Look up the blocks used in the lookup table
  mutate(blocks_list = str_split(blocks_list, ",\\s*")) %>%
  # Limit to ZCTA(s) of interest
  filter(ZCTA_geoid == "26288") %>%
  # Convert to char vector of the block IDs
  pull(blocks_list) %>% unlist()
  
tmp$lookupTable_block_geos <- blocks %>% 
  filter(GEOID %in% tmp$lookupTable_block_IDs)

tmp$pop_cent <- ZCTA_centroids %>%
  filter(ZCTA_geoid=="26288") %>%
  st_as_sf(coords = c("pw_lon","pw_lat")) %>%
  st_set_crs(params$CRS)

tmp$geo_cent <- filter(st_centroid(ZCTA), GEOID20=="26288")

ggplot() + 
  # geom_sf(data=tmp$intersect, fill="grey70", color="grey50") +
  geom_sf(data=tmp$lookupTable_block_geos, aes(fill=value)) +
  geom_sf(data=tmp$zip, fill=NA, color="red", linetype="dashed") +
  geom_sf(data=tmp$geo_cent, color="red") +
  geom_sf(data=tmp$pop_cent, color="blue") +
  labs(fill="Population",
       title="Zipcode 26288 (Webster Springs, WV)", 
       subtitle = "Dots are centroid (Red = geographic, Blue = population weighted)") +
  scale_fill_binned(type = "gradient",
                    low = RColorBrewer::brewer.pal(3, "Greens")[1],
                    high = RColorBrewer::brewer.pal(3, "Greens")[3]) +
  ggthemes::theme_map() +
  theme(legend.position = c(.9,0))

Show the code

rm(tmp)

Multiple together

Show the code

tmp <- list()

tmp$lookupTable_block_IDs <- ZCTA_centroids %>%
  # Look up the blocks used in the lookup table
  mutate(blocks_list = str_split(blocks_list, ",\\s*")) %>%
  # Limit to ZCTA(s) of interest
  filter(ZCTA_geoid %in% subset_ZCTA$ZCTA5CE20) %>%
  # drop some of the extra columns
  select(ZCTA_geoid, population, blocks_list) %>%
  # Unnest the list longer
  unnest_longer(blocks_list)

tmp$lookupTable_block_geos <- blocks %>%
  # Join blocks to the lookup table to find their ZCTA
  inner_join(tmp$lookupTable_block_IDs, by = c("GEOID" = "blocks_list")) %>%
  # Rearrange the df
  select(ZCTA=ZCTA_geoid, GEOID, value, ZCTA_pop=population, geometry) %>%
  
  # Calculate the proportion that each block makes up (relative to ZCTA's)
  # total population
  ### Method 1
  mutate(pct_of_pop = value / ZCTA_pop)
  ### Method 2
  # group_by(ZCTA) %>%
  # mutate(pct_of_pop = value / sum(value, na.rm = T))

tmp$pop_cent <- ZCTA_centroids %>%
  filter(ZCTA_geoid %in% subset_ZCTA$ZCTA5CE20) %>%
  st_as_sf(coords = c("pw_lon","pw_lat")) %>%
  st_set_crs(params$CRS)
 
tmp$geo_cent <- filter(st_centroid(ZCTA), GEOID20 %in% subset_ZCTA$ZCTA5CE20)


subset_ZCTA %>%
  rename(ZCTA = ZCTA5CE20) %>%
  ggplot() +
  # geom_sf(aes(fill=ZCTA), color="black", alpha=.1) +
  geom_sf(fill="grey", color="black", alpha=.1) +
  geom_sf(data=tmp$lookupTable_block_geos, 
          # aes(alpha=value, fill=ZCTA),
          aes(alpha=pct_of_pop, fill=ZCTA),
          color=NA) +
  geom_sf(data=tmp$geo_cent, shape=1) +
  geom_sf(data=tmp$pop_cent, shape=16) +
  
  # scale_alpha_binned(range = c(.1, .9)) +
  labs(title = "Centroids for select zip codes",
       caption = "Grey = population of zero", 
       subtitle = "Geometic (empty circle) & population weighted (filled circles)",
       alpha="Proportion of zipcode's population") +
  ggthemes::theme_map() +
  theme(legend.position = c(1,0))

Show the code

rm(blocks, subset_ZCTA, tmp)

Meeting in the middle

In reality, there isn’t a huge difference between the two methods, as shown in the map below:

Show the code

ZCTA %>%
  left_join(select(ZCTA_centroids, ZCTA_geoid, diff_dist_km), by=c("GEOID20"="ZCTA_geoid")) %>%
  mutate(in_example = GEOID20 %in% c("26288", "26222", "26217", "26217", "26298"),
         in_example = if_else(in_example, "Example", " ")) %>%
  ggplot() +
  geom_sf(aes(fill=diff_dist_km), color=NA) +
  # geom_sf(aes(fill=dist_km, color=in_example)) +
  # scale_color_manual(values = c(" "=NA, "Example"="red")) +
  geom_sf(data=wv_outline, fill=NA, color="gold") +
  labs(title="Discrepency between centroid measurements (km)",
       subtitle = "Difference between population weighted & geometic centroids",
       fill="Distance (km)") +
  ggthemes::theme_map() +
  theme(legend.position = c(.9,0))

Show the code

how_many_by_cutoff <- function(df, cutoff) {
  n_cut <- df %>% filter(diff_dist_km >cutoff) %>% nrow()
  n_tot <- nrow(df)
  pct   <- signif(n_cut / n_tot,2) *100
  str_glue("{n_cut} zipcodes ({pct}%) have a discrepancy of >{cutoff} km ({round(cutoff/1.609, 1)} mi)")
}
# how_many_by_cutoff(ZCTA_centroids, 3)

Only 254 zipcodes (10%) have a discrepancy of >3 km (1.9 mi), 62 zipcodes (2.5%) have a discrepancy of >5 km (3.1 mi), and 3 zipcodes (0.12%) have a discrepancy of >10 km (6.2 mi).

The map below shows the ZCTAs that have a discrepancy of at least 5 km. The ones around New Martinsville, Beckley, and Princeton all seem like appropriate uses of the population weighted centroids, but the area just south of Elkins looks a little problematic (as does the 28675 zipcode, the one in North Carolina.

Note

The map below uses Census tract data, which is not nearly as granular as the block level data (which is used in the static figures above & the calculations)

Show the code

rm(how_many_by_cutoff)
temp_ls <- list()

# mutate(GEOID_county = str_sub(GEOID_TABBLOCK_20, 1, 5), 
#          GEOID_tract  = str_sub(GEOID_TABBLOCK_20, 6, 11),
#          GEOID_blkgrp = str_sub(GEOID_TABBLOCK_20, 12, 12),
#          GEOID_block  = str_sub(GEOID_TABBLOCK_20, 12, 16))


temp_ls$ZCTA_subset <- ZCTA_centroids %>%
  filter(diff_dist_km>5)

temp_ls$ZCTA_subset_shape <- ZCTA %>%
  filter(GEOID20 %in% temp_ls$ZCTA_subset$ZCTA_geoid) %>%
  # append the diff
  left_join(select(temp_ls$ZCTA_subset, ZCTA_geoid, diff_dist_km), 
            by=c("GEOID20"="ZCTA_geoid")) %>%
  mutate(rank = min_rank(desc(diff_dist_km))) %>%
  rowwise() %>%
  mutate(label = str_glue("<b>{GEOID20}</b> zipcode<br/>{round(diff_dist_km,1)} km diff ({round(diff_dist_km/1.609, 1)} mi)<br/>Ranking: #{rank}"),
         label = htmltools::HTML(label))

temp_ls$pop_cent <- ZCTA_centroids %>%
  filter(ZCTA_geoid %in% temp_ls$ZCTA_subset$ZCTA_geoid) %>%
  st_as_sf(coords = c("pw_lon","pw_lat")) %>%
  st_set_crs(params$CRS) %>%
  mutate(label = str_glue("Pop weighted centroid ({ZCTA_geoid} zipcode) "))
 
temp_ls$geo_cent <- filter(st_centroid(ZCTA), GEOID20 %in% temp_ls$ZCTA_subset$ZCTA_geoid) %>%
  mutate(label = str_glue("Geometric centroid ({GEOID20} zipcode) "))



temp_ls$tracts_involved <- temp_ls$ZCTA_subset %>%
  # Look up the blocks used in the lookup table
  mutate(blocks_list = str_split(blocks_list, ",\\s*")) %>%
  # Unnest the list longer
  unnest_longer(blocks_list) %>%
  mutate(tract = str_sub(blocks_list, 1, 11)) %>%
  pull(tract) %>%
  unique()

temp_ls$tmp1 <- temp_ls$ZCTA_subset %>%
  # Look up the blocks used in the lookup table
  mutate(blocks_list = str_split(blocks_list, ",\\s*")) %>%
  # Unnest the list longer
  unnest_longer(blocks_list) %>%
  mutate(state = str_sub(blocks_list, 1, 2), 
         county = str_sub(blocks_list, 3, 5)) %>%
  count(state, county) %>%
  
  group_by(state) %>%
  summarise(county = list(county)) %>%
  ungroup() 

temp_ls$tmp2 <- split(temp_ls$tmp1, temp_ls$tmp1$state) %>%
  map(~.x[[2]]) %>%
  map(~unlist(.x))

temp_ls$tracts_sf <- map2(names(temp_ls$tmp2), temp_ls$tmp2, 
     ~get_decennial(geography = "tract",
                    variables = "P1_001N", # Total population
                    year = 2020,
                    state= .x,
                    county = .y,
                    geometry = T)) %>%
  bind_rows() %>% 
  # Limit to the tracts involved
  filter(GEOID %in% temp_ls$tracts_involved)


temp_ls$pal <- colorNumeric(
  palette = "Greens",
  domain = temp_ls$tracts_sf$value
)

temp_ls$tracts_sf %>%
  leaflet() %>%
  addProviderTiles("CartoDB.Positron") %>%
  # addProviderTiles(providers$Stamen.TonerLite) %>%

  addPolygons(group = "Population (tract)",
              data = temp_ls$tracts_sf,
              color = ~temp_ls$pal(value),
              weight = 0.5,
              smoothFactor = 0.2,
              fillOpacity = 0.5) %>%
    addPolygons(group = "ZCTA",
                data = temp_ls$ZCTA_subset_shape,
              color = "black",
              weight = 3,
              smoothFactor = 0.2,
              fillOpacity = 0.0,
              label = ~label) %>%
  addCircleMarkers(group = "Pop weight (blue)",
             data = temp_ls$pop_cent,
             color="blue", 
             fillOpacity = 1,
             radius=4,
             stroke = F,
             label = ~label) %>%
  addCircleMarkers(group = "Gemoetric centroid (red)",
             data = temp_ls$geo_cent,
             color="red", 
             fillOpacity = 1,
             radius=4,
             stroke = F,
             label = ~label) %>%
  addLayersControl(
    # baseGroups = c(
    #   "OSM (default)",
    #   "Positron (minimal)",
    #   "World Imagery (satellite)"
    # ),
    overlayGroups = c("Population (tract)", "ZCTA",
                      "Pop weight (blue)",
                      "Gemoetric centroid (red)"),
    options = layersControlOptions(collapsed = FALSE)
  ) %>%
  # addLegend(
  #   position = "bottomright",
  #   pal = temp_ls$pal,
  #   values = temp_ls$tracts_sf$value,
  #   title = "% with bachelor's<br/>degree"
  # )
  addMiniMap(zoomLevelOffset = -4)

Show the code

rm(temp_ls)

Middle ground solution

As a compromise, we can split the difference between the population weighted centroids and the geometric centroids

Identify the closest facilities

Now that we have our zipcode’s centroids (which is a surrogate for where the patient lives), we need identify the distance from these closest facilities

Reducing the search space

One approach would be to calculate the distance from every zipcode (n=2497) to every facility (n=23). This is computationally demanding, as it’s 57,431 queries, and not necessary.

Instead, we find the 3 closest hospitals for each zip code (using

Show the code

# If you want to show code, use code-fold: show
find_closest_nn <- function(zcta, hosp){
  #' @description
  #' Finds the 3 closest hospitals
  #' 
  #' @param zcta a plain df where the first column is the ZCTA. There should 
  #'             also be two columns named "lon" and "lat"
  #' @param hosp a data frame where the first column is the ID for the hospital.
  #'             there should also be a column of "geometry" 
  library(RANN)
  
  # Extract the first column of `zcta` & `hosp`
  list_of_zcta <- zcta[[1]]
  list_of_hosp <- hosp[[1]]
  
  # Temp list for the results
  results <- list()
  
  results$hosp_index_lookup <- tibble(abbv = list_of_hosp) %>%
    mutate(hosp_index = row_number())
  
  ################################
  ### Testing assumptions
  ################################
  # Test that this column is indeed ZCTAs
  testthat::expect_equal(nrow(zcta), {
    list_of_zcta %>%
      str_detect("^\\d{5}$") %>%
      sum()
  })
  testthat::expect_equal(nrow(zcta), n_distinct(list_of_zcta))
  
  # Test that `zcta` contains columns named lon & lat
  testthat::expect_in("lon", names(zcta))
  testthat::expect_in("lat", names(zcta))
  
  # Make sure the list of hospitals is unique
  testthat::expect_equal(nrow(hosp), n_distinct(list_of_hosp))
  
  # Make sure `hosp` is an SF object
  testthat::expect_s3_class(hosp, "sf")
 
  ################################
  ### Do the nearest neighbour search
  ################################ 
  ## Matrix of ZCTA centroid coordinates
  mat_ZCTA <- zcta %>%
    # Make the correct geometry
    st_as_sf(coords = c("lon", "lat"), crs=params$CRS) %>%
    st_transform(3857) %>% # must be projected
    # Extract coords
    st_coordinates()

  ## Matrix of hospital coordinates
  mat_hosp <- hospitals %>%
    select(abbv, geometry) %>%
    # Make the correct geometry
    st_as_sf(crs=params$CRS) %>%
    st_transform(3857) %>% # must be projected
    # Extract coords
    st_coordinates()
  
  nn_results <- RANN::nn2(data = mat_hosp, 
                          query = mat_ZCTA,
                          k = 3)
  
  ################################
  ### Make a table of the closest hospitals for each ZCTA
  ################################
  # {nn.idx} is a matrix, with each row corresponding to each ZCTA. The first 
  # column is the index of the closest hospital, the second column is the index
  # of the second closest hospital, and so on 
  results$closest_hosp <- nn_results$nn.idx %>%
    # Convert matrix to tibble, and append the ZCTAs
    as_tibble() %>% 
    mutate(ZCTA = list_of_zcta) %>%
    
    # this is now a table with columns named V1, V2, V3, and ZCTA. Let's 
    # convert this to long format
    pivot_longer(cols = starts_with("V"), 
                 names_to = "order",
                 values_to = "hosp_index") %>%
    
    # And replace the index with the supplied abbreviation
    left_join(results$hosp_index_lookup, by="hosp_index") %>%
    select(-hosp_index ) # drop the index column
    
    
  ################################
  ### Similar idea, but with their distances
  ################################
  # {nn.idx} is a matrix, with each row corresponding to each ZCTA. Instead of
  # having the index of the hospital, the values of the matrix represent the 
  # distance
  results$closest_dist <- nn_results$nn.dists %>%
    # Convert matrix to tibble, and append the ZCTAs
    as_tibble() %>% 
    mutate(ZCTA = list_of_zcta) %>%
    
    # this is now a table with columns named V1, V2, V3, and ZCTA. Let's 
    # convert this to long format
    pivot_longer(cols = starts_with("V"), 
                 names_to = "order",
                 values_to = "nn_dist") %>%
    mutate(nn_dist = round(nn_dist))
  
  
  ################################
  ### Return results
  ################################
  results$closest_hosp %>%
    left_join(results$closest_dist, by=c("ZCTA", "order"))
    
  
}

ZCTA_n3 <- ZCTA_centroids %>%
  select(ZCTA_geoid, lon=mid_lon, lat=mid_lat) %>%
  find_closest_nn(hospitals)

With this, we can map each facility’s service area (without accounting for drive time)

Show the code

tmp <- list()
# Lookup table for changing hospital abbreviations
tmp$hosp_lookup <- hospitals %>%
  select(abbv, name_short) %>%
  st_drop_geometry()


# Service areas
tmp$service <- ZCTA %>%
  left_join(filter(ZCTA_n3, order=="V1"), by=c("GEOID20"="ZCTA")) %>%
  group_by(abbv) %>%
  summarise(geometry = st_union(geometry), 
            abbv = unique(abbv),
            .groups = "drop") %>%
  left_join(tmp$hosp_lookup, by="abbv") 

# Population data
tmp$ZCTA_pop <- ZCTA %>%
  left_join(ZCTA_centroids, by=c("GEOID20"="ZCTA_geoid"))

tmp$bins <- c(0,250,1000,2500,5000,7500, Inf)

tmp$pal <- colorBin(
  palette = "YlGn",
  domain = tmp$ZCTA_pop$population,
  bins = tmp$bins)

  
leaflet() %>%
  addProviderTiles("CartoDB.Positron") %>%
    addPolygons(data=tmp$ZCTA_pop,
                group = "Population",
                stroke = FALSE,
                fillOpacity = .75,
                fillColor = ~tmp$pal(population)) %>%
  
  addPolygons(data=tmp$service,
              group = "Service area",
              color = "black",
              weight = 3,
              smoothFactor = 0.2,
              label = ~name_short,
              fillOpacity = 0.0,
              highlightOptions = highlightOptions(
                color = "red",
                bringToFront = TRUE)) %>%
  addMarkers(data=hospitals,
             group = "Facilities",
             label = ~name_full) %>%
  addLayersControl(
    overlayGroups = c("Population",
                      "Service area",
                      "Facilities"),
    options = layersControlOptions(collapsed = FALSE)
  ) %>%
  addLegend(pal = tmp$pal, values = tmp$ZCTA_pop$population, 
            opacity = 0.7, title = "Population in zipcode",
            position = "bottomright")

Show the code

rm(tmp)

Find drive times

To calculate the drivetimes, we will use MapBox’s Matrix API. Even though we are interested in calculating routes from zip codes to facilities (which we will do), the way the code below is structured calls the API by facility. Basically, we use the table from before (with the 3 closest facilities for each zipcode) and go hospital-by-hospital to identify if that facility is in the top 3 for that zip code. Once we have the list of ZCTAs served by that facility, we ask MapBox for a many to one (N x 1) matrix, where “N” are the zipcodes (and 1 is the hospital/facility). After we iterate through all of the facilities, we combine each of those together

Show the code

# # Can use for troubleshooting, sample only some
# ZCTA_sample <- slice_sample(ZCTA_n3, n=4, by=abbv)



################################
### Part 1: List of origins (ZCTAs)
################################
ls_origins <- ZCTA_n3 %>%
  # Split the data.frame into a list, seperated by desitinations
  split(ZCTA_n3$abbv) %>%
  
  # # Can use for troubleshooting
  # head(n=3) %>% # limits the number of hospitals
  
  ### Working within the list now:
  
  # Extracts the ZCTAs
  map(~pull(.x, ZCTA)) %>%
  
  # # Can use for troubleshooting
  # map(~head(.x, n=5)) %>% # limits the number of ZCTAs
  
  # Filter the master ZCTA table to only include the ZCTAs from above 
  map(~filter(ZCTA_centroids, ZCTA_geoid %in% .x)) %>%
  # convert these filter ZCTAs to shape files
  map(function(df){
    df %>%
      st_as_sf(coords = c("mid_lon","mid_lat")) %>%
      st_set_crs(params$CRS) %>%
      select(ZCTA=ZCTA_geoid)
  })

#>> lobstr::tree(ls_origins) # Preview of the structure
# <list of tbls>, each table of variable number of rows 
# ├─BMC: S3<sf/tbl_df> 
# │ ├─ZCTA: "21136", "20418", "20851", "22722", ...
# │ └─geometry: point1, point2, point3, point4, ...
# ├─BRN: S3<sf/tbl_df>
# │ ├─ZCTA: "43717", "44630", "44682", "45721", ...
# │ └─geometry: point1, point2, point3, point4, ...
# ├─BRX: S3<sf/tbl_df>
# │ ├─ZCTA: "25063", "25962", "25043", "24986", ...
# │ └─geometry: point1, point2, point3, point4, ...
# ├─CCMC: S3<sf/tbl_df>
# │ ├─ZCTA: "43717", "45769", "45783", "45739", ...
# │ └─geometry: point1, point2, point3, point4, ...
# ├─..........



################################
### Part 2: List of destinations (hospitals)
################################
ls_dest <- names(ls_origins) %>%
  map(~filter(hospitals, abbv==.x)) %>%
  map(~select(.x, abbv))
#>> lobstr::tree(ls_dest) # Preview of the structure
# <list of tbls>, each only one row
# ├─S3<sf/tbl_df>
# │ ├─abbv: "BMC"
# │ └─geometry: S3<sfc_POINT/sfc> point
# ├─S3<sf/tbl_df>
# │ ├─abbv: "BRN"
# │ └─geometry: S3<sfc_POINT/sfc> point
# ├─S3<sf/tbl_df>
# │ ├─abbv: "BRX"
# │ └─geometry: S3<sfc_POINT/sfc> point
# ├─S3<sf/tbl_df>
# │ ├─abbv: "CCMC"
# │ └─geometry: S3<sfc_POINT/sfc> point
# ├─..........
 
# ls_origins$BMC %>% str()
# ls_origins[[1]] %>% str()
# ls_dest[[1]]  %>% str()
# 
# df <- mapboxapi::mb_matrix(origins = ls_origins[[1]], destinations = ls_dest[[1]])
# df %>%
#   as_tibble() %>%
#   mutate(ZCTA = ls_origins[[1]][["ZCTA"]],
#          abbv = ls_dest[[1]][["abbv"]]) %>%
#   select(ZCTA, abbv, drive=V1)



################################
### Part 3: Call the API
projected_complete(c(420))
################################
tic("Drivetime API") 
df <- map2(ls_origins, ls_dest, function(org, dest){
  ZCTA_ids <- org[["ZCTA"]]  # list of ZCTAs
  hosp_id  <- dest[["abbv"]] # name of the hospital
  
  # CLI messages
  cli::cli_div(theme = list(
    span.hosp = list(color = "royalblue"),
    span.n = list(color = "forestgreen")
  ))
  msg <- paste0("Calculating {.n {length(ZCTA_ids)} ZCTA} ",
                "for {.hosp {hosp_id}}")
  cli::cli_alert_info(msg)
  
  # Call mapbox
  mat <- mapboxapi::mb_matrix(origins = org, destinations = dest)
  
  # Format as a tbl
  mat %>%
    as_tibble() %>%
    mutate(ZCTA = ZCTA_ids,
           abbv = hosp_id) %>%
    select(ZCTA, abbv, drive=V1)
  
  
}, .progress = TRUE)
toc()


################################
### Part 4: Join & save the data
################################
df_merged <- ZCTA_n3 %>%
  left_join(list_rbind(df), by=c("ZCTA","abbv")) %>%
  mutate(drive = round(drive))
  
  

### -------------- WRITE TO GOOGLE SHEET
library(googlesheets4)
options(gargle_oauth_email = TRUE)
gs4_auth(cache = TRUE)

ss <- gs4_get("https://docs.google.com/spreadsheets/d/1MQG2mWaKPmqM7cbKUbg3BQvElWV4-_F1nINZPg2b5XQ")

# # Delete the old values
# range_delete(ss, sheet = "Closest_facilities", range = "A:O")

# Write to sheet
df_merged %>%
  write_sheet(ss, sheet = "Closest_facilities")


rm(ls_dest, ls_origins, df, df_merged, ss, ZCTA_n3)

Show the code

library(googlesheets4)
options(gargle_oauth_email = TRUE)
gs4_auth(cache = TRUE)

ZCTA_n3 <- read_sheet("https://docs.google.com/spreadsheets/d/1MQG2mWaKPmqM7cbKUbg3BQvElWV4-_F1nINZPg2b5XQ", sheet = "Closest_facilities")

To illustrate this, below is a very limited subset of the data. The first table (below) is the dataset in its original format. The table in the margin is the reformatted version (which is sent to the API). Note that it’s all the same information, but instead of being arranged by zip code, it’s arranged by facility

Show the code

sample_zips <- c("24465", "26206", "24946") # "26288", 
ZCTA_n3 %>%
  filter(ZCTA %in% sample_zips) %>%
  mutate(Hosp = str_glue("Query for {abbv}")) %>%
  select(-nn_dist) %>%
  mutate(ZCTA = factor(ZCTA, levels = sample_zips)) %>%
  gt(groupname_col = "Hosp") %>%
  tab_style(cell_fill(), cells_row_groups()) %>%
  gt::tab_header(title="Query results", subtitle = "arranged by hosp") %>%
  data_color(
    columns = c(ZCTA), method = "factor",
    palette = c("red", "orange", "darkgreen", "blue")
  )

ZCTA	order	abbv	drive
Query results
arranged by hosp
Query for GMH
24465	V1	GMH	89
Query for STJ
24465	V2	STJ	124
24946	V3	STJ	141
26206	V3	STJ	100
Query for BRX
24465	V3	BRX	175
24946	V2	BRX	120
26206	V2	BRX	59
Query for SMR
24946	V1	SMR	82
26206	V1	SMR	41

ZCTA	order	abbv	nn_dist
Original Data
Zipcode: 24465
24465	V1	GMH	102366
24465	V2	STJ	108692
24465	V3	BRX	133623
Zipcode: 24946
24946	V1	SMR	67874
24946	V2	BRX	94736
24946	V3	STJ	120136
Zipcode: 26206
26206	V1	SMR	38634
26206	V2	BRX	49497
26206	V3	STJ	90519

When we join the query results to the original data, we get the table below

ZCTA	order	abbv	drive
What the end results look like
Zipcode: 24465
24465	V1	GMH	89
24465	V2	STJ	124
24465	V3	BRX	175
Zipcode: 24946
24946	V1	SMR	82
24946	V2	BRX	120
24946	V3	STJ	141
Zipcode: 26206
26206	V1	SMR	41
26206	V2	BRX	59
26206	V3	STJ	100

xxx

Show the code

mins_to_hr <- function(time){
  if(time>60){
    hrs <- time %/% 60
    mins <- time - (60*hrs)
    rslt <- str_glue("{hrs}:{str_pad(mins, 2, pad = '0')} hours")
  } else({
    rslt <- str_glue("{time} mins")
  })
  return(rslt)
}

df <- ZCTA_n3 %>%
  left_join(select(hospitals, abbv, name=name_short), by="abbv") %>%
  select(-nn_dist, -abbv) %>%
  pivot_wider(id_cols = ZCTA, 
              names_from = order, 
              values_from = c(name, drive),
              names_glue = "{order}_{.value}") %>%
  left_join(select(ZCTA, GEOID20), by=c("ZCTA"="GEOID20")) %>%
  rowwise() %>%
  mutate(label_main = str_glue("{ZCTA}: {mins_to_hr(V1_drive)} to {V1_name}")) %>%
  ungroup() %>%
  st_as_sf()


pal <- colorNumeric(
  palette = "YlGn",
  domain = df$V1_drive)

leaflet() %>%
addProviderTiles("CartoDB.Positron") %>%
    addPolygons(data=st_as_sf(wv_outline),
                group = "Drivetime",
                color = "black",
                weight = 3,
                smoothFactor = 0.2,
                fillOpacity = 0.0) %>%
    addPolygons(data=df,
                group = "WV outline",
                stroke = FALSE,
                fillOpacity = .75,
                fillColor = ~pal(V1_drive),
                label = ~label_main) %>%
  addMarkers(data=hospitals,
             group = "Facilities",
             label = ~name_full) %>%
  addLayersControl(
    overlayGroups = c("Drivetime",
                      "WV outline",
                      "Facilities"),
    options = layersControlOptions(collapsed = FALSE)
  ) %>%
  addLegend(pal = pal, values = df$V1_drive,
            opacity = 0.7, title = "Time to closest facility (min)",
            position = "bottomright")

Show the code

rm(df, pal)
#   
#   addPolygons(data=tmp$service,
#               group = "Service area",
#               color = "black",
#               weight = 3,
#               smoothFactor = 0.2,
#               label = ~name_short,
#               fillOpacity = 0.0,
#               highlightOptions = highlightOptions(
#                 color = "red",
#                 bringToFront = TRUE)) %>%
#   addMarkers(data=hospitals,
#              group = "Facilities",
#              label = ~name_full) %>%

#   addLegend(pal = tmp$pal, values = tmp$ZCTA_pop$population, 
#             opacity = 0.7, title = "Population in zipcode",
#             position = "bottomright")