knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(here)
library(janitor)
library(sf)
library(tidycensus)
library(plotly)
library(mapview)
# census_api_key("YOUR KEY GOES HERE", install = TRUE)

Introduction

This R Markdown report contains the results from two unrelated analyses. Both utilize data from the U.S. Census Bureau (obtained using the “tidycensus” package).

Part A

Data Preparation

The code chunk below uses get_acs() to obtain the estimated percentage of the age-25+ population that has a graduate degree within each county in Michigan, according to the 2017-2021 5-year American Community Survey. Then it creates a static point plot that also shows the margine of error for each estimate. The plot will be displayed in the Analysis section that follows.

adv_deg <- get_acs(
  geography = "county",
  variables = c(percent_grad = "DP02_0066P"),
  year = 2021,
  state = "Michigan"
)

# Create point plot 
adv_deg_cnty_plot <- adv_deg %>%
  ggplot(aes(x = estimate, y = reorder(NAME, estimate))) + #sorts y-axis descending
  geom_errorbar(aes(xmin = estimate - moe, xmax = estimate + moe),
                    width = 0.5, linewidth = 0.5) +
  geom_point(color = 'darkblue', size = 2) +
  scale_y_discrete(labels = function(x) str_remove(x, "County, Michigan")) + #clean up y-axis labels
  labs(title = "Percentage of Population Age 25+ with Graduate Degree",
       caption = "Data acquired with R and tidycensus",
       x = "ACS estimate (percent)",
       y = "Counties in Michigan") +
  theme_minimal()

Analysis

The seven counties in Michigan in which more than 15 percent of the adult population has a graduate degree are Washtenaw, Oakland, Leelanau, Ingham, Keweenaw (though with a wide margin of error), Kalamazoo, and Emmet. These counties are the home of major universities or the (second?) home of affluent residents.

The eleven counties in which less than 5 percent of the adult population has a graduate degree are Clare, Arenac, Oscoda, Ogemaw, Branch, Tuscola, Ontonagon, Missaukee, Ionia, Montcalm, and Lake. All of these counties (except Branch and Ontonagon) are in the north/central rural areas of the state.

# Display point plot with error bars created above
adv_deg_cnty_plot

# Generate interactive plot using static plot
ggplotly(adv_deg_cnty_plot, tooltip = 'x')

Part B

Data Preparation

The raw data file contains all public libraries in the U.S. in 2022. It is a preliminary version of Public Libraries Survey, and older vintages of these data are available from the Institute of Museum and Library Services.

The code chunk that follows accomplishes the following tasks:

  1. Append geographic variables (latitude, longitude, and locale) from published 2021 data
  2. Filter the records to only public libraries in Illinois
  3. Keep and rename only the five variables of interest for this analysis
  4. Parse the geography_code variable into two separate variables (to make it tidy)
    • geography type (e.g., place, county, school district)
    • precision level (e.g., exact, overlap, remainder)
  5. Recode the values of geography_type to match what tidycensus is expecting
    • [Above step is in preparation for anticipated processing for final project]
  6. Calculate the proportion of cardholders to service area population for each library
  7. Collapse/recode the locale variable and convert to a 4-level factor
  8. Convert the dataframe to an sf object with point geometry for mapping
  9. Filter sf data to only libraries with geography_type of ‘place’ (in preparation for spatial join)
  10. Use get_decennial() to obtain 2020 Census counts for all places in Illinois with their polygon geometries (and the state level total with its polygon)
  11. Spatially inner-join the Illinois places with the place-based libraries to end up with only the municipalities that contain a place-based library
  12. Calculate the ratio that compares the 2020 Census count with the reported library service area population for each record
# Load the raw data
publiclibraries22 <- read_csv(here("data","publiclibraries22.csv"))

# Load the processed data from last cycle that contains point locations
publiclibraries21 <- read_csv(here("data","publiclibraries21.csv"))
publiclibraries21_geo <- publiclibraries21 %>%
  select(FSCSKEY, LONGITUD, LATITUDE, LOCALE_ADD)

# 1. Merge the point location and locale variables from 21 to 22 data
all_public_libraries <- publiclibraries22 %>%
  left_join(publiclibraries21_geo,
            by="FSCSKEY")

# 2./3. Clean the data
illinois_libraries <- all_public_libraries %>%
  clean_names() %>%
  filter(stabr == 'IL') %>% # select only Illinois libraries
  filter(fscskey != "-3") %>% # remove closed libraries
  filter(fscskey != "IL8034") %>% #drop record that doesn't exist in 2021 data
  rename(library_id = fscskey, # rename variables to be easier to remember
         library_name = libname,
         legal_basis = c_legbas,
         geography_code = geocode,
         service_area_pop = popu_lsa,
         cardholders = regbor,
         long = longitud,
         lat = latitude,
         locale12 = locale_add
         ) %>%
  select(library_id, library_name, legal_basis, geography_code, 
         service_area_pop, cardholders, long, lat, locale12) %>%
  # 4. Separate the geographic_code variable into parts
  separate(geography_code, 
           c("geography_type", "geography_precision"), # name the new variables
           2, # split after 2nd character
           remove = FALSE) %>% #keep the original variable
  # 5. Recode geography_type values to be human readable
  mutate(geography_type = case_match(geography_type,
                                     'CO' ~ 'county',
                                     'CD' ~ 'county subdivision',
                                     'MD' ~ 'multi-county subdivision',
                                     'PL' ~ 'place',
                                     'MP' ~ 'multi-place',
                                     'SU' ~ 'school district (unified)',
                                     'OT' ~ 'other'
                                     )) %>%
  # 6. Calculate proportion of cardholders to service area population
  mutate(users_pop_proportion = cardholders / service_area_pop) %>%
  # 7. Collapse 12 locale levels to 4
  mutate(locale4 = case_match(locale12,
                              11:13 ~ 1,
                              21:23 ~ 2,
                              31:33 ~ 3,
                              41:43 ~ 4
                              )) %>%
  # Convert locale4 to a factor with labels
  mutate(locale4 = factor(locale4, #use factor() NOT as.factor()!
                          labels = c("city", "suburb", "town", "rural"),
                          ordered = TRUE))

# 8. Convert dataframe to sf object with point geometry
illinois_libraries_geo <- illinois_libraries %>%
  st_as_sf(coords = c("long", "lat"),
           crs = 4269) # important to specify here to enable spatial join later

# 9. Filter only libraries with reported census geo type of 'place'
illinois_libraries_geo_place <- illinois_libraries_geo %>%
  filter(geography_type == "place")

# 10. Get 2020 population counts for places in Illinois
illinois_2020pop_places <- get_decennial(
  geography = 'place',
  variables = 'P1_001N',
  year = 2020,
  state = 'illinois',
  geometry = TRUE,
  progress_bar = FALSE
)

# 10b. Get 2020 Illinois state total and state polygon
illinois_2020pop_total <- get_decennial(
  geography = 'state',
  variables = 'P1_001N',
  year = 2020,
  state = 'illinois',
  geometry = TRUE,
  progress_bar = FALSE
)

# 11. Spatial join the place polygons with the library point locations 
illinois_2020pop_places_libraries <- illinois_2020pop_places %>%
  st_join(illinois_libraries_geo_place,
          left = FALSE) # default to inner join

# 12. Calculate the ratio of 2020 Census count to reported library service area pop
illinois_2020pop_places_libraries <- illinois_2020pop_places_libraries %>%
  mutate(pop_align = value/service_area_pop) # 'value' is 2020 count
  # this direction had more manageable scale than service_area_pop/value

##Alternative join not used but saved for later
# Spatial join the library point locations with the place polygons
  #illinois_libraries_geo_place_join <- illinois_libraries_geo_place %>%
  #  st_join(illinois_2020pop_places,
  #          left = TRUE)

Analysis

The static and interactive maps below show that most records have a comparison ratio around 1 (colored aquamarine). The interactive map is necessary to zoom in on different areas of the state to identify places were the 2020 Census count is smaller than the reported LSA population (pop_align < 1; blues) and places where the 2020 Census count is larger than the reported LSA population (pop_align > 1; yellows).

# Create static map plot of population comparison ratio
illinois_2020pop_places_libraries %>%
  ggplot(aes(fill = pop_align)) +
    geom_sf(data = illinois_2020pop_total, fill = NA) + # add Illinois outline
    geom_sf(data = illinois_2020pop_places_libraries) +
    scale_fill_viridis_c() +
    theme_void() +
    labs(title = str_wrap("Ratio of 2020 Decennial Census Count to Reported Library Service Area Population", width = 40),
         subtitle = "For Illinois public libraries with municipal service areas",
         fill = str_wrap("2020 Census Count / Reported LSA Population", width = 20),
         caption = str_wrap("2020 Decennial Census Counts obtained with tidycensus R package; Public library locations from IMLS's PLS, FY 2021", width = 70)
         )

# Static plot above is really hard to see, so this provides interactive version
mapview(illinois_2020pop_places_libraries, zcol = 'pop_align')