# Load packages
library(tidyverse)
library(tidycensus)
library(gt)
library(scales)
library(plotly)
library(sf)
library(tigris)
library(mapview)
library(RColorBrewer)
library(viridis)
library(santoku)

Introduction

The US Census Bureau collects and shares immense amounts of data on the US population to better inform policy decisions and ensure equal representation when drawing legislative districts. Much of the data is collected through the American Community Survey (ACS), which surveys households across the country on everything from demographics to home characteristics. Using those survey answers, the Census Bureau creates estimates for the various enumeration areas, e.g., block groups, tracts, counties, states.

While the estimates are open to the public, navigating through the Census Bureau’s website or using the API can be tricky. However, the tidycensus package in R makes it easy to retrieve census data in a tidy format, making it ready for many forms of analysis. In this report, I’ll explore how tidycensus can be used to create informative visuals out of ACS data. Part A will explore the percentage of people with graduate degrees for each Tennessee county. Part B will map the median year structures were built by census block groups in Davidson County.

Part A

Data Preparation

The first step before beginning any analysis is get US Census data from the 5-year American Community Survey. For this analysis, data on the percent of people with graduate degrees is needed for every county in Tennessee. The tidycensus package makes getting this data easy.

# Load in census data using the tidycensus get_acs function.
tn_county_grad <- get_acs(
    geography = "county",
    variables = c(
        grad_degree_percent = "DP02_0066P"
    ),
    state = "TN"
)

# Split the NAME column into county and state
tn_county_grad <- separate(tn_county_grad, NAME, c("county", "state"), sep = ", ")

tn_county_grad

## # A tibble: 95 × 6
##    GEOID county          state     variable            estimate   moe
##    <chr> <chr>           <chr>     <chr>                  <dbl> <dbl>
##  1 47001 Anderson County Tennessee grad_degree_percent     11     0.9
##  2 47003 Bedford County  Tennessee grad_degree_percent      5.1   1.1
##  3 47005 Benton County   Tennessee grad_degree_percent      4.8   1.3
##  4 47007 Bledsoe County  Tennessee grad_degree_percent      4     1.1
##  5 47009 Blount County   Tennessee grad_degree_percent      9.5   0.7
##  6 47011 Bradley County  Tennessee grad_degree_percent      9.1   1  
##  7 47013 Campbell County Tennessee grad_degree_percent      5.4   1.7
##  8 47015 Cannon County   Tennessee grad_degree_percent      5.8   1.7
##  9 47017 Carroll County  Tennessee grad_degree_percent      8     1.5
## 10 47019 Carter County   Tennessee grad_degree_percent      7.6   1  
## # ℹ 85 more rows

Analysis

Which counties have the largest percentages of graduate school graduates?

# arrange the rows so the highest values are at the top
top10 <- tn_county_grad %>% 
    arrange(desc(estimate))

# Create a table out of the 10 rows at the top of the data frame
gt(head(top10, 10))

GEOID	county	state	variable	estimate	moe
47187	Williamson County	Tennessee	grad_degree_percent	22.7	0.9
47037	Davidson County	Tennessee	grad_degree_percent	17.5	0.5
47093	Knox County	Tennessee	grad_degree_percent	15.2	0.6
47179	Washington County	Tennessee	grad_degree_percent	14.2	1.0
47157	Shelby County	Tennessee	grad_degree_percent	13.6	0.4
47065	Hamilton County	Tennessee	grad_degree_percent	13.1	0.6
47189	Wilson County	Tennessee	grad_degree_percent	12.3	0.9
47001	Anderson County	Tennessee	grad_degree_percent	11.0	0.9
47125	Montgomery County	Tennessee	grad_degree_percent	11.0	1.0
47105	Loudon County	Tennessee	grad_degree_percent	10.5	1.5

Williamson, Davidson, and Knox County have the highest percentages for the state. Most of the counties in the top 10 are either large cities or neighboring large cities that offer many “white-collar” jobs.

Which counties have the lowest percentages of graduate school graduates?

# arrange the rows so the lowest values are at the top
bottom10 <- tn_county_grad %>% 
    arrange(estimate)

# Create a table out of the 10 rows at the top of the data frame
gt(head(bottom10, 10))

GEOID	county	state	variable	estimate	moe
47087	Jackson County	Tennessee	grad_degree_percent	2.7	1.0
47101	Lewis County	Tennessee	grad_degree_percent	2.8	1.5
47039	Decatur County	Tennessee	grad_degree_percent	3.0	1.1
47069	Hardeman County	Tennessee	grad_degree_percent	3.4	0.9
47111	Macon County	Tennessee	grad_degree_percent	3.4	1.1
47081	Hickman County	Tennessee	grad_degree_percent	3.6	1.0
47135	Perry County	Tennessee	grad_degree_percent	3.6	1.7
47181	Wayne County	Tennessee	grad_degree_percent	3.8	1.3
47007	Bledsoe County	Tennessee	grad_degree_percent	4.0	1.1
47029	Cocke County	Tennessee	grad_degree_percent	4.0	1.1

Jackson, Lewis, and Decatur Counties have the three lowest percentage of graduate school graduates. Most of the counties in the bottom 10 are rural.

How do counties compare when the margin of error is considered?

tn_grad_plot <- tn_county_grad %>% 
    ggplot(aes(x = estimate, y = reorder(county, estimate))) +
    # Add the error bar
    geom_errorbar(aes(xmin = estimate - moe, xmax = estimate + moe),
                  width = 0.5, linewidth = 0.5) +
    # Add the point
    geom_point(color = "blue") +
    # Convert x labels to percentages
    scale_x_continuous(labels = label_percent(scale = 1, suffix = "%")) +
    # Remove "County" from each name so that it is less repetitive and more reader friendly
    scale_y_discrete(labels = function(x) str_remove(x, " County")) +
    labs(title = "Percentage of People with Graduate Degrees, ACS 2018-2022",
         subtitle = "Counties in Tennessee",
         caption = "Data acquired using tidycensus and R",
         x = "ACS estimated Percentage",
         y = "") +
    theme_minimal(base_size = 12)

tn_grad_plot

Williamson, Davidson, and Knox county lead the state in population percentages for graduate educated individuals with relatively small margins of error. The more rural counties, like Pickett county, experience much higher error margins.

Many of the counties toward the top of the list have smaller margin of errors, likely because they are a few of the larger counties in the state. However, further down the list, the margin of error for many counties is relatively high, with the highest error being plus or minus 2.9 percent in Pickett County.

Overall, the plot is interesting, but Tennessee’s 95 counties may be too much content for a static plot. To make studying the data more comprehensible, the plot below allows you to interact with the data.

# Create an interactive chart from the original
ggplotly(tn_grad_plot, tooltip = "x")

Part B

Data Prepation

Once again, data will be loaded in from the Census Bureau using the tidycensus package. However, the median year structures were built (“B25035_001”) will be explored with a map at the block group level for Davidson County, Tennessee.

# Load in census data using the tidycensus get_acs function.
davidson_year_built <- get_acs(
    geography = "block group",
    variables = c(
        median_year_built = "B25035_001"
    ),
    state = "TN",
    county = "Davidson",
    geometry = TRUE     # joins attribute data with TIGER files when loading in
)

Erasing areas that overlap with water will allow some additional context on when and where structures were built.

sf::sf_use_s2(FALSE) # necessary for erase_water to function

davidson_no_water <- erase_water(davidson_year_built, area_threshold = .75, year = 2022)

Mapping and Analysis

A static map is a simple way to explore the data and get a quick idea of building patterns in Davidson County (Nashville).

# Retrieve the minimum and maximum of the data set
minYear <- min(davidson_no_water$estimate, na.rm = TRUE)
maxYear <- max(davidson_no_water$estimate, na.rm = TRUE)

# create the breaks
legendBreaks <- seq(minYear, maxYear+1, by = 10)

# define a color palette
col <- cividis(n = 9, alpha = 1, direction = 1)

# put the estimates into bins based on the breaks
davidson_no_water$year_range <- chop(davidson_no_water$estimate,
                                 breaks = legendBreaks,
                                 label = lbl_glue("{l} - {r}") # Removes brackets
                                 )

# Cast geometry to MULTIPOLYGON for use with ggplotly
davidson_no_water <- st_cast(davidson_no_water, "MULTIPOLYGON")

# Plot the map using ggplot
year_plot <- ggplot(davidson_no_water, aes(fill = year_range)) +
    geom_sf(color = NA) + # Color = NA removes lines
    theme_void() +
    # add a custom color scale that used the predefined cividis colors
    scale_fill_manual(name = "Median Year Built",
                      values = col,
                      na.value = "#e0e0e0"
                      ) +
    labs(title = "The Median Year Structures were Built by Census Block Group",
         subtitle = "Davidson County, Tennessee",
         caption = "ACS 5-year data obtained through Tidycensus in R"
    )

year_plot

The area around downtown is dominated by newer structures. The structures in the area around downtown are much older, but the structures near the edges of the county are relatively newer.

While this map is informative, it does not allow viewers to easily identify values for individual tracts. Making the map interactive with plotly will help with that. Click an item in the legend to filter it from the view.

ggplotly(year_plot, tooltip = "fill")

While using plotly is great, it is missing the geographic context provided by a base map. Fortunately, the mapview package provides that added functionality.

# Create an interactive map using mapview
mapview(x = davidson_no_water,
        layer.name = "Median Year Built", # does not use default name
        zcol = "estimate",
        col.regions = col, 
        alpha.regions = .8, # changes opacity
        at = legendBreaks,
        na.color = "white"
        )

Thanks to the context provided by mapview, it’s easier to tell that the downtown area around Broadway and the Capitol are still older than the surrounding areas which have seen much development in the recent years. It’s also easier to tell what areas are missing values: the two airports, the main train yard, Vanderbilt University, and some farmland being developed into new subdivisions.

Lab 5: Working with Census data using Tidycensus

Benjamin Browning

2024-03-31