This report explores the tidycensus package to obtain and manipulate American Community Survey (ACS) data from the US Census Bureau to conduct spatial analysis. Data was pulled using the tidycensus package, extracting counties in Washington state for Part A and census tracts in Snohomish county, Washington, in Part B. The focus of this report is to use tidycensus to obtain data from the US Census Bureau in a “tidy” format to use in analysis. The first section looks at the percentage of graduate degree holders in Washington counties and the second section looks at retail trade earnings in census tracts in Snohomish county, Washington. This report is part of Lab 5 of GEOG588.
library(tidycensus) # For getting tidy US Census data
library(tidyverse) # For data analysis and visualization
library(plotly) # For interactive graphs
library(ggiraph) # For interactive graphs as well
library(mapview) # For interactive map viewing
library(scales) # To control axis/legend labels
grad_deg_percent <- get_acs(
geography = "county", # We want to pull data for counties
variables = "DP02_0066P", # The variable is percentage of population with a graduate degree
state = "WA", # For the state of Washington
year = 2021 # For the 2017-2021 5-year ACS.
)
## Getting data from the 2017-2021 5-year ACS
## Using the ACS Data Profile
View(grad_deg_percent) # Use this to view the data in RStudio.
glimpse(grad_deg_percent) # Here's a glimpse for here:
## Rows: 39
## Columns: 5
## $ GEOID <chr> "53001", "53003", "53005", "53007", "53009", "53011", "53013"…
## $ NAME <chr> "Adams County, Washington", "Asotin County, Washington", "Ben…
## $ variable <chr> "DP02_0066P", "DP02_0066P", "DP02_0066P", "DP02_0066P", "DP02…
## $ estimate <dbl> 5.8, 8.5, 12.6, 10.3, 11.9, 11.4, 10.9, 5.7, 7.0, 6.4, 7.3, 1…
## $ moe <dbl> 1.4, 1.3, 0.8, 1.3, 1.0, 0.4, 3.3, 0.6, 1.2, 2.4, 0.9, 4.0, 0…
arrange(grad_deg_percent, desc(estimate), na.rm=TRUE) # We arrange the data in descending order, highest to lowest, omitting the NA values.
## # A tibble: 39 × 5
## GEOID NAME variable estimate moe
## <chr> <chr> <chr> <dbl> <dbl>
## 1 53075 Whitman County, Washington DP02_0066P 22.6 1.9
## 2 53033 King County, Washington DP02_0066P 22.1 0.3
## 3 53055 San Juan County, Washington DP02_0066P 22.1 1.1
## 4 53031 Jefferson County, Washington DP02_0066P 18.8 1.4
## 5 53029 Island County, Washington DP02_0066P 14.1 1.1
## 6 53073 Whatcom County, Washington DP02_0066P 14 0.7
## 7 53067 Thurston County, Washington DP02_0066P 13.8 0.7
## 8 53039 Klickitat County, Washington DP02_0066P 12.7 2.2
## 9 53005 Benton County, Washington DP02_0066P 12.6 0.8
## 10 53035 Kitsap County, Washington DP02_0066P 12.2 0.5
## # ℹ 29 more rows
arrange(grad_deg_percent, estimate, na.rm=TRUE) # We arrange the data by ascending order, which is the default.
## # A tibble: 39 × 5
## GEOID NAME variable estimate moe
## <chr> <chr> <chr> <dbl> <dbl>
## 1 53025 Grant County, Washington DP02_0066P 5.1 0.7
## 2 53015 Cowlitz County, Washington DP02_0066P 5.7 0.6
## 3 53001 Adams County, Washington DP02_0066P 5.8 1.4
## 4 53027 Grays Harbor County, Washington DP02_0066P 6.1 0.8
## 5 53045 Mason County, Washington DP02_0066P 6.1 0.9
## 6 53019 Ferry County, Washington DP02_0066P 6.4 2.4
## 7 53077 Yakima County, Washington DP02_0066P 6.6 0.6
## 8 53041 Lewis County, Washington DP02_0066P 6.9 0.8
## 9 53017 Douglas County, Washington DP02_0066P 7 1.2
## 10 53021 Franklin County, Washington DP02_0066P 7.3 0.9
## # ℹ 29 more rows
grad_plot <- ggplot(grad_deg_percent, aes(x = estimate, # We use ggplot to graph the estimate values
y = reorder(NAME, estimate))) + # We'll order by the estimate values
geom_errorbar(aes(xmin=estimate - moe, xmax = estimate + moe), # The errorbar is generated by creating a range that adds/subtracts the MOE to the estimate value
width=0.5, linewidth = 0.5) + # Set some dimensions
geom_point(color = "darkred", size = 2) + # Assign color to the estimate point
scale_x_continuous(labels = label_percent(scale=1)) + # Label with "%" on x axis.
scale_y_discrete(labels = function(x) str_remove(x, " County, Washington|, Washington")) # Remove the "County, Washington" line.
grad_plot <- grad_plot + # This is a separate chunk for the labels
labs(title = "Percent graduate degrees, 2021 ACS", # Add a title
subtitle = "Counties in Washington state", # Subtitle
caption = "Data acquired with R and tidycensus. Error bars represent margin of error around estimates", # Description
x = "2017-2021 ACS estimate", # Data information
y = "") +
theme_minimal(base_size = 12) # Set theme
grad_plot
ggplotly(grad_plot, tooltip = "x")
grad_plot_ggiraph <- ggplot(grad_deg_percent, aes(x = estimate, # We create the MOE chart again here.
y = reorder(NAME, estimate),
tooltip = estimate,
data_id = GEOID)) +
geom_errorbar(aes(xmin = estimate - moe, xmax = estimate + moe),
width = 0.5, linewidth = 0.5) +
geom_point_interactive(color = "darkred", size = 2) +
scale_x_continuous(labels = label_percent(scale=1)) +
scale_y_discrete(labels = function(x) str_remove(x, " County, Washington|, Washington")) +
labs(title = "Percent graduate degrees, 2021 ACS",
subtitle = "By counties in Washington state",
caption = "Data acquired with R and tidycensus. Error bars represent margin of error around estimates",
x = "ACS estimate",
y = "") +
theme_minimal(base_size = 12)
girafe(ggobj = grad_plot_ggiraph) %>% # Then we set the ggobj in ggiraph.
girafe_options(opts_hover(css = "fill:cyan;")) # Set the fill color of the points when hovered over.
grad_deg_percent_geo <- get_acs(
geography = "county", # We want to pull data for counties
variables = "DP02_0066P", # The variable is percentage of population with a graduate degree
state = "WA", # For the state of Washington
year = 2021, # For the 2017-2021 5-year ACS.
geometry = TRUE, # This allows for geometry info to be added to our data.
progress_bar = FALSE
)
mapview(grad_deg_percent_geo, zcol = "estimate")
vars <- load_variables(2021, "acs5") # We look at the 2017-2021 ACS dataset.
View(vars) # View the data in RStudio.
glimpse(var) # Here's a glimpse of the data here.
## function (x, y = NULL, na.rm = FALSE, use)
sno_retail_income <- get_acs( # Use get_acs() to retrieve data
geography = "tract", # We look at census tracts.
variables = "B24031_008", # The variable for the median earnings in retail trade
state = "WA", # In the state of Washington
county = "Snohomish", # In Snohomish county.
geometry = TRUE, # This option lets us retrieve geometry information already joined to the data.
progress_bar = FALSE
)
## Getting data from the 2017-2021 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
mapview(sno_retail_income, zcol = "estimate")
ggplot(sno_retail_income, aes(fill = estimate)) + # Create a plot with the estimate (retail earnings) as the fill
geom_sf() + # Add geography
theme_void() + # Void theme
scale_fill_viridis_c(option = "inferno", n.breaks = 4) + # Use the "inferno" color palette in viridis and four breaks/divisions
labs(title = "Median Earnings in Retail Trade Industry", # Add title
subtitle = "Census tracts in Snohomish County", # Subtitle
fill = "Median Retail Earnings", # Legend
caption = "2017-2021 ACS | tidycensus R package")