Setup

Load Packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidycensus)
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(pander)
library(knitr)
library(pander)
library(rmarkdown)

Access Census API Key

census_api_key("75836456df67458839c36492f8b736a89591e1c1", install = TRUE, overwrite = TRUE)
## Your original .Renviron will be backed up and stored in your R HOME directory if needed.
## Your API key has been stored in your .Renviron and can be accessed by Sys.getenv("CENSUS_API_KEY"). 
## To use now, restart R or run `readRenviron("~/.Renviron")`
## [1] "75836456df67458839c36492f8b736a89591e1c1"

Load census data dictionaries

acs_variable_list.2020 <- load_variables(2020, #year
                                         "acs5", #five year ACS estimates
                                         cache = TRUE)

acs_variable_list.2016 <- load_variables(2016, #year
                                         "acs5", #five year ACS estimates
                                         cache = TRUE)

Downloading Data from Tidycensus

Create a vector of census variables

acs_vars <- c("B01001_001E", # ACS total Pop estimate
              "B25002_001E", # Estimate of total housing units
              "B25002_003E", # Number of vacant housing units
              "B19013_001E", # Median HH Income ($)
              "B02001_002E", # People describing themselves as "white alone"
              "B06009_006E") # Total graduate or professional degree

Call the Census API to get tract level data for 2020 for all of Philadelphia

acsTractsPHL.2020 <- get_acs(geography = "tract",
                             year = 2020, 
                             variables = acs_vars, 
                             geometry = FALSE, 
                             state = "PA", 
                             county = "Philadelphia", 
                             output = "wide") 
## Getting data from the 2016-2020 5-year ACS

Wrangling Data with dplyr

Mutating, selecting and renaming variables

acsTractsPHL.2020 <- acsTractsPHL.2020 %>%
  dplyr::select (GEOID, NAME, all_of(acs_vars))

acsTractsPHL.2020 <- acsTractsPHL.2020 %>%
  rename (total_pop.2020 = B01001_001E,
          total_HU.2020 = B25002_001E,
          total_vacant.2020 = B25002_003E,
          med_HH_Income.2020 = B19013_001E,
          total_White.2020 = B02001_002E,
          total_GradDeg.2020 = B06009_006E)

acsTractsPHL.2020 <- acsTractsPHL.2020 %>%
  mutate(vacancyPct.2020 = total_vacant.2020/total_HU.2020,
         pctWhite.2020   = total_White.2020/total_pop.2020)
acsTractsPHL.2016 <- get_acs(geography = "tract",
                             year = 2016, 
                             variables = acs_vars,
                             geometry = FALSE,
                             state = "PA", 
                             county = "Philadelphia",
                             output = "wide") %>%
  dplyr::select (GEOID, NAME, all_of(acs_vars)) %>% 
  rename (total_pop.2016 = B01001_001E,
          total_HU.2016 = B25002_001E,
          total_vacant.2016 = B25002_003E,
          med_HH_Income.2016 = B19013_001E,
          total_White.2016 = B02001_002E,
          total_GradDeg.2016 = B06009_006E) %>%
  mutate(vacancyPct.2016 = total_vacant.2016/total_HU.2016,
         pctWhite.2016 = total_White.2016/total_pop.2016)
## Getting data from the 2012-2016 5-year ACS

Joining data

allACS <- left_join(acsTractsPHL.2016, acsTractsPHL.2020,
                    by= c("GEOID"))

Doing column math using mutate

allACS <- allACS %>%
  mutate(change_med_HH_Income = med_HH_Income.2020 - (med_HH_Income.2016 * 1.08), 
         change_Grad_Degree_Pct = (total_GradDeg.2020/total_pop.2020)-(total_GradDeg.2016/total_pop.2016))

Exercise - Creating Three New Variables With the Mutate Function

allACS <- allACS %>%
  mutate(change_vacancyPct = (vacancyPct.2020 - vacancyPct.2016),
         pct_change_pop = ((total_pop.2020 - total_pop.2016) / total_pop.2016) * 100,
         pct_GradDeg_2020 = (total_GradDeg.2020 / total_pop.2020) * 100)

print(allACS)
## # A tibble: 384 × 24
##    GEOID       NAME.x             total_pop.2016 total_HU.2016 total_vacant.2016
##    <chr>       <chr>                       <dbl>         <dbl>             <dbl>
##  1 42101024900 Census Tract 249,…           3171          1381               286
##  2 42101025800 Census Tract 258,…           1718           800                72
##  3 42101026500 Census Tract 265,…           5106          1907               337
##  4 42101026700 Census Tract 267,…           6147          2834               380
##  5 42101026800 Census Tract 268,…           4554          2086               209
##  6 42101031900 Census Tract 319,…           4938          1890               214
##  7 42101033000 Census Tract 330,…           7231          2855               238
##  8 42101034000 Census Tract 340,…           2957          1114                74
##  9 42101034600 Census Tract 346,…           2664          1302                87
## 10 42101034801 Census Tract 348.…           4465          1978               114
## # ℹ 374 more rows
## # ℹ 19 more variables: med_HH_Income.2016 <dbl>, total_White.2016 <dbl>,
## #   total_GradDeg.2016 <dbl>, vacancyPct.2016 <dbl>, pctWhite.2016 <dbl>,
## #   NAME.y <chr>, total_pop.2020 <dbl>, total_HU.2020 <dbl>,
## #   total_vacant.2020 <dbl>, med_HH_Income.2020 <dbl>, total_White.2020 <dbl>,
## #   total_GradDeg.2020 <dbl>, vacancyPct.2020 <dbl>, pctWhite.2020 <dbl>,
## #   change_med_HH_Income <dbl>, change_Grad_Degree_Pct <dbl>, …

Summarizing Census Data

Exploring central tendancies

mean(allACS$change_med_HH_Income, na.rm = TRUE)
## [1] 6012.472
median(allACS$change_med_HH_Income, na.rm = TRUE)
## [1] 4873.76

Exploring distributions

hist(allACS$change_med_HH_Income)

ggplot(allACS) +
  geom_histogram(aes(change_med_HH_Income)) +
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 31 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(allACS)+
  geom_histogram(aes(change_med_HH_Income), binwidth = 5000)+
  labs(
    title = "Change in Philadelphia HH median income by tract, 2016-2020",
    caption = "Data: US Census Bureau, ACS 5-year estimates",
    x="Change in Med HH Income (2020 dollars)", 
       y="Number of tracts") +
  theme_minimal()

Making a summary table

summaryTable <- allACS %>%
  summarize(mean_change_HH_Income = mean(change_med_HH_Income, na.rm = TRUE),
            med_change_HH_Income = median(change_med_HH_Income, na.rm = TRUE))

pander(summaryTable)
mean_change_HH_Income med_change_HH_Income
6012 4874

Comparing geographies

myTracts <- c("42101023500", 
              "42101023600", 
              "42101023700", 
              "42101025300", 
              "42101025400",
              "42101025500", 
              "42101025600", 
              "42101038800")

allACS <- allACS %>%
  mutate(mtAiry = ifelse(GEOID %in% myTracts, "MT AIRY", "REST OF PHILADELPHIA"))
summaryTable2 <- allACS %>%
  group_by(mtAiry) %>%
  summarize(mean_change_HH_Income = mean(change_med_HH_Income, na.rm = TRUE),
            med_change_HH_Income = median(change_med_HH_Income, na.rm = TRUE))

pander(summaryTable2)
mtAiry mean_change_HH_Income med_change_HH_Income
MT AIRY 6945 3115
REST OF PHILADELPHIA 5991 5073

Graphic comparisons Using ggplot2

ggplot(allACS)+
  geom_histogram(aes(change_med_HH_Income),
                 binwidth = 5000)+
  labs(
    title = "Change in Philadelphia HH median income by tract, 2016-2020",
    caption = "Data: US Census Bureau, ACS 5-year estimates",
    x="Change in Med HH Income (2020 dollars)", 
       y="Number of tracts")+
  facet_wrap(~mtAiry, scales = "free") + 
  theme_minimal()

ggplot(allACS)+
  geom_point(aes(x =med_HH_Income.2016 * 1.08, 
                 y = med_HH_Income.2020,
                 color = mtAiry))+
  geom_abline(intercept = 0, slope = 1)+
  labs(
    title = "2020 Median HH Income as a Function of 2016 Median HH Income",
    subtitle = "All figures in 2020 dollars",
    caption = "Data: US Census Bureau, ACS 5-year estimates",
    x="Med HH Income 2016 ($)", 
    y="Med HH Income 2020 ($)") +
  theme_minimal()
## Warning: Removed 31 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(allACS)+
  geom_point(aes(x = 100* pctWhite.2020, 
                 y = med_HH_Income.2020,
                 color = mtAiry))+
  geom_smooth(aes(x = 100* pctWhite.2020, 
                  y = med_HH_Income.2020), 
              method = "lm", se = FALSE)+
  labs(
    title = "2020 Median HH Income as a Function of Pct White",
    subtitle = "All figures in 2020 dollars",
    caption = "Data: US Census Bureau, ACS 5-year estimates",
    x="Pct. Residents Identifying as 'White Only'", 
    y="Med HH Income 2020 ($)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 31 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 31 rows containing missing values or values outside the scale range
## (`geom_point()`).

Spatial Data and Tidycensus

acsTractsPHL.2020.sf <- get_acs(geography = "tract",
                             year = 2020, 
                             variables = acs_vars, 
                             geometry = TRUE, 
                             state = "PA", 
                             county = "Philadelphia", 
                             output = "wide") %>% 
  dplyr::select (GEOID, NAME, all_of(acs_vars)) %>%
  rename (total_pop.2020 = B01001_001E,
          total_HU.2020 = B25002_001E,
          total_vacant.2020 = B25002_003E,
          med_HH_Income.2020 = B19013_001E,
          total_White.2020 = B02001_002E,
          total_GradDeg.2020 = B06009_006E) %>%
  mutate(vacancyPct.2020 = total_vacant.2020/total_HU.2020,
         pctWhite.2020 = total_White.2020/total_pop.2020) %>%
  mutate(mtAiry = ifelse(GEOID %in% myTracts, "MT AIRY", "REST OF PHILADELPHIA")) %>%
  st_as_sf(crs = 4326) # Turn shp into sf object and project as WGS84
## Getting data from the 2016-2020 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
##   |                                                                              |                                                                      |   0%  |                                                                              |=                                                                     |   1%  |                                                                              |=                                                                     |   2%  |                                                                              |==                                                                    |   3%  |                                                                              |===                                                                   |   4%  |                                                                              |====                                                                  |   5%  |                                                                              |====                                                                  |   6%  |                                                                              |=====                                                                 |   7%  |                                                                              |=====                                                                 |   8%  |                                                                              |======                                                                |   9%  |                                                                              |=======                                                               |  10%  |                                                                              |=========                                                             |  13%  |                                                                              |===========                                                           |  15%  |                                                                              |===========                                                           |  16%  |                                                                              |============                                                          |  18%  |                                                                              |==============                                                        |  20%  |                                                                              |===============                                                       |  22%  |                                                                              |================                                                      |  23%  |                                                                              |===================                                                   |  28%  |                                                                              |======================                                                |  32%  |                                                                              |==========================                                            |  37%  |                                                                              |===========================                                           |  39%  |                                                                              |==============================                                        |  43%  |                                                                              |===============================                                       |  44%  |                                                                              |=================================                                     |  47%  |                                                                              |===================================                                   |  50%  |                                                                              |====================================                                  |  51%  |                                                                              |======================================                                |  54%  |                                                                              |======================================                                |  55%  |                                                                              |========================================                              |  57%  |                                                                              |===========================================                           |  62%  |                                                                              |=============================================                         |  64%  |                                                                              |==============================================                        |  65%  |                                                                              |================================================                      |  68%  |                                                                              |===================================================                   |  73%  |                                                                              |=====================================================                 |  75%  |                                                                              |=======================================================               |  79%  |                                                                              |=========================================================             |  82%  |                                                                              |============================================================          |  85%  |                                                                              |=============================================================         |  87%  |                                                                              |===============================================================       |  91%  |                                                                              |===================================================================   |  96%  |                                                                              |===================================================================== |  99%  |                                                                              |======================================================================| 100%

represent the boundary of Mt. Airy

ggplot()+
  geom_sf(data = acsTractsPHL.2020.sf, aes(fill = pctWhite.2020),
          color = "transparent")+
  geom_sf(data = acsTractsPHL.2020.sf %>%
            filter(mtAiry == "MT AIRY") %>%
            st_union(),
          color = "white",
          fill = "transparent")+
  labs(
    title = "Percentage of those identifying as 'white only' by tract",
    subtitle = "",
    caption = "Data: US Census Bureau, ACS 5-year estimates") +
  theme_void()

Homework Assignment

Q1: Create a ggplot: Change in Vacant Housing Units (2016-2020)

ggplot(allACS, aes(x = total_vacant.2016, y = total_vacant.2020, color = mtAiry)) +
  geom_point(size = 2) +
  scale_color_manual(values = c("#6495ED", "#8B4513"), name = "Mt. Airy") +
  labs(
    x = "Vacant Housing Units in 2016",
    y = "Vacant Housing Units in 2020",
    title = "Change in Vacant Housing Units from 2016 to 2020"
  ) +
  theme_minimal()
## Warning: Removed 21 rows containing missing values or values outside the scale range
## (`geom_point()`).

Q2: Create a map: ousing Vacancy Rates by Tract in Philadelphia (2016)

# Calculate vacancy percentage for 2016
acsTractsPHL.2016 <- acsTractsPHL.2016 %>%
  mutate(vacancyPct.2016 = total_vacant.2016 / total_HU.2016 * 100)
acsTractsPHL.2016.sf <- get_acs(geography = "tract",
                             year = 2016, 
                             variables = acs_vars, 
                             geometry = TRUE, 
                             state = "PA", 
                             county = "Philadelphia", 
                             output = "wide") %>% 
  dplyr::select (GEOID, NAME, all_of(acs_vars)) %>%
  rename (total_pop.2016 = B01001_001E,
          total_HU.2016 = B25002_001E,
          total_vacant.2016 = B25002_003E,
          med_HH_Income.2016 = B19013_001E,
          total_White.2016 = B02001_002E,
          total_GradDeg.2016 = B06009_006E) %>%
  mutate(vacancyPct.2016 = total_vacant.2016/total_HU.2016,
         pctWhite.2016 = total_White.2016/total_pop.2016) %>%
  mutate(mtAiry = ifelse(GEOID %in% myTracts, "MT AIRY", "REST OF PHILADELPHIA")) %>%
  st_as_sf(crs = 4326) # Turn shp into sf object and project as WGS84
## Getting data from the 2012-2016 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
##   |                                                                              |                                                                      |   0%  |                                                                              |==                                                                    |   2%  |                                                                              |=====                                                                 |   6%  |                                                                              |========                                                              |  12%  |                                                                              |===========                                                           |  16%  |                                                                              |===============                                                       |  21%  |                                                                              |===================                                                   |  27%  |                                                                              |======================                                                |  31%  |                                                                              |==========================                                            |  36%  |                                                                              |============================                                          |  40%  |                                                                              |=============================                                         |  42%  |                                                                              |=================================                                     |  47%  |                                                                              |====================================                                  |  52%  |                                                                              |=======================================                               |  56%  |                                                                              |========================================                              |  57%  |                                                                              |=========================================                             |  59%  |                                                                              |==========================================                            |  60%  |                                                                              |===========================================                           |  61%  |                                                                              |=============================================                         |  64%  |                                                                              |=================================================                     |  70%  |                                                                              |====================================================                  |  74%  |                                                                              |========================================================              |  80%  |                                                                              |========================================================              |  81%  |                                                                              |===========================================================           |  85%  |                                                                              |==============================================================        |  88%  |                                                                              |==============================================================        |  89%  |                                                                              |===============================================================       |  90%  |                                                                              |==================================================================    |  95%  |                                                                              |======================================================================| 100%
ggplot() +
  geom_sf(data = acsTractsPHL.2016.sf, 
          aes(fill = vacancyPct.2016), color = "transparent") +
  geom_sf(data = acsTractsPHL.2016.sf %>%
            filter(mtAiry == "MT AIRY") %>%
            st_union(),
          color = "yellow", 
          fill = "transparent") +
  scale_fill_viridis_c(option = "viridis", 
                       name = "Percentage of Vacant Homes") +  
  labs(
    title = "Housing Vacancy Rates by Tract in Philadelphia (2016)",
    subtitle = "",
    caption = "Data Sources: US Census Bureau, ACS 5-year estimates"
  ) +
  theme_void()

### The above map highlights the vacancy rates across Philadelphia tracts in the year of 2016. Mt. Airy is outlined in yellow. The analysis implies that Mt. Airy has lower vacancy rates compared to other areas in the city, which could reflect its relatively stable housing market.

Q3: a summary of mean number of vacant housing units per tract for Mt. Airy vs. the rest Philadelphia as a whole in 2020

library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(dplyr)
summary_table <- acsTractsPHL.2020.sf %>%
  st_drop_geometry() %>%
  group_by(mtAiry) %>%
  summarise(mean_vacant_units = mean(total_vacant.2020, na.rm = TRUE))

# Summarize with a kable table
kable(summary_table, caption = "Mean Number of Vacant Housing Units per Tract (2020)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Mean Number of Vacant Housing Units per Tract (2020)
mtAiry mean_vacant_units
MT AIRY 156.6250
REST OF PHILADELPHIA 186.8575