# Clean environment
rm(list = ls())
# Load required packages
library(tidyverse) # Data wrangling, visualization
library(tidycensus) # Pull ACS data
library(lubridate) # Clean dates
library(skimr) # Conduct general data quality checksData Cleaning: Eviction Filings in Boston
Overview
This work sample demonstrates my approach to preparing datasets for quantitative analysis. The workflow integrates three data sources:
- Monthly eviction filing data from Princeton’s Eviction Lab (2020–2023)
- American Community Survey (ACS) 5-year estimates providing demographic and housing characteristics
- City of Boston 2020 census tract identifiers
The cleaning process involves pulling data via API, joining datasets across sources, reshaping and aggregating at multiple levels, creating derived variables, and conducting data quality checks.
The final analysis explores the relationship between neighborhood racial composition, median income, and eviction rates. The analysis is available here: https://rpubs.com/anavasconcelos/1367326
Step 1: Load Boston Census Tract Identifiers
This file identifies the 2020 census tracts that fall within Boston city boundaries.
boston_tracts <- read.csv("2020-census-tracts-in-boston (1).csv") %>%
mutate(GEOID = as.character(geoid20))
# Check # of observations
nrow(boston_tracts)[1] 207
Step 2: Pull Demographic Data from American Community Survey
I use the tidycensus package to retrieve tract-level estimates from the ACS 5-year data for Suffolk County, where Boston is located. I also create city-level aggregates and append them to the tract-level dataset for later comparisons.
# Load ACS variable names to identify correct codes
acs_vars <- load_variables(year = 2022, dataset = "acs5", cache = TRUE)
# Retrieve ACS demographic variables for Suffolk County tracts
demographic_by_tract <- get_acs(
geography = "tract",
variables = c(
pop_total = "B02001_001",
pop_black = "B02001_003",
median_income = "B19013_001",
unit_renter = "B25003_003",
median_rent = "B25064_001"
),
state = "MA",
county = "Suffolk",
output = "wide",
geometry = FALSE,
year = 2022
) %>%
select(-matches("M$")) %>% # Remove margin of error columns
mutate(pct_black = round((pop_blackE / pop_totalE) * 100, 2)) %>%
right_join(boston_tracts, by = "GEOID") %>% # Subset to Boston tracts only
select(
GEOID,
unit_renter = unit_renterE,
median_income = median_incomeE,
median_rent = median_rentE,
pct_black,
pop_total = pop_totalE,
pop_black = pop_blackE
)
# Create city-level aggregates for tract-to-city comparisons
demographic_boston <- demographic_by_tract %>%
summarize(
median_income = mean(median_income, na.rm = TRUE),
median_rent = mean(median_rent, na.rm = TRUE),
pop_total = sum(pop_total),
pop_black = sum(pop_black),
unit_renter = sum(unit_renter),
pct_black = sum(pop_black) / sum(pop_total)
) %>%
mutate(GEOID = "Boston")
# Append city-level row to tract-level data
demographic_final <- rbind(demographic_boston, demographic_by_tract)
# Check # of observations
nrow(demographic_final)[1] 208
Step 3: Load and Wrangle Eviction Lab Data
The Princeton Eviction Lab provides monthly filing counts by census tract. I aggregate to yearly and total counts, then reshape to calculate change over time. I also calculate city-level aggregates for tract-to-city comparisons.
# Load monthly eviction data
eviction <- read.csv("boston_monthly_2020_2023.csv") %>%
filter(GEOID != "sealed") %>% # Remove sealed/redacted tracts
right_join(boston_tracts, by = "GEOID") %>% # Subset to Boston tracts
mutate(
month = mdy(paste("01", month)), # Convert "mm-yyyy" to date
year = year(month)
) %>%
select(GEOID, year, filings = filings_2020)
# Aggregate filings across entire city (for city-level comparisons)
eviction_boston <- eviction %>%
group_by(year) %>%
summarize(filings = sum(filings, na.rm = TRUE), .groups = "drop") %>%
mutate(GEOID = "Boston")
# Combine tract and city-level data
eviction_all <- rbind(eviction, eviction_boston)
# Calculate total filings by tract (2020–2023)
eviction_by_GEOID <- eviction_all %>%
group_by(GEOID) %>%
summarize(total_filings = sum(filings, na.rm = TRUE), .groups = "drop")
# Reshape to wide format to calculate change in filings
eviction_wide <- eviction_all %>%
group_by(GEOID, year) %>%
summarize(filings = sum(filings, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(
names_from = year,
values_from = filings,
names_prefix = "filings_"
) %>%
mutate(change = filings_2023 - filings_2020) %>%
select(GEOID, change)
# Merge total filings with change variable
eviction_final <- inner_join(eviction_by_GEOID, eviction_wide, by = "GEOID")
# Check # of observations
nrow(eviction_final)[1] 208
Step 4: Merge Datasets and Create Outcome Variable
merged_df <- eviction_final %>%
inner_join(demographic_final, by = "GEOID") %>%
filter(unit_renter != 0) %>% # Remove tracts with no renters
mutate(
filings_pct = round((total_filings / unit_renter) * 100, 2)
)
# Ensure there are duplicate tracts after merging, check # of observations
merged_df %>% count(GEOID) %>% filter(n > 1)# A tibble: 0 × 2
# ℹ 2 variables: GEOID <chr>, n <int>
nrow(merged_df)[1] 195
Step 5: Data Quality Checks
# Summary statistics
skim(merged_df)| Name | merged_df |
| Number of rows | 195 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| GEOID | 0 | 1 | 6 | 11 | 0 | 195 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| total_filings | 0 | 1.00 | 120.64 | 840.16 | 0 | 17.00 | 40.00 | 89.50 | 11765.00 | ▇▁▁▁▁ |
| change | 0 | 1.00 | 23.63 | 164.92 | -17 | 2.00 | 6.00 | 17.00 | 2304.00 | ▇▁▁▁▁ |
| median_income | 4 | 0.98 | 95378.01 | 47159.81 | 18125 | 60611.00 | 86250.00 | 124984.50 | 237188.00 | ▆▇▆▂▁ |
| median_rent | 6 | 0.97 | 2018.06 | 657.52 | 486 | 1627.00 | 2059.00 | 2382.00 | 3501.00 | ▂▃▇▅▂ |
| pop_total | 0 | 1.00 | 6810.44 | 47470.06 | 30 | 2351.50 | 3230.00 | 4337.50 | 665945.00 | ▇▁▁▁▁ |
| pop_black | 0 | 1.00 | 1536.14 | 10744.44 | 0 | 70.50 | 258.00 | 1026.00 | 150002.00 | ▇▁▁▁▁ |
| unit_renter | 0 | 1.00 | 1844.79 | 12822.94 | 4 | 583.50 | 874.00 | 1225.50 | 179867.00 | ▇▁▁▁▁ |
| pct_black | 0 | 1.00 | 19.37 | 21.99 | 0 | 3.02 | 8.83 | 32.70 | 79.75 | ▇▂▂▁▁ |
| filings_pct | 0 | 1.00 | 8.67 | 32.16 | 0 | 2.27 | 4.92 | 9.91 | 450.00 | ▇▁▁▁▁ |
# Visualize distribution to identify outliers
ggplot(merged_df, aes(x = filings_pct)) +
geom_histogram()ggplot(merged_df, aes(x = median_income)) +
geom_histogram()ggplot(merged_df, aes(x = pct_black)) +
geom_histogram()# Remove tracts with missing median income (small-population tracts suppressed by ACS)
merged_df <- merged_df %>%
filter(!is.na(median_income))
# Remove outlier - one tract has 450% filing rate due to small tract population
merged_df <- merged_df %>%
filter(filings_pct != 450)
# Check # of observations
nrow(merged_df)[1] 191
Step 6: Create Categorical Variables for Analysis
I create indicator variables to rank census tracts by tertiles relative to each other, as well as binary indicators comparing each tract to the city average.
# Extract city-level values for comparison
boston_filings_pct <- merged_df %>%
filter(GEOID == "Boston") %>%
pull(filings_pct)
boston_median_income <- merged_df %>%
filter(GEOID == "Boston") %>%
pull(median_income)
boston_pct_black <- merged_df %>%
filter(GEOID == "Boston") %>%
pull(pct_black)
# Create final analysis dataset with categorical variables
final_df <- merged_df %>%
filter(GEOID != "Boston") %>%
mutate(
# Tertile rankings
black_tertile = factor(ntile(pct_black, 3),
labels = c("Low", "Middle", "High")),
income_tertile = factor(ntile(median_income, 3),
labels = c("Low", "Middle", "High")),
filings_tertile = factor(ntile(filings_pct, 3),
labels = c("Low", "Middle", "High")),
# Above/below city average indicators
filings_pct_rank = ifelse(filings_pct > boston_filings_pct,
"Above City Average", "Below City Average"),
median_income_rank = ifelse(median_income > boston_median_income,
"Above City Average", "Below City Average"),
pct_black_rank = ifelse(pct_black > boston_pct_black,
"Above City Average", "Below City Average")
)Step 7: Final Inspection of New Variables
To validate that the derived variables were created correctly, I check whether eviction filing rates vary in the expected directions. (i.e., are eviction filings rate higher in poorer communities?)
# Tertile variables -
#Race
final_df %>%
group_by(black_tertile) %>%
summarize(mean = mean(filings_pct))# A tibble: 3 × 2
black_tertile mean
<fct> <dbl>
1 Low 3.13
2 Middle 4.50
3 High 11.8
#Income
final_df %>%
group_by(income_tertile) %>%
summarize(mean = mean(filings_pct))# A tibble: 3 × 2
income_tertile mean
<fct> <dbl>
1 Low 9.27
2 Middle 6.42
3 High 3.63
#City ranking variables
#Race
final_df %>%
group_by(pct_black_rank) %>%
summarize(mean(filings_pct))# A tibble: 2 × 2
pct_black_rank `mean(filings_pct)`
<chr> <dbl>
1 Above City Average 6.74
2 Below City Average 1.80
#Income
final_df %>%
group_by(median_income_rank) %>%
summarize(mean(filings_pct))# A tibble: 2 × 2
median_income_rank `mean(filings_pct)`
<chr> <dbl>
1 Above City Average 4.06
2 Below City Average 8.35
Final Dataset
The cleaned dataset contains 190 census tracts with the following variables ready for analysis:
filings_pct: Eviction filings per 100 renter-occupied units (2020–2023)change: Change in annual filings between 2020 and 2023median_income,median_rent,pct_black: Tract demographic characteristics- Tertile and city-comparison indicators for stratified analysis