Data Cleaning: Eviction Filings in Boston

Author

Ana Vasconcelos

Published

December 6, 2025

Overview

This work sample demonstrates my approach to preparing datasets for quantitative analysis. The workflow integrates three data sources:

Monthly eviction filing data from Princeton’s Eviction Lab (2020–2023)
American Community Survey (ACS) 5-year estimates providing demographic and housing characteristics
City of Boston 2020 census tract identifiers

The cleaning process involves pulling data via API, joining datasets across sources, reshaping and aggregating at multiple levels, creating derived variables, and conducting data quality checks.

The final analysis explores the relationship between neighborhood racial composition, median income, and eviction rates. The analysis is available here: https://rpubs.com/anavasconcelos/1367326

# Clean environment
rm(list = ls())

# Load required packages
library(tidyverse)  # Data wrangling, visualization
library(tidycensus) # Pull ACS data
library(lubridate) # Clean dates
library(skimr) # Conduct general data quality checks

Step 1: Load Boston Census Tract Identifiers

This file identifies the 2020 census tracts that fall within Boston city boundaries.

boston_tracts <- read.csv("2020-census-tracts-in-boston (1).csv") %>%
  mutate(GEOID = as.character(geoid20))

# Check # of observations
nrow(boston_tracts)

[1] 207

Step 2: Pull Demographic Data from American Community Survey

I use the tidycensus package to retrieve tract-level estimates from the ACS 5-year data for Suffolk County, where Boston is located. I also create city-level aggregates and append them to the tract-level dataset for later comparisons.

# Load ACS variable names to identify correct codes
acs_vars <- load_variables(year = 2022, dataset = "acs5", cache = TRUE)

# Retrieve ACS demographic variables for Suffolk County tracts
demographic_by_tract <- get_acs(
  geography = "tract",
  variables = c(
    pop_total     = "B02001_001",
    pop_black     = "B02001_003",
    median_income = "B19013_001",
    unit_renter   = "B25003_003",
    median_rent   = "B25064_001"
  ),
  state = "MA",
  county = "Suffolk",
  output = "wide",
  geometry = FALSE,
  year = 2022
) %>%
  select(-matches("M$")) %>%                     # Remove margin of error columns
  mutate(pct_black = round((pop_blackE / pop_totalE) * 100, 2)) %>%
  right_join(boston_tracts, by = "GEOID") %>%   # Subset to Boston tracts only
  select(
    GEOID,
    unit_renter   = unit_renterE,
    median_income = median_incomeE,
    median_rent   = median_rentE,
    pct_black,
    pop_total     = pop_totalE,
    pop_black     = pop_blackE
  )

# Create city-level aggregates for tract-to-city comparisons
demographic_boston <- demographic_by_tract %>%
  summarize(
    median_income = mean(median_income, na.rm = TRUE),
    median_rent   = mean(median_rent, na.rm = TRUE),
    pop_total     = sum(pop_total),
    pop_black     = sum(pop_black),
    unit_renter   = sum(unit_renter),
    pct_black     = sum(pop_black) / sum(pop_total)
  ) %>%
  mutate(GEOID = "Boston")

# Append city-level row to tract-level data
demographic_final <- rbind(demographic_boston, demographic_by_tract)

# Check # of observations
nrow(demographic_final)

[1] 208

Step 3: Load and Wrangle Eviction Lab Data

The Princeton Eviction Lab provides monthly filing counts by census tract. I aggregate to yearly and total counts, then reshape to calculate change over time. I also calculate city-level aggregates for tract-to-city comparisons.

# Load monthly eviction data
eviction <- read.csv("boston_monthly_2020_2023.csv") %>%
  filter(GEOID != "sealed") %>%                 # Remove sealed/redacted tracts
  right_join(boston_tracts, by = "GEOID") %>%    # Subset to Boston tracts
  mutate(
    month = mdy(paste("01", month)),            # Convert "mm-yyyy" to date
    year  = year(month)
  ) %>%
  select(GEOID, year, filings = filings_2020)

# Aggregate filings across entire city (for city-level comparisons)
eviction_boston <- eviction %>%
  group_by(year) %>%
  summarize(filings = sum(filings, na.rm = TRUE), .groups = "drop") %>%
  mutate(GEOID = "Boston")

# Combine tract and city-level data
eviction_all <- rbind(eviction, eviction_boston)

# Calculate total filings by tract (2020–2023)
eviction_by_GEOID <- eviction_all %>%
  group_by(GEOID) %>%
  summarize(total_filings = sum(filings, na.rm = TRUE), .groups = "drop")

# Reshape to wide format to calculate change in filings
eviction_wide <- eviction_all %>%
  group_by(GEOID, year) %>%
  summarize(filings = sum(filings, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(
    names_from   = year,
    values_from  = filings,
    names_prefix = "filings_"
  ) %>%
  mutate(change = filings_2023 - filings_2020) %>%
  select(GEOID, change)

# Merge total filings with change variable
eviction_final <- inner_join(eviction_by_GEOID, eviction_wide, by = "GEOID")

# Check # of observations
nrow(eviction_final)

[1] 208

Step 4: Merge Datasets and Create Outcome Variable

merged_df <- eviction_final %>%
  inner_join(demographic_final, by = "GEOID") %>%
  filter(unit_renter != 0) %>%                 # Remove tracts with no renters
  mutate(
    filings_pct = round((total_filings / unit_renter) * 100, 2)
  )

# Ensure there are duplicate tracts after merging, check # of observations
merged_df %>% count(GEOID) %>% filter(n > 1)

# A tibble: 0 × 2
# ℹ 2 variables: GEOID <chr>, n <int>

nrow(merged_df)

[1] 195

Step 5: Data Quality Checks

# Summary statistics
skim(merged_df)

Data summary
Name	merged_df
Number of rows	195
Number of columns	10
_______________________
Column type frequency:
character	1
numeric	9
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
GEOID	0	1	6	11	0	195	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
total_filings	0	1.00	120.64	840.16	0	17.00	40.00	89.50	11765.00	▇▁▁▁▁
change	0	1.00	23.63	164.92	-17	2.00	6.00	17.00	2304.00	▇▁▁▁▁
median_income	4	0.98	95378.01	47159.81	18125	60611.00	86250.00	124984.50	237188.00	▆▇▆▂▁
median_rent	6	0.97	2018.06	657.52	486	1627.00	2059.00	2382.00	3501.00	▂▃▇▅▂
pop_total	0	1.00	6810.44	47470.06	30	2351.50	3230.00	4337.50	665945.00	▇▁▁▁▁
pop_black	0	1.00	1536.14	10744.44	0	70.50	258.00	1026.00	150002.00	▇▁▁▁▁
unit_renter	0	1.00	1844.79	12822.94	4	583.50	874.00	1225.50	179867.00	▇▁▁▁▁
pct_black	0	1.00	19.37	21.99	0	3.02	8.83	32.70	79.75	▇▂▂▁▁
filings_pct	0	1.00	8.67	32.16	0	2.27	4.92	9.91	450.00	▇▁▁▁▁

# Visualize distribution to identify outliers
ggplot(merged_df, aes(x = filings_pct)) +
  geom_histogram()

ggplot(merged_df, aes(x = median_income)) +
  geom_histogram()

ggplot(merged_df, aes(x = pct_black)) +
  geom_histogram()

# Remove tracts with missing median income (small-population tracts suppressed by ACS)
merged_df <- merged_df %>%
  filter(!is.na(median_income))

# Remove outlier - one tract has 450% filing rate due to small tract population
merged_df <- merged_df %>%
  filter(filings_pct != 450)

# Check # of observations
nrow(merged_df)

[1] 191

Step 6: Create Categorical Variables for Analysis

I create indicator variables to rank census tracts by tertiles relative to each other, as well as binary indicators comparing each tract to the city average.

# Extract city-level values for comparison
boston_filings_pct   <- merged_df %>% 
                         filter(GEOID == "Boston") %>% 
                        pull(filings_pct)

boston_median_income <- merged_df %>%
                         filter(GEOID == "Boston") %>%
                         pull(median_income)

boston_pct_black <- merged_df %>% 
                      filter(GEOID == "Boston") %>% 
                      pull(pct_black)

# Create final analysis dataset with categorical variables
final_df <- merged_df %>%
  filter(GEOID != "Boston") %>%
  mutate(
    # Tertile rankings
    black_tertile   = factor(ntile(pct_black, 3), 
                             labels = c("Low", "Middle", "High")),
    income_tertile  = factor(ntile(median_income, 3), 
                             labels = c("Low", "Middle", "High")),
    filings_tertile = factor(ntile(filings_pct, 3), 
                             labels = c("Low", "Middle", "High")),
    
    # Above/below city average indicators
    filings_pct_rank   = ifelse(filings_pct > boston_filings_pct, 
                                "Above City Average", "Below City Average"),
    median_income_rank = ifelse(median_income > boston_median_income,
                                "Above City Average", "Below City Average"),
    pct_black_rank     = ifelse(pct_black > boston_pct_black,
                                "Above City Average", "Below City Average")
  )

Step 7: Final Inspection of New Variables

To validate that the derived variables were created correctly, I check whether eviction filing rates vary in the expected directions. (i.e., are eviction filings rate higher in poorer communities?)

# Tertile variables -
#Race
  final_df %>%
    group_by(black_tertile) %>%
   summarize(mean = mean(filings_pct))

# A tibble: 3 × 2
  black_tertile  mean
  <fct>         <dbl>
1 Low            3.13
2 Middle         4.50
3 High          11.8

#Income
  final_df %>%
    group_by(income_tertile) %>%
  summarize(mean = mean(filings_pct))

# A tibble: 3 × 2
  income_tertile  mean
  <fct>          <dbl>
1 Low             9.27
2 Middle          6.42
3 High            3.63

#City ranking variables
  
#Race
final_df %>%
  group_by(pct_black_rank) %>%
  summarize(mean(filings_pct))

# A tibble: 2 × 2
  pct_black_rank     `mean(filings_pct)`
  <chr>                            <dbl>
1 Above City Average                6.74
2 Below City Average                1.80

#Income
final_df %>%
  group_by(median_income_rank) %>%
  summarize(mean(filings_pct))

# A tibble: 2 × 2
  median_income_rank `mean(filings_pct)`
  <chr>                            <dbl>
1 Above City Average                4.06
2 Below City Average                8.35

Final Dataset

The cleaned dataset contains 190 census tracts with the following variables ready for analysis:

filings_pct: Eviction filings per 100 renter-occupied units (2020–2023)
change: Change in annual filings between 2020 and 2023
median_income, median_rent, pct_black: Tract demographic characteristics
Tertile and city-comparison indicators for stratified analysis