This analysis examines fatal encounters in the United States between 2000 to 2021, exploring patterns across temporal, demographic, and geographic dimensions.
library(tidycensus)
library(sf)
library(tmap)
library(tidyverse)
library(dplyr)
library(stringr)
library(tidyr)
library(ggplot2)
library(classInt)
library(lubridate)
library(readxl)
library(forcats)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
fe_data<-read.csv("FatalEncounters.csv")
names(fe_data)
fe_data$County <- str_to_lower(fe_data$Location.of.death..county.)
fe_data$State <- str_to_lower(fe_data$State)
head(fe_data$Date.of.injury.resulting.in.death..month.day.year.)
fe_data <- fe_data %>%
mutate(Year = year(as.Date(Date.of.injury.resulting.in.death..month.day.year.,
format = "%m/%d/%Y")))
fe_data <- fe_data %>%
filter(!is.na(County), !is.na(State), !is.na(Year), !is.na(Race), !is.na(Gender),
!str_detect(Race, "Race unspecified|Christopher Anthony Alexander")) %>%
select('Unique.ID','Name','Age','Gender','Race','Date.of.injury.resulting.in.death..month.day.year.',
'Full.Address','Location.of.death..city.','State','Location.of.death..zip.code.',
'Location.of.death..county.','Latitude','Longitude','Year')
3.Plot 1: Time Series Graph - We categorize races
into broader groups, aggregate fatalities per year, and visualize trends
over time.
Insight: White, Black and Hispanic populations stand out having
experienced highest fatality counts.
fe_data <- fe_data %>%
mutate(
Race_category = case_when(
str_detect(Race, "African") ~ "Black",
str_detect(Race, "Asian") ~ "Asian",
str_detect(Race, "European|White") ~ "White",
str_detect(Race, "Hispanic|Latino") ~ "Hispanic/Latino",
str_detect(Race, "Native") ~ "Native American/Alaskan",
str_detect(Race, "Middle") ~ "Middle Eastern",
TRUE ~ "Other"
)
)
names(fe_data)
fe_race <- fe_data %>%
group_by(Year) %>%
summarise(Total = n(),
Black = sum(Race_category == "Black"),
White = sum(Race_category == "White"),
Hispanic_Latino = sum(Race_category =="Hispanic/Latino"),
Asian = sum(Race_category== "Asian"),
Native_American_Alaskan= sum(Race_category== "Native American/Alaskan"),
Middle_Eastern = sum(Race_category== "Middle Eastern"),
.groups = "drop"
)
fe_race_long <- fe_race %>%
pivot_longer(
cols = c(Total, Black:Middle_Eastern),
names_to = "Race",
values_to = "Count"
)
category_order <- fe_race_long %>%
group_by(Race) %>%
summarise(total = sum(Count, na.rm = TRUE)) %>%
arrange(desc(total)) %>%
pull(Race)
fe_race_long$Race <- factor(fe_race_long$Race, levels = category_order)
# Plot
ggplot(fe_race_long, aes(x = Year, y = Count, color = Race)) +
geom_line(size = 1.2) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "Fatal Encounters by Race (2000 to 2015)",
x = "Year",
y = "Number of Fatalities",
color = "Race"
)+
theme(
plot.title = element_text(face = "bold", size = 14, hjust = 0.5)
)
4.Plot 2: Stacked Bar Chart - We aggregate
fatalities by gender within the top three racial groups. Cleaning gender
entries ensures accurate classification. The stacked bar chart shows the
relative contribution of male and female fatalities in each race.
Insight: Males dominate the fatality counts across all top racial
groups, with Black and White males showing the highest numbers
# Gender distribution aggregated across all years for top 3 races
fe_gender_race_total <- fe_data %>%
mutate(
Gender = str_trim(Gender),
Gender = na_if(Gender, ""),
Gender = na_if(Gender, "NULL"),
Gender = str_to_title(Gender)
) %>%
filter(Race_category %in% c("White", "Black", "Hispanic/Latino"),
!is.na(Gender)) %>%
group_by(Race_category, Gender) %>%
summarise(Fatalities = n(), .groups = "drop")
# Create stacked bar chart
ggplot(fe_gender_race_total, aes(x = Race_category, y = Fatalities, fill = Gender)) +
geom_bar(stat = "identity", position = "stack") +
theme_minimal(base_size = 13) +
labs(
title = "Fatal Encounters by Gender and Race",
x = "Race Category",
y = "Number of Fatalities",
fill = "Gender"
) +
scale_fill_brewer(palette = "Pastel1") +
theme(
plot.title = element_text(face = "bold", size = 14, hjust = 0.5)
)
5.Plot 3: Facetted histogram - We plot age distribution for fatalities in the top three races. Insight:Most fatalities occur between ages 20–40. Median age varies slightly by race
fe_age_hist <- fe_data %>%
filter(Race_category %in% c("White", "Black", "Hispanic/Latino"), !is.na(Age)) %>%
mutate(Age = as.numeric(Age))
# Calculate median age per race category
medians <- fe_age_hist %>%
group_by(Race_category) %>%
summarise(median_age = median(Age, na.rm = TRUE), .groups = "drop")
# Plot histogram with median lines
ggplot(fe_age_hist, aes(x = Age, fill = Race_category)) +
geom_histogram(binwidth = 5, color = "white", alpha = 0.7) +
geom_vline(data = medians, aes(xintercept = median_age),
color = "red", linetype = "solid", size = 1) +
facet_wrap(~Race_category, nrow = 1) +
theme_minimal(base_size = 13) +
labs(
title = "Fatal Encounters by Age and Race",
x = "Age",
y = "Number of Fatalities",
) +
scale_fill_brewer(palette = "Accent") +
theme(
plot.title = element_text(face = "bold", size = 14, hjust = 0.5)
)
6.Getting population data from file - We read population data from the given data set to further calculate fatality rates across differents states in the country.
# Load state populations
excel_sheets("FatalEncounters.xlsx")
state_pop <- read_excel(
path = "FatalEncounters.xlsx",
sheet = "State Abbreviations and Populat"
)
names(state_pop)
state_pop <- state_pop %>%
select('Abbreviation','Name','Population (2015 est.)',
'Population (2010)','Population (2005)', 'Population (2000)') %>%
rename(State = Abbreviation) %>%
mutate(State = str_to_lower(State))
names(state_pop)
state_pop <- state_pop %>%
rename(
"2015" = "Population (2015 est.)",
"2010" = "Population (2010)",
"2005" = "Population (2005)",
"2000" = "Population (2000)"
)
7.Plot 4: Choropleth Map - We calculate fatalities per million population for the year 2015 as that is the latest year for which the dataset includes state-wise population figures. Then we map the rates to visualize state-level differences.
# reshape population data
state_pop_long<- state_pop %>%
pivot_longer(
cols = c("2000", "2005", "2010", "2015"),
names_to = "Year",
values_to = "total_population"
) %>%
select(State, Name, Year, total_population)
# Filter for 2015
fe_demo <- fe_data %>%
group_by(Year, State)%>%
summarise(Total = n())%>%
filter(Year %in% c(2015))
# Make sure columns are the same type
fe_demo <- fe_demo %>%
mutate(Year = as.numeric(Year))
state_pop_long <- state_pop_long %>%
mutate(Year = as.numeric(Year))
combined <- fe_demo %>%
inner_join(state_pop_long, by = c("State", "Year"))
combined <- combined %>%
mutate(
fatality_rate = (Total/ total_population) * 1000000
) %>%
filter(!is.na(total_population),!is.na(Year))
# Get U.S. states shapefile
us_states <- tigris::states(cb = TRUE) %>%
st_as_sf() %>%
mutate(State = tolower(STUSPS))
us_combined_map <- us_states %>%
left_join(combined, by = "State") %>%
filter(!is.na(Year), Year == 2015)
# Visualize
tmap_mode("plot")
tm_shape(us_combined_map) +
tm_polygons(
col = "fatality_rate",
palette = "reds",
style = "quantile",
title = "Fatalities per million population",
colorNA = "grey90",
textNA = "",
showNA = FALSE,
border.col = "white",
border.alpha = 0.5,
lwd = 0.5
) +
tm_layout(
title = "Fatality Rates by States in 2015",
title.size = 1,
title.position = c("bottom", "left"),
legend.outside = TRUE,
legend.outside.position = "right",
legend.outside.size = 0.2,
legend.title.size = 0.75,
legend.text.size = 0.5,
inner.margins = c(0.05, 0.05, 0.1, 0.05),
bg.color = "white"
)+
tm_crs("ESRI:102003")
6.Plot 5: Dot density map - We drill down to the top state in 2015 and map individual fatalities as red dots over the cities. Insight: Fatality hotspots are concentrated mainly in cities
# Top state by fatalities in 2015
top_state <- us_combined_map %>%
arrange(desc(fatality_rate)) %>%
slice_head(n = 1) %>%
pull(State)
top_state
top_state_name <- us_combined_map %>%
arrange(desc(fatality_rate)) %>%
slice_head(n = 1) %>%
pull(NAME)
top_state_name
# Filter data for that state
top_state_data <- fe_data %>%
filter(State == top_state, !is.na(Latitude), !is.na(Longitude))
# Convert to sf points
top_state_points <- st_as_sf(top_state_data, coords = c("Longitude", "Latitude"), crs = 4326)
state_boundary <- us_states %>%
filter(State == top_state)
# Get city/place boundaries for the state
state_cities <- tigris::places(state = toupper(top_state), cb = TRUE) %>%
st_as_sf()
# Plot dot density map with city boundaries
tmap_mode("plot")
tm_shape(state_boundary) +
tm_polygons(col = "white", border.col = "black", lwd = 1.5) +
tm_shape(state_cities) +
tm_polygons(col = "grey", border.col = "white", lwd = 1) +
tm_shape(top_state_points) +
tm_dots(size = 0.05, col = "red", alpha = 0.7, title = "Fatalities") +
tm_layout(
title = paste("Dot-density Map of Fatalities in 2015 in",top_state_name),
title.size = 1,
title.position = c("bottom"),
)
-Temporal Trends: In Plot 1, the data shows a concerning upward trend in reported fatal encounters, particularly after 2010. Fatalities have fluctuated over the years, with Black and White populations experiencing the highest numbers.
-Demographic Patterns: From Plots 1 and 2 we can infer that fatal encounters disproportionately affect young men in the top three affected race categories (White, Black and Hispanics) particularly in the 20-40 age range. Males account for the majority of fatalities in all high-fatality racial groups, indicating gendered risk differences.
-Geographic Distribution: In Plot 4, state-level analysis reveals that fatality rates vary dramatically across the United States, even after controlling for population differences. Western and Southern states show elevated rates. A clustering of similar rates among neighboring states implies that regional policy approaches or cultural factors may be at work.
-Urban Concentration: In Plot 5, the dot density
analysis confirms that fatal encounters predominantly occur in urban
environments where police-civilian interactions are most frequent.
Ultimately, these patterns show that fatal encounters are not random occurrences but follow discernible trends influenced by demographics, geography, and policy contexts. Recognizing these patterns offers hope, as it highlights opportunities for targeted interventions guided by spatial and demographic insights.