Introduction
This assignment aims to analyze fatal encounters data from this source https://fatalencounters.org/ by providing visualization to support the analysis. I’ve broken down the analysis into 3 parts including
- Demographic Analysis
- Temporal Analysis
- Geographical Analysis
Load Packages
library(tidyverse)
library(knitr)
library(skimr)
library(units)
library(ggplot2)
library(ggpubr)
library(lubridate)
library(tmap)
library(sf)
Load Data
data <- read.csv("fatal_encountors_data.csv")
glimpse(data)
## Rows: 31,498
## Columns: 35
## $ Unique.ID <int> 31495, 31496, 3…
## $ Name <chr> "Ashley McClend…
## $ Age <chr> "28", "", "", "…
## $ Gender <chr> "Female", "Fema…
## $ Race <chr> "African-Americ…
## $ Race.with.imputations <chr> "African-Americ…
## $ Imputation.probability <chr> "Not imputed", …
## $ URL.of.image..PLS.NO.HOTLINKS. <chr> "https://fatale…
## $ Date.of.injury.resulting.in.death..month.day.year. <chr> "12/31/2021", "…
## $ Location.of.injury..address. <chr> "South Pearl St…
## $ Location.of.death..city. <chr> "Pageland", "Me…
## $ State <chr> "SC", "MS", "MS…
## $ Location.of.death..zip.code. <int> 29728, 39301, 3…
## $ Location.of.death..county. <chr> "Chesterfield",…
## $ Full.Address <chr> "South Pearl St…
## $ Latitude <chr> "34.7452955", "…
## $ Longitude <dbl> -80.39306, -88.…
## $ Agency.or.agencies.involved <chr> "Pageland Polic…
## $ Highest.level.of.force <chr> "Vehicle", "Gun…
## $ UID.Temporary <int> NA, NA, NA, NA,…
## $ Name.Temporary <chr> "", "", "", "",…
## $ Armed.Unarmed <chr> "", "", "", "",…
## $ Alleged.weapon <chr> "", "", "", "",…
## $ Aggressive.physical.movement <chr> "", "", "", "",…
## $ Fleeing.Not.fleeing <chr> "", "", "", "",…
## $ Description.Temp <chr> "", "", "", "",…
## $ URL.Temp <chr> "", "", "", "",…
## $ Brief.description <chr> "Ashley McClend…
## $ Dispositions.Exclusions.INTERNAL.USE..NOT.FOR.ANALYSIS <chr> "Criminal", "Pe…
## $ Intended.use.of.force..Developing. <chr> "Pursuit", "Dea…
## $ Supporting.document.link <chr> "https://www.ws…
## $ X <lgl> NA, NA, NA, NA,…
## $ X.1 <int> NA, NA, NA, NA,…
## $ Unique.ID.formula <int> NA, NA, NA, NA,…
## $ Unique.identifier..redundant. <int> 31495, 31496, 3…
Convert the column names to camel cases
library(stringr)
# Function to convert to Camel Case
toCamelCase <- function(x) {
x <- gsub("\\.", " ", x)
x <- tools::toTitleCase(x)
x <- gsub(" ", "", x)
x <- tolower(substr(x, 1, 1)) %>%
paste0(substr(x, 2, nchar(x)))
return(x)
}
colnames(data) <- sapply(colnames(data), toCamelCase)
# View the new cleaned column names
colnames(data)
## [1] "uniqueID"
## [2] "name"
## [3] "age"
## [4] "gender"
## [5] "race"
## [6] "racewithImputations"
## [7] "imputationProbability"
## [8] "uRLofImagePLSNOHOTLINKS"
## [9] "dateofInjuryResultinginDeathMonthDayYear"
## [10] "locationofInjuryAddress"
## [11] "locationofDeathCity"
## [12] "state"
## [13] "locationofDeathZipCode"
## [14] "locationofDeathCounty"
## [15] "fullAddress"
## [16] "latitude"
## [17] "longitude"
## [18] "agencyorAgenciesInvolved"
## [19] "highestLevelofForce"
## [20] "uIDTemporary"
## [21] "nameTemporary"
## [22] "armedUnarmed"
## [23] "allegedWeapon"
## [24] "aggressivePhysicalMovement"
## [25] "fleeingnotFleeing"
## [26] "descriptionTemp"
## [27] "uRLTemp"
## [28] "briefDescription"
## [29] "dispositionsExclusionsINTERNALUSEnotforANALYSIS"
## [30] "intendedUseofForceDeveloping"
## [31] "supportingDocumentLink"
## [32] "x"
## [33] "x1"
## [34] "uniqueIDFormula"
## [35] "uniqueIdentifierRedundant"
Demographic Analysis
# Make the age column to be number
data$age <- as.numeric(data$age)
## Warning: NAs introduced by coercion
# Impute missing race or empty string with unspecified and make them lower (to avoid case inconsistency)
data <- data %>%
mutate(racewithImputations = ifelse(is.na(racewithImputations) | racewithImputations == '', "Race unspecified", racewithImputations)) %>%
mutate(racewithImputations = tolower(racewithImputations))
# Remove rows with missing Age or Gender
data_clean <- data %>%
filter(!is.na(age) & gender %in% c("Male", "Female"))
# Create a boxplot of Age distribution by Gender
ggplot(data_clean, aes(x = gender, y = age)) +
geom_boxplot(outlier.size = 0.2) +
theme_minimal() +
labs(title = "Age Distribution by Gender", x = "Gender", y = "Age") +
theme()
# Calculate the race distribution and sort by count in descending order
race_distribution <- data %>%
group_by(racewithImputations) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
mutate(Percentage = round((Count / sum(Count)) * 100, 1))
# Plot horizontal bar chart of counts by races in descending order
ggplot(race_distribution, aes(x = reorder(racewithImputations, Count), y = Count)) +
geom_bar(stat = "identity", color = "#555555") +
geom_text(aes(label = paste0(Count, " (", Percentage, "%)")), hjust = -0.1, color = "black") +
scale_y_continuous(expand = c(0, 0), limits = c(0, max(race_distribution$Count) * 1.3)) +
theme_minimal() +
labs(title = "Race Distribution of Fatal Encounters", x = "Race", y = "Count") +
theme(axis.text.x = element_text(hjust = 1), legend.position = "none") +
coord_flip()
# Create a box plot of age by race
ggplot(data_clean, aes(x = racewithImputations, y = age)) +
geom_boxplot(outlier.size = 0.2) +
theme_minimal() +
labs(title = "Age Distribution by Race", x = "Race", y = "Age") +
theme() +
coord_flip()
Temporal Analysis
Trend of Incident
# Cast date columns to date type
data$dateofInjuryResultinginDeathMonthDayYear <- as.Date(data$dateofInjuryResultinginDeathMonthDayYear, format = "%m/%d/%Y")
# Extract year and month
data$Year <- year(data$dateofInjuryResultinginDeathMonthDayYear)
data$Month <- month(data$dateofInjuryResultinginDeathMonthDayYear, label = TRUE)
# Plot trends over time
state_trend <- data %>%
group_by(Year) %>%
summarise(Count = n())
ggplot(state_trend, aes(x = as.numeric(Year), y = Count)) +
geom_line() +
theme_minimal() +
labs(title = "Trends of Fatal Encounters by State Over Time", x = "Year", y = "Number of Incidents")
Monthly Patterns
# Group by year and month, count number of cases per month
monthly_trend <- data %>%
group_by(Year, Month) %>%
summarise(Count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
# Plot the monthly trend, compare by year to see if there is any trend
ggplot(monthly_trend, aes(x = Month, y = Count, color = factor(Year), group = Year)) +
geom_line(size = 1) +
theme_minimal() +
labs(title = "Monthly Trend of Fatal Encounters by Year", x = "Month", y = "Number of Incidents", color = "Year") +
theme(axis.text.x = element_text(hjust = 1)) # Tilt the month labels for better readability
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Top 10 States with Increasing Rate of Incident
# Analyze trend by state, selecting only top 10 states having the most increasing rate
# Group by State and Year, count the number of incidents per state per year
state_year_trend <- data %>%
group_by(state, Year) %>%
summarise(Count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
# Calculate the slope (trend) for each state using linear regression
slopes <- state_year_trend %>%
group_by(state) %>%
do(model = lm(Count ~ Year, data = .)) %>%
mutate(Slope = coef(model)[2]) %>%
ungroup() %>%
select(state, Slope)
# Get the top 10 states with the highest rates
top_10_states <- slopes %>%
arrange(desc(Slope)) %>%
top_n(10, Slope)
# Filter the top 10 states
top_state_year_trend <- state_year_trend %>%
filter(state %in% top_10_states$state)
# Plot the trends for the top 10 states
ggplot(top_state_year_trend, aes(x = Year, y = Count, color = state, group = state)) +
geom_point(size = 1) +
geom_smooth(method = "lm", se = FALSE, size = 0.75) +
theme_minimal() +
labs(title = "Trend of Fatal Encounters in Top 10 States with Highest Rates",
x = "Year", y = "Number of Incidents", color = "State") +
theme(axis.text.x = element_text(hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'
Geographical Analysis
Visualize Location of Incidents
data$latitude <- as.numeric(data$latitude)
## Warning: NAs introduced by coercion
data$longitude <- as.numeric(data$longitude)
map_data_clean <- data %>%
filter(!is.na(latitude)) %>%
filter(!is.na(longitude))
map_data_sf <- st_as_sf(map_data_clean, coords = c("longitude", "latitude"), crs = 4326)
tmap_mode("view")
## tmap mode set to interactive viewing
# Plot the map with tmap
tm_shape(map_data_sf) +
tm_dots(col = "highestLevelofForce",
size = 0.01,
palette = "Set1",
title = "Highest Level of Force",
alpha = 0.7) +
tm_basemap(server = "OpenStreetMap") +
tm_layout(title = "Map of Fatal Encounters",
legend.outside = TRUE)