library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(stringr)
# read + clean names
df0 <- readr::read_csv("/home/rstudio/data/Form Responses.csv", show_col_types = FALSE) |>
janitor::clean_names()
## New names:
## • `` -> `...32`
## • `` -> `...33`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
coalesce_cols <- function(.data, cols, fill = NA_character_) {
have <- dplyr::select(.data, dplyr::any_of(cols))
if (ncol(have) == 0) return(rep(fill, nrow(.data)))
if (ncol(have) == 1) return(have[[1]])
dplyr::coalesce(!!!have)
}
# build unified fields safely
df <- df0 |>
mutate(
# raw date string from whichever column exists
date_raw = coalesce_cols(cur_data(), c(
"date","incident_date","date_of_incident","datetime","occurred_date","occured_date"
)),
# parse many common formats
date = suppressWarnings(lubridate::parse_date_time(
date_raw,
orders = c("Ymd","Y-m-d","mdY","m/d/Y","m/d/y","Ymd HMS","Y-m-d H:M:S","m/d/Y H:M:S","mdy HMS")
)),
year = lubridate::year(date),
race = coalesce_cols(cur_data(), c("race","race_ethnicity","ethnicity")) |> stringr::str_to_title(),
gender = coalesce_cols(cur_data(), c("gender","sex")),
state = coalesce_cols(cur_data(), c("state","state_abbrev","state_code")) |> toupper(),
city = coalesce_cols(cur_data(), c("city","location","city_town")),
lat = suppressWarnings(as.numeric(coalesce_cols(cur_data(), c("latitude","lat")))),
lon = suppressWarnings(as.numeric(coalesce_cols(cur_data(), c("longitude","lng","long")))),
armed = coalesce_cols(cur_data(), c("armed","weapon","armed_status")),
mental_health = coalesce_cols(cur_data(), c("mental_health","mental_illness","behavioral_crisis"))
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `date_raw = coalesce_cols(...)`.
## Caused by warning:
## ! `cur_data()` was deprecated in dplyr 1.1.0.
## ℹ Please use `pick()` instead.
library(lubridate)
library(dplyr)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(stringr)
pick_date <- function(dat) {
# candidates: any column name containing "date" or "time" or "timestamp"
cand <- names(dat)[str_detect(names(dat), "(date|time|stamp)")]
if (length(cand) == 0) return(NULL)
scores <- sapply(cand, function(x){
v <- suppressWarnings(parse_date_time(dat[[x]],
orders = c("Ymd HMS","Y-m-d H:M:S",
"mdy HMS","mdY","m/d/Y","m/d/y","Y-m-d","Ymd")))
mean(!is.na(v))
})
best <- cand[which.max(scores)]
if (length(best) == 0 || is.na(scores[best]) || scores[best] == 0) return(NULL)
tibble(col = best,
parsed = suppressWarnings(parse_date_time(dat[[best]],
orders = c("Ymd HMS","Y-m-d H:M:S",
"mdy HMS","mdY","m/d/Y","m/d/y","Y-m-d","Ymd"))))
}
picked <- pick_date(df0)
if (!is.null(picked)) {
df <- df0 |> mutate(date = picked$parsed, year = year(date))
p_trend <- df |>
filter(!is.na(year)) |>
count(year) |>
ggplot(aes(year, n)) +
geom_line(linewidth = 1) + geom_point() +
scale_y_continuous(labels = comma) +
labs(title = "Police killings by year", x = NULL, y = "Deaths") +
theme_minimal(base_size = 14)
p_trend
}
p_race <- df |>
filter(!is.na(race)) |>
mutate(race = fct_lump_n(str_to_title(race), n = 6)) |>
count(race) |>
mutate(pct = n / sum(n)) |>
ggplot(aes(fct_reorder(race, pct), pct)) +
geom_col(fill="#5B8DEF") +
coord_flip() +
scale_y_continuous(labels=percent) +
labs(title="Share of deaths by race", x=NULL, y="Percent of cases") +
theme_minimal(base_size = 14)
p_race
# collapse text columns per row for keyword search
text_cols <- df0 |> select(where(is.character))
row_txt <- if (ncol(text_cols) > 0) {
text_cols |> mutate(.row = row_number()) |>
tidyr::unite("_txt", tidyselect::everything(), sep = " | ", na.rm = TRUE) |> pull(`_txt`) |> tolower()
} else rep("", nrow(df0))
df <- df |> mutate( #I try to label data into unarmed or mental-related incidents
unarmed_flag = str_detect(row_txt, "\\bunarmed\\b|no weapon|none reported"),
mental_flag = str_detect(row_txt, "mental|behavior|psychiatr|crisis|autism|suicid")
)
circ_tab <- tibble(
status = c("Unarmed (keyword)", "Mental-health flag (keyword)"),
n = c(sum(df$unarmed_flag, na.rm = TRUE),
sum(df$mental_flag, na.rm = TRUE))
) |> mutate(pct = n/sum(n))
p_circ <- ggplot(circ_tab, aes(fct_reorder(status, pct), pct)) +
geom_col(fill = "#F28E2B") +
coord_flip() +
scale_y_continuous(labels = percent) +
labs(title = "Reported circumstances (keyword scan across all text fields)",
x = NULL, y = "Percent of cases") +
theme_minimal(base_size = 14)
p_circ
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Questions that I am investigating:
Are some demographic/context factors associated with a victim being unarmed? Are the same factors associated with a mental-health flag?
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## # A tibble: 112 × 6
## model term estimate conf.low conf.high p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Pr(MH flag) stateNE 3.10 1.77 5.69 1.37e- 4
## 2 Pr(MH flag) stateVT 2.75 1.20 6.26 1.57e- 2
## 3 Pr(MH flag) stateDC 0.385 0.107 1.09 9.71e- 2
## 4 Pr(MH flag) race_catRace Unspecified 2.56 2.32 2.83 1.88e-78
## 5 Pr(MH flag) stateNH 2.46 1.17 5.18 1.72e- 2
## 6 Pr(MH flag) statePA 2.27 1.37 3.95 2.27e- 3
## 7 Pr(MH flag) race_catEuropean-American/W… 1.99 1.81 2.20 2.68e-44
## 8 Pr(MH flag) stateVA 1.95 1.16 3.43 1.51e- 2
## 9 Pr(MH flag) stateTX 1.94 1.19 3.35 1.16e- 2
## 10 Pr(MH flag) stateME 1.89 0.972 3.73 6.32e- 2
## # ℹ 102 more rows
Questions that I am investigating: Which factors help explain where, and when more incidents occur?
This builds a state–year panel and fits a Poisson (or quasi-Poisson if over dispersed). Even without population offsets, it’s still useful for relative patterns.
## # A tibble: 6 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -74.7 2.70 -27.7 9.96e-127
## 2 stateAL 1.69 0.142 11.9 8.08e- 31
## 3 stateAR 1.11 0.150 7.38 3.26e- 13
## 4 stateAZ 1.96 0.139 14.1 2.96e- 41
## 5 stateCA 3.65 0.132 27.7 1.50e-126
## 6 stateCO 1.68 0.142 11.8 1.88e- 30
I explored “Form Responses” to ask who, where, and when police involved deaths occur, and whether patterns persist after adjusting for place and time.
When: A weekday–month heat map shows a clear summer bulge and weekend lift, peaking on Fridays/Saturdays. That timing is consistent with higher contact volumes rather than random noise.
Where: A state choropleth highlights large raw counts in California, Texas, and Florida, with additional Sun Belt clusters and lighter counts in parts of New England and the upper Midwest. These are counts, not per-capita rates, but they point to where prevention capacity should be concentrated.
Who: By composition, White victims are the largest share by count, followed by Unspecified, Black, and Hispanic/Latino. Because the plot is adjusted, it should be read alongside modeling and population context.
Circumstances: A keyword scan across all text fields suggests a substantial share flagged “unarmed” and another sizable share with mental-health cues precisely the situations where alternative response (co-response or behavioral-health teams) is most plausible.
Modeling: A state–year Poisson confirms big, persistent between-state differences after holding time constant. At the incident level, two logistic models clarify who is involved in sensitive contexts: - Mental-health flag (odds ratios): higher in several states (e.g., NE, VT, NH, PA, VA, TX), and higher for men and with age. Relative to the reference race (Black), odds are higher for White and Race Unspecified. - Unarmed (odds ratios): several states show >2× odds (e.g., ND, DE, CT, HI, MS, LA, MD). Men have lower odds of being unarmed, odds decline with age, and are lower for White and Hispanic/Latino than the Black reference.
Bottom line: The data point to place-specific concentration, summer/weekend surges, and many incidents in unarmed or mental-health contexts. If I could move one lever, I’d prioritize alternative crisis response and supervision in geographic hot spots, paired with transparent tracking to see if those interventions reduce both the counts and the most preventable circumstances.