Mini_Assignment

Set up

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(stringr)

# read + clean names
df0 <- readr::read_csv("/home/rstudio/data/Form Responses.csv", show_col_types = FALSE) |>
  janitor::clean_names()

## New names:
## • `` -> `...32`
## • `` -> `...33`

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

coalesce_cols <- function(.data, cols, fill = NA_character_) {
  have <- dplyr::select(.data, dplyr::any_of(cols))
  if (ncol(have) == 0) return(rep(fill, nrow(.data)))
  if (ncol(have) == 1) return(have[[1]])
  dplyr::coalesce(!!!have)
}

# build unified fields safely
df <- df0 |>
  mutate(
    # raw date string from whichever column exists
    date_raw = coalesce_cols(cur_data(), c(
      "date","incident_date","date_of_incident","datetime","occurred_date","occured_date"
    )),
    # parse many common formats
    date = suppressWarnings(lubridate::parse_date_time(
      date_raw,
      orders = c("Ymd","Y-m-d","mdY","m/d/Y","m/d/y","Ymd HMS","Y-m-d H:M:S","m/d/Y H:M:S","mdy HMS")
    )),
    year   = lubridate::year(date),

    race   = coalesce_cols(cur_data(), c("race","race_ethnicity","ethnicity")) |> stringr::str_to_title(),
    gender = coalesce_cols(cur_data(), c("gender","sex")),

    state  = coalesce_cols(cur_data(), c("state","state_abbrev","state_code")) |> toupper(),
    city   = coalesce_cols(cur_data(), c("city","location","city_town")),

    lat = suppressWarnings(as.numeric(coalesce_cols(cur_data(), c("latitude","lat")))),
    lon = suppressWarnings(as.numeric(coalesce_cols(cur_data(), c("longitude","lng","long")))),

    armed          = coalesce_cols(cur_data(), c("armed","weapon","armed_status")),
    mental_health  = coalesce_cols(cur_data(), c("mental_health","mental_illness","behavioral_crisis"))
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `date_raw = coalesce_cols(...)`.
## Caused by warning:
## ! `cur_data()` was deprecated in dplyr 1.1.0.
## ℹ Please use `pick()` instead.

Data Visualizations

library(lubridate)
library(dplyr)
library(ggplot2)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

library(stringr)

pick_date <- function(dat) {
  # candidates: any column name containing "date" or "time" or "timestamp"
  cand <- names(dat)[str_detect(names(dat), "(date|time|stamp)")]
  if (length(cand) == 0) return(NULL)

  scores <- sapply(cand, function(x){
    v <- suppressWarnings(parse_date_time(dat[[x]],
                                          orders = c("Ymd HMS","Y-m-d H:M:S",
                                                     "mdy HMS","mdY","m/d/Y","m/d/y","Y-m-d","Ymd")))
    mean(!is.na(v))
  })
  best <- cand[which.max(scores)]
  if (length(best) == 0 || is.na(scores[best]) || scores[best] == 0) return(NULL)
  tibble(col = best,
         parsed = suppressWarnings(parse_date_time(dat[[best]],
                                                   orders = c("Ymd HMS","Y-m-d H:M:S",
                                                              "mdy HMS","mdY","m/d/Y","m/d/y","Y-m-d","Ymd"))))
}

picked <- pick_date(df0)

if (!is.null(picked)) {
  df <- df0 |> mutate(date = picked$parsed, year = year(date))
  p_trend <- df |>
    filter(!is.na(year)) |>
    count(year) |>
    ggplot(aes(year, n)) +
    geom_line(linewidth = 1) + geom_point() +
    scale_y_continuous(labels = comma) +
    labs(title = "Police killings by year", x = NULL, y = "Deaths") +
    theme_minimal(base_size = 14)
  p_trend
}

p_race <- df |>
  filter(!is.na(race)) |>
  mutate(race = fct_lump_n(str_to_title(race), n = 6)) |>
  count(race) |>
  mutate(pct = n / sum(n)) |>
  ggplot(aes(fct_reorder(race, pct), pct)) +
  geom_col(fill="#5B8DEF") +
  coord_flip() +
  scale_y_continuous(labels=percent) +
  labs(title="Share of deaths by race", x=NULL, y="Percent of cases") +
  theme_minimal(base_size = 14)
p_race

# collapse text columns per row for keyword search
text_cols <- df0 |> select(where(is.character))
row_txt <- if (ncol(text_cols) > 0) {
  text_cols |> mutate(.row = row_number()) |>
    tidyr::unite("_txt", tidyselect::everything(), sep = " | ", na.rm = TRUE) |> pull(`_txt`) |> tolower()
} else rep("", nrow(df0))

df <- df |> mutate( #I try to label data into unarmed or mental-related incidents
  unarmed_flag = str_detect(row_txt, "\\bunarmed\\b|no weapon|none reported"),
  mental_flag  = str_detect(row_txt, "mental|behavior|psychiatr|crisis|autism|suicid")
)

circ_tab <- tibble(
  status = c("Unarmed (keyword)", "Mental-health flag (keyword)"),
  n = c(sum(df$unarmed_flag, na.rm = TRUE),
        sum(df$mental_flag,  na.rm = TRUE))
) |> mutate(pct = n/sum(n))

p_circ <- ggplot(circ_tab, aes(fct_reorder(status, pct), pct)) +
  geom_col(fill = "#F28E2B") +
  coord_flip() +
  scale_y_continuous(labels = percent) +
  labs(title = "Reported circumstances (keyword scan across all text fields)",
       x = NULL, y = "Percent of cases") +
  theme_minimal(base_size = 14)
p_circ

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Logistic models at the incident level

Questions that I am investigating:

Are some demographic/context factors associated with a victim being unarmed? Are the same factors associated with a mental-health flag?

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## # A tibble: 112 × 6
##    model       term                         estimate conf.low conf.high  p.value
##    <chr>       <chr>                           <dbl>    <dbl>     <dbl>    <dbl>
##  1 Pr(MH flag) stateNE                         3.10     1.77       5.69 1.37e- 4
##  2 Pr(MH flag) stateVT                         2.75     1.20       6.26 1.57e- 2
##  3 Pr(MH flag) stateDC                         0.385    0.107      1.09 9.71e- 2
##  4 Pr(MH flag) race_catRace Unspecified        2.56     2.32       2.83 1.88e-78
##  5 Pr(MH flag) stateNH                         2.46     1.17       5.18 1.72e- 2
##  6 Pr(MH flag) statePA                         2.27     1.37       3.95 2.27e- 3
##  7 Pr(MH flag) race_catEuropean-American/W…    1.99     1.81       2.20 2.68e-44
##  8 Pr(MH flag) stateVA                         1.95     1.16       3.43 1.51e- 2
##  9 Pr(MH flag) stateTX                         1.94     1.19       3.35 1.16e- 2
## 10 Pr(MH flag) stateME                         1.89     0.972      3.73 6.32e- 2
## # ℹ 102 more rows

Count model at the state–year level

Questions that I am investigating: Which factors help explain where, and when more incidents occur?

This builds a state–year panel and fits a Poisson (or quasi-Poisson if over dispersed). Even without population offsets, it’s still useful for relative patterns.

## # A tibble: 6 × 5
##   term        estimate std.error statistic   p.value
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)   -74.7      2.70     -27.7  9.96e-127
## 2 stateAL         1.69     0.142     11.9  8.08e- 31
## 3 stateAR         1.11     0.150      7.38 3.26e- 13
## 4 stateAZ         1.96     0.139     14.1  2.96e- 41
## 5 stateCA         3.65     0.132     27.7  1.50e-126
## 6 stateCO         1.68     0.142     11.8  1.88e- 30

Narrative

I explored “Form Responses” to ask who, where, and when police involved deaths occur, and whether patterns persist after adjusting for place and time.

When: A weekday–month heat map shows a clear summer bulge and weekend lift, peaking on Fridays/Saturdays. That timing is consistent with higher contact volumes rather than random noise.

Where: A state choropleth highlights large raw counts in California, Texas, and Florida, with additional Sun Belt clusters and lighter counts in parts of New England and the upper Midwest. These are counts, not per-capita rates, but they point to where prevention capacity should be concentrated.

Who: By composition, White victims are the largest share by count, followed by Unspecified, Black, and Hispanic/Latino. Because the plot is adjusted, it should be read alongside modeling and population context.

Circumstances: A keyword scan across all text fields suggests a substantial share flagged “unarmed” and another sizable share with mental-health cues precisely the situations where alternative response (co-response or behavioral-health teams) is most plausible.

Modeling: A state–year Poisson confirms big, persistent between-state differences after holding time constant. At the incident level, two logistic models clarify who is involved in sensitive contexts: - Mental-health flag (odds ratios): higher in several states (e.g., NE, VT, NH, PA, VA, TX), and higher for men and with age. Relative to the reference race (Black), odds are higher for White and Race Unspecified. - Unarmed (odds ratios): several states show >2× odds (e.g., ND, DE, CT, HI, MS, LA, MD). Men have lower odds of being unarmed, odds decline with age, and are lower for White and Hispanic/Latino than the Black reference.

Bottom line: The data point to place-specific concentration, summer/weekend surges, and many incidents in unarmed or mental-health contexts. If I could move one lever, I’d prioritize alternative crisis response and supervision in geographic hot spots, paired with transparent tracking to see if those interventions reduce both the counts and the most preventable circumstances.

Mini_Assignment_5

Quan Duong

2025-10-18

Set up

Data Visualizations

Logistic models at the incident level

Count model at the state–year level

Narrative