Introduction

We will analyzes New York City shooting incidents, to understand where, when and who.

Data Import & Overview

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
nypd <- read_csv("https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD") %>%
  mutate(occur_date = mdy(OCCUR_DATE),
         occur_hour = hour(as.POSIXct(OCCUR_TIME, format="%H:%M")))
## Rows: 29744 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (12): OCCUR_DATE, BORO, LOC_OF_OCCUR_DESC, LOC_CLASSFCTN_DESC, LOCATION...
## dbl   (5): INCIDENT_KEY, PRECINCT, JURISDICTION_CODE, Latitude, Longitude
## num   (2): X_COORD_CD, Y_COORD_CD
## lgl   (1): STATISTICAL_MURDER_FLAG
## time  (1): OCCUR_TIME
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(nypd)
## Rows: 29,744
## Columns: 23
## $ INCIDENT_KEY            <dbl> 231974218, 177934247, 255028563, 25384540, 726…
## $ OCCUR_DATE              <chr> "08/09/2021", "04/07/2018", "12/02/2022", "11/…
## $ OCCUR_TIME              <time> 01:06:00, 19:48:00, 22:57:00, 01:50:00, 01:58…
## $ BORO                    <chr> "BRONX", "BROOKLYN", "BRONX", "BROOKLYN", "BRO…
## $ LOC_OF_OCCUR_DESC       <chr> NA, NA, "OUTSIDE", NA, NA, NA, NA, NA, NA, NA,…
## $ PRECINCT                <dbl> 40, 79, 47, 66, 46, 42, 71, 69, 75, 69, 40, 42…
## $ JURISDICTION_CODE       <dbl> 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0…
## $ LOC_CLASSFCTN_DESC      <chr> NA, NA, "STREET", NA, NA, NA, NA, NA, NA, NA, …
## $ LOCATION_DESC           <chr> NA, NA, "GROCERY/BODEGA", "PVT HOUSE", "MULTI …
## $ STATISTICAL_MURDER_FLAG <lgl> FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, F…
## $ PERP_AGE_GROUP          <chr> NA, "25-44", "(null)", "UNKNOWN", "25-44", "18…
## $ PERP_SEX                <chr> NA, "M", "(null)", "U", "M", "M", NA, NA, "M",…
## $ PERP_RACE               <chr> NA, "WHITE HISPANIC", "(null)", "UNKNOWN", "BL…
## $ VIC_AGE_GROUP           <chr> "18-24", "25-44", "25-44", "18-24", "<18", "18…
## $ VIC_SEX                 <chr> "M", "M", "M", "M", "F", "M", "M", "M", "M", "…
## $ VIC_RACE                <chr> "BLACK", "BLACK", "BLACK", "BLACK", "BLACK", "…
## $ X_COORD_CD              <dbl> 1006343.0, 1000082.9, 1020691.0, 985107.3, 100…
## $ Y_COORD_CD              <dbl> 234270.0, 189064.7, 257125.0, 173349.8, 247502…
## $ Latitude                <dbl> 40.80967, 40.68561, 40.87235, 40.64249, 40.845…
## $ Longitude               <dbl> -73.92019, -73.94291, -73.86823, -73.99691, -7…
## $ Lon_Lat                 <chr> "POINT (-73.92019278899994 40.80967347200004)"…
## $ occur_date              <date> 2021-08-09, 2018-04-07, 2022-12-02, 2006-11-1…
## $ occur_hour              <int> 1, 19, 22, 1, 1, 21, 22, 23, 15, 15, 0, 2, 22,…

Shooting Counts by Borough and Year

nypd %>%
  mutate(year = year(occur_date)) %>%
  count(BORO, year) %>%
  ggplot(aes(x = year, y = n, fill = BORO)) +
    geom_area(alpha=0.7) +
    labs(title = "Annual Shooting Incidents by Borough",
         x = "Year", y = "Number of Incidents", fill = "Borough") +
    theme_minimal()

Hourly Distribution Across Boroughs

nypd %>%
  filter(!is.na(occur_hour)) %>%
  ggplot(aes(x = occur_hour, fill = BORO)) +
    geom_histogram(binwidth = 1, position = "dodge") +
    labs(title = "Shootings by Hour and Borough",
         x = "Hour of Day", y = "Incident Count") +
    theme_minimal()

Location Types: Where Do Shootings Happen?

nypd %>%
  filter(!is.na(LOCATION_DESC)) %>%
  count(LOCATION_DESC, sort=TRUE) %>%
  top_n(10, n) %>%
  ggplot(aes(x = reorder(LOCATION_DESC, n), y = n)) +
    geom_col(fill = "firebrick") +
    coord_flip() +
    labs(title = "Top 10 Locations for Shootings",
         x = "Location", y = "Incident Count") +
    theme_minimal()

Demographics: Victims and Suspects

# Victim Age and Race
nypd %>%
  filter(!is.na(VIC_AGE_GROUP), !is.na(VIC_RACE)) %>%
  count(VIC_AGE_GROUP, VIC_RACE) %>%
  ggplot(aes(x = VIC_AGE_GROUP, y = n, fill = VIC_RACE)) +
    geom_col(position = "dodge") +
    labs(title = "Victim Age Group by Race",
         x = "Age Group", y = "Count", fill = "Race") +
    theme_minimal()

# Suspect Sex and Race
nypd %>%
  filter(!is.na(PERP_SEX), !is.na(PERP_RACE)) %>%
  count(PERP_SEX, PERP_RACE) %>%
  ggplot(aes(x = PERP_SEX, y = n, fill = PERP_RACE)) +
    geom_col(position = "dodge") +
    labs(title = "Suspect Sex by Race",
         x = "Sex", y = "Count", fill = "Race") +
    theme_minimal()

Relationship Between % Young PERP and Shooting Counts by Precinct

# Calculate total shootings and % young perp by precinct
precinct_summary <- nypd %>%
  group_by(PRECINCT) %>%
  summarize(
    total_shootings = n(),
    pct_perp_young = mean(PERP_AGE_GROUP == "18-24", na.rm = TRUE) * 100
  )
str(precinct_summary)
## tibble [77 × 3] (S3: tbl_df/tbl/data.frame)
##  $ PRECINCT       : num [1:77] 1 5 6 7 9 10 13 14 17 18 ...
##  $ total_shootings: int [1:77] 29 74 29 127 128 76 64 69 10 48 ...
##  $ pct_perp_young : num [1:77] 30.8 34.3 12 32.2 34 ...
# Model total shootings as response, pct_perp_young as predictor
reg_shoot <- lm(total_shootings ~ pct_perp_young, data = precinct_summary)
summary(reg_shoot)
## 
## Call:
## lm(formula = total_shootings ~ pct_perp_young, data = precinct_summary)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -490.6 -278.6 -109.8  120.0 1285.4 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      -1.123    153.627  -0.007   0.9942  
## pct_perp_young   12.969      4.940   2.625   0.0105 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 374.9 on 75 degrees of freedom
## Multiple R-squared:  0.08416,    Adjusted R-squared:  0.07195 
## F-statistic: 6.892 on 1 and 75 DF,  p-value: 0.01049
ggplot(precinct_summary, aes(x = pct_perp_young, y = total_shootings)) +
  geom_point(color = "dodgerblue", size = 2) +
  geom_smooth(method = "lm", color = "firebrick", se = TRUE) +
  labs(
    title = "Relationship Between % Young PERP and Shooting Counts by Precinct",
    x = "Percent of Perp Age 18-24",
    y = "Total Shootings"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Spatial Hotspots

library(sf)
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library(ggmap)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
##   Stadia Maps' Terms of Service: <https://stadiamaps.com/terms-of-service>
##   OpenStreetMap's Tile Usage Policy: <https://operations.osmfoundation.org/policies/tiles>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
# Load map of New York's boroughs and convert to spatial data frame
boundary <- read_csv("https://data.cityofnewyork.us/api/views/63ge-mke6/rows.csv?accessType=DOWNLOAD")
## Rows: 2325 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): the_geom, BoroName, CT2020, CDEligibil, NTAName, NTA2020, CDTA2020,...
## dbl (6): CTLabel, BoroCode, BoroCT2020, GEOID, Shape_Length, Shape_Area
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
boundary_sf <- st_as_sf(boundary, wkt = "the_geom", crs = 4326)

# Exclude missing coordinates
loc_data <- nypd %>% filter(!is.na(Longitude), !is.na(Latitude))

ggplot() +
  # Plot NYC boundaries first
  geom_sf(data = boundary_sf, fill = NA, color = "grey40", size = 0.5) +
  # Then add your density layer
  stat_density_2d(
    data = loc_data,
    aes(x = Longitude, y = Latitude, fill = after_stat(level)),
    geom = "polygon",
    alpha = 0.6,
    show.legend = FALSE
  ) +
  scale_fill_viridis_c() +
  theme_minimal() +
  labs(title = "Density of Shooting Incidents in NYC",
       x = "Longitude", y = "Latitude")

Change Over Time

nypd %>%
  group_by(year = year(occur_date)) %>%
  summarize(annual_incidents = n()) %>%
  ggplot(aes(x = year, y = annual_incidents)) +
    geom_line(color = "navy", linewidth=1.2) +
    geom_point(color = "navy") +
    labs(title = "Yearly Shooting Trends in NYC", x = "Year", y = "Shooting Incidents") +
    theme_minimal()

Limitations and Bias

Our data and results may be influenced by reporting delays, changes in NYPD definitions, and missing demographic/location details. It’s possible my analytical focus on quantitative summaries may overlook context (such as policy changes or events impacting violence rates). To address this, I included qualitative interpretation alongside each visualization and analyzed multiple perspectives, not just aggregate or time-based trends.

Conclusion

Spatial Hotspots: Specific boroughs and precincts consistently report higher shooting densities, especially in the Bronx and Brooklyn. Density mapping over NYC boundaries reveals clear urban clusters where shootings are concentrated.

Temporal Patterns: Shootings display distinct trends over the years, with notable changes in incident counts and variations by hour of day. Recent data may show decreases after public policy interventions or increased policing, though further investigation is warranted.

Location Context: Most shootings occur in or near public housing, apartments, or street-level settings, highlighting the role of neighborhood environments.

Demographics: Young adults (age 18–24) constitute a substantial proportion of both victims and suspects; racial disparities persist, with certain groups bearing disproportionate impact.

Regression Analysis: Precincts with a higher proportion of young suspects tend to record more shootings. The relationship is statistically significant but explains only a small fraction of the variation, suggesting other structural factors are at play.

# Session Info

``` r
# Display R session information
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.0.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggmap_4.0.2     sf_1.0-21       lubridate_1.9.4 forcats_1.0.1  
##  [5] stringr_1.5.1   dplyr_1.1.4     purrr_1.1.0     readr_2.1.5    
##  [9] tidyr_1.3.1     tibble_3.3.0    ggplot2_4.0.0   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       xfun_0.52          bslib_0.9.0        lattice_0.22-7    
##  [5] tzdb_0.5.0         vctrs_0.6.5        tools_4.5.1        bitops_1.0-9      
##  [9] generics_0.1.4     curl_7.0.0         parallel_4.5.1     proxy_0.4-27      
## [13] pkgconfig_2.0.3    Matrix_1.7-3       KernSmooth_2.23-26 RColorBrewer_1.1-3
## [17] S7_0.2.0           lifecycle_1.0.4    compiler_4.5.1     farver_2.1.2      
## [21] htmltools_0.5.8.1  class_7.3-23       sass_0.4.10        yaml_2.3.10       
## [25] pillar_1.11.1      crayon_1.5.3       jquerylib_0.1.4    MASS_7.3-65       
## [29] classInt_0.4-11    cachem_1.1.0       nlme_3.1-168       tidyselect_1.2.1  
## [33] digest_0.6.37      stringi_1.8.7      labeling_0.4.3     splines_4.5.1     
## [37] fastmap_1.2.0      grid_4.5.1         cli_3.6.5          magrittr_2.0.3    
## [41] e1071_1.7-16       withr_3.0.2        scales_1.4.0       bit64_4.6.0-1     
## [45] timechange_0.3.0   rmarkdown_2.29     httr_1.4.7         jpeg_0.1-11       
## [49] bit_4.6.0          png_0.1-8          hms_1.1.3          evaluate_1.0.4    
## [53] knitr_1.50         viridisLite_0.4.2  mgcv_1.9-3         rlang_1.1.6       
## [57] isoband_0.2.7      Rcpp_1.1.0         glue_1.8.0         DBI_1.2.3         
## [61] rstudioapi_0.17.1  vroom_1.6.6        jsonlite_2.0.0     plyr_1.8.9        
## [65] R6_2.6.1           units_1.0-0