We will analyzes New York City shooting incidents, to understand where, when and who.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
nypd <- read_csv("https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD") %>%
mutate(occur_date = mdy(OCCUR_DATE),
occur_hour = hour(as.POSIXct(OCCUR_TIME, format="%H:%M")))
## Rows: 29744 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): OCCUR_DATE, BORO, LOC_OF_OCCUR_DESC, LOC_CLASSFCTN_DESC, LOCATION...
## dbl (5): INCIDENT_KEY, PRECINCT, JURISDICTION_CODE, Latitude, Longitude
## num (2): X_COORD_CD, Y_COORD_CD
## lgl (1): STATISTICAL_MURDER_FLAG
## time (1): OCCUR_TIME
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(nypd)
## Rows: 29,744
## Columns: 23
## $ INCIDENT_KEY <dbl> 231974218, 177934247, 255028563, 25384540, 726…
## $ OCCUR_DATE <chr> "08/09/2021", "04/07/2018", "12/02/2022", "11/…
## $ OCCUR_TIME <time> 01:06:00, 19:48:00, 22:57:00, 01:50:00, 01:58…
## $ BORO <chr> "BRONX", "BROOKLYN", "BRONX", "BROOKLYN", "BRO…
## $ LOC_OF_OCCUR_DESC <chr> NA, NA, "OUTSIDE", NA, NA, NA, NA, NA, NA, NA,…
## $ PRECINCT <dbl> 40, 79, 47, 66, 46, 42, 71, 69, 75, 69, 40, 42…
## $ JURISDICTION_CODE <dbl> 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0…
## $ LOC_CLASSFCTN_DESC <chr> NA, NA, "STREET", NA, NA, NA, NA, NA, NA, NA, …
## $ LOCATION_DESC <chr> NA, NA, "GROCERY/BODEGA", "PVT HOUSE", "MULTI …
## $ STATISTICAL_MURDER_FLAG <lgl> FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, F…
## $ PERP_AGE_GROUP <chr> NA, "25-44", "(null)", "UNKNOWN", "25-44", "18…
## $ PERP_SEX <chr> NA, "M", "(null)", "U", "M", "M", NA, NA, "M",…
## $ PERP_RACE <chr> NA, "WHITE HISPANIC", "(null)", "UNKNOWN", "BL…
## $ VIC_AGE_GROUP <chr> "18-24", "25-44", "25-44", "18-24", "<18", "18…
## $ VIC_SEX <chr> "M", "M", "M", "M", "F", "M", "M", "M", "M", "…
## $ VIC_RACE <chr> "BLACK", "BLACK", "BLACK", "BLACK", "BLACK", "…
## $ X_COORD_CD <dbl> 1006343.0, 1000082.9, 1020691.0, 985107.3, 100…
## $ Y_COORD_CD <dbl> 234270.0, 189064.7, 257125.0, 173349.8, 247502…
## $ Latitude <dbl> 40.80967, 40.68561, 40.87235, 40.64249, 40.845…
## $ Longitude <dbl> -73.92019, -73.94291, -73.86823, -73.99691, -7…
## $ Lon_Lat <chr> "POINT (-73.92019278899994 40.80967347200004)"…
## $ occur_date <date> 2021-08-09, 2018-04-07, 2022-12-02, 2006-11-1…
## $ occur_hour <int> 1, 19, 22, 1, 1, 21, 22, 23, 15, 15, 0, 2, 22,…
nypd %>%
mutate(year = year(occur_date)) %>%
count(BORO, year) %>%
ggplot(aes(x = year, y = n, fill = BORO)) +
geom_area(alpha=0.7) +
labs(title = "Annual Shooting Incidents by Borough",
x = "Year", y = "Number of Incidents", fill = "Borough") +
theme_minimal()
nypd %>%
filter(!is.na(occur_hour)) %>%
ggplot(aes(x = occur_hour, fill = BORO)) +
geom_histogram(binwidth = 1, position = "dodge") +
labs(title = "Shootings by Hour and Borough",
x = "Hour of Day", y = "Incident Count") +
theme_minimal()
nypd %>%
filter(!is.na(LOCATION_DESC)) %>%
count(LOCATION_DESC, sort=TRUE) %>%
top_n(10, n) %>%
ggplot(aes(x = reorder(LOCATION_DESC, n), y = n)) +
geom_col(fill = "firebrick") +
coord_flip() +
labs(title = "Top 10 Locations for Shootings",
x = "Location", y = "Incident Count") +
theme_minimal()
# Victim Age and Race
nypd %>%
filter(!is.na(VIC_AGE_GROUP), !is.na(VIC_RACE)) %>%
count(VIC_AGE_GROUP, VIC_RACE) %>%
ggplot(aes(x = VIC_AGE_GROUP, y = n, fill = VIC_RACE)) +
geom_col(position = "dodge") +
labs(title = "Victim Age Group by Race",
x = "Age Group", y = "Count", fill = "Race") +
theme_minimal()
# Suspect Sex and Race
nypd %>%
filter(!is.na(PERP_SEX), !is.na(PERP_RACE)) %>%
count(PERP_SEX, PERP_RACE) %>%
ggplot(aes(x = PERP_SEX, y = n, fill = PERP_RACE)) +
geom_col(position = "dodge") +
labs(title = "Suspect Sex by Race",
x = "Sex", y = "Count", fill = "Race") +
theme_minimal()
# Calculate total shootings and % young perp by precinct
precinct_summary <- nypd %>%
group_by(PRECINCT) %>%
summarize(
total_shootings = n(),
pct_perp_young = mean(PERP_AGE_GROUP == "18-24", na.rm = TRUE) * 100
)
str(precinct_summary)
## tibble [77 × 3] (S3: tbl_df/tbl/data.frame)
## $ PRECINCT : num [1:77] 1 5 6 7 9 10 13 14 17 18 ...
## $ total_shootings: int [1:77] 29 74 29 127 128 76 64 69 10 48 ...
## $ pct_perp_young : num [1:77] 30.8 34.3 12 32.2 34 ...
# Model total shootings as response, pct_perp_young as predictor
reg_shoot <- lm(total_shootings ~ pct_perp_young, data = precinct_summary)
summary(reg_shoot)
##
## Call:
## lm(formula = total_shootings ~ pct_perp_young, data = precinct_summary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -490.6 -278.6 -109.8 120.0 1285.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.123 153.627 -0.007 0.9942
## pct_perp_young 12.969 4.940 2.625 0.0105 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 374.9 on 75 degrees of freedom
## Multiple R-squared: 0.08416, Adjusted R-squared: 0.07195
## F-statistic: 6.892 on 1 and 75 DF, p-value: 0.01049
ggplot(precinct_summary, aes(x = pct_perp_young, y = total_shootings)) +
geom_point(color = "dodgerblue", size = 2) +
geom_smooth(method = "lm", color = "firebrick", se = TRUE) +
labs(
title = "Relationship Between % Young PERP and Shooting Counts by Precinct",
x = "Percent of Perp Age 18-24",
y = "Total Shootings"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Spatial Hotspots
library(sf)
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library(ggmap)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
## Stadia Maps' Terms of Service: <https://stadiamaps.com/terms-of-service>
## OpenStreetMap's Tile Usage Policy: <https://operations.osmfoundation.org/policies/tiles>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
# Load map of New York's boroughs and convert to spatial data frame
boundary <- read_csv("https://data.cityofnewyork.us/api/views/63ge-mke6/rows.csv?accessType=DOWNLOAD")
## Rows: 2325 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): the_geom, BoroName, CT2020, CDEligibil, NTAName, NTA2020, CDTA2020,...
## dbl (6): CTLabel, BoroCode, BoroCT2020, GEOID, Shape_Length, Shape_Area
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
boundary_sf <- st_as_sf(boundary, wkt = "the_geom", crs = 4326)
# Exclude missing coordinates
loc_data <- nypd %>% filter(!is.na(Longitude), !is.na(Latitude))
ggplot() +
# Plot NYC boundaries first
geom_sf(data = boundary_sf, fill = NA, color = "grey40", size = 0.5) +
# Then add your density layer
stat_density_2d(
data = loc_data,
aes(x = Longitude, y = Latitude, fill = after_stat(level)),
geom = "polygon",
alpha = 0.6,
show.legend = FALSE
) +
scale_fill_viridis_c() +
theme_minimal() +
labs(title = "Density of Shooting Incidents in NYC",
x = "Longitude", y = "Latitude")
nypd %>%
group_by(year = year(occur_date)) %>%
summarize(annual_incidents = n()) %>%
ggplot(aes(x = year, y = annual_incidents)) +
geom_line(color = "navy", linewidth=1.2) +
geom_point(color = "navy") +
labs(title = "Yearly Shooting Trends in NYC", x = "Year", y = "Shooting Incidents") +
theme_minimal()
Our data and results may be influenced by reporting delays, changes in NYPD definitions, and missing demographic/location details. It’s possible my analytical focus on quantitative summaries may overlook context (such as policy changes or events impacting violence rates). To address this, I included qualitative interpretation alongside each visualization and analyzed multiple perspectives, not just aggregate or time-based trends.
Spatial Hotspots: Specific boroughs and precincts consistently report higher shooting densities, especially in the Bronx and Brooklyn. Density mapping over NYC boundaries reveals clear urban clusters where shootings are concentrated.
Temporal Patterns: Shootings display distinct trends over the years, with notable changes in incident counts and variations by hour of day. Recent data may show decreases after public policy interventions or increased policing, though further investigation is warranted.
Location Context: Most shootings occur in or near public housing, apartments, or street-level settings, highlighting the role of neighborhood environments.
Demographics: Young adults (age 18–24) constitute a substantial proportion of both victims and suspects; racial disparities persist, with certain groups bearing disproportionate impact.
Regression Analysis: Precincts with a higher proportion of young suspects tend to record more shootings. The relationship is statistically significant but explains only a small fraction of the variation, suggesting other structural factors are at play.
# Session Info
``` r
# Display R session information
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.0.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Chicago
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggmap_4.0.2 sf_1.0-21 lubridate_1.9.4 forcats_1.0.1
## [5] stringr_1.5.1 dplyr_1.1.4 purrr_1.1.0 readr_2.1.5
## [9] tidyr_1.3.1 tibble_3.3.0 ggplot2_4.0.0 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.52 bslib_0.9.0 lattice_0.22-7
## [5] tzdb_0.5.0 vctrs_0.6.5 tools_4.5.1 bitops_1.0-9
## [9] generics_0.1.4 curl_7.0.0 parallel_4.5.1 proxy_0.4-27
## [13] pkgconfig_2.0.3 Matrix_1.7-3 KernSmooth_2.23-26 RColorBrewer_1.1-3
## [17] S7_0.2.0 lifecycle_1.0.4 compiler_4.5.1 farver_2.1.2
## [21] htmltools_0.5.8.1 class_7.3-23 sass_0.4.10 yaml_2.3.10
## [25] pillar_1.11.1 crayon_1.5.3 jquerylib_0.1.4 MASS_7.3-65
## [29] classInt_0.4-11 cachem_1.1.0 nlme_3.1-168 tidyselect_1.2.1
## [33] digest_0.6.37 stringi_1.8.7 labeling_0.4.3 splines_4.5.1
## [37] fastmap_1.2.0 grid_4.5.1 cli_3.6.5 magrittr_2.0.3
## [41] e1071_1.7-16 withr_3.0.2 scales_1.4.0 bit64_4.6.0-1
## [45] timechange_0.3.0 rmarkdown_2.29 httr_1.4.7 jpeg_0.1-11
## [49] bit_4.6.0 png_0.1-8 hms_1.1.3 evaluate_1.0.4
## [53] knitr_1.50 viridisLite_0.4.2 mgcv_1.9-3 rlang_1.1.6
## [57] isoband_0.2.7 Rcpp_1.1.0 glue_1.8.0 DBI_1.2.3
## [61] rstudioapi_0.17.1 vroom_1.6.6 jsonlite_2.0.0 plyr_1.8.9
## [65] R6_2.6.1 units_1.0-0