# Start Session
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 544134 29.1 1211438 64.7 686460 36.7
## Vcells 992501 7.6 8388608 64.0 1876787 14.4
# Load Packages
library(readxl)
library(ggplot2)
library(dplyr)
library(sf)
library(sp)
library(tidyr)
library(tidyverse)
library(descr)
library(leaflet)
library(ggthemes)
library(writexl)
library(readr)
library(haven)
library(leaflet)
library(knitr)
library(car) # For VIF
library(MASS) # For AIC and BIC
library(lmtest) # For Likelihood Ratio Test
library(clarify)
This week, I will explore this dataset to identify trends, analyze key variables, and assess how it can inform potential research directions.
The NYPD Arrests Data (Historic) dataset provides a comprehensive record of arrests made by the New York City Police Department (NYPD) from 2006 to the present. It includes detailed information on each arrest, such as the date, time, location, arresting officer’s command, and demographic details of the individual arrested. The dataset also classifies offenses based on New York State Penal Law, distinguishing between felonies, misdemeanors, and violations.
This dataset is maintained by the NYPD and updated regularly to enhance transparency and support public safety research, policymaking, and data-driven decision-making. It serves as a valuable resource for understanding arrest patterns, crime trends, and law enforcement practices within New York City. However, it is important to note that the dataset does not include sealed records, juvenile arrests, or personally identifiable information to comply with legal and privacy regulations.
For this dataset, I only want to focus on years 2012-2021 because this dataset is extremely large with over 5 million rows.
data <- read.csv("NYPD_Arrests_Data__Historic__20250302.csv")
# Checking data types, and first few rows
str(data)
## 'data.frame': 1048575 obs. of 19 variables:
## $ ARREST_KEY : int 205590989 202546976 203563701 205169609 204097687 205785615 206848760 203457591 203288011 203999100 ...
## $ ARREST_DATE : chr "11/24/2019" "9/17/2019" "10/10/2019" "11/14/2019" ...
## $ PD_CD : int 339 793 494 397 105 259 101 259 922 922 ...
## $ PD_DESC : chr "LARCENY,PETIT FROM OPEN AREAS," "WEAPONS POSSESSION 3" "STOLEN PROPERTY 2,1,POSSESSION" "ROBBERY,OPEN AREA UNCLASSIFIED" ...
## $ KY_CD : int 341 118 111 105 106 351 344 351 348 348 ...
## $ OFNS_DESC : chr "PETIT LARCENY" "DANGEROUS WEAPONS" "POSSESSION OF STOLEN PROPERTY" "ROBBERY" ...
## $ LAW_CODE : chr "PL 1552500" "PL 2650201" "PL 1654502" "PL 1601001" ...
## $ LAW_CAT_CD : chr "M" "F" "F" "F" ...
## $ ARREST_BORO : chr "M" "M" "K" "B" ...
## $ ARREST_PRECINCT : int 28 6 90 43 114 44 72 40 61 122 ...
## $ JURISDICTION_CODE: int 0 0 0 0 0 0 0 0 0 4 ...
## $ AGE_GROUP : chr "25-44" "45-64" "18-24" "18-24" ...
## $ PERP_SEX : chr "F" "M" "M" "M" ...
## $ PERP_RACE : chr "WHITE" "BLACK" "BLACK HISPANIC" "BLACK HISPANIC" ...
## $ X_COORD_CD : int 997571 982746 1000830 1020183 1007694 1008355 979428 1007373 995029 966300 ...
## $ Y_COORD_CD : int 234556 206647 197889 239283 219657 240738 174364 232752 157260 157946 ...
## $ Latitude : num 40.8 40.7 40.7 40.8 40.8 ...
## $ Longitude : num -74 -74 -73.9 -73.9 -73.9 ...
## $ Lon_Lat : chr "POINT (-73.95188015099995 40.81047604300005)" "POINT (-74.00542831099995 40.73388303300004)" "POINT (-73.94019763899996 40.70982902900005)" "POINT (-73.87017044999999 40.823387291000074)" ...
head(data)
## ARREST_KEY ARREST_DATE PD_CD PD_DESC KY_CD
## 1 205590989 11/24/2019 339 LARCENY,PETIT FROM OPEN AREAS, 341
## 2 202546976 9/17/2019 793 WEAPONS POSSESSION 3 118
## 3 203563701 10/10/2019 494 STOLEN PROPERTY 2,1,POSSESSION 111
## 4 205169609 11/14/2019 397 ROBBERY,OPEN AREA UNCLASSIFIED 105
## 5 204097687 10/23/2019 105 STRANGULATION 1ST 106
## 6 205785615 11/29/2019 259 CRIMINAL MISCHIEF,UNCLASSIFIED 4 351
## OFNS_DESC LAW_CODE LAW_CAT_CD ARREST_BORO
## 1 PETIT LARCENY PL 1552500 M M
## 2 DANGEROUS WEAPONS PL 2650201 F M
## 3 POSSESSION OF STOLEN PROPERTY PL 1654502 F K
## 4 ROBBERY PL 1601001 F B
## 5 FELONY ASSAULT PL 1211300 F Q
## 6 CRIMINAL MISCHIEF & RELATED OF PL 1450001 M B
## ARREST_PRECINCT JURISDICTION_CODE AGE_GROUP PERP_SEX PERP_RACE
## 1 28 0 25-44 F WHITE
## 2 6 0 45-64 M BLACK
## 3 90 0 18-24 M BLACK HISPANIC
## 4 43 0 18-24 M BLACK HISPANIC
## 5 114 0 25-44 M BLACK
## 6 44 0 45-64 M BLACK
## X_COORD_CD Y_COORD_CD Latitude Longitude
## 1 997571 234556 40.81048 -73.95188
## 2 982746 206647 40.73388 -74.00543
## 3 1000830 197889 40.70983 -73.94020
## 4 1020183 239283 40.82339 -73.87017
## 5 1007694 219657 40.76956 -73.91536
## 6 1008355 240738 40.82742 -73.91290
## Lon_Lat
## 1 POINT (-73.95188015099995 40.81047604300005)
## 2 POINT (-74.00542831099995 40.73388303300004)
## 3 POINT (-73.94019763899996 40.70982902900005)
## 4 POINT (-73.87017044999999 40.823387291000074)
## 5 POINT (-73.91536344699995 40.769561242000066)
## 6 POINT (-73.91290142199995 40.82742104700003)
# Summary statistics to find missing values and unusual values
summary(data)
## ARREST_KEY ARREST_DATE PD_CD PD_DESC
## Min. : 9926901 Length:1048575 Min. : 0.0 Length:1048575
## 1st Qu.:159969152 Class :character 1st Qu.:209.0 Class :character
## Median :172601981 Mode :character Median :478.0 Mode :character
## Mean :147716885 Mean :471.6
## 3rd Qu.:202720638 3rd Qu.:729.0
## Max. :238513928 Max. :997.0
## NA's :154
## KY_CD OFNS_DESC LAW_CODE LAW_CAT_CD
## Min. :101 Length:1048575 Length:1048575 Length:1048575
## 1st Qu.:117 Class :character Class :character Class :character
## Median :341 Mode :character Mode :character Mode :character
## Mean :275
## 3rd Qu.:347
## Max. :995
## NA's :3157
## ARREST_BORO ARREST_PRECINCT JURISDICTION_CODE AGE_GROUP
## Length:1048575 Min. : 1.00 Min. : 0.000 Length:1048575
## Class :character 1st Qu.: 34.00 1st Qu.: 0.000 Class :character
## Mode :character Median : 61.00 Median : 0.000 Mode :character
## Mean : 61.73 Mean : 1.808
## 3rd Qu.: 90.00 3rd Qu.: 0.000
## Max. :123.00 Max. :97.000
## NA's :6
## PERP_SEX PERP_RACE X_COORD_CD Y_COORD_CD
## Length:1048575 Length:1048575 Min. : 913512 Min. : 121131
## Class :character Class :character 1st Qu.: 991394 1st Qu.: 186190
## Mode :character Mode :character Median :1004363 Median : 207813
## Mean :1005111 Mean : 208846
## 3rd Qu.:1016751 3rd Qu.: 236103
## Max. :1067298 Max. :7250292
##
## Latitude Longitude Lon_Lat
## Min. :40.50 Min. :-74.25 Length:1048575
## 1st Qu.:40.68 1st Qu.:-73.97 Class :character
## Median :40.74 Median :-73.93 Mode :character
## Mean :40.74 Mean :-73.92
## 3rd Qu.:40.81 3rd Qu.:-73.88
## Max. :59.66 Max. :-73.68
##
# Checking column names
colnames(data)
## [1] "ARREST_KEY" "ARREST_DATE" "PD_CD"
## [4] "PD_DESC" "KY_CD" "OFNS_DESC"
## [7] "LAW_CODE" "LAW_CAT_CD" "ARREST_BORO"
## [10] "ARREST_PRECINCT" "JURISDICTION_CODE" "AGE_GROUP"
## [13] "PERP_SEX" "PERP_RACE" "X_COORD_CD"
## [16] "Y_COORD_CD" "Latitude" "Longitude"
## [19] "Lon_Lat"
# Making a duplicate in case of any mistake in recoding
data2 <- data
# Checking first few rows
head(data2)
## ARREST_KEY ARREST_DATE PD_CD PD_DESC KY_CD
## 1 205590989 11/24/2019 339 LARCENY,PETIT FROM OPEN AREAS, 341
## 2 202546976 9/17/2019 793 WEAPONS POSSESSION 3 118
## 3 203563701 10/10/2019 494 STOLEN PROPERTY 2,1,POSSESSION 111
## 4 205169609 11/14/2019 397 ROBBERY,OPEN AREA UNCLASSIFIED 105
## 5 204097687 10/23/2019 105 STRANGULATION 1ST 106
## 6 205785615 11/29/2019 259 CRIMINAL MISCHIEF,UNCLASSIFIED 4 351
## OFNS_DESC LAW_CODE LAW_CAT_CD ARREST_BORO
## 1 PETIT LARCENY PL 1552500 M M
## 2 DANGEROUS WEAPONS PL 2650201 F M
## 3 POSSESSION OF STOLEN PROPERTY PL 1654502 F K
## 4 ROBBERY PL 1601001 F B
## 5 FELONY ASSAULT PL 1211300 F Q
## 6 CRIMINAL MISCHIEF & RELATED OF PL 1450001 M B
## ARREST_PRECINCT JURISDICTION_CODE AGE_GROUP PERP_SEX PERP_RACE
## 1 28 0 25-44 F WHITE
## 2 6 0 45-64 M BLACK
## 3 90 0 18-24 M BLACK HISPANIC
## 4 43 0 18-24 M BLACK HISPANIC
## 5 114 0 25-44 M BLACK
## 6 44 0 45-64 M BLACK
## X_COORD_CD Y_COORD_CD Latitude Longitude
## 1 997571 234556 40.81048 -73.95188
## 2 982746 206647 40.73388 -74.00543
## 3 1000830 197889 40.70983 -73.94020
## 4 1020183 239283 40.82339 -73.87017
## 5 1007694 219657 40.76956 -73.91536
## 6 1008355 240738 40.82742 -73.91290
## Lon_Lat
## 1 POINT (-73.95188015099995 40.81047604300005)
## 2 POINT (-74.00542831099995 40.73388303300004)
## 3 POINT (-73.94019763899996 40.70982902900005)
## 4 POINT (-73.87017044999999 40.823387291000074)
## 5 POINT (-73.91536344699995 40.769561242000066)
## 6 POINT (-73.91290142199995 40.82742104700003)
# Counting duplicate "ARREST_KEY" values since thats the key generated for each arrest
sum(duplicated(data2$ARREST_KEY))
## [1] 0
# Checking for missing values in "ARREST_KEY"
sum(is.na(data2$ARREST_KEY))
## [1] 0
# Converting the "ARREST_DATE" column to a Date format
data2$ARREST_DATE <- as.Date(data2$ARREST_DATE, format="%m/%d/%Y")
# Extracting the Year for filtering
data2$YEAR <- as.numeric(format(data2$ARREST_DATE, "%Y"))
# Filtering for Years 2012-2021
data2 <- data2[data2$YEAR >= 2012 & data2$YEAR <= 2021, ]
# Verifying the filtering to see if I extracted the correct years
unique(data2$YEAR)
## [1] 2019 2017 2021 2020 2016 2018 2014 2013 2012 2015
# Replacing abbreviations in the "LAW_CAT_CD" column with full descriptions
data2$LAW_CAT_CD <- ifelse(data2$LAW_CAT_CD == "M", "Misdemeanor",
ifelse(data2$LAW_CAT_CD == "F", "Felony",
ifelse(data2$LAW_CAT_CD == "V", "Violation",
ifelse(data2$LAW_CAT_CD == "I", "Infraction",
data2$LAW_CAT_CD))))
# Verifying the changes
table(data2$LAW_CAT_CD) # View counts of each category
##
## Felony Infraction Misdemeanor Violation
## 5521 293284 2192 484584 9954
unique(data2$LAW_CAT_CD) # Check unique values to confirm replacements
## [1] "Misdemeanor" "Felony" "" "Violation" "Infraction"
# Replacing abbreviations in the "ARREST_BORO" column with full borough names
data2$ARREST_BORO <- ifelse(data2$ARREST_BORO == "M", "Manhattan",
ifelse(data2$ARREST_BORO == "B", "Bronx",
ifelse(data2$ARREST_BORO == "S", "Staten Island",
ifelse(data2$ARREST_BORO == "K", "Brooklyn",
ifelse(data2$ARREST_BORO == "Q", "Queens",
data2$ARREST_BORO)))))
# Verifying the changes
table(data2$ARREST_BORO)
##
## Bronx Brooklyn Manhattan Queens Staten Island
## 176634 218436 205838 161101 33526
unique(data2$ARREST_BORO)
## [1] "Manhattan" "Brooklyn" "Bronx" "Queens"
## [5] "Staten Island"
# Replacing "M" with "Male" and "F" with "Female" in the "PERP_SEX" column
data2$PERP_SEX <- ifelse(data2$PERP_SEX == "M", "Male",
ifelse(data2$PERP_SEX == "F", "Female",
data2$PERP_SEX))
# Verifying the changes
table(data2$PERP_SEX)
##
## Female Male
## 139674 655861
unique(data2$PERP_SEX)
## [1] "Female" "Male"
# Removing unwanted columns from dataset
data2 <- data2[, !names(data2) %in% c("PD_CD", "KY_CD", "LAW_CODE", "X_COORD_CD", "Y_COORD_CD", "Lon_Lat")]
# Verifying the changes
colnames(data2)
## [1] "ARREST_KEY" "ARREST_DATE" "PD_DESC"
## [4] "OFNS_DESC" "LAW_CAT_CD" "ARREST_BORO"
## [7] "ARREST_PRECINCT" "JURISDICTION_CODE" "AGE_GROUP"
## [10] "PERP_SEX" "PERP_RACE" "Latitude"
## [13] "Longitude" "YEAR"
# Saving the "data2" dataset in csv for a faster load time in R Studio for Analysis
write.csv(data2, "NYPD_Arrests_Cleaned.csv", row.names = FALSE)