Loading the dataset

The Second data set is NYPD Arrest Data in 2023 from NYC database. https://data.cityofnewyork.us/Public-Safety/NYPD-Arrest-Data-Year-to-Date-/uip8-fykc

How’s the number of arrest disturb among race, age and sex? Is there a peak month?

library(tidyverse)
library(lubridate)

ny_arrest <- read_csv("https://raw.githubusercontent.com/tonyCUNY/tonyCUNY/main/NYPD_Arrest_Data__Year_to_Date__20231014.csv")

glimpse(ny_arrest)
## Rows: 112,571
## Columns: 19
## $ ARREST_KEY                 <dbl> 261249590, 261271301, 262235280, 264404265,…
## $ ARREST_DATE                <chr> "01/02/2023", "01/03/2023", "01/20/2023", "…
## $ PD_CD                      <dbl> 339, 105, 109, 101, 808, 101, 157, NA, 157,…
## $ PD_DESC                    <chr> "LARCENY,PETIT FROM OPEN AREAS,", "STRANGUL…
## $ KY_CD                      <dbl> 341, 106, 106, 344, 125, 344, 104, NA, 104,…
## $ OFNS_DESC                  <chr> "PETIT LARCENY", "FELONY ASSAULT", "FELONY …
## $ LAW_CODE                   <chr> "PL 1552500", "PL 1211200", "PL 1200501", "…
## $ LAW_CAT_CD                 <chr> "M", "F", "F", "M", "F", "M", "F", "F", "F"…
## $ ARREST_BORO                <chr> "M", "S", "K", "S", "K", "M", "S", "S", "S"…
## $ ARREST_PRECINCT            <dbl> 6, 120, 62, 120, 83, 6, 120, 120, 120, 110,…
## $ JURISDICTION_CODE          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ AGE_GROUP                  <chr> "25-44", "25-44", "45-64", "<18", "25-44", …
## $ PERP_SEX                   <chr> "M", "M", "M", "F", "F", "M", "M", "M", "M"…
## $ PERP_RACE                  <chr> "BLACK", "WHITE", "WHITE", "BLACK", "WHITE"…
## $ X_COORD_CD                 <dbl> 982745, 962808, 983289, 958743, 1006097, 98…
## $ Y_COORD_CD                 <dbl> 206647, 174275, 158770, 173302, 195371, 208…
## $ Latitude                   <dbl> 40.73388, 40.64500, 40.60246, 40.64232, 40.…
## $ Longitude                  <dbl> -74.00543, -74.07726, -74.00346, -74.09191,…
## $ `New Georeferenced Column` <chr> "POINT (-74.005428 40.733876)", "POINT (-74…

Data cleaning and transformation

# drop all rows with NA value
# select column: ARREST_DATE, PD_DESC, AGE_GROUP, PERP_SEX, PERP_RACE, 
# create a new column and transform ARREST_DATE into month

ny_arrest2 <- ny_arrest |> 
                  drop_na() |> 
                  select(c(ARREST_DATE, PD_DESC, AGE_GROUP, PERP_SEX, PERP_RACE)) |>
                  mutate(MONTH = month(as.Date(ARREST_DATE, format = "%m/%d/%Y"))) |> 
                  relocate(MONTH, .after = ARREST_DATE)

head(ny_arrest2)
## # A tibble: 6 × 6
##   ARREST_DATE MONTH PD_DESC                        AGE_GROUP PERP_SEX PERP_RACE
##   <chr>       <dbl> <chr>                          <chr>     <chr>    <chr>    
## 1 01/02/2023      1 LARCENY,PETIT FROM OPEN AREAS, 25-44     M        BLACK    
## 2 01/03/2023      1 STRANGULATION 1ST              25-44     M        WHITE    
## 3 01/20/2023      1 ASSAULT 2,1,UNCLASSIFIED       45-64     M        WHITE    
## 4 03/01/2023      3 ASSAULT 3                      <18       F        BLACK    
## 5 03/02/2023      3 TAX LAW                        25-44     F        WHITE    
## 6 03/10/2023      3 ASSAULT 3                      45-64     M        WHITE
# 

Summary for number of arrest by month, sex

arrest_count <- ny_arrest2 |> 
                    group_by(PERP_RACE, AGE_GROUP, PERP_SEX) |> 
                    summarise(COUNT = n())
arrest_count
## # A tibble: 105 × 4
## # Groups:   PERP_RACE, AGE_GROUP [35]
##    PERP_RACE                      AGE_GROUP PERP_SEX COUNT
##    <chr>                          <chr>     <chr>    <int>
##  1 AMERICAN INDIAN/ALASKAN NATIVE 18-24     F            4
##  2 AMERICAN INDIAN/ALASKAN NATIVE 18-24     M           62
##  3 AMERICAN INDIAN/ALASKAN NATIVE 18-24     U            3
##  4 AMERICAN INDIAN/ALASKAN NATIVE 25-44     F           27
##  5 AMERICAN INDIAN/ALASKAN NATIVE 25-44     M          160
##  6 AMERICAN INDIAN/ALASKAN NATIVE 25-44     U            4
##  7 AMERICAN INDIAN/ALASKAN NATIVE 45-64     F            7
##  8 AMERICAN INDIAN/ALASKAN NATIVE 45-64     M           59
##  9 AMERICAN INDIAN/ALASKAN NATIVE 45-64     U            3
## 10 AMERICAN INDIAN/ALASKAN NATIVE 65+       F            2
## # ℹ 95 more rows
month_arrest_count <- ny_arrest2 |> 
                      group_by(MONTH, PERP_RACE, AGE_GROUP, PERP_SEX) |> 
                      summarise(COUNT = n())
month_arrest_count
## # A tibble: 518 × 5
## # Groups:   MONTH, PERP_RACE, AGE_GROUP [209]
##    MONTH PERP_RACE                      AGE_GROUP PERP_SEX COUNT
##    <dbl> <chr>                          <chr>     <chr>    <int>
##  1     1 AMERICAN INDIAN/ALASKAN NATIVE 18-24     F            1
##  2     1 AMERICAN INDIAN/ALASKAN NATIVE 18-24     M            8
##  3     1 AMERICAN INDIAN/ALASKAN NATIVE 25-44     F            4
##  4     1 AMERICAN INDIAN/ALASKAN NATIVE 25-44     M           33
##  5     1 AMERICAN INDIAN/ALASKAN NATIVE 45-64     F            2
##  6     1 AMERICAN INDIAN/ALASKAN NATIVE 45-64     M           14
##  7     1 AMERICAN INDIAN/ALASKAN NATIVE 65+       M            1
##  8     1 AMERICAN INDIAN/ALASKAN NATIVE <18       F            1
##  9     1 AMERICAN INDIAN/ALASKAN NATIVE <18       M            1
## 10     1 ASIAN / PACIFIC ISLANDER       18-24     F           30
## # ℹ 508 more rows
write.csv(month_arrest_count, "C:\\Users\\tonyl\\Desktop\\CUNY\\arrest_count.csv", row.names = FALSE)

# 

Visualize the data

No of arrest over different variable: PERP_RACE, AGE_GROUP, PERP_SEX AND MONTH 25-44 Age group has the highest number of arrest. Mostly male are being arrested. Black and white Hispanic have higher number of arrest. Each month has about the same number of arrest.

# No of arrest by PERP_RACE AND MONTH
ny_arrest2 |> 
      group_by(PERP_RACE, MONTH) |> 
      summarise(COUNT = n()) |>
      ggplot(aes(x = MONTH, y = COUNT, fill = PERP_RACE)) +
      geom_bar(stat = "identity", position = "dodge") +
      labs(title = "Number of Arrests by PERP_RACE and MONTH", x = "MONTH", y = "Number of Arrests", fill = "PERP_RACE")

# No of arrest by PERP_RACE AND AGE_GROUP
ny_arrest2 |> 
      group_by(AGE_GROUP, PERP_RACE) |> 
      summarise(COUNT = n()) |>
      ggplot(aes(x = reorder(PERP_RACE, COUNT), y = COUNT, fill = AGE_GROUP)) +
      geom_bar(stat = "identity") + 
      coord_flip() +
      labs(title = "Number of Arrests by PERP_RACE", x = "PERP_RACE", y = "Number of Arrests")

# No of arrest by AGE_GROUP AND PERP_SEX
ny_arrest2 |> 
      group_by(PERP_SEX, AGE_GROUP) |> 
      summarise(COUNT = n()) |>
      ggplot(aes(x = reorder(AGE_GROUP, COUNT), y = COUNT, fill = PERP_SEX)) +
      geom_bar(stat = "identity") + 
      coord_flip() +
      labs(title = "Number of Arrests by AGE_GROUP", x = "AGE_GROUP", y = "Number of Arrests")