The Second data set is NYPD Arrest Data in 2023 from NYC database. https://data.cityofnewyork.us/Public-Safety/NYPD-Arrest-Data-Year-to-Date-/uip8-fykc
How’s the number of arrest disturb among race, age and sex? Is there a peak month?
library(tidyverse)
library(lubridate)
ny_arrest <- read_csv("https://raw.githubusercontent.com/tonyCUNY/tonyCUNY/main/NYPD_Arrest_Data__Year_to_Date__20231014.csv")
glimpse(ny_arrest)
## Rows: 112,571
## Columns: 19
## $ ARREST_KEY <dbl> 261249590, 261271301, 262235280, 264404265,…
## $ ARREST_DATE <chr> "01/02/2023", "01/03/2023", "01/20/2023", "…
## $ PD_CD <dbl> 339, 105, 109, 101, 808, 101, 157, NA, 157,…
## $ PD_DESC <chr> "LARCENY,PETIT FROM OPEN AREAS,", "STRANGUL…
## $ KY_CD <dbl> 341, 106, 106, 344, 125, 344, 104, NA, 104,…
## $ OFNS_DESC <chr> "PETIT LARCENY", "FELONY ASSAULT", "FELONY …
## $ LAW_CODE <chr> "PL 1552500", "PL 1211200", "PL 1200501", "…
## $ LAW_CAT_CD <chr> "M", "F", "F", "M", "F", "M", "F", "F", "F"…
## $ ARREST_BORO <chr> "M", "S", "K", "S", "K", "M", "S", "S", "S"…
## $ ARREST_PRECINCT <dbl> 6, 120, 62, 120, 83, 6, 120, 120, 120, 110,…
## $ JURISDICTION_CODE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ AGE_GROUP <chr> "25-44", "25-44", "45-64", "<18", "25-44", …
## $ PERP_SEX <chr> "M", "M", "M", "F", "F", "M", "M", "M", "M"…
## $ PERP_RACE <chr> "BLACK", "WHITE", "WHITE", "BLACK", "WHITE"…
## $ X_COORD_CD <dbl> 982745, 962808, 983289, 958743, 1006097, 98…
## $ Y_COORD_CD <dbl> 206647, 174275, 158770, 173302, 195371, 208…
## $ Latitude <dbl> 40.73388, 40.64500, 40.60246, 40.64232, 40.…
## $ Longitude <dbl> -74.00543, -74.07726, -74.00346, -74.09191,…
## $ `New Georeferenced Column` <chr> "POINT (-74.005428 40.733876)", "POINT (-74…
# drop all rows with NA value
# select column: ARREST_DATE, PD_DESC, AGE_GROUP, PERP_SEX, PERP_RACE,
# create a new column and transform ARREST_DATE into month
ny_arrest2 <- ny_arrest |>
drop_na() |>
select(c(ARREST_DATE, PD_DESC, AGE_GROUP, PERP_SEX, PERP_RACE)) |>
mutate(MONTH = month(as.Date(ARREST_DATE, format = "%m/%d/%Y"))) |>
relocate(MONTH, .after = ARREST_DATE)
head(ny_arrest2)
## # A tibble: 6 × 6
## ARREST_DATE MONTH PD_DESC AGE_GROUP PERP_SEX PERP_RACE
## <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 01/02/2023 1 LARCENY,PETIT FROM OPEN AREAS, 25-44 M BLACK
## 2 01/03/2023 1 STRANGULATION 1ST 25-44 M WHITE
## 3 01/20/2023 1 ASSAULT 2,1,UNCLASSIFIED 45-64 M WHITE
## 4 03/01/2023 3 ASSAULT 3 <18 F BLACK
## 5 03/02/2023 3 TAX LAW 25-44 F WHITE
## 6 03/10/2023 3 ASSAULT 3 45-64 M WHITE
#
arrest_count <- ny_arrest2 |>
group_by(PERP_RACE, AGE_GROUP, PERP_SEX) |>
summarise(COUNT = n())
arrest_count
## # A tibble: 105 × 4
## # Groups: PERP_RACE, AGE_GROUP [35]
## PERP_RACE AGE_GROUP PERP_SEX COUNT
## <chr> <chr> <chr> <int>
## 1 AMERICAN INDIAN/ALASKAN NATIVE 18-24 F 4
## 2 AMERICAN INDIAN/ALASKAN NATIVE 18-24 M 62
## 3 AMERICAN INDIAN/ALASKAN NATIVE 18-24 U 3
## 4 AMERICAN INDIAN/ALASKAN NATIVE 25-44 F 27
## 5 AMERICAN INDIAN/ALASKAN NATIVE 25-44 M 160
## 6 AMERICAN INDIAN/ALASKAN NATIVE 25-44 U 4
## 7 AMERICAN INDIAN/ALASKAN NATIVE 45-64 F 7
## 8 AMERICAN INDIAN/ALASKAN NATIVE 45-64 M 59
## 9 AMERICAN INDIAN/ALASKAN NATIVE 45-64 U 3
## 10 AMERICAN INDIAN/ALASKAN NATIVE 65+ F 2
## # ℹ 95 more rows
month_arrest_count <- ny_arrest2 |>
group_by(MONTH, PERP_RACE, AGE_GROUP, PERP_SEX) |>
summarise(COUNT = n())
month_arrest_count
## # A tibble: 518 × 5
## # Groups: MONTH, PERP_RACE, AGE_GROUP [209]
## MONTH PERP_RACE AGE_GROUP PERP_SEX COUNT
## <dbl> <chr> <chr> <chr> <int>
## 1 1 AMERICAN INDIAN/ALASKAN NATIVE 18-24 F 1
## 2 1 AMERICAN INDIAN/ALASKAN NATIVE 18-24 M 8
## 3 1 AMERICAN INDIAN/ALASKAN NATIVE 25-44 F 4
## 4 1 AMERICAN INDIAN/ALASKAN NATIVE 25-44 M 33
## 5 1 AMERICAN INDIAN/ALASKAN NATIVE 45-64 F 2
## 6 1 AMERICAN INDIAN/ALASKAN NATIVE 45-64 M 14
## 7 1 AMERICAN INDIAN/ALASKAN NATIVE 65+ M 1
## 8 1 AMERICAN INDIAN/ALASKAN NATIVE <18 F 1
## 9 1 AMERICAN INDIAN/ALASKAN NATIVE <18 M 1
## 10 1 ASIAN / PACIFIC ISLANDER 18-24 F 30
## # ℹ 508 more rows
write.csv(month_arrest_count, "C:\\Users\\tonyl\\Desktop\\CUNY\\arrest_count.csv", row.names = FALSE)
#
No of arrest over different variable: PERP_RACE, AGE_GROUP, PERP_SEX AND MONTH 25-44 Age group has the highest number of arrest. Mostly male are being arrested. Black and white Hispanic have higher number of arrest. Each month has about the same number of arrest.
# No of arrest by PERP_RACE AND MONTH
ny_arrest2 |>
group_by(PERP_RACE, MONTH) |>
summarise(COUNT = n()) |>
ggplot(aes(x = MONTH, y = COUNT, fill = PERP_RACE)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Number of Arrests by PERP_RACE and MONTH", x = "MONTH", y = "Number of Arrests", fill = "PERP_RACE")
# No of arrest by PERP_RACE AND AGE_GROUP
ny_arrest2 |>
group_by(AGE_GROUP, PERP_RACE) |>
summarise(COUNT = n()) |>
ggplot(aes(x = reorder(PERP_RACE, COUNT), y = COUNT, fill = AGE_GROUP)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Number of Arrests by PERP_RACE", x = "PERP_RACE", y = "Number of Arrests")
# No of arrest by AGE_GROUP AND PERP_SEX
ny_arrest2 |>
group_by(PERP_SEX, AGE_GROUP) |>
summarise(COUNT = n()) |>
ggplot(aes(x = reorder(AGE_GROUP, COUNT), y = COUNT, fill = PERP_SEX)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Number of Arrests by AGE_GROUP", x = "AGE_GROUP", y = "Number of Arrests")