The city of New York has long been a subject of fascination and scrutiny for its high crime rates. With the rise of technology and data-driven analysis, it has become easier to understand the patterns and trends in crime complaints across the five boroughs. This study aims to explore the relationship between crime rates and the boroughs of New York City from 2017 to 2023, with a specific focus on the types of crime reported. To identify if there is a statistically significant difference in the number of complaints among different boroughs
By analyzing and interpreting crime data from various sources, this research aims to shed light on the complex dynamics of crime in the city and inform policy decisions to improve public safety.
library(tidyverse)
library(knitr)
library(infer)NYPD_Data <- read.csv("NYPD_Data.csv")head(NYPD_Data,5)## CMPLNT_NUM ADDR_PCT_CD BORO_NM CMPLNT_FR_DT CMPLNT_FR_TM CMPLNT_TO_DT
## 1 246893603 44 BRONX 06/20/2022 00:00:00
## 2 247202657 49 BRONX 06/27/2022 05:15:00
## 3 247759365 44 BRONX 07/08/2022 20:07:00
## 4 248104661H1 44 BRONX 07/15/2022 23:50:00
## 5 246220488 123 STATEN ISLAND 06/06/2020 18:04:00 06/06/2022
## CMPLNT_TO_TM CRM_ATPT_CPTD_CD HADEVELOPT HOUSING_PSA JURISDICTION_CODE
## 1 (null) COMPLETED (null) NA 0
## 2 (null) COMPLETED (null) NA 0
## 3 (null) COMPLETED (null) NA 0
## 4 (null) COMPLETED (null) NA 0
## 5 18:15:00 COMPLETED (null) NA 0
## JURIS_DESC KY_CD LAW_CAT_CD LOC_OF_OCCUR_DESC
## 1 N.Y. POLICE DEPT 105 FELONY FRONT OF
## 2 N.Y. POLICE DEPT 105 FELONY FRONT OF
## 3 N.Y. POLICE DEPT 113 FELONY (null)
## 4 N.Y. POLICE DEPT 101 FELONY OUTSIDE
## 5 N.Y. POLICE DEPT 578 VIOLATION INSIDE
## OFNS_DESC PARKS_NM PATROL_BORO PD_CD
## 1 ROBBERY (null) PATROL BORO BRONX 389
## 2 ROBBERY (null) PATROL BORO BRONX 380
## 3 FORGERY (null) PATROL BORO BRONX 725
## 4 MURDER & NON-NEGL. MANSLAUGHTER (null) PATROL BORO BRONX NA
## 5 HARRASSMENT 2 (null) PATROL BORO STATEN ISLAND 637
## PD_DESC PREM_TYP_DESC RPT_DT STATION_NAME
## 1 ROBBERY,DWELLING STREET 06/20/2022 (null)
## 2 ROBBERY,CAR JACKING CHURCH 06/27/2022 (null)
## 3 FORGERY,M.V. REGISTRATION STREET 07/08/2022 (null)
## 4 (null) (null) 07/15/2022 (null)
## 5 HARASSMENT,SUBD 1,CIVILIAN RESIDENCE-HOUSE 06/06/2022 (null)
## SUSP_AGE_GROUP SUSP_RACE SUSP_SEX TRANSIT_DISTRICT VIC_AGE_GROUP VIC_RACE
## 1 UNKNOWN UNKNOWN U NA 25-44 BLACK
## 2 UNKNOWN BLACK M NA 45-64 WHITE
## 3 25-44 BLACK M NA UNKNOWN UNKNOWN
## 4 25-44 BLACK M NA 25-44 BLACK
## 5 25-44 UNKNOWN M NA 25-44 WHITE
## VIC_SEX X_COORD_CD Y_COORD_CD Latitude Longitude Lat_Lon
## 1 F 1007951 241445 40.82935 -73.91436 (40.829355, -73.914355)
## 2 M 1026285 246465 40.84307 -73.84807 (40.843067, -73.848075)
## 3 E 1008526 244666 40.83819 -73.91227 (40.83819352, -73.91226789)
## 4 M 1007600 240213 40.82597 -73.91563 (40.82597381, -73.9156299)
## 5 F 915395 125939 40.51208 -74.24762 (40.512082, -74.247623)
## New.Georeferenced.Column
## 1 POINT (-73.914355 40.829355)
## 2 POINT (-73.848075 40.843067)
## 3 POINT (-73.91226789 40.83819352)
## 4 POINT (-73.9156299 40.82597381)
## 5 POINT (-74.247623 40.512082)
glimpse(NYPD_Data)## Rows: 531,768
## Columns: 36
## $ CMPLNT_NUM <chr> "246893603", "247202657", "247759365", "24810…
## $ ADDR_PCT_CD <int> 44, 49, 44, 44, 123, 5, 84, 43, 75, 14, 111, …
## $ BORO_NM <chr> "BRONX", "BRONX", "BRONX", "BRONX", "STATEN I…
## $ CMPLNT_FR_DT <chr> "06/20/2022", "06/27/2022", "07/08/2022", "07…
## $ CMPLNT_FR_TM <chr> "00:00:00", "05:15:00", "20:07:00", "23:50:00…
## $ CMPLNT_TO_DT <chr> "", "", "", "", "06/06/2022", "02/03/2022", "…
## $ CMPLNT_TO_TM <chr> "(null)", "(null)", "(null)", "(null)", "18:1…
## $ CRM_ATPT_CPTD_CD <chr> "COMPLETED", "COMPLETED", "COMPLETED", "COMPL…
## $ HADEVELOPT <chr> "(null)", "(null)", "(null)", "(null)", "(nul…
## $ HOUSING_PSA <int> NA, NA, NA, NA, NA, NA, 463, NA, NA, NA, NA, …
## $ JURISDICTION_CODE <int> 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ JURIS_DESC <chr> "N.Y. POLICE DEPT", "N.Y. POLICE DEPT", "N.Y.…
## $ KY_CD <int> 105, 105, 113, 101, 578, 361, 233, 233, 116, …
## $ LAW_CAT_CD <chr> "FELONY", "FELONY", "FELONY", "FELONY", "VIOL…
## $ LOC_OF_OCCUR_DESC <chr> "FRONT OF", "FRONT OF", "(null)", "OUTSIDE", …
## $ OFNS_DESC <chr> "ROBBERY", "ROBBERY", "FORGERY", "MURDER & NO…
## $ PARKS_NM <chr> "(null)", "(null)", "(null)", "(null)", "(nul…
## $ PATROL_BORO <chr> "PATROL BORO BRONX", "PATROL BORO BRONX", "PA…
## $ PD_CD <int> 389, 380, 725, NA, 637, 639, 175, 175, 177, 4…
## $ PD_DESC <chr> "ROBBERY,DWELLING", "ROBBERY,CAR JACKING", "F…
## $ PREM_TYP_DESC <chr> "STREET", "CHURCH", "STREET", "(null)", "RESI…
## $ RPT_DT <chr> "06/20/2022", "06/27/2022", "07/08/2022", "07…
## $ STATION_NAME <chr> "(null)", "(null)", "(null)", "(null)", "(nul…
## $ SUSP_AGE_GROUP <chr> "UNKNOWN", "UNKNOWN", "25-44", "25-44", "25-4…
## $ SUSP_RACE <chr> "UNKNOWN", "BLACK", "BLACK", "BLACK", "UNKNOW…
## $ SUSP_SEX <chr> "U", "M", "M", "M", "M", "M", "(null)", "M", …
## $ TRANSIT_DISTRICT <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ VIC_AGE_GROUP <chr> "25-44", "45-64", "UNKNOWN", "25-44", "25-44"…
## $ VIC_RACE <chr> "BLACK", "WHITE", "UNKNOWN", "BLACK", "WHITE"…
## $ VIC_SEX <chr> "F", "M", "E", "M", "F", "F", "F", "F", "F", …
## $ X_COORD_CD <int> 1007951, 1026285, 1008526, 1007600, 915395, 9…
## $ Y_COORD_CD <int> 241445, 246465, 244666, 240213, 125939, 20129…
## $ Latitude <dbl> 40.82935, 40.84307, 40.83819, 40.82597, 40.51…
## $ Longitude <dbl> -73.91436, -73.84807, -73.91227, -73.91563, -…
## $ Lat_Lon <chr> "(40.829355, -73.914355)", "(40.843067, -73.8…
## $ New.Georeferenced.Column <chr> "POINT (-73.914355 40.829355)", "POINT (-73.8…
summary(NYPD_Data)## CMPLNT_NUM ADDR_PCT_CD BORO_NM CMPLNT_FR_DT
## Length:531768 Min. : 1.00 Length:531768 Length:531768
## Class :character 1st Qu.: 34.00 Class :character Class :character
## Mode :character Median : 63.00 Mode :character Mode :character
## Mean : 63.13
## 3rd Qu.:101.00
## Max. :123.00
## NA's :39
## CMPLNT_FR_TM CMPLNT_TO_DT CMPLNT_TO_TM CRM_ATPT_CPTD_CD
## Length:531768 Length:531768 Length:531768 Length:531768
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HADEVELOPT HOUSING_PSA JURISDICTION_CODE JURIS_DESC
## Length:531768 Min. : 218 Min. : 0.0000 Length:531768
## Class :character 1st Qu.: 477 1st Qu.: 0.0000 Class :character
## Mode :character Median : 696 Median : 0.0000 Mode :character
## Mean : 6799 Mean : 0.4613
## 3rd Qu.: 1251 3rd Qu.: 0.0000
## Max. :72405 Max. :97.0000
## NA's :497825
## KY_CD LAW_CAT_CD LOC_OF_OCCUR_DESC OFNS_DESC
## Min. :101.0 Length:531768 Length:531768 Length:531768
## 1st Qu.:113.0 Class :character Class :character Class :character
## Median :341.0 Mode :character Mode :character Mode :character
## Mean :300.9
## 3rd Qu.:351.0
## Max. :678.0
##
## PARKS_NM PATROL_BORO PD_CD PD_DESC
## Length:531768 Length:531768 Min. :100.0 Length:531768
## Class :character Class :character 1st Qu.:256.0 Class :character
## Mode :character Mode :character Median :352.0 Mode :character
## Mean :403.5
## 3rd Qu.:637.0
## Max. :922.0
## NA's :578
## PREM_TYP_DESC RPT_DT STATION_NAME SUSP_AGE_GROUP
## Length:531768 Length:531768 Length:531768 Length:531768
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## SUSP_RACE SUSP_SEX TRANSIT_DISTRICT VIC_AGE_GROUP
## Length:531768 Length:531768 Min. : 1.0 Length:531768
## Class :character Class :character 1st Qu.: 3.0 Class :character
## Mode :character Mode :character Median :11.0 Mode :character
## Mean :14.8
## 3rd Qu.:30.0
## Max. :34.0
## NA's :520479
## VIC_RACE VIC_SEX X_COORD_CD Y_COORD_CD
## Length:531768 Length:531768 Min. : 913411 Min. :121312
## Class :character Class :character 1st Qu.: 991330 1st Qu.:185702
## Mode :character Mode :character Median :1004395 Median :207065
## Mean :1005187 Mean :207765
## 3rd Qu.:1017411 3rd Qu.:234516
## Max. :1067306 Max. :271819
## NA's :9 NA's :9
## Latitude Longitude Lat_Lon New.Georeferenced.Column
## Min. :40.50 Min. :-74.25 Length:531768 Length:531768
## 1st Qu.:40.68 1st Qu.:-73.97 Class :character Class :character
## Median :40.73 Median :-73.93 Mode :character Mode :character
## Mean :40.74 Mean :-73.92
## 3rd Qu.:40.81 3rd Qu.:-73.88
## Max. :40.91 Max. :-73.70
## NA's :9 NA's :9
NYPD_Data$num_complaint <- 1
Totalcomplaint <- sum(NYPD_Data$num_complaint)
yearlycomplaintdata <- NYPD_Data %>%
mutate(Year = year(mdy(CMPLNT_TO_DT))) %>%
group_by(Year) %>%
summarise(`No. complaint` = sum(num_complaint),
Percentage = paste(round((`No. complaint`/Totalcomplaint)*100,2),"%",sep = " "),
`Monthly Average` = round(`No. complaint`/12,0),
`Daily Average` = round(`No. complaint`/365,0)) %>%
arrange(desc(Year))
kable(yearlycomplaintdata)| Year | No. complaint | Percentage | Monthly Average | Daily Average |
|---|---|---|---|---|
| 2023 | 7 | 0 % | 1 | 0 |
| 2022 | 486787 | 91.54 % | 40566 | 1334 |
| 2021 | 4513 | 0.85 % | 376 | 12 |
| 2020 | 392 | 0.07 % | 33 | 1 |
| 2019 | 182 | 0.03 % | 15 | 0 |
| 2018 | 149 | 0.03 % | 12 | 0 |
| 2017 | 106 | 0.02 % | 9 | 0 |
| 2016 | 65 | 0.01 % | 5 | 0 |
| 2015 | 60 | 0.01 % | 5 | 0 |
| 2014 | 66 | 0.01 % | 6 | 0 |
| 2013 | 35 | 0.01 % | 3 | 0 |
| 2012 | 25 | 0 % | 2 | 0 |
| 2011 | 29 | 0.01 % | 2 | 0 |
| 2010 | 20 | 0 % | 2 | 0 |
| 2009 | 11 | 0 % | 1 | 0 |
| 2008 | 14 | 0 % | 1 | 0 |
| 2007 | 14 | 0 % | 1 | 0 |
| 2006 | 13 | 0 % | 1 | 0 |
| 2005 | 6 | 0 % | 0 | 0 |
| 2004 | 7 | 0 % | 1 | 0 |
| 2003 | 5 | 0 % | 0 | 0 |
| 2002 | 7 | 0 % | 1 | 0 |
| 2001 | 4 | 0 % | 0 | 0 |
| 2000 | 4 | 0 % | 0 | 0 |
| 1999 | 5 | 0 % | 0 | 0 |
| 1997 | 6 | 0 % | 0 | 0 |
| 1996 | 2 | 0 % | 0 | 0 |
| 1995 | 2 | 0 % | 0 | 0 |
| 1994 | 2 | 0 % | 0 | 0 |
| 1993 | 3 | 0 % | 0 | 0 |
| 1992 | 3 | 0 % | 0 | 0 |
| 1990 | 3 | 0 % | 0 | 0 |
| 1989 | 3 | 0 % | 0 | 0 |
| 1988 | 4 | 0 % | 0 | 0 |
| 1987 | 4 | 0 % | 0 | 0 |
| 1986 | 1 | 0 % | 0 | 0 |
| 1985 | 1 | 0 % | 0 | 0 |
| 1984 | 1 | 0 % | 0 | 0 |
| 1979 | 1 | 0 % | 0 | 0 |
| 1977 | 1 | 0 % | 0 | 0 |
| 1972 | 1 | 0 % | 0 | 0 |
| 1969 | 1 | 0 % | 0 | 0 |
| 1961 | 1 | 0 % | 0 | 0 |
| NA | 39202 | 7.37 % | 3267 | 107 |
It appears that there were only 7 complaints in 2023, which may not be a representative sample since it is only a few months into the year. However, the data shows that there were 486,787 complaints in 2022, which accounts for the majority of the data.
ggplot(yearlycomplaintdata, aes(x = Year, y = `No. complaint`)) +
geom_bar(stat = "identity", fill = "dodgerblue") +
labs(title = "Number of Complaints by Year (2015-2023)",
x = "Year",
y = "Number of Complaints") +
scale_x_continuous(limits = c(2015, 2023), breaks = seq(2015, 2023, 1)) +
theme(plot.title = element_text(hjust = 0.5))# Convert CMPLNT_FR_DT to date format
NYPD_Data$CMPLNT_FR_DT <- mdy(NYPD_Data$CMPLNT_FR_DT)monthlycomplaintdata <- NYPD_Data %>%
mutate(Month = lubridate::month(CMPLNT_FR_DT, label = TRUE),
Year = lubridate::year(CMPLNT_FR_DT),
MonthYM = format(CMPLNT_FR_DT, "%Y-%m")) %>%
group_by(MonthYM) %>%
summarise(`No. complaint` = sum(num_complaint),
Percentage = paste0(round((`No. complaint`/Totalcomplaint)*100,2)," %"),
`Daily Average` = round(`No. complaint`/30,0)) %>%
arrange(desc(`MonthYM`))
# Print the first 20 rows of the results
kable(head(monthlycomplaintdata,5))| MonthYM | No. complaint | Percentage | Daily Average |
|---|---|---|---|
| 2022-12 | 38358 | 7.21 % | 1279 |
| 2022-11 | 43150 | 8.11 % | 1438 |
| 2022-10 | 45769 | 8.61 % | 1526 |
| 2022-09 | 44856 | 8.44 % | 1495 |
| 2022-08 | 46485 | 8.74 % | 1550 |
The dataframe shows the monthly complaints data from March 2022 to December 2022, with the number of complaints, percentage of total complaints, and daily average complaints for each month. The highest number of complaints was in December 2022 with 38,358 complaints.
library(ggplot2)
# Filter the data to include only the year 2022
monthlycomplaintdata_2022 <- monthlycomplaintdata %>%
filter(str_detect(MonthYM, "2022"))
# Create the plot
ggplot(monthlycomplaintdata_2022, aes(x = MonthYM, y = `No. complaint`)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Monthly Complaint Data in 2022",
x = "Month-Year", y = "Number of Complaints") +
theme(plot.title = element_text(hjust = 0.5))# Group the data by borough and year and calculate summary values
boroughyearlycomplaintdata <- NYPD_Data %>%
mutate(Year = lubridate::year(CMPLNT_FR_DT)) %>%
group_by(BORO_NM, Year) %>%
summarise(`No. complaint` = sum(num_complaint),
Percentage = paste0(round((`No. complaint`/Totalcomplaint)*100,2),"%"),
`Monthly Average` = round(`No. complaint`/12,0),
`Daily Average` = round(`No. complaint`/365,0)) %>%
arrange(desc(BORO_NM), desc(Year))
# Print the results
kable(head(boroughyearlycomplaintdata,5))| BORO_NM | Year | No. complaint | Percentage | Monthly Average | Daily Average |
|---|---|---|---|---|---|
| STATEN ISLAND | 2022 | 22055 | 4.15% | 1838 | 60 |
| STATEN ISLAND | 2021 | 478 | 0.09% | 40 | 1 |
| STATEN ISLAND | 2020 | 53 | 0.01% | 4 | 0 |
| STATEN ISLAND | 2019 | 22 | 0% | 2 | 0 |
| STATEN ISLAND | 2018 | 18 | 0% | 2 | 0 |
Because we have a lot of complainsduring the year unable us to draw them perfectly we will focus on 2015 and 2022
library(ggplot2)
# Filter the data for the year 2015
boroughyearlycomplaintdata_2015 <- filter(boroughyearlycomplaintdata, Year == 2015)
# Create the bar plot
ggplot(data = boroughyearlycomplaintdata_2015, aes(x = BORO_NM, y = `No. complaint`, fill = BORO_NM)) +
geom_bar(stat = "identity") +
labs(title = "Total Number of Complaints by Borough in 2015", x = "Borough", y = "Number of Complaints")library(ggplot2)
# Filter the data for 2022
complaints_2022 <- boroughyearlycomplaintdata %>% filter(Year == 2022)
# Create a bar plot of number of complaints per borough
ggplot(complaints_2022, aes(x=BORO_NM, y=`No. complaint`, fill=BORO_NM)) +
geom_bar(stat="identity") +
ggtitle("Number of Complaints per Borough in 2022") +
xlab("Borough") +
ylab("Number of Complaints") +
theme(plot.title = element_text(hjust = 0.5))# Calculate the total number of complaints
Totalcomplaint <- sum(NYPD_Data$num_complaint)
# Group the data by year, month, borough, and month-year and calculate summary values
boroughmonthlycomplaintdata <- NYPD_Data %>%
mutate(Month = lubridate::month(CMPLNT_FR_DT, label = TRUE),
Year = lubridate::year(CMPLNT_FR_DT),
MonthYM = format(CMPLNT_FR_DT, "%Y-%m")) %>%
group_by(MonthYM, BORO_NM) %>%
summarise(`No. complaint` = sum(num_complaint),
`Total Complaints` = sum(NYPD_Data$num_complaint),
Percentage = paste0(round((`No. complaint`/sum(NYPD_Data$num_complaint))*100,2),"%"),
Percentage_int = round((`No. complaint`/sum(NYPD_Data$num_complaint))*100,2),
`Daily Average` = round(`No. complaint`/30,0)) %>%
arrange(desc(`MonthYM`))
# Print the first 20 rows of the results
kable(head(boroughmonthlycomplaintdata,5))| MonthYM | BORO_NM | No. complaint | Total Complaints | Percentage | Percentage_int | Daily Average |
|---|---|---|---|---|---|---|
| 2022-12 | (null) | 57 | 531768 | 0.01% | 0.01 | 2 |
| 2022-12 | BRONX | 8368 | 531768 | 1.57% | 1.57 | 279 |
| 2022-12 | BROOKLYN | 10518 | 531768 | 1.98% | 1.98 | 351 |
| 2022-12 | MANHATTAN | 9394 | 531768 | 1.77% | 1.77 | 313 |
| 2022-12 | QUEENS | 8260 | 531768 | 1.55% | 1.55 | 275 |
# Filter data for 2020
NYPD_Data_2020 <- NYPD_Data %>% filter(year(mdy(CMPLNT_TO_DT)) == 2020)
# Create a monthly plot of the data for 2020
ggplot(NYPD_Data_2020, aes(x=month(mdy(CMPLNT_TO_DT), label=TRUE, abbr=FALSE), fill=BORO_NM)) +
geom_bar() +
labs(title="Monthly Crime Complaints by Borough in 2020", x="Month", y="Number of Complaints") +
scale_fill_brewer(palette="Set3") +
theme_minimal()# Group the data by borough and date and calculate summary values
boroughdailycomplaintdata <- NYPD_Data %>%
group_by(BORO_NM, CMPLNT_FR_DT) %>%
summarise(`No. complaint` = sum(num_complaint)) %>%
mutate(`Daily Average` = round(`No. complaint`/n_distinct(CMPLNT_FR_DT),0)) %>%
arrange(desc(BORO_NM), desc(CMPLNT_FR_DT))
# Print the results
kable(head(boroughdailycomplaintdata,5))| BORO_NM | CMPLNT_FR_DT | No. complaint | Daily Average |
|---|---|---|---|
| STATEN ISLAND | 2022-12-31 | 32 | 0 |
| STATEN ISLAND | 2022-12-30 | 46 | 0 |
| STATEN ISLAND | 2022-12-29 | 61 | 0 |
| STATEN ISLAND | 2022-12-28 | 62 | 0 |
| STATEN ISLAND | 2022-12-27 | 55 | 0 |
# Filter for Manhattan
manhattan_data <- NYPD_Data %>% filter(BORO_NM == "MANHATTAN")
# Group the data by offense description and calculate the number of complaints for each category
top_crime_complaints_manhattan <- manhattan_data %>%
group_by(OFNS_DESC) %>%
summarise(total_complaints = sum(num_complaint)) %>%
arrange(desc(total_complaints))
# Display the top 10 crime complaints in Manhattan
head(top_crime_complaints_manhattan, 10)## # A tibble: 10 × 2
## OFNS_DESC total_complaints
## <chr> <dbl>
## 1 PETIT LARCENY 36786
## 2 GRAND LARCENY 18377
## 3 HARRASSMENT 2 16814
## 4 ASSAULT 3 & RELATED OFFENSES 11864
## 5 CRIMINAL MISCHIEF & RELATED OF 9964
## 6 FELONY ASSAULT 5142
## 7 BURGLARY 4699
## 8 ROBBERY 4168
## 9 OFF. AGNST PUB ORD SENSBLTY & 3798
## 10 DANGEROUS DRUGS 3223
library(ggplot2)
# Create ggplot object
ggplot(top_crime_complaints_manhattan[1:10,], aes(x = reorder(OFNS_DESC, total_complaints), y = total_complaints)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Top 10 Crime Complaints in Manhattan", x = "Offense Description", y = "Number of Complaints") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# Filter the data to include only the Bronx
bronx_data <- filter(NYPD_Data, BORO_NM == "BRONX")
# Group the data by offense description and calculate the number of complaints for each category
top_crime_complaints <- bronx_data %>%
group_by(OFNS_DESC) %>%
summarise(total_complaints = sum(num_complaint)) %>%
arrange(desc(total_complaints))
# Display the top 10 crime complaints
head(top_crime_complaints, 10)## # A tibble: 10 × 2
## OFNS_DESC total_complaints
## <chr> <dbl>
## 1 HARRASSMENT 2 18951
## 2 PETIT LARCENY 18536
## 3 ASSAULT 3 & RELATED OFFENSES 13907
## 4 CRIMINAL MISCHIEF & RELATED OF 8878
## 5 GRAND LARCENY 8073
## 6 FELONY ASSAULT 7512
## 7 ROBBERY 5161
## 8 OFF. AGNST PUB ORD SENSBLTY & 4391
## 9 GRAND LARCENY OF MOTOR VEHICLE 4010
## 10 VEHICLE AND TRAFFIC LAWS 3113
# Create ggplot object
ggplot(top_crime_complaints[1:10,], aes(x = reorder(OFNS_DESC, total_complaints), y = total_complaints)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Top 10 Crime Complaints in the Bronx", x = "Offense Description", y = "Number of Complaints") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# Filter the data to include only complaints from Staten Island
SI_Data <- filter(NYPD_Data, BORO_NM == "STATEN ISLAND")
# Group the data by offense description and calculate the number of complaints for each category
top_crime_complaints_SI <- SI_Data %>%
group_by(OFNS_DESC) %>%
summarise(total_complaints = sum(num_complaint)) %>%
arrange(desc(total_complaints))
# Display the top 10 crime complaints
head(top_crime_complaints_SI, 10)## # A tibble: 10 × 2
## OFNS_DESC total_complaints
## <chr> <dbl>
## 1 HARRASSMENT 2 4704
## 2 PETIT LARCENY 4006
## 3 CRIMINAL MISCHIEF & RELATED OF 2392
## 4 ASSAULT 3 & RELATED OFFENSES 2218
## 5 GRAND LARCENY 1378
## 6 OFF. AGNST PUB ORD SENSBLTY & 1128
## 7 MISCELLANEOUS PENAL LAW 1057
## 8 FELONY ASSAULT 783
## 9 VEHICLE AND TRAFFIC LAWS 699
## 10 DANGEROUS DRUGS 514
library(ggplot2)
# Create ggplot object
ggplot(top_crime_complaints_SI[1:10,], aes(x = reorder(OFNS_DESC, total_complaints), y = total_complaints)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Top 10 Crime Complaints in Staten Island", x = "Offense Description", y = "Number of Complaints") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# Filter the data to only include Brooklyn
brooklyn_data <- NYPD_Data %>%
filter(BORO_NM == "BROOKLYN")
# Group the data by offense description and calculate the number of complaints for each category
top_crime_complaints <- brooklyn_data %>%
group_by(OFNS_DESC) %>%
summarise(total_complaints = sum(num_complaint)) %>%
arrange(desc(total_complaints))
# Display the top 10 crime complaints
head(top_crime_complaints, 10)## # A tibble: 10 × 2
## OFNS_DESC total_complaints
## <chr> <dbl>
## 1 PETIT LARCENY 29834
## 2 HARRASSMENT 2 24859
## 3 ASSAULT 3 & RELATED OFFENSES 15267
## 4 GRAND LARCENY 13062
## 5 CRIMINAL MISCHIEF & RELATED OF 12801
## 6 FELONY ASSAULT 7501
## 7 OFF. AGNST PUB ORD SENSBLTY & 5800
## 8 MISCELLANEOUS PENAL LAW 5529
## 9 BURGLARY 4514
## 10 ROBBERY 4474
# Create ggplot object
ggplot(top_crime_complaints[1:10,], aes(x = reorder(OFNS_DESC, total_complaints), y = total_complaints)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Top 10 Crime Complaints in Brooklyn", x = "Offense Description", y = "Number of Complaints") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# Filter the data for complaints in Queens
queens_data <- NYPD_Data %>% filter(BORO_NM == "QUEENS")
# Group the data by offense description and calculate the number of complaints for each category
top_crime_complaints <- queens_data %>%
group_by(OFNS_DESC) %>%
summarise(total_complaints = sum(num_complaint)) %>%
arrange(desc(total_complaints))
# Display the top 10 crime complaints
head(top_crime_complaints, 10)## # A tibble: 10 × 2
## OFNS_DESC total_complaints
## <chr> <dbl>
## 1 PETIT LARCENY 25407
## 2 HARRASSMENT 2 17664
## 3 ASSAULT 3 & RELATED OFFENSES 11716
## 4 GRAND LARCENY 10677
## 5 CRIMINAL MISCHIEF & RELATED OF 10671
## 6 FELONY ASSAULT 5141
## 7 MISCELLANEOUS PENAL LAW 4123
## 8 GRAND LARCENY OF MOTOR VEHICLE 3618
## 9 OFF. AGNST PUB ORD SENSBLTY & 3446
## 10 ROBBERY 3317
ggplot(head(top_crime_complaints, 10), aes(x = total_complaints, y = fct_reorder(OFNS_DESC, total_complaints))) +
geom_col(fill = "lightblue") +
labs(title = "Top 10 Crime Complaints in Queens",
x = "Number of Complaints",
y = "Offense Description") +
theme(plot.title = element_text(hjust = 0.5))complaints_by_borough_summary <- NYPD_Data %>%
group_by(BORO_NM) %>%
summarize(num_complaints = n()) %>%
summarise(mean_complaints = mean(num_complaints),
median_complaints = median(num_complaints),
sd_complaints = sd(num_complaints))
# Print summary statistics
cat("Summary statistics of number of complaints by borough:\n")## Summary statistics of number of complaints by borough:
print(complaints_by_borough_summary)## # A tibble: 1 × 3
## mean_complaints median_complaints sd_complaints
## <dbl> <dbl> <dbl>
## 1 88628 113650 61245.
The summary statistics table shows the mean, median, and standard deviation of the number of complaints across all boroughs in the NYPD_Data dataset.
mean_complaints: The average number of complaints across all boroughs is 88,628. median_complaints: The median number of complaints across all boroughs is 113,650, which means that half of the boroughs have fewer complaints and half have more. sd_complaints: The standard deviation of the number of complaints across all boroughs is 61,245, which indicates that there is a wide variation in the number of complaints between the boroughs.
# Group the data by offense description and calculate the number of complaints for each category
top_crime_complaints <- NYPD_Data %>%
group_by(OFNS_DESC) %>%
summarise(total_complaints = sum(num_complaint)) %>%
arrange(desc(total_complaints))
# Display the top 10 crime complaints
head(top_crime_complaints, 10)## # A tibble: 10 × 2
## OFNS_DESC total_complaints
## <chr> <dbl>
## 1 PETIT LARCENY 115477
## 2 HARRASSMENT 2 83003
## 3 ASSAULT 3 & RELATED OFFENSES 54988
## 4 GRAND LARCENY 51575
## 5 CRIMINAL MISCHIEF & RELATED OF 44716
## 6 FELONY ASSAULT 26081
## 7 OFF. AGNST PUB ORD SENSBLTY & 18564
## 8 ROBBERY 17411
## 9 MISCELLANEOUS PENAL LAW 15851
## 10 BURGLARY 15759
library(ggplot2)
# Create a bar plot
ggplot(head(top_crime_complaints, 10), aes(x = reorder(OFNS_DESC, total_complaints), y = total_complaints)) +
geom_bar(stat = "identity", fill = "darkblue") +
xlab("Crime Complaints") +
ylab("Number of Complaints") +
ggtitle("Top 10 Crime Complaints in NYC") +
theme(plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1))Since our research question is focused on identifying if there is a statistically significant difference in the number of complaints among different boroughs, a two-sample t-test would be appropriate. We can compare the mean number of complaints per year between two boroughs and test whether the difference is statistically significant.
To perform a t-test , we will first need to extract the necessary data. We will extract the number of complaints for two boroughs (Brooklyn and Manhattan)
The null hypothesis
H0: μ1 = μ2 (The mean number of complaints in Brooklyn is equal to the mean number of complaints in Manhattan)
The alternative hypothesis
HA: μ1 ≠μ2 (The mean number of complaints in Brooklyn is not equal to the mean number of complaints in Manhattan)
Here, μ1 represents the population mean of the number of complaints in Brooklyn, and μ2 represents the population mean of the number of complaints in Manhattan
BRONX_data <- boroughmonthlycomplaintdata[boroughmonthlycomplaintdata$BORO_NM == "BRONX", ]
QUEENS_data <- boroughmonthlycomplaintdata[boroughmonthlycomplaintdata$BORO_NM == "QUEENS", ]
STATEN_ISLAND_data <- boroughmonthlycomplaintdata[boroughmonthlycomplaintdata$BORO_NM == "STATEN ISLAND", ]
MANHATTAN_data <- boroughmonthlycomplaintdata[boroughmonthlycomplaintdata$BORO_NM == "MANHATTAN", ]
BROOKLYN_data <- boroughmonthlycomplaintdata[boroughmonthlycomplaintdata$BORO_NM == "BROOKLYN", ]# Perform two-sample t-test comparing Brooklyn and Manhattan complaint data
t.test(BROOKLYN_data$`No. complaint`, MANHATTAN_data$`No. complaint`)##
## Welch Two Sample t-test
##
## data: BROOKLYN_data$`No. complaint` and MANHATTAN_data$`No. complaint`
## t = 0.044592, df = 406.74, p-value = 0.9645
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -514.7782 538.6747
## sample estimates:
## mean of x mean of y
## 692.3615 680.4133
The output of the Welch Two Sample t-test you performed suggests that there is no significant difference between the mean number of complaints for Brooklyn and Manhattan.
The t-value of 0.044592 suggests that there is a small difference between the means of the two groups, but the p-value of 0.9645 indicates that this difference is not statistically significant. The degrees of freedom (df) value of 406.74 reflects the use of the Welch’s t-test, which is used when the variances of the two groups are not equal.
The 95% confidence interval of the difference in means ranges from -514.7782 to 538.6747, which includes 0. This suggests that we can be 95% confident that the true difference in means between the two boroughs falls within this range.
The sample estimates show that the mean number of complaints in Brooklyn is slightly higher than in Manhattan, with a mean of 692.3615 for Brooklyn and 680.4133
since the p-value is greater than the chosen significance level (usually 0.05), we would fail to reject the null hypothesis, which is that there is no significant difference in the mean number of complaints between Brooklyn and Manhattan.
Therefore, based on this analysis, we can conclude that there is no statistically significant difference in the mean number of complaints between the two boroughs.
library(ggplot2)
ggplot(boroughmonthlycomplaintdata, aes(x=BORO_NM, y=`No. complaint`)) +
geom_boxplot(aes(fill=BORO_NM)) +
xlab("Borough") + ylab("Number of Complaints") +
ggtitle("Comparison of Mean Number of Complaints between Brooklyn and Manhattan")# Create a data frame with the total number of complaints for Brooklyn and Manhattan
bm_complaints <- data.frame(
Borough = c("Brooklyn", "Manhattan"),
Total_Complaints = c(sum(BROOKLYN_data$`No. complaint`),
sum(MANHATTAN_data$`No. complaint`))
)
# Calculate the percentage of total complaints for Brooklyn and Manhattan
bm_complaints$Percentage <- round(bm_complaints$Total_Complaints / sum(bm_complaints$Total_Complaints) * 100, 1)
# Create a pie chart showing the percentage of total complaints for Brooklyn and Manhattan
ggplot(bm_complaints, aes(x = "", y = Percentage, fill = Borough)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start = 0) +
labs(title = "Percentage of Total Complaints for Brooklyn and Manhattan", fill = "Borough") +
theme_void() +
scale_fill_manual(values = c("SKYblue", "deepskyblue4")) +
geom_text(aes(label = paste0(Percentage, "%")), position = position_stack(vjust = 0.5))conduct a linear regression analysis for the years 2015 and 2022
print(boroughyearlycomplaintdata_2015)## # A tibble: 5 × 6
## # Groups: BORO_NM [5]
## BORO_NM Year `No. complaint` Percentage `Monthly Average` Daily Avera…¹
## <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 STATEN ISLAND 2015 4 0% 0 0
## 2 QUEENS 2015 28 0.01% 2 0
## 3 MANHATTAN 2015 25 0% 2 0
## 4 BROOKLYN 2015 30 0.01% 2 0
## 5 BRONX 2015 21 0% 2 0
## # … with abbreviated variable name ¹​`Daily Average`
boroughyearlycomplaintdata_2022 <- boroughyearlycomplaintdata %>%
filter(Year == 2022) %>%
select(BORO_NM, Year, `No. complaint`) %>%
arrange(BORO_NM)
print(boroughyearlycomplaintdata_2022)## # A tibble: 6 × 3
## # Groups: BORO_NM [6]
## BORO_NM Year `No. complaint`
## <chr> <dbl> <dbl>
## 1 (null) 2022 949
## 2 BRONX 2022 110055
## 3 BROOKLYN 2022 144319
## 4 MANHATTAN 2022 130973
## 5 QUEENS 2022 112244
## 6 STATEN ISLAND 2022 22055
lm_model <- lm(`No. complaint` ~ Year, data = rbind(boroughyearlycomplaintdata_2015, boroughyearlycomplaintdata_2022))
summary(lm_model)##
## Call:
## lm(formula = `No. complaint` ~ Year, data = rbind(boroughyearlycomplaintdata_2015,
## boroughyearlycomplaintdata_2022))
##
## Residuals:
## Min 1Q Median 3Q Max
## -85817 -9 6 24384 57553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24969926 7812200 -3.196 0.0109 *
## Year 12392 3870 3.202 0.0108 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44730 on 9 degrees of freedom
## Multiple R-squared: 0.5326, Adjusted R-squared: 0.4807
## F-statistic: 10.25 on 1 and 9 DF, p-value: 0.01079
The linear regression model that we fitted estimates that the number of complaints has increased by approximately 12,392 per year since 2015. The intercept of the model (-24,969,926) is not meaningful in this context as it represents the number of complaints in the year 0 which is an arbitrary point of reference.
The p-value for the Year variable in the model is less than the typical threshold of 0.05, which indicates that the Year variable is statistically significant in predicting the number of complaints. In other words, we can say that there is a significant linear relationship between the year and the number of complaints.
The adjusted R-squared value of the model is 0.4807. This means that the Year variable explains about 48% of the variation in the number of complaints. This suggests that there are other factors besides time (Year) that are influencing the number of complaints.
Finally, the F-statistic and its associated p-value indicate that the regression line fits the data significantly better than a horizontal line, providing evidence that there is indeed a significant linear relationship between Year and the number of complaints.
Number of Complaints per Year
library(ggplot2)
# Combine the data frames
combined_data <- rbind(boroughyearlycomplaintdata_2015, boroughyearlycomplaintdata_2022)
# Create the scatterplot
ggplot(data = combined_data, aes(x = Year, y = `No. complaint`)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "Year", y = "Number of Complaints", title = "Number of Complaints per Year") +
theme_bw()scatter plot for both years
# Create scatter plot for both years
ggplot(data = rbind(boroughyearlycomplaintdata_2015, boroughyearlycomplaintdata_2022),
aes(x = Year, y = `No. complaint`, color = BORO_NM)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Number of Complaints by Year and Borough",
x = "Year",
y = "Number of Complaints",
color = "Borough") +
scale_color_discrete(name = "Borough") +
theme_minimal()Our analysis of the NYPD complaints dataset revealed that Brooklyn had the highest number of complaints overall, followed by Manhattan and Queens . We also observed an overall decrease in the number of complaints in recent years. Our statistical analysis using a two-sample t-test showed that there was no significant difference in the mean number of complaints per year between Brooklyn and Manhattan. Our results can inform policy decisions and community engagement strategies aimed at improving police accountability and trust in the city.
we also performed a linear regression analysis on the data from 2015 to 2022. The analysis showed that there was a significant positive correlation between the year and the number of complaints. The regression equation showed that for every one year increase, the number of complaints increased by an average of 12,392. This finding suggests that although there has been an overall decrease in the number of complaints over recent years, this decrease is not linear and complaints may continue to increase in the future. Therefore, it is important to continue monitoring and addressing concerns related to police-community relations in New York City.