Libraries
library(data.table)
library(dplyr)
library(lubridate)
library(hms)
library(ggplot2)
library(maptools)
library(ggmap)
library(scales)
library(tseries)
library(forecast)
library(zoo)
Load data
df_full <- fread('C:/Users/antho/Documents/projects/citizen/NYPD_Complaint_Data_Historic.csv')
Sample and Format data
set.seed(123)
df <- df_full[sample(nrow(df_full), size = 500000, replace = FALSE), ]
glimpse(df)
## Observations: 500,000
## Variables: 35
## $ CMPLNT_NUM <int> 529743672, 613252886, 294985478, 728497476, ...
## $ CMPLNT_FR_DT <chr> "09/13/2014", "01/25/2012", "02/04/2010", "0...
## $ CMPLNT_FR_TM <chr> "22:30:00", "21:50:00", "22:15:00", "11:50:0...
## $ CMPLNT_TO_DT <chr> "09/14/2014", "01/25/2012", "", "03/13/2012"...
## $ CMPLNT_TO_TM <chr> "13:00:00", "22:37:00", "", "12:05:00", "12:...
## $ ADDR_PCT_CD <int> 68, 45, 46, 40, 72, 101, 41, 43, 79, 20, 83,...
## $ RPT_DT <chr> "09/14/2014", "01/25/2012", "02/04/2010", "0...
## $ KY_CD <int> 109, 105, 235, 351, 341, 361, 351, 109, 344,...
## $ OFNS_DESC <chr> "GRAND LARCENY", "ROBBERY", "DANGEROUS DRUGS...
## $ PD_CD <int> 457, 397, 567, 259, 321, 639, 259, 413, 101,...
## $ PD_DESC <chr> "LARCENY,GRAND OF VEHICULAR/MOTORCYCLE ACCES...
## $ CRM_ATPT_CPTD_CD <chr> "COMPLETED", "COMPLETED", "COMPLETED", "ATTE...
## $ LAW_CAT_CD <chr> "FELONY", "FELONY", "MISDEMEANOR", "MISDEMEA...
## $ BORO_NM <chr> "BROOKLYN", "BRONX", "BRONX", "BRONX", "BROO...
## $ LOC_OF_OCCUR_DESC <chr> "", "", "", "", "INSIDE", "INSIDE", "INSIDE"...
## $ PREM_TYP_DESC <chr> "STREET", "STREET", "STREET", "OTHER", "BEAU...
## $ JURIS_DESC <chr> "N.Y. POLICE DEPT", "N.Y. POLICE DEPT", "N.Y...
## $ JURISDICTION_CODE <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ PARKS_NM <chr> "", NA, NA, NA, NA, NA, NA, NA, NA, "", "", ...
## $ HADEVELOPT <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ HOUSING_PSA <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "", NA, ...
## $ X_COORD_CD <int> 978011, 1027392, 1008817, 1008278, 982106, 1...
## $ Y_COORD_CD <int> 171336, 246172, 250725, 237309, 172074, 1567...
## $ SUSP_AGE_GROUP <chr> "", "", "", "", "", "", "45-64", "", "", "",...
## $ SUSP_RACE <chr> "", "", "", "", "WHITE HISPANIC", "BLACK", "...
## $ SUSP_SEX <chr> "", "", "", "", "M", "F", "M", "", "", "", "...
## $ TRANSIT_DISTRICT <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Latitude <dbl> 40.63696, 40.84226, 40.85483, 40.81801, 40.6...
## $ Longitude <dbl> -74.02248, -73.84408, -73.91120, -73.91319, ...
## $ Lat_Lon <chr> "(40.636960333, -74.022480845)", "(40.842263...
## $ PATROL_BORO <chr> "PATROL BORO BKLYN SOUTH", "PATROL BORO BRON...
## $ STATION_NAME <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ VIC_AGE_GROUP <chr> "45-64", "18-24", "", "", "25-44", "25-44", ...
## $ VIC_RACE <chr> "WHITE", "WHITE", "UNKNOWN", "UNKNOWN", "ASI...
## $ VIC_SEX <chr> "F", "M", "E", "E", "F", "F", "D", "M", "F",...
names(df) <- c('complaint', 'from_date', 'from_time', 'to_date', 'to_time', 'precinct', 'report_date',
'off_code', 'off_desc', 'class_code', 'class_desc', 'outcome', 'level', 'boro', 'location',
'premise', 'juris_desc', 'juris_code', 'park', 'house_dev', 'house_psa', 'x_coord', 'y_coord',
'susp_age', 'susp_race', 'sus_sex', 'transit', 'lat', 'long', 'lat_lon', 'patrol_boro', 'station',
'vic_age', 'vic_race', 'vic_sex')
# dates
df$from_date <- mdy(df$from_date)
df$to_date <- mdy(df$to_date)
df$report_date <- mdy(df$report_date)
# time
df$from_time <- as_hms(df$from_time)
## Warning: Lossy cast from <character> to <hms> at position(s) 18936, 303464,
## 350933, 359108, 398934
df$to_time <- as_hms(df$to_time)
## Warning: Lossy cast from <character> to <hms> at position(s) 3, 13, 22, 52,
## 53, ... (and 122967 more)
df$from_hour <- hour(df$from_time)
df$year <- year(df$report_date)
df$month_year <- as.yearmon(df$report_date)
Explore data
str(df)
## Classes 'data.table' and 'data.frame': 500000 obs. of 38 variables:
## $ complaint : int 529743672 613252886 294985478 728497476 913842706 107262943 394025820 503121618 472147001 621315823 ...
## $ from_date : Date, format: "2014-09-13" "2012-01-25" ...
## $ from_time : 'hms' num 22:30:00 21:50:00 22:15:00 11:50:00 ...
## ..- attr(*, "units")= chr "secs"
## $ to_date : Date, format: "2014-09-14" "2012-01-25" ...
## $ to_time : 'hms' num 13:00:00 22:37:00 NA 12:05:00 ...
## ..- attr(*, "units")= chr "secs"
## $ precinct : int 68 45 46 40 72 101 41 43 79 20 ...
## $ report_date: Date, format: "2014-09-14" "2012-01-25" ...
## $ off_code : int 109 105 235 351 341 361 351 109 344 233 ...
## $ off_desc : chr "GRAND LARCENY" "ROBBERY" "DANGEROUS DRUGS" "CRIMINAL MISCHIEF & RELATED OF" ...
## $ class_code : int 457 397 567 259 321 639 259 413 101 175 ...
## $ class_desc : chr "LARCENY,GRAND OF VEHICULAR/MOTORCYCLE ACCESSORIES" "ROBBERY,OPEN AREA UNCLASSIFIED" "MARIJUANA, POSSESSION 4 & 5" "CRIMINAL MISCHIEF,UNCLASSIFIED 4" ...
## $ outcome : chr "COMPLETED" "COMPLETED" "COMPLETED" "ATTEMPTED" ...
## $ level : chr "FELONY" "FELONY" "MISDEMEANOR" "MISDEMEANOR" ...
## $ boro : chr "BROOKLYN" "BRONX" "BRONX" "BRONX" ...
## $ location : chr "" "" "" "" ...
## $ premise : chr "STREET" "STREET" "STREET" "OTHER" ...
## $ juris_desc : chr "N.Y. POLICE DEPT" "N.Y. POLICE DEPT" "N.Y. POLICE DEPT" "N.Y. POLICE DEPT" ...
## $ juris_code : int 0 0 0 0 0 0 0 0 0 0 ...
## $ park : chr "" NA NA NA ...
## $ house_dev : chr "" "" "" "" ...
## $ house_psa : chr NA NA NA NA ...
## $ x_coord : int 978011 1027392 1008817 1008278 982106 1046025 1014179 1024197 1001420 991168 ...
## $ y_coord : int 171336 246172 250725 237309 172074 156791 239453 234000 187356 224929 ...
## $ susp_age : chr "" "" "" "" ...
## $ susp_race : chr "" "" "" "" ...
## $ sus_sex : chr "" "" "" "" ...
## $ transit : int NA NA NA NA NA NA NA NA NA NA ...
## $ lat : num 40.6 40.8 40.9 40.8 40.6 ...
## $ long : num -74 -73.8 -73.9 -73.9 -74 ...
## $ lat_lon : chr "(40.636960333, -74.022480845)" "(40.84226332, -73.84407906)" "(40.854831167, -73.911195468)" "(40.818009625, -73.913191928)" ...
## $ patrol_boro: chr "PATROL BORO BKLYN SOUTH" "PATROL BORO BRONX" "PATROL BORO BRONX" "PATROL BORO BRONX" ...
## $ station : chr "" "" "" "" ...
## $ vic_age : chr "45-64" "18-24" "" "" ...
## $ vic_race : chr "WHITE" "WHITE" "UNKNOWN" "UNKNOWN" ...
## $ vic_sex : chr "F" "M" "E" "E" ...
## $ from_hour : int 22 21 22 11 12 13 12 18 2 3 ...
## $ year : num 2014 2012 2010 2012 2013 ...
## $ month_year : 'yearmon' num Sep 2014 Jan 2012 Feb 2010 Mar 2012 ...
## - attr(*, ".internal.selfref")=<externalptr>
summary(df)
## complaint from_date from_time
## Min. :100003254 Min. :1017-03-18 Length:500000
## 1st Qu.:325043348 1st Qu.:2009-01-11 Class1:hms
## Median :550582272 Median :2012-03-25 Class2:difftime
## Mean :550171924 Mean :2012-04-10 Mode :numeric
## 3rd Qu.:775115150 3rd Qu.:2015-07-15
## Max. :999999904 Max. :2018-12-31
## NA's :52
## to_date to_time precinct
## Min. :1972-01-02 Length:500000 Min. : 1.00
## 1st Qu.:2009-06-06 Class1:hms 1st Qu.: 40.00
## Median :2012-09-23 Class2:difftime Median : 63.00
## Mean :2012-08-24 Mode :numeric Mean : 63.43
## 3rd Qu.:2015-11-24 3rd Qu.: 94.00
## Max. :2018-12-31 Max. :123.00
## NA's :123342 NA's :165
## report_date off_code off_desc class_code
## Min. :2006-01-01 Min. :101.0 Length:500000 Min. :101.0
## 1st Qu.:2009-01-23 1st Qu.:117.0 Class :character 1st Qu.:254.0
## Median :2012-04-05 Median :341.0 Mode :character Median :388.0
## Mean :2012-05-01 Mean :295.2 Mean :416.2
## 3rd Qu.:2015-07-25 3rd Qu.:351.0 3rd Qu.:637.0
## Max. :2018-12-31 Max. :881.0 Max. :969.0
## NA's :442
## class_desc outcome level
## Length:500000 Length:500000 Length:500000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## boro location premise
## Length:500000 Length:500000 Length:500000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## juris_desc juris_code park
## Length:500000 Min. : 0.0000 Length:500000
## Class :character 1st Qu.: 0.0000 Class :character
## Mode :character Median : 0.0000 Mode :character
## Mean : 0.7196
## 3rd Qu.: 0.0000
## Max. :97.0000
## NA's :442
## house_dev house_psa x_coord y_coord
## Length:500000 Length:500000 Min. : 152021 Min. : 121131
## Class :character Class :character 1st Qu.: 991686 1st Qu.: 184272
## Mode :character Mode :character Median :1004448 Median : 205840
## Mean :1004789 Mean : 206961
## 3rd Qu.:1016634 3rd Qu.: 235172
## Max. :1067249 Max. :2261851
## NA's :1362 NA's :1362
## susp_age susp_race sus_sex transit
## Length:500000 Length:500000 Length:500000 Min. : 1.0
## Class :character Class :character Class :character 1st Qu.: 3.0
## Mode :character Mode :character Mode :character Median :11.0
## Mean :13.4
## 3rd Qu.:30.0
## Max. :34.0
## NA's :488961
## lat long lat_lon patrol_boro
## Min. :40.50 Min. :-77.00 Length:500000 Length:500000
## 1st Qu.:40.67 1st Qu.:-73.97 Class :character Class :character
## Median :40.73 Median :-73.93 Mode :character Mode :character
## Mean :40.73 Mean :-73.93
## 3rd Qu.:40.81 3rd Qu.:-73.88
## Max. :46.36 Max. :-73.70
## NA's :1362 NA's :1362
## station vic_age vic_race
## Length:500000 Length:500000 Length:500000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## vic_sex from_hour year month_year
## Length:500000 Min. : 0.00 Min. :2006 Min. :2006
## Class :character 1st Qu.: 9.00 1st Qu.:2009 1st Qu.:2009
## Mode :character Median :14.00 Median :2012 Median :2012
## Mean :13.24 Mean :2012 Mean :2012
## 3rd Qu.:19.00 3rd Qu.:2015 3rd Qu.:2016
## Max. :23.00 Max. :2018 Max. :2019
## NA's :5
glimpse(df)
## Observations: 500,000
## Variables: 38
## $ complaint <int> 529743672, 613252886, 294985478, 728497476, 913842...
## $ from_date <date> 2014-09-13, 2012-01-25, 2010-02-04, 2012-03-13, 2...
## $ from_time <time> 22:30:00, 21:50:00, 22:15:00, 11:50:00, 12:00:00,...
## $ to_date <date> 2014-09-14, 2012-01-25, NA, 2012-03-13, 2013-03-0...
## $ to_time <time> 13:00:00, 22:37:00, NA, 12:05:00, 12:45:00,...
## $ precinct <int> 68, 45, 46, 40, 72, 101, 41, 43, 79, 20, 83, 43, 1...
## $ report_date <date> 2014-09-14, 2012-01-25, 2010-02-04, 2012-03-13, 2...
## $ off_code <int> 109, 105, 235, 351, 341, 361, 351, 109, 344, 233, ...
## $ off_desc <chr> "GRAND LARCENY", "ROBBERY", "DANGEROUS DRUGS", "CR...
## $ class_code <int> 457, 397, 567, 259, 321, 639, 259, 413, 101, 175, ...
## $ class_desc <chr> "LARCENY,GRAND OF VEHICULAR/MOTORCYCLE ACCESSORIES...
## $ outcome <chr> "COMPLETED", "COMPLETED", "COMPLETED", "ATTEMPTED"...
## $ level <chr> "FELONY", "FELONY", "MISDEMEANOR", "MISDEMEANOR", ...
## $ boro <chr> "BROOKLYN", "BRONX", "BRONX", "BRONX", "BROOKLYN",...
## $ location <chr> "", "", "", "", "INSIDE", "INSIDE", "INSIDE", "FRO...
## $ premise <chr> "STREET", "STREET", "STREET", "OTHER", "BEAUTY & N...
## $ juris_desc <chr> "N.Y. POLICE DEPT", "N.Y. POLICE DEPT", "N.Y. POLI...
## $ juris_code <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,...
## $ park <chr> "", NA, NA, NA, NA, NA, NA, NA, NA, "", "", NA, NA...
## $ house_dev <chr> "", "", "", "", "", "", "", "", "", "", "", "", ""...
## $ house_psa <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "", NA, NA, NA...
## $ x_coord <int> 978011, 1027392, 1008817, 1008278, 982106, 1046025...
## $ y_coord <int> 171336, 246172, 250725, 237309, 172074, 156791, 23...
## $ susp_age <chr> "", "", "", "", "", "", "45-64", "", "", "", "UNKN...
## $ susp_race <chr> "", "", "", "", "WHITE HISPANIC", "BLACK", "BLACK"...
## $ sus_sex <chr> "", "", "", "", "M", "F", "M", "", "", "", "U", "M...
## $ transit <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ lat <dbl> 40.63696, 40.84226, 40.85483, 40.81801, 40.63899, ...
## $ long <dbl> -74.02248, -73.84408, -73.91120, -73.91319, -74.00...
## $ lat_lon <chr> "(40.636960333, -74.022480845)", "(40.84226332, -7...
## $ patrol_boro <chr> "PATROL BORO BKLYN SOUTH", "PATROL BORO BRONX", "P...
## $ station <chr> "", "", "", "", "", "", "", "", "", "", "", "", ""...
## $ vic_age <chr> "45-64", "18-24", "", "", "25-44", "25-44", "", "4...
## $ vic_race <chr> "WHITE", "WHITE", "UNKNOWN", "UNKNOWN", "ASIAN / P...
## $ vic_sex <chr> "F", "M", "E", "E", "F", "F", "D", "M", "F", "M", ...
## $ from_hour <int> 22, 21, 22, 11, 12, 13, 12, 18, 2, 3, 11, 19, 17, ...
## $ year <dbl> 2014, 2012, 2010, 2012, 2013, 2007, 2011, 2006, 20...
## $ month_year <yearmon> Sep 2014, Jan 2012, Feb 2010, Mar 2012, Mar 20...
Analysis
# summary
df_date <- df %>% group_by(report_date) %>% summarise(count = n_distinct(complaint))
df_time <- df %>% group_by(from_hour) %>% summarise(count = n_distinct(complaint)) %>% mutate(perc = percent(count/sum(count)))
df_boro <- df %>% filter(boro != "") %>% group_by(boro) %>% summarise(count = n_distinct(complaint)) %>% mutate(perc = count/sum(count))
ggplot(df_date, aes(report_date, count)) + geom_line() + xlab('Year') + ylab('Complaints') + ggtitle('NYPD Complaints, 2006 to 2018')

ggplot(df_time, aes(x = from_hour, y = count)) + geom_bar(stat = 'identity') + ggtitle('NYPD Complaints by Hour, 2006 to 2018') + xlab('Hour') + ylab('Complaints')
## Warning: Removed 1 rows containing missing values (position_stack).

ggplot(df_boro, aes(x='', y=perc, fill=boro)) + geom_bar(stat='identity') + coord_polar("y", start=0) +
ggtitle('Complaints by Borough, 2006 to 2018') + theme(legend.title=element_blank()) + scale_colour_hue()

myLocation <- c(lon = -73.8993, lat = 40.7223)
myMap <-
get_map(
location = myLocation,
source = "google",
maptype = "terrain",
crop = FALSE,
zoom = 10
)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=40.7223,-73.8993&zoom=10&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx%20
ggmap(myMap, legend = "topright") +
geom_point(
aes(x = long, y = lat),
df,
alpha = .01,
color = "darkred",
size = 1
) +
labs(x = 'Longitude', y = 'Latitude') +
ggtitle('NYPD Complaints 2006 to 2018')
## Warning: Removed 1371 rows containing missing values (geom_point).

Times-series data, Trend and Seasonality
# monthly
df_month <- df %>% group_by(month_year) %>% summarise(count = n())
# time series
ts <- ts(df_month$count,
frequency = 12,
start = c(2006, 1))
autoplot(ts)

# linear model
lm <- lm(count ~ month_year, df_month)
summary(lm)
##
## Call:
## lm(formula = count ~ month_year, data = df_month)
##
## Residuals:
## Min 1Q Median 3Q Max
## -756.28 -138.32 35.17 171.37 444.57
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78989.695 10006.043 7.894 5.10e-13 ***
## month_year -37.658 4.972 -7.574 3.13e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 233 on 154 degrees of freedom
## Multiple R-squared: 0.2714, Adjusted R-squared: 0.2667
## F-statistic: 57.36 on 1 and 154 DF, p-value: 3.128e-12
# arima model and forecast
fit <- auto.arima(ts)
fit %>% forecast(h=12) %>% autoplot()

summary(fit)
## Series: ts
## ARIMA(0,0,1)(0,1,1)[12] with drift
##
## Coefficients:
## ma1 sma1 drift
## 0.1603 -0.8251 -3.2375
## s.e. 0.0772 0.0818 0.2865
##
## sigma^2 estimated as 14653: log likelihood=-900.28
## AIC=1808.57 AICc=1808.85 BIC=1820.44
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 1.998078 115.0826 85.15202 -0.0486044 2.761761 0.6528185
## ACF1
## Training set 0.01813275
checkresiduals(fit)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(0,1,1)[12] with drift
## Q* = 38.343, df = 21, p-value = 0.01174
##
## Model df: 3. Total lags used: 24