Libraries

library(data.table)
library(dplyr)
library(lubridate)
library(hms)
library(ggplot2)
library(maptools)
library(ggmap)
library(scales)
library(tseries)
library(forecast)
library(zoo)

Load data

df_full <- fread('C:/Users/antho/Documents/projects/citizen/NYPD_Complaint_Data_Historic.csv')

Sample and Format data

set.seed(123)
df <- df_full[sample(nrow(df_full), size = 500000, replace = FALSE), ]

glimpse(df)
## Observations: 500,000
## Variables: 35
## $ CMPLNT_NUM        <int> 529743672, 613252886, 294985478, 728497476, ...
## $ CMPLNT_FR_DT      <chr> "09/13/2014", "01/25/2012", "02/04/2010", "0...
## $ CMPLNT_FR_TM      <chr> "22:30:00", "21:50:00", "22:15:00", "11:50:0...
## $ CMPLNT_TO_DT      <chr> "09/14/2014", "01/25/2012", "", "03/13/2012"...
## $ CMPLNT_TO_TM      <chr> "13:00:00", "22:37:00", "", "12:05:00", "12:...
## $ ADDR_PCT_CD       <int> 68, 45, 46, 40, 72, 101, 41, 43, 79, 20, 83,...
## $ RPT_DT            <chr> "09/14/2014", "01/25/2012", "02/04/2010", "0...
## $ KY_CD             <int> 109, 105, 235, 351, 341, 361, 351, 109, 344,...
## $ OFNS_DESC         <chr> "GRAND LARCENY", "ROBBERY", "DANGEROUS DRUGS...
## $ PD_CD             <int> 457, 397, 567, 259, 321, 639, 259, 413, 101,...
## $ PD_DESC           <chr> "LARCENY,GRAND OF VEHICULAR/MOTORCYCLE ACCES...
## $ CRM_ATPT_CPTD_CD  <chr> "COMPLETED", "COMPLETED", "COMPLETED", "ATTE...
## $ LAW_CAT_CD        <chr> "FELONY", "FELONY", "MISDEMEANOR", "MISDEMEA...
## $ BORO_NM           <chr> "BROOKLYN", "BRONX", "BRONX", "BRONX", "BROO...
## $ LOC_OF_OCCUR_DESC <chr> "", "", "", "", "INSIDE", "INSIDE", "INSIDE"...
## $ PREM_TYP_DESC     <chr> "STREET", "STREET", "STREET", "OTHER", "BEAU...
## $ JURIS_DESC        <chr> "N.Y. POLICE DEPT", "N.Y. POLICE DEPT", "N.Y...
## $ JURISDICTION_CODE <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ PARKS_NM          <chr> "", NA, NA, NA, NA, NA, NA, NA, NA, "", "", ...
## $ HADEVELOPT        <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ HOUSING_PSA       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "", NA, ...
## $ X_COORD_CD        <int> 978011, 1027392, 1008817, 1008278, 982106, 1...
## $ Y_COORD_CD        <int> 171336, 246172, 250725, 237309, 172074, 1567...
## $ SUSP_AGE_GROUP    <chr> "", "", "", "", "", "", "45-64", "", "", "",...
## $ SUSP_RACE         <chr> "", "", "", "", "WHITE HISPANIC", "BLACK", "...
## $ SUSP_SEX          <chr> "", "", "", "", "M", "F", "M", "", "", "", "...
## $ TRANSIT_DISTRICT  <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Latitude          <dbl> 40.63696, 40.84226, 40.85483, 40.81801, 40.6...
## $ Longitude         <dbl> -74.02248, -73.84408, -73.91120, -73.91319, ...
## $ Lat_Lon           <chr> "(40.636960333, -74.022480845)", "(40.842263...
## $ PATROL_BORO       <chr> "PATROL BORO BKLYN SOUTH", "PATROL BORO BRON...
## $ STATION_NAME      <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ VIC_AGE_GROUP     <chr> "45-64", "18-24", "", "", "25-44", "25-44", ...
## $ VIC_RACE          <chr> "WHITE", "WHITE", "UNKNOWN", "UNKNOWN", "ASI...
## $ VIC_SEX           <chr> "F", "M", "E", "E", "F", "F", "D", "M", "F",...
names(df) <- c('complaint', 'from_date', 'from_time', 'to_date', 'to_time', 'precinct', 'report_date',
                      'off_code', 'off_desc', 'class_code', 'class_desc', 'outcome', 'level', 'boro', 'location',
                      'premise', 'juris_desc', 'juris_code', 'park', 'house_dev', 'house_psa', 'x_coord', 'y_coord',
                      'susp_age', 'susp_race', 'sus_sex', 'transit', 'lat', 'long', 'lat_lon', 'patrol_boro', 'station',
                      'vic_age', 'vic_race', 'vic_sex')

# dates
df$from_date <- mdy(df$from_date)
df$to_date <- mdy(df$to_date)
df$report_date <- mdy(df$report_date)

# time
df$from_time <- as_hms(df$from_time)
## Warning: Lossy cast from <character> to <hms> at position(s) 18936, 303464,
## 350933, 359108, 398934
df$to_time <- as_hms(df$to_time)
## Warning: Lossy cast from <character> to <hms> at position(s) 3, 13, 22, 52,
## 53, ... (and 122967 more)
df$from_hour <- hour(df$from_time)
df$year <- year(df$report_date)
df$month_year <- as.yearmon(df$report_date)

Explore data

str(df)
## Classes 'data.table' and 'data.frame':   500000 obs. of  38 variables:
##  $ complaint  : int  529743672 613252886 294985478 728497476 913842706 107262943 394025820 503121618 472147001 621315823 ...
##  $ from_date  : Date, format: "2014-09-13" "2012-01-25" ...
##  $ from_time  : 'hms' num  22:30:00 21:50:00 22:15:00 11:50:00 ...
##   ..- attr(*, "units")= chr "secs"
##  $ to_date    : Date, format: "2014-09-14" "2012-01-25" ...
##  $ to_time    : 'hms' num  13:00:00 22:37:00 NA 12:05:00 ...
##   ..- attr(*, "units")= chr "secs"
##  $ precinct   : int  68 45 46 40 72 101 41 43 79 20 ...
##  $ report_date: Date, format: "2014-09-14" "2012-01-25" ...
##  $ off_code   : int  109 105 235 351 341 361 351 109 344 233 ...
##  $ off_desc   : chr  "GRAND LARCENY" "ROBBERY" "DANGEROUS DRUGS" "CRIMINAL MISCHIEF & RELATED OF" ...
##  $ class_code : int  457 397 567 259 321 639 259 413 101 175 ...
##  $ class_desc : chr  "LARCENY,GRAND OF VEHICULAR/MOTORCYCLE ACCESSORIES" "ROBBERY,OPEN AREA UNCLASSIFIED" "MARIJUANA, POSSESSION 4 & 5" "CRIMINAL MISCHIEF,UNCLASSIFIED 4" ...
##  $ outcome    : chr  "COMPLETED" "COMPLETED" "COMPLETED" "ATTEMPTED" ...
##  $ level      : chr  "FELONY" "FELONY" "MISDEMEANOR" "MISDEMEANOR" ...
##  $ boro       : chr  "BROOKLYN" "BRONX" "BRONX" "BRONX" ...
##  $ location   : chr  "" "" "" "" ...
##  $ premise    : chr  "STREET" "STREET" "STREET" "OTHER" ...
##  $ juris_desc : chr  "N.Y. POLICE DEPT" "N.Y. POLICE DEPT" "N.Y. POLICE DEPT" "N.Y. POLICE DEPT" ...
##  $ juris_code : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ park       : chr  "" NA NA NA ...
##  $ house_dev  : chr  "" "" "" "" ...
##  $ house_psa  : chr  NA NA NA NA ...
##  $ x_coord    : int  978011 1027392 1008817 1008278 982106 1046025 1014179 1024197 1001420 991168 ...
##  $ y_coord    : int  171336 246172 250725 237309 172074 156791 239453 234000 187356 224929 ...
##  $ susp_age   : chr  "" "" "" "" ...
##  $ susp_race  : chr  "" "" "" "" ...
##  $ sus_sex    : chr  "" "" "" "" ...
##  $ transit    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ lat        : num  40.6 40.8 40.9 40.8 40.6 ...
##  $ long       : num  -74 -73.8 -73.9 -73.9 -74 ...
##  $ lat_lon    : chr  "(40.636960333, -74.022480845)" "(40.84226332, -73.84407906)" "(40.854831167, -73.911195468)" "(40.818009625, -73.913191928)" ...
##  $ patrol_boro: chr  "PATROL BORO BKLYN SOUTH" "PATROL BORO BRONX" "PATROL BORO BRONX" "PATROL BORO BRONX" ...
##  $ station    : chr  "" "" "" "" ...
##  $ vic_age    : chr  "45-64" "18-24" "" "" ...
##  $ vic_race   : chr  "WHITE" "WHITE" "UNKNOWN" "UNKNOWN" ...
##  $ vic_sex    : chr  "F" "M" "E" "E" ...
##  $ from_hour  : int  22 21 22 11 12 13 12 18 2 3 ...
##  $ year       : num  2014 2012 2010 2012 2013 ...
##  $ month_year : 'yearmon' num  Sep 2014 Jan 2012 Feb 2010 Mar 2012 ...
##  - attr(*, ".internal.selfref")=<externalptr>
summary(df)
##    complaint           from_date           from_time       
##  Min.   :100003254   Min.   :1017-03-18   Length:500000    
##  1st Qu.:325043348   1st Qu.:2009-01-11   Class1:hms       
##  Median :550582272   Median :2012-03-25   Class2:difftime  
##  Mean   :550171924   Mean   :2012-04-10   Mode  :numeric   
##  3rd Qu.:775115150   3rd Qu.:2015-07-15                    
##  Max.   :999999904   Max.   :2018-12-31                    
##                      NA's   :52                            
##     to_date             to_time            precinct     
##  Min.   :1972-01-02   Length:500000     Min.   :  1.00  
##  1st Qu.:2009-06-06   Class1:hms        1st Qu.: 40.00  
##  Median :2012-09-23   Class2:difftime   Median : 63.00  
##  Mean   :2012-08-24   Mode  :numeric    Mean   : 63.43  
##  3rd Qu.:2015-11-24                     3rd Qu.: 94.00  
##  Max.   :2018-12-31                     Max.   :123.00  
##  NA's   :123342                         NA's   :165     
##   report_date            off_code       off_desc           class_code   
##  Min.   :2006-01-01   Min.   :101.0   Length:500000      Min.   :101.0  
##  1st Qu.:2009-01-23   1st Qu.:117.0   Class :character   1st Qu.:254.0  
##  Median :2012-04-05   Median :341.0   Mode  :character   Median :388.0  
##  Mean   :2012-05-01   Mean   :295.2                      Mean   :416.2  
##  3rd Qu.:2015-07-25   3rd Qu.:351.0                      3rd Qu.:637.0  
##  Max.   :2018-12-31   Max.   :881.0                      Max.   :969.0  
##                                                          NA's   :442    
##   class_desc          outcome             level          
##  Length:500000      Length:500000      Length:500000     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##      boro             location           premise         
##  Length:500000      Length:500000      Length:500000     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   juris_desc          juris_code          park          
##  Length:500000      Min.   : 0.0000   Length:500000     
##  Class :character   1st Qu.: 0.0000   Class :character  
##  Mode  :character   Median : 0.0000   Mode  :character  
##                     Mean   : 0.7196                     
##                     3rd Qu.: 0.0000                     
##                     Max.   :97.0000                     
##                     NA's   :442                         
##   house_dev          house_psa            x_coord           y_coord       
##  Length:500000      Length:500000      Min.   : 152021   Min.   : 121131  
##  Class :character   Class :character   1st Qu.: 991686   1st Qu.: 184272  
##  Mode  :character   Mode  :character   Median :1004448   Median : 205840  
##                                        Mean   :1004789   Mean   : 206961  
##                                        3rd Qu.:1016634   3rd Qu.: 235172  
##                                        Max.   :1067249   Max.   :2261851  
##                                        NA's   :1362      NA's   :1362     
##    susp_age          susp_race           sus_sex             transit      
##  Length:500000      Length:500000      Length:500000      Min.   : 1.0    
##  Class :character   Class :character   Class :character   1st Qu.: 3.0    
##  Mode  :character   Mode  :character   Mode  :character   Median :11.0    
##                                                           Mean   :13.4    
##                                                           3rd Qu.:30.0    
##                                                           Max.   :34.0    
##                                                           NA's   :488961  
##       lat             long          lat_lon          patrol_boro       
##  Min.   :40.50   Min.   :-77.00   Length:500000      Length:500000     
##  1st Qu.:40.67   1st Qu.:-73.97   Class :character   Class :character  
##  Median :40.73   Median :-73.93   Mode  :character   Mode  :character  
##  Mean   :40.73   Mean   :-73.93                                        
##  3rd Qu.:40.81   3rd Qu.:-73.88                                        
##  Max.   :46.36   Max.   :-73.70                                        
##  NA's   :1362    NA's   :1362                                          
##    station            vic_age            vic_race        
##  Length:500000      Length:500000      Length:500000     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    vic_sex            from_hour          year        month_year  
##  Length:500000      Min.   : 0.00   Min.   :2006   Min.   :2006  
##  Class :character   1st Qu.: 9.00   1st Qu.:2009   1st Qu.:2009  
##  Mode  :character   Median :14.00   Median :2012   Median :2012  
##                     Mean   :13.24   Mean   :2012   Mean   :2012  
##                     3rd Qu.:19.00   3rd Qu.:2015   3rd Qu.:2016  
##                     Max.   :23.00   Max.   :2018   Max.   :2019  
##                     NA's   :5
glimpse(df)
## Observations: 500,000
## Variables: 38
## $ complaint   <int> 529743672, 613252886, 294985478, 728497476, 913842...
## $ from_date   <date> 2014-09-13, 2012-01-25, 2010-02-04, 2012-03-13, 2...
## $ from_time   <time> 22:30:00, 21:50:00, 22:15:00, 11:50:00, 12:00:00,...
## $ to_date     <date> 2014-09-14, 2012-01-25, NA, 2012-03-13, 2013-03-0...
## $ to_time     <time> 13:00:00, 22:37:00,       NA, 12:05:00, 12:45:00,...
## $ precinct    <int> 68, 45, 46, 40, 72, 101, 41, 43, 79, 20, 83, 43, 1...
## $ report_date <date> 2014-09-14, 2012-01-25, 2010-02-04, 2012-03-13, 2...
## $ off_code    <int> 109, 105, 235, 351, 341, 361, 351, 109, 344, 233, ...
## $ off_desc    <chr> "GRAND LARCENY", "ROBBERY", "DANGEROUS DRUGS", "CR...
## $ class_code  <int> 457, 397, 567, 259, 321, 639, 259, 413, 101, 175, ...
## $ class_desc  <chr> "LARCENY,GRAND OF VEHICULAR/MOTORCYCLE ACCESSORIES...
## $ outcome     <chr> "COMPLETED", "COMPLETED", "COMPLETED", "ATTEMPTED"...
## $ level       <chr> "FELONY", "FELONY", "MISDEMEANOR", "MISDEMEANOR", ...
## $ boro        <chr> "BROOKLYN", "BRONX", "BRONX", "BRONX", "BROOKLYN",...
## $ location    <chr> "", "", "", "", "INSIDE", "INSIDE", "INSIDE", "FRO...
## $ premise     <chr> "STREET", "STREET", "STREET", "OTHER", "BEAUTY & N...
## $ juris_desc  <chr> "N.Y. POLICE DEPT", "N.Y. POLICE DEPT", "N.Y. POLI...
## $ juris_code  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,...
## $ park        <chr> "", NA, NA, NA, NA, NA, NA, NA, NA, "", "", NA, NA...
## $ house_dev   <chr> "", "", "", "", "", "", "", "", "", "", "", "", ""...
## $ house_psa   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "", NA, NA, NA...
## $ x_coord     <int> 978011, 1027392, 1008817, 1008278, 982106, 1046025...
## $ y_coord     <int> 171336, 246172, 250725, 237309, 172074, 156791, 23...
## $ susp_age    <chr> "", "", "", "", "", "", "45-64", "", "", "", "UNKN...
## $ susp_race   <chr> "", "", "", "", "WHITE HISPANIC", "BLACK", "BLACK"...
## $ sus_sex     <chr> "", "", "", "", "M", "F", "M", "", "", "", "U", "M...
## $ transit     <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ lat         <dbl> 40.63696, 40.84226, 40.85483, 40.81801, 40.63899, ...
## $ long        <dbl> -74.02248, -73.84408, -73.91120, -73.91319, -74.00...
## $ lat_lon     <chr> "(40.636960333, -74.022480845)", "(40.84226332, -7...
## $ patrol_boro <chr> "PATROL BORO BKLYN SOUTH", "PATROL BORO BRONX", "P...
## $ station     <chr> "", "", "", "", "", "", "", "", "", "", "", "", ""...
## $ vic_age     <chr> "45-64", "18-24", "", "", "25-44", "25-44", "", "4...
## $ vic_race    <chr> "WHITE", "WHITE", "UNKNOWN", "UNKNOWN", "ASIAN / P...
## $ vic_sex     <chr> "F", "M", "E", "E", "F", "F", "D", "M", "F", "M", ...
## $ from_hour   <int> 22, 21, 22, 11, 12, 13, 12, 18, 2, 3, 11, 19, 17, ...
## $ year        <dbl> 2014, 2012, 2010, 2012, 2013, 2007, 2011, 2006, 20...
## $ month_year  <yearmon> Sep 2014, Jan 2012, Feb 2010, Mar 2012, Mar 20...

Analysis

# summary
df_date <- df %>% group_by(report_date) %>% summarise(count = n_distinct(complaint))
df_time <- df %>% group_by(from_hour) %>% summarise(count = n_distinct(complaint)) %>% mutate(perc = percent(count/sum(count)))
df_boro <- df %>% filter(boro != "") %>% group_by(boro) %>% summarise(count = n_distinct(complaint)) %>% mutate(perc = count/sum(count))

ggplot(df_date, aes(report_date, count)) + geom_line() + xlab('Year') + ylab('Complaints') + ggtitle('NYPD Complaints, 2006 to 2018')

ggplot(df_time, aes(x = from_hour, y = count)) + geom_bar(stat = 'identity') + ggtitle('NYPD Complaints by Hour, 2006 to 2018') + xlab('Hour') + ylab('Complaints')
## Warning: Removed 1 rows containing missing values (position_stack).

ggplot(df_boro, aes(x='', y=perc, fill=boro)) + geom_bar(stat='identity') + coord_polar("y", start=0) +
  ggtitle('Complaints by Borough, 2006 to 2018') + theme(legend.title=element_blank()) + scale_colour_hue()

myLocation <- c(lon = -73.8993, lat = 40.7223)
myMap <-
  get_map(
    location = myLocation,
    source = "google",
    maptype = "terrain",
    crop = FALSE,
    zoom = 10
  )
## Source : https://maps.googleapis.com/maps/api/staticmap?center=40.7223,-73.8993&zoom=10&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx%20
ggmap(myMap, legend = "topright") +
  geom_point(
    aes(x = long, y = lat),
    df,
    alpha = .01,
    color = "darkred",
    size = 1
  )  +
  labs(x = 'Longitude', y = 'Latitude') +
  ggtitle('NYPD Complaints 2006 to 2018')
## Warning: Removed 1371 rows containing missing values (geom_point).

Times-series data, Trend and Seasonality

# monthly
df_month <- df %>% group_by(month_year) %>% summarise(count = n())

# time series
ts <- ts(df_month$count,
         frequency = 12,
         start = c(2006, 1))
         
autoplot(ts)

# linear model
lm <- lm(count ~ month_year, df_month)
summary(lm)
## 
## Call:
## lm(formula = count ~ month_year, data = df_month)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -756.28 -138.32   35.17  171.37  444.57 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 78989.695  10006.043   7.894 5.10e-13 ***
## month_year    -37.658      4.972  -7.574 3.13e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 233 on 154 degrees of freedom
## Multiple R-squared:  0.2714, Adjusted R-squared:  0.2667 
## F-statistic: 57.36 on 1 and 154 DF,  p-value: 3.128e-12
# arima model and forecast
fit <- auto.arima(ts)
fit %>% forecast(h=12) %>% autoplot()

summary(fit)
## Series: ts 
## ARIMA(0,0,1)(0,1,1)[12] with drift 
## 
## Coefficients:
##          ma1     sma1    drift
##       0.1603  -0.8251  -3.2375
## s.e.  0.0772   0.0818   0.2865
## 
## sigma^2 estimated as 14653:  log likelihood=-900.28
## AIC=1808.57   AICc=1808.85   BIC=1820.44
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE     MAPE      MASE
## Training set 1.998078 115.0826 85.15202 -0.0486044 2.761761 0.6528185
##                    ACF1
## Training set 0.01813275
checkresiduals(fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(0,1,1)[12] with drift
## Q* = 38.343, df = 21, p-value = 0.01174
## 
## Model df: 3.   Total lags used: 24