Research goal

We want to estimate criminal violence level in Chicago using open data that is too really big for primitive handy calculations.

Data

We use dataset “Crimes - 2001 to present” from data.gov. This dataset reflects reported incidents of crime that occurred in the City of Chicago from 2001 to present. Data is extracted from the Chicago Police Department’s CLEAR (Citizen Law Enforcement Analysis and Reporting) system.

For more about Chicago criminal data see https://catalog.data.gov/dataset/crimes-2001-to-present-398a4. The previous parts of the current research see here https://rpubs.com/alex-lev.

Preparing data

We omit reading file with data and converting character strings to numbers as the time consuming routine process

#chicago_crime<-readRDS(file = "chicago_crime.rds") 
#it takes two minutes to load compressed file in memory
#chicago_crime <- na.omit(chicago_crime)#omit NA
#dd_hms <- separate(chicago_crime,"Day_time",c("MDY","HMS"),sep = " ")
#separate Day_time by two parts
#dd_hms <- separate(dd_hms,"MDY",c("Y","M","D"),convert=T)
#separeate Year - Y, Month - M, Day - D as numbers
#d_hom <- filter(dd_hms,dd_hms$`Primary Type`=="HOMICIDE")
#filtering data by type of crime - HOMICIDE

Agregating and arranging data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
load(file = "d_hom.RData")
d_hom_ym <- filter(d_hom,Y!=2017) %>% group_by(Y) %>% count(M) %>% arrange()
#Homocide data for 2001 - 2016 by years with months summary
d_hom_ym
## Source: local data frame [192 x 3]
## Groups: Y [?]
## 
##        Y     M     n
##    <int> <int> <int>
## 1   2001     1    28
## 2   2001     2    20
## 3   2001     3    31
## 4   2001     4    45
## 5   2001     5    31
## 6   2001     6    56
## 7   2001     7    60
## 8   2001     8    48
## 9   2001     9    40
## 10  2001    10    38
## # ... with 182 more rows

Violence time series

We use forecast package for demonstrating violence rate (number of homicides by month) variations by year and month as time series.

library(forecast)
library(ggplot2)
#preparing time series object fro analysis
d_hom_ym_ts <- ts(d_hom_ym,start = 2001,end = 2016,frequency = 12)
d_hom_ym_ts[1:12,]
##          Y  M  n
##  [1,] 2001  1 28
##  [2,] 2001  2 20
##  [3,] 2001  3 31
##  [4,] 2001  4 45
##  [5,] 2001  5 31
##  [6,] 2001  6 56
##  [7,] 2001  7 60
##  [8,] 2001  8 48
##  [9,] 2001  9 40
## [10,] 2001 10 38
## [11,] 2001 11 33
## [12,] 2001 12 30
seasonplot(d_hom_ym_ts[,3],s = 12,col=rainbow(16), year.labels=TRUE,main = "Homicide rate by year and month")

ggseasonplot(d_hom_ym_ts[,3], polar=TRUE)

autoplot.ts(d_hom_ym_ts[,3])

ggAcf(d_hom_ym_ts[,3])#autocorrelation 

d_hom_ym_ts[,3] %>% decompose %>% autoplot

Forecating time series

Here we use as a framework “Forecasting time series using R” by Professor Rob J Hyndman, MONASH University.

Mean method

Forecast of all future values is equal to mean of historical data: \((y_1, ...y_n)\).

\(\hat{y}_{n+h|n}=(y_1+...y_n)/n\)

fit.m <- meanf(d_hom_ym_ts[,3],h = 24,level = 0.95,fan = T)
fit.m$model
## $mu
## [1] 39.14917
## 
## $mu.se
## [1] 0.8852688
## 
## $sd
## [1] 11.91007
## 
## $call
## meanf(y = d_hom_ym_ts[, 3], h = 24, level = 0.95, fan = T)
checkresiduals(fit.m)

## 
##  Ljung-Box test
## 
## data:  residuals
## Q* = 427.06, df = 23, p-value < 2.2e-16
## 
## Model df: 1.   Total lags used: 24
autoplot(fit.m)

Naive method

Forecasts equal to last observed value:\(\hat{y}_{n+h|n}=y_n\)

fit.n <- naive(d_hom_ym_ts[,3],h = 24,level = 0.95,fan = T)
checkresiduals(fit.n)

## 
##  Ljung-Box test
## 
## data:  residuals
## Q* = 61.214, df = 24, p-value = 4.296e-05
## 
## Model df: 0.   Total lags used: 24
with(fit.n,plot(fitted,residuals))

autoplot(fit.n)

Seasonal naive method

Forecasts equal to last value from same season: \(\hat{y}_{n+h|n}=y_{n-m}\) where \(m\) - seasonal period and \(k=[(h-1)/m] +1\).

fit.sn <- snaive(d_hom_ym_ts[,3],h = 24,level = 0.95,fan = T)
checkresiduals(fit.sn)

## 
##  Ljung-Box test
## 
## data:  residuals
## Q* = 135.1, df = 24, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 24
with(fit.sn,plot(fitted,residuals))

autoplot(fit.sn)

Drift method

Forecasts equal to last value plus average change: \(\hat{y}_{n+h|n}=y_{n}+h(y_n-y_1)/(n-1)\). Equivalent to extrapolating a line drawn between first and last observations.

fit.drift <- rwf(d_hom_ym_ts[,3],h = 24,drift = T,level = 0.95,fan = T)
checkresiduals(fit.drift)

## 
##  Ljung-Box test
## 
## data:  residuals
## Q* = 61.214, df = 23, p-value = 2.546e-05
## 
## Model df: 1.   Total lags used: 24
with(fit.drift,plot(fitted,residuals))

autoplot(fit.drift)

Holt’s method

fit.holt <- holt(d_hom_ym_ts[,3],h=24)
fit.holt$model
## Holt's method 
## 
## Call:
##  holt(y = d_hom_ym_ts[, 3], h = 24) 
## 
##   Smoothing parameters:
##     alpha = 0.6948 
##     beta  = 1e-04 
## 
##   Initial states:
##     l = 28.5919 
##     b = 0.122 
## 
##   sigma:  10.8863
## 
##      AIC     AICc      BIC 
## 1815.204 1815.547 1831.196
checkresiduals(fit.holt)

## 
##  Ljung-Box test
## 
## data:  residuals
## Q* = 133.27, df = 20, p-value < 2.2e-16
## 
## Model df: 4.   Total lags used: 24
with(fit.holt,plot(fitted,residuals))

autoplot(fit.holt)

Holt-Winter

fit.hw <- HoltWinters(d_hom_ym_ts[,3])
checkresiduals(fit.hw)

plot(fit.hw)

Seasonal exponential smoothing

fit.ses <- ses(d_hom_ym_ts[,3], h=24)
fit.ses$model
## Simple exponential smoothing 
## 
## Call:
##  ses(y = d_hom_ym_ts[, 3], h = 24) 
## 
##   Smoothing parameters:
##     alpha = 0.6931 
## 
##   Initial states:
##     l = 26.9314 
## 
##   sigma:  10.8856
## 
##      AIC     AICc      BIC 
## 1811.183 1811.318 1820.778
checkresiduals(fit.ses)

## 
##  Ljung-Box test
## 
## data:  residuals
## Q* = 133.11, df = 22, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 24
with(fit.ses,plot(fitted,residuals))

autoplot(fit.ses)

Exponential smoothing state space

fit.ets <- ets(d_hom_ym_ts[,3])
plot(fit.ets)

fc <- d_hom_ym_ts[,3] %>% ets %>% forecast(h=20)  
autoplot(d_hom_ym_ts[,3], series="Data") + 
  autolayer(fc, series="Forecast") + 
  autolayer(fitted(fc), series="Fitted")

ARIMA

fit.arima <- auto.arima(d_hom_ym_ts[,3])
fit.arima$coef
##        ar1        ar2       sar1       sar2  intercept 
##  0.2883811  0.2566103  0.2282973  0.3598988 39.3167887
with(fit.arima,plot(fitted,residuals))

checkresiduals(fit.arima)

## 
##  Ljung-Box test
## 
## data:  residuals
## Q* = 24.655, df = 19, p-value = 0.1722
## 
## Model df: 5.   Total lags used: 24
plot(forecast(fit.arima,h = 24))

Regression OLS

regr = ar.ols(d_hom_ym_ts[,3], order=2, demean=F, intercept=TRUE)
checkresiduals(regr)

fore = predict(regr, n.ahead=24)
ts.plot(d_hom_ym_ts[,3], fore$pred, col=1:2, xlim=c(2001,2018),
          ylab="Homicide rate")
lines(fore$pred, type="p", col=2)
lines(fore$pred+fore$se, lty="dashed", col=4)
lines(fore$pred-fore$se, lty="dashed", col=4)

Linear model

fit.lm <- tslm(d_hom_ym_ts ~ trend + season)
fit.lm$coefficients
##                         Y            M           n
## (Intercept) 2000.91666667 1.000000e+00 32.47400097
## trend          0.08333333 1.676129e-17 -0.02443957
## season2       -0.08333333 1.000000e+00 -7.83886452
## season3       -0.16666667 2.000000e+00  2.58557505
## season4       -0.25000000 3.000000e+00  6.87668129
## season5       -0.33333333 4.000000e+00 10.90112086
## season6       -0.41666667 5.000000e+00 15.52556043
## season7       -0.50000000 6.000000e+00 23.81666667
## season8       -0.58333333 7.000000e+00 19.17443957
## season9       -0.66666667 8.000000e+00 15.59887914
## season10      -0.75000000 9.000000e+00 10.02331871
## season11      -0.83333333 1.000000e+01  5.91442495
## season12      -0.91666667 1.100000e+01  4.80553119
with(fit.arima,plot(fitted,residuals))

autoplot(forecast(fit.lm,h = 24))

##Cubic Spline method Returns local linear forecasts and prediction intervals using cubic smoothing splines.

fit.spl <- splinef(y = d_hom_ym_ts[,3],h = 24)
with(fit.spl,plot(fitted,residuals))

checkresiduals(fit.spl)

autoplot(fit.spl)

Comparing models by accuracy

Let \(y_t\) denote the observation \(t\) and \(f_t\) denote its forecast, where \(t = 1...n\). Then the following measures are useful: \(MAE=n^{-1}\sum_{t=1}^n{|y_t-f_t|}\) - Mean Absolute Error. see https://en.wikipedia.org/wiki/Mean_absolute_error

\(MSE=n^{-1}\sum_{t=1}^n{(y_t-f_t)^2}\) - Mean Squared Error. see https://en.wikipedia.org/wiki/Mean_squared_error

\(RMSE=\sqrt{n^{-1}\sum_{t=1}^n{(y_t-f_t)^2}}\) - Root-Mean-Square Error see https://en.wikipedia.org/wiki/Root-mean-square_deviation

\(MAPE=100n^{-1}\sum_{t=1}^n{|y_t-f_t|/|y_t|}\) - Mean Absolute Percentage Error see https://en.wikipedia.org/wiki/Mean_absolute_percentage_error

acc.m <- accuracy(fit.m)
acc.n <- accuracy(fit.n)
acc.sn <- accuracy(fit.sn)
acc.drift <- accuracy(fit.drift)
acc.holt <- accuracy(fit.holt)
acc.ses <- accuracy(fit.ses)
acc.arima <- accuracy(fit.arima)
acc.spl <- accuracy(fit.spl)

acc.tab <- rbind.data.frame(acc.m,
                            acc.n,acc.sn,
                            acc.drift,acc.holt,
                            acc.ses,acc.arima,acc.spl)

acc.tab$method <- c("mean","naive","snaive",
                 "drift","holt","ses","arima","spline")


library(aplpack)
## Loading required package: tcltk
faces(acc.tab[,1:7],scale = T, main="Models chernoff faces",labels = acc.tab$method)

## effect of variables:
##  modified item       Var   
##  "height of face   " "ME"  
##  "width of face    " "RMSE"
##  "structure of face" "MAE" 
##  "height of mouth  " "MPE" 
##  "width of mouth   " "MAPE"
##  "smiling          " "MASE"
##  "height of eyes   " "ACF1"
##  "width of eyes    " "ME"  
##  "height of hair   " "RMSE"
##  "width of hair   "  "MAE" 
##  "style of hair   "  "MPE" 
##  "height of nose  "  "MAPE"
##  "width of nose   "  "MASE"
##  "width of ear    "  "ACF1"
##  "height of ear   "  "ME"
barplot(acc.tab$ME,names.arg = acc.tab$method,main = "ME")

barplot(acc.tab$RMSE,names.arg = acc.tab$method,main = "RMSE")

barplot(acc.tab$MAE,names.arg = acc.tab$method,main = "MAE")

barplot(acc.tab$MPE,names.arg = acc.tab$method,main = "MPE")

barplot(acc.tab$MAPE,names.arg = acc.tab$method,main = "MAPE")

barplot(acc.tab$MASE,names.arg = acc.tab$method,main = "MASE")

barplot(acc.tab$ACF1,names.arg = acc.tab$method,main = "ACF1")

library(dplyr)
acc.tab %>% group_by(method) %>% summarise(SumErr=sum(ME,RMSE,MAE,MPE,MAPE,MASE,ACF1)) %>% arrange(SumErr)
## # A tibble: 8 × 2
##   method   SumErr
##    <chr>    <dbl>
## 1  arima 31.12045
## 2   holt 38.77420
## 3    ses 39.40361
## 4  drift 40.51788
## 5  naive 41.00545
## 6   mean 41.05116
## 7 snaive 45.63184
## 8 spline 46.60634

Predicting homicide rate with ARIMA

fc.ar <- forecast(fit.arima,h = 24)
fc.ar$mean
##           Jan      Feb      Mar      Apr      May      Jun      Jul
## 2016          32.78642 38.96320 40.71821 44.22196 41.86337 43.14746
## 2017 38.83241 30.94402 36.64989 38.11472 42.86311 42.67757 45.12544
## 2018 44.13165                                                      
##           Aug      Sep      Oct      Nov      Dec
## 2016 45.41036 46.96553 36.34095 37.73075 37.23155
## 2017 44.19931 48.51113 35.28719 37.40303 35.48893
## 2018
autoplot(fc.ar$mean)

Conclusions

  1. Chicago homicide rate is a classical time series with season fluctuations.
  2. ARIMA model is the best fitted for forecasting future trends with minimum of sum errors.