We want to estimate criminal violence level in Chicago using open data that is too really big for primitive handy calculations.
We use dataset “Crimes - 2001 to present” from data.gov. This dataset reflects reported incidents of crime that occurred in the City of Chicago from 2001 to present. Data is extracted from the Chicago Police Department’s CLEAR (Citizen Law Enforcement Analysis and Reporting) system.
For more about Chicago criminal data see https://catalog.data.gov/dataset/crimes-2001-to-present-398a4. The previous parts of the current research see here https://rpubs.com/alex-lev.
We omit reading file with data and converting character strings to numbers as the time consuming routine process
#chicago_crime<-readRDS(file = "chicago_crime.rds")
#it takes two minutes to load compressed file in memory
#chicago_crime <- na.omit(chicago_crime)#omit NA
#dd_hms <- separate(chicago_crime,"Day_time",c("MDY","HMS"),sep = " ")
#separate Day_time by two parts
#dd_hms <- separate(dd_hms,"MDY",c("Y","M","D"),convert=T)
#separeate Year - Y, Month - M, Day - D as numbers
#d_hom <- filter(dd_hms,dd_hms$`Primary Type`=="HOMICIDE")
#filtering data by type of crime - HOMICIDE
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
load(file = "d_hom.RData")
d_hom_ym <- filter(d_hom,Y!=2017) %>% group_by(Y) %>% count(M) %>% arrange()
#Homocide data for 2001 - 2016 by years with months summary
d_hom_ym
## Source: local data frame [192 x 3]
## Groups: Y [?]
##
## Y M n
## <int> <int> <int>
## 1 2001 1 28
## 2 2001 2 20
## 3 2001 3 31
## 4 2001 4 45
## 5 2001 5 31
## 6 2001 6 56
## 7 2001 7 60
## 8 2001 8 48
## 9 2001 9 40
## 10 2001 10 38
## # ... with 182 more rows
We use forecast package for demonstrating violence rate (number of homicides by month) variations by year and month as time series.
library(forecast)
library(ggplot2)
#preparing time series object fro analysis
d_hom_ym_ts <- ts(d_hom_ym,start = 2001,end = 2016,frequency = 12)
d_hom_ym_ts[1:12,]
## Y M n
## [1,] 2001 1 28
## [2,] 2001 2 20
## [3,] 2001 3 31
## [4,] 2001 4 45
## [5,] 2001 5 31
## [6,] 2001 6 56
## [7,] 2001 7 60
## [8,] 2001 8 48
## [9,] 2001 9 40
## [10,] 2001 10 38
## [11,] 2001 11 33
## [12,] 2001 12 30
seasonplot(d_hom_ym_ts[,3],s = 12,col=rainbow(16), year.labels=TRUE,main = "Homicide rate by year and month")
ggseasonplot(d_hom_ym_ts[,3], polar=TRUE)
autoplot.ts(d_hom_ym_ts[,3])
ggAcf(d_hom_ym_ts[,3])#autocorrelation
d_hom_ym_ts[,3] %>% decompose %>% autoplot
Here we use as a framework “Forecasting time series using R” by Professor Rob J Hyndman, MONASH University.
Forecast of all future values is equal to mean of historical data: \((y_1, ...y_n)\).
\(\hat{y}_{n+h|n}=(y_1+...y_n)/n\)
fit.m <- meanf(d_hom_ym_ts[,3],h = 24,level = 0.95,fan = T)
fit.m$model
## $mu
## [1] 39.14917
##
## $mu.se
## [1] 0.8852688
##
## $sd
## [1] 11.91007
##
## $call
## meanf(y = d_hom_ym_ts[, 3], h = 24, level = 0.95, fan = T)
checkresiduals(fit.m)
##
## Ljung-Box test
##
## data: residuals
## Q* = 427.06, df = 23, p-value < 2.2e-16
##
## Model df: 1. Total lags used: 24
autoplot(fit.m)
Forecasts equal to last observed value:\(\hat{y}_{n+h|n}=y_n\)
fit.n <- naive(d_hom_ym_ts[,3],h = 24,level = 0.95,fan = T)
checkresiduals(fit.n)
##
## Ljung-Box test
##
## data: residuals
## Q* = 61.214, df = 24, p-value = 4.296e-05
##
## Model df: 0. Total lags used: 24
with(fit.n,plot(fitted,residuals))
autoplot(fit.n)
Forecasts equal to last value from same season: \(\hat{y}_{n+h|n}=y_{n-m}\) where \(m\) - seasonal period and \(k=[(h-1)/m] +1\).
fit.sn <- snaive(d_hom_ym_ts[,3],h = 24,level = 0.95,fan = T)
checkresiduals(fit.sn)
##
## Ljung-Box test
##
## data: residuals
## Q* = 135.1, df = 24, p-value < 2.2e-16
##
## Model df: 0. Total lags used: 24
with(fit.sn,plot(fitted,residuals))
autoplot(fit.sn)
Forecasts equal to last value plus average change: \(\hat{y}_{n+h|n}=y_{n}+h(y_n-y_1)/(n-1)\). Equivalent to extrapolating a line drawn between first and last observations.
fit.drift <- rwf(d_hom_ym_ts[,3],h = 24,drift = T,level = 0.95,fan = T)
checkresiduals(fit.drift)
##
## Ljung-Box test
##
## data: residuals
## Q* = 61.214, df = 23, p-value = 2.546e-05
##
## Model df: 1. Total lags used: 24
with(fit.drift,plot(fitted,residuals))
autoplot(fit.drift)
fit.holt <- holt(d_hom_ym_ts[,3],h=24)
fit.holt$model
## Holt's method
##
## Call:
## holt(y = d_hom_ym_ts[, 3], h = 24)
##
## Smoothing parameters:
## alpha = 0.6948
## beta = 1e-04
##
## Initial states:
## l = 28.5919
## b = 0.122
##
## sigma: 10.8863
##
## AIC AICc BIC
## 1815.204 1815.547 1831.196
checkresiduals(fit.holt)
##
## Ljung-Box test
##
## data: residuals
## Q* = 133.27, df = 20, p-value < 2.2e-16
##
## Model df: 4. Total lags used: 24
with(fit.holt,plot(fitted,residuals))
autoplot(fit.holt)
fit.hw <- HoltWinters(d_hom_ym_ts[,3])
checkresiduals(fit.hw)
plot(fit.hw)
fit.ses <- ses(d_hom_ym_ts[,3], h=24)
fit.ses$model
## Simple exponential smoothing
##
## Call:
## ses(y = d_hom_ym_ts[, 3], h = 24)
##
## Smoothing parameters:
## alpha = 0.6931
##
## Initial states:
## l = 26.9314
##
## sigma: 10.8856
##
## AIC AICc BIC
## 1811.183 1811.318 1820.778
checkresiduals(fit.ses)
##
## Ljung-Box test
##
## data: residuals
## Q* = 133.11, df = 22, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 24
with(fit.ses,plot(fitted,residuals))
autoplot(fit.ses)
fit.ets <- ets(d_hom_ym_ts[,3])
plot(fit.ets)
fc <- d_hom_ym_ts[,3] %>% ets %>% forecast(h=20)
autoplot(d_hom_ym_ts[,3], series="Data") +
autolayer(fc, series="Forecast") +
autolayer(fitted(fc), series="Fitted")
fit.arima <- auto.arima(d_hom_ym_ts[,3])
fit.arima$coef
## ar1 ar2 sar1 sar2 intercept
## 0.2883811 0.2566103 0.2282973 0.3598988 39.3167887
with(fit.arima,plot(fitted,residuals))
checkresiduals(fit.arima)
##
## Ljung-Box test
##
## data: residuals
## Q* = 24.655, df = 19, p-value = 0.1722
##
## Model df: 5. Total lags used: 24
plot(forecast(fit.arima,h = 24))
regr = ar.ols(d_hom_ym_ts[,3], order=2, demean=F, intercept=TRUE)
checkresiduals(regr)
fore = predict(regr, n.ahead=24)
ts.plot(d_hom_ym_ts[,3], fore$pred, col=1:2, xlim=c(2001,2018),
ylab="Homicide rate")
lines(fore$pred, type="p", col=2)
lines(fore$pred+fore$se, lty="dashed", col=4)
lines(fore$pred-fore$se, lty="dashed", col=4)
fit.lm <- tslm(d_hom_ym_ts ~ trend + season)
fit.lm$coefficients
## Y M n
## (Intercept) 2000.91666667 1.000000e+00 32.47400097
## trend 0.08333333 1.676129e-17 -0.02443957
## season2 -0.08333333 1.000000e+00 -7.83886452
## season3 -0.16666667 2.000000e+00 2.58557505
## season4 -0.25000000 3.000000e+00 6.87668129
## season5 -0.33333333 4.000000e+00 10.90112086
## season6 -0.41666667 5.000000e+00 15.52556043
## season7 -0.50000000 6.000000e+00 23.81666667
## season8 -0.58333333 7.000000e+00 19.17443957
## season9 -0.66666667 8.000000e+00 15.59887914
## season10 -0.75000000 9.000000e+00 10.02331871
## season11 -0.83333333 1.000000e+01 5.91442495
## season12 -0.91666667 1.100000e+01 4.80553119
with(fit.arima,plot(fitted,residuals))
autoplot(forecast(fit.lm,h = 24))
##Cubic Spline method Returns local linear forecasts and prediction intervals using cubic smoothing splines.
fit.spl <- splinef(y = d_hom_ym_ts[,3],h = 24)
with(fit.spl,plot(fitted,residuals))
checkresiduals(fit.spl)
autoplot(fit.spl)
Let \(y_t\) denote the observation \(t\) and \(f_t\) denote its forecast, where \(t = 1...n\). Then the following measures are useful: \(MAE=n^{-1}\sum_{t=1}^n{|y_t-f_t|}\) - Mean Absolute Error. see https://en.wikipedia.org/wiki/Mean_absolute_error
\(MSE=n^{-1}\sum_{t=1}^n{(y_t-f_t)^2}\) - Mean Squared Error. see https://en.wikipedia.org/wiki/Mean_squared_error
\(RMSE=\sqrt{n^{-1}\sum_{t=1}^n{(y_t-f_t)^2}}\) - Root-Mean-Square Error see https://en.wikipedia.org/wiki/Root-mean-square_deviation
\(MAPE=100n^{-1}\sum_{t=1}^n{|y_t-f_t|/|y_t|}\) - Mean Absolute Percentage Error see https://en.wikipedia.org/wiki/Mean_absolute_percentage_error
acc.m <- accuracy(fit.m)
acc.n <- accuracy(fit.n)
acc.sn <- accuracy(fit.sn)
acc.drift <- accuracy(fit.drift)
acc.holt <- accuracy(fit.holt)
acc.ses <- accuracy(fit.ses)
acc.arima <- accuracy(fit.arima)
acc.spl <- accuracy(fit.spl)
acc.tab <- rbind.data.frame(acc.m,
acc.n,acc.sn,
acc.drift,acc.holt,
acc.ses,acc.arima,acc.spl)
acc.tab$method <- c("mean","naive","snaive",
"drift","holt","ses","arima","spline")
library(aplpack)
## Loading required package: tcltk
faces(acc.tab[,1:7],scale = T, main="Models chernoff faces",labels = acc.tab$method)
## effect of variables:
## modified item Var
## "height of face " "ME"
## "width of face " "RMSE"
## "structure of face" "MAE"
## "height of mouth " "MPE"
## "width of mouth " "MAPE"
## "smiling " "MASE"
## "height of eyes " "ACF1"
## "width of eyes " "ME"
## "height of hair " "RMSE"
## "width of hair " "MAE"
## "style of hair " "MPE"
## "height of nose " "MAPE"
## "width of nose " "MASE"
## "width of ear " "ACF1"
## "height of ear " "ME"
barplot(acc.tab$ME,names.arg = acc.tab$method,main = "ME")
barplot(acc.tab$RMSE,names.arg = acc.tab$method,main = "RMSE")
barplot(acc.tab$MAE,names.arg = acc.tab$method,main = "MAE")
barplot(acc.tab$MPE,names.arg = acc.tab$method,main = "MPE")
barplot(acc.tab$MAPE,names.arg = acc.tab$method,main = "MAPE")
barplot(acc.tab$MASE,names.arg = acc.tab$method,main = "MASE")
barplot(acc.tab$ACF1,names.arg = acc.tab$method,main = "ACF1")
library(dplyr)
acc.tab %>% group_by(method) %>% summarise(SumErr=sum(ME,RMSE,MAE,MPE,MAPE,MASE,ACF1)) %>% arrange(SumErr)
## # A tibble: 8 × 2
## method SumErr
## <chr> <dbl>
## 1 arima 31.12045
## 2 holt 38.77420
## 3 ses 39.40361
## 4 drift 40.51788
## 5 naive 41.00545
## 6 mean 41.05116
## 7 snaive 45.63184
## 8 spline 46.60634
fc.ar <- forecast(fit.arima,h = 24)
fc.ar$mean
## Jan Feb Mar Apr May Jun Jul
## 2016 32.78642 38.96320 40.71821 44.22196 41.86337 43.14746
## 2017 38.83241 30.94402 36.64989 38.11472 42.86311 42.67757 45.12544
## 2018 44.13165
## Aug Sep Oct Nov Dec
## 2016 45.41036 46.96553 36.34095 37.73075 37.23155
## 2017 44.19931 48.51113 35.28719 37.40303 35.48893
## 2018
autoplot(fc.ar$mean)