Library

library(plyr)                                                 
library(readr)
library(dplyr)
library(forecast)
library(MLmetrics)
library(lubridate)
library(tidyr)
library(ggplot2)

Read

crime <- list.files(path = "D:/Algoritma Class/LBB/TS & Forecasting/data",    
                       pattern = "*.csv", full.names = TRUE) %>% 
  lapply(read_csv) %>%                                           
  bind_rows  

tail(crime)
## # A tibble: 6 x 22
##         ID `Case Number` Date             Block IUCR  `Primary Type` Description
##      <dbl> <chr>         <chr>            <chr> <chr> <chr>          <chr>      
## 1 12758573 JF314440      06/11/2019 12:0~ 014X~ 1150  DECEPTIVE PRA~ CREDIT CAR~
## 2 12766253 JF322870      06/01/2019 06:1~ 021X~ 1150  DECEPTIVE PRA~ CREDIT CAR~
## 3 12766547 JF324064      01/04/2019 11:0~ 020X~ 1751  OFFENSE INVOL~ CRIMINAL S~
## 4 12767728 JF325471      11/20/2019 02:0~ 027X~ 1153  DECEPTIVE PRA~ FINANCIAL ~
## 5 11928781 JC555497      12/01/2019 06:0~ 013X~ 1754  OFFENSE INVOL~ AGGRAVATED~
## 6 12768199 JF326126      11/01/2019 12:0~ 062X~ 1153  DECEPTIVE PRA~ FINANCIAL ~
## # ... with 15 more variables: `Location Description` <chr>, Arrest <lgl>,
## #   Domestic <lgl>, Beat <chr>, District <chr>, Ward <dbl>,
## #   `Community Area` <dbl>, `FBI Code` <chr>, `X Coordinate` <dbl>,
## #   `Y Coordinate` <dbl>, Year <dbl>, `Updated On` <chr>, Latitude <dbl>,
## #   Longitude <dbl>, Location <chr>
str(crime)
## spec_tbl_df [1,333,073 x 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ID                  : num [1:1333073] 10224738 10224739 10224740 10224741 10224742 ...
##  $ Case Number         : chr [1:1333073] "HY411648" "HY411615" "HY411595" "HY411610" ...
##  $ Date                : chr [1:1333073] "09/05/2015 01:30:00 PM" "09/04/2015 11:30:00 AM" "09/05/2015 12:45:00 PM" "09/05/2015 01:00:00 PM" ...
##  $ Block               : chr [1:1333073] "043XX S WOOD ST" "008XX N CENTRAL AVE" "035XX W BARRY AVE" "0000X N LARAMIE AVE" ...
##  $ IUCR                : chr [1:1333073] "0486" "0870" "2023" "0560" ...
##  $ Primary Type        : chr [1:1333073] "BATTERY" "THEFT" "NARCOTICS" "ASSAULT" ...
##  $ Description         : chr [1:1333073] "DOMESTIC BATTERY SIMPLE" "POCKET-PICKING" "POSS: HEROIN(BRN/TAN)" "SIMPLE" ...
##  $ Location Description: chr [1:1333073] "RESIDENCE" "CTA BUS" "SIDEWALK" "APARTMENT" ...
##  $ Arrest              : logi [1:1333073] FALSE FALSE TRUE FALSE FALSE FALSE ...
##  $ Domestic            : logi [1:1333073] TRUE FALSE FALSE TRUE FALSE FALSE ...
##  $ Beat                : chr [1:1333073] "0924" "1511" "1412" "1522" ...
##  $ District            : chr [1:1333073] "009" "015" "014" "015" ...
##  $ Ward                : num [1:1333073] 12 29 35 28 21 32 25 27 13 45 ...
##  $ Community Area      : num [1:1333073] 61 25 21 25 71 24 31 27 65 11 ...
##  $ FBI Code            : chr [1:1333073] "08B" "06" "18" "08A" ...
##  $ X Coordinate        : num [1:1333073] 1165074 1138875 1152037 1141706 1168430 ...
##  $ Y Coordinate        : num [1:1333073] 1875917 1904869 1920384 1900086 1850165 ...
##  $ Year                : num [1:1333073] 2015 2015 2015 2015 2015 ...
##  $ Updated On          : chr [1:1333073] "02/10/2018 03:50:01 PM" "02/10/2018 03:50:01 PM" "02/10/2018 03:50:01 PM" "02/10/2018 03:50:01 PM" ...
##  $ Latitude            : num [1:1333073] 41.8 41.9 41.9 41.9 41.7 ...
##  $ Longitude           : num [1:1333073] -87.7 -87.8 -87.7 -87.8 -87.7 ...
##  $ Location            : chr [1:1333073] "(41.815117282, -87.669999562)" "(41.895080471, -87.765400451)" "(41.937405765, -87.716649687)" "(41.881903443, -87.755121152)" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ID = col_double(),
##   ..   `Case Number` = col_character(),
##   ..   Date = col_character(),
##   ..   Block = col_character(),
##   ..   IUCR = col_character(),
##   ..   `Primary Type` = col_character(),
##   ..   Description = col_character(),
##   ..   `Location Description` = col_character(),
##   ..   Arrest = col_logical(),
##   ..   Domestic = col_logical(),
##   ..   Beat = col_character(),
##   ..   District = col_character(),
##   ..   Ward = col_double(),
##   ..   `Community Area` = col_double(),
##   ..   `FBI Code` = col_character(),
##   ..   `X Coordinate` = col_double(),
##   ..   `Y Coordinate` = col_double(),
##   ..   Year = col_double(),
##   ..   `Updated On` = col_character(),
##   ..   Latitude = col_double(),
##   ..   Longitude = col_double(),
##   ..   Location = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
unique(crime$`Primary Type`)
##  [1] "BATTERY"                           "THEFT"                            
##  [3] "NARCOTICS"                         "ASSAULT"                          
##  [5] "BURGLARY"                          "ROBBERY"                          
##  [7] "OTHER OFFENSE"                     "CRIMINAL DAMAGE"                  
##  [9] "WEAPONS VIOLATION"                 "DECEPTIVE PRACTICE"               
## [11] "CRIMINAL TRESPASS"                 "MOTOR VEHICLE THEFT"              
## [13] "SEX OFFENSE"                       "INTERFERENCE WITH PUBLIC OFFICER" 
## [15] "OFFENSE INVOLVING CHILDREN"        "PUBLIC PEACE VIOLATION"           
## [17] "PROSTITUTION"                      "GAMBLING"                         
## [19] "CRIM SEXUAL ASSAULT"               "LIQUOR LAW VIOLATION"             
## [21] "ARSON"                             "STALKING"                         
## [23] "KIDNAPPING"                        "INTIMIDATION"                     
## [25] "CONCEALED CARRY LICENSE VIOLATION" "NON - CRIMINAL"                   
## [27] "HUMAN TRAFFICKING"                 "OBSCENITY"                        
## [29] "PUBLIC INDECENCY"                  "OTHER NARCOTIC VIOLATION"         
## [31] "NON-CRIMINAL"                      "HOMICIDE"                         
## [33] "CRIMINAL SEXUAL ASSAULT"           "NON-CRIMINAL (SUBJECT SPECIFIED)"

Melalui data tersebut, kita akan memilih untuk menggunakan kasus Narcotics pada kolom Primary Type.

# crime$Date <-  as.POSIXct(crime$Date, format = "%m/%d/%Y %H:%M:%S")
crime_narcotics <- crime %>% 
  mutate(Date = lubridate::mdy_hms(Date)) %>% 
  filter(`Primary Type`=="NARCOTICS") %>% 
  group_by(Date=date(Date)) %>% 
  summarise(Count.Narcotics=n_distinct(`Case Number`)) %>% 
  ungroup()
  

head(crime_narcotics)
## # A tibble: 6 x 2
##   Date       Count.Narcotics
##   <date>               <int>
## 1 2015-01-01              43
## 2 2015-01-02              86
## 3 2015-01-03              70
## 4 2015-01-04              66
## 5 2015-01-05              43
## 6 2015-01-06              57

Cek Missing Value

# cek NA
colSums(is.na(crime_narcotics))
##            Date Count.Narcotics 
##               0               0

Dari data di atas, tidak ditemukan adanya missing value dan data tersebut sudah diurutkan berdasarkan kolom Date, sehingga kita dapat melanjutkan proses selanjutnya.

Cross Validation

Split data: menggunakan Data train dari tahun 2015 - 2019 (4 tahun) dan Data test pada tahun 2019 (1 tahun)

narcotics_test <- tail(crime_narcotics, 12)
narcotics_train <- head(crime_narcotics, -12)

Membuat object time series

range(crime_narcotics$Date)
## [1] "2015-01-01" "2019-12-31"
narcotics_ts <- ts(data = crime_narcotics$Count.Narcotics,
                 start = c(2015,1,1), 
                 end = c(2019, 12, 31),
                 frequency = 24)
narcotics_ts %>% autoplot()

Melakukan decompose pada object time series untuk menentukan additive atau multiplicative

narcotic_decom <- decompose(narcotics_ts) 
narcotic_decom %>% autoplot()

narcotic_decom$trend %>% autoplot()

Melihat object time series decompose dengan tipe multiplicative

decompose(narcotics_ts, type = "multiplicative") %>% autoplot()

Seperti yang dapat kita lihat, pola dari trend masih menunjukkan pola seperti seasonal, hal ini menunjukkan bahwa terdapat kemungkinan pola seasonality yang tidak ditangkap di plot di atas. Untuk mengatasi hal tersebut, kita akan membuat objek Multi Seasonal Time Series.

narcotics_msts <- msts(narcotics_train$Count.Narcotics, seasonal.periods = c(24, # Daily
                                                            24*7, # Weekly
                                                            24*30)) # Monthly


# Decompose MSTS Object
narcotics_msts_dec <- mstl(narcotics_msts, lambda = "auto") #%>% autoplot()

narcotics_msts_dec %>%
  tail(365) %>%
  autoplot()

Setelah dilakukan decompose multiseasonal, pola trend mulai terlihat menjadi pola additive.

head(narcotics_msts)
## Multi-Seasonal Time Series:
## Start: 1 1
## Seasonal Periods: 24 168 720
## Data:
## [1] 43 86 70 66 43 57

Seasonality Analysis

# Create a data frame based on MSTS Object

df_narcotics_multi <- as.data.frame(narcotics_msts_dec)

1. Daily Seasonality

plot1 <- df_narcotics_multi %>%
  mutate(day = narcotics_train$Date) %>%
  group_by(day) %>%
  summarise(seasonal = sum(Seasonal24 + Seasonal168 + Seasonal720)) %>%
  head(24*2) %>%
  ggplot(aes(x = day, y = seasonal)) +
  geom_point(col = "maroon") + geom_line(col = "red") +
  theme_minimal()

plot1

Dari visualisasi di atas, dapat terlihat bahwa kasus narkotika mengalamai peningkatan pada tanggal 15 Januari.

2. Weekly Seasonality

plot2 <- df_narcotics_multi %>%
  mutate(day = narcotics_train$Date) %>%
  group_by(day) %>%
  summarise(seasonal = sum(Seasonal24 + Seasonal168 + Seasonal720)) %>%
  head(24*7) %>%
  ggplot(aes(x = day, y = seasonal)) +
  geom_point(col = "maroon") + geom_line(col = "red") +
  theme_minimal()

plot2

3. Monthly Seasonality

plot3 <- df_narcotics_multi %>%
  mutate(day = narcotics_train$Date, month = month(narcotics_train$Date, label = T)) %>%
  group_by(month) %>%
  summarise(seasonal = sum(Seasonal24 + Seasonal168 + Seasonal720)) %>%
  head(24*30) %>%
  ggplot(aes(x = month, y = seasonal)) +
  geom_point() + geom_col() +
  theme_minimal()

plot3

Sedangkan berdasarkan data pola per bulan, dapat dilihat terdapat peningkatan yang tinggi pada bulan maret, kemudian terus menurun hingga bulan juni.

Modelling

Dalam hal ini, kita akan coba menggunakan metode Multi-Seasonal HoltWinter

narcotics_hw <- stlm(narcotics_msts,  s.window = 24, method = "ets")

f_hw <- forecast(narcotics_hw, h = 24)

narcotics_msts %>%
  tail(100) %>% 
  autoplot(series = "actual") +
  # autolayer(narcotics_test, series = "test data") +
  autolayer(f_hw, series = "Multi-Seasonal Holt_Winter") +
  scale_color_manual(values = c("black", "blue", "firebrick"))

Multi-Seasonal ARIMA model

narcotic_arima <- stlm(narcotics_msts, method = "arima")

f_arima <- forecast(narcotic_arima, h = 365)

narcotics_msts %>%
  tail(100) %>%
  autoplot() +
  autolayer(f_arima, series = "Multi-Seasonal ARIMA", color = "red") 

Assumption

Asumsi pada time series diujikan untuk mengukur apakah residual yang peroleh dari hasil modeling sudah cukup baik untuk menggambarkan dan menangkap informasi pada data. Mengapa menggunakan residual data? Karena dengan menggunakan residual data, kita dapat mendapatkan informasi dari data aktual maupun dari hasil prediksi menggunakan model. Metode forecasting yang baik menghasilkan nilai residual berikut ini[^11]:

  1. Residual yang tidak berkorelasi. Apabila terdapat residual yang berkorelasi, artinya masih terdapat informasi yang tertinggal yang seharusnya digunakan untuk menghitung hasil forecast.
  2. Residual memiliki rata-rata 0.

Untuk memastikan bahwa residual yang dihasilkan memiliki kriteria diatas, maka terdapat asumsi untuk mengujinya.

No-autocorrelation residual

\(H_0\): residual has no-autocorrelation \(H_1\): residual has autocorrelation

yang diinginkan p-value > 0.05 (alpha), no-autocorrelation

Untuk mengecek ada/tidaknya autokorelasi pada residual hasil forecasting time series, kita akan menggunakan metode Ljung-Box:

  • melakukan uji Ljung-box dengan menggunakan fungsi Box.test(residual model, type = "Ljung-Box)

\(H_0\): residual has no-autocorrelation \(H_1\): residual has autocorrelation

# menggunakan Ljung-Box test
Box.test(narcotic_arima$residuals, type = "Ljung-Box", lag = 2*365)
## 
##  Box-Ljung test
## 
## data:  narcotic_arima$residuals
## X-squared = 1871.2, df = 730, p-value < 2.2e-16

Berdasarkan hasil model di atas, p-value < 0.05 berarti terdapat autokorelasi dalam model residual.

Normality of residual

\(H_0\): residual menyebar normal \(H_1\): residual tidak menyebar normal

yang diinginkan p-value > 0.05 (alpha), residual menyebar normal

Untuk mengecek normality residual pada hasil forecasting time series kita bisa melakukan uji normality (shapiro test) dengan menggunakan fungsi shapiro.test(residual model)

library(nortest)
ad.test(narcotic_arima$residuals)
## 
##  Anderson-Darling normality test
## 
## data:  narcotic_arima$residuals
## A = 2.9257, p-value = 2.366e-07

Berdasarkan hasil model di atas, p-value < 0.05 maka tolak h0 atau residual tidak menyebar normal.

accuracy(f_arima, narcotics_test$Count.Narcotics)
##                      ME      RMSE       MAE        MPE     MAPE     MASE
## Training set -0.1664684  6.321473  4.830176  -3.607164 13.90407 0.478609
## Test set      3.2685846 14.341325 12.632686 -15.122809 51.50754 1.251738
##                     ACF1
## Training set 0.001085914
## Test set              NA

Conclusion

Perbedaan nilai MAE pada model di atas adalah 7.8%, yang berarti masih bisa dibilang tidak overfitting karena tidak mencapai 10%.

Dari kedua model tersebut, kita juga tidak dapat memenuhi asumsi autokorelasi dan residual normal. Karena hal tersebut, dapat dilihat atau dikatakan bahwa model yang kita gunakan dapat dilakukan tuning untuk menjadi model yang lebih optimal.