library(plyr)
library(readr)
library(dplyr)
library(forecast)
library(MLmetrics)
library(lubridate)
library(tidyr)
library(ggplot2)
crime <- list.files(path = "D:/Algoritma Class/LBB/TS & Forecasting/data",
pattern = "*.csv", full.names = TRUE) %>%
lapply(read_csv) %>%
bind_rows
tail(crime)
## # A tibble: 6 x 22
## ID `Case Number` Date Block IUCR `Primary Type` Description
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 12758573 JF314440 06/11/2019 12:0~ 014X~ 1150 DECEPTIVE PRA~ CREDIT CAR~
## 2 12766253 JF322870 06/01/2019 06:1~ 021X~ 1150 DECEPTIVE PRA~ CREDIT CAR~
## 3 12766547 JF324064 01/04/2019 11:0~ 020X~ 1751 OFFENSE INVOL~ CRIMINAL S~
## 4 12767728 JF325471 11/20/2019 02:0~ 027X~ 1153 DECEPTIVE PRA~ FINANCIAL ~
## 5 11928781 JC555497 12/01/2019 06:0~ 013X~ 1754 OFFENSE INVOL~ AGGRAVATED~
## 6 12768199 JF326126 11/01/2019 12:0~ 062X~ 1153 DECEPTIVE PRA~ FINANCIAL ~
## # ... with 15 more variables: `Location Description` <chr>, Arrest <lgl>,
## # Domestic <lgl>, Beat <chr>, District <chr>, Ward <dbl>,
## # `Community Area` <dbl>, `FBI Code` <chr>, `X Coordinate` <dbl>,
## # `Y Coordinate` <dbl>, Year <dbl>, `Updated On` <chr>, Latitude <dbl>,
## # Longitude <dbl>, Location <chr>
str(crime)
## spec_tbl_df [1,333,073 x 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ID : num [1:1333073] 10224738 10224739 10224740 10224741 10224742 ...
## $ Case Number : chr [1:1333073] "HY411648" "HY411615" "HY411595" "HY411610" ...
## $ Date : chr [1:1333073] "09/05/2015 01:30:00 PM" "09/04/2015 11:30:00 AM" "09/05/2015 12:45:00 PM" "09/05/2015 01:00:00 PM" ...
## $ Block : chr [1:1333073] "043XX S WOOD ST" "008XX N CENTRAL AVE" "035XX W BARRY AVE" "0000X N LARAMIE AVE" ...
## $ IUCR : chr [1:1333073] "0486" "0870" "2023" "0560" ...
## $ Primary Type : chr [1:1333073] "BATTERY" "THEFT" "NARCOTICS" "ASSAULT" ...
## $ Description : chr [1:1333073] "DOMESTIC BATTERY SIMPLE" "POCKET-PICKING" "POSS: HEROIN(BRN/TAN)" "SIMPLE" ...
## $ Location Description: chr [1:1333073] "RESIDENCE" "CTA BUS" "SIDEWALK" "APARTMENT" ...
## $ Arrest : logi [1:1333073] FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ Domestic : logi [1:1333073] TRUE FALSE FALSE TRUE FALSE FALSE ...
## $ Beat : chr [1:1333073] "0924" "1511" "1412" "1522" ...
## $ District : chr [1:1333073] "009" "015" "014" "015" ...
## $ Ward : num [1:1333073] 12 29 35 28 21 32 25 27 13 45 ...
## $ Community Area : num [1:1333073] 61 25 21 25 71 24 31 27 65 11 ...
## $ FBI Code : chr [1:1333073] "08B" "06" "18" "08A" ...
## $ X Coordinate : num [1:1333073] 1165074 1138875 1152037 1141706 1168430 ...
## $ Y Coordinate : num [1:1333073] 1875917 1904869 1920384 1900086 1850165 ...
## $ Year : num [1:1333073] 2015 2015 2015 2015 2015 ...
## $ Updated On : chr [1:1333073] "02/10/2018 03:50:01 PM" "02/10/2018 03:50:01 PM" "02/10/2018 03:50:01 PM" "02/10/2018 03:50:01 PM" ...
## $ Latitude : num [1:1333073] 41.8 41.9 41.9 41.9 41.7 ...
## $ Longitude : num [1:1333073] -87.7 -87.8 -87.7 -87.8 -87.7 ...
## $ Location : chr [1:1333073] "(41.815117282, -87.669999562)" "(41.895080471, -87.765400451)" "(41.937405765, -87.716649687)" "(41.881903443, -87.755121152)" ...
## - attr(*, "spec")=
## .. cols(
## .. ID = col_double(),
## .. `Case Number` = col_character(),
## .. Date = col_character(),
## .. Block = col_character(),
## .. IUCR = col_character(),
## .. `Primary Type` = col_character(),
## .. Description = col_character(),
## .. `Location Description` = col_character(),
## .. Arrest = col_logical(),
## .. Domestic = col_logical(),
## .. Beat = col_character(),
## .. District = col_character(),
## .. Ward = col_double(),
## .. `Community Area` = col_double(),
## .. `FBI Code` = col_character(),
## .. `X Coordinate` = col_double(),
## .. `Y Coordinate` = col_double(),
## .. Year = col_double(),
## .. `Updated On` = col_character(),
## .. Latitude = col_double(),
## .. Longitude = col_double(),
## .. Location = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
unique(crime$`Primary Type`)
## [1] "BATTERY" "THEFT"
## [3] "NARCOTICS" "ASSAULT"
## [5] "BURGLARY" "ROBBERY"
## [7] "OTHER OFFENSE" "CRIMINAL DAMAGE"
## [9] "WEAPONS VIOLATION" "DECEPTIVE PRACTICE"
## [11] "CRIMINAL TRESPASS" "MOTOR VEHICLE THEFT"
## [13] "SEX OFFENSE" "INTERFERENCE WITH PUBLIC OFFICER"
## [15] "OFFENSE INVOLVING CHILDREN" "PUBLIC PEACE VIOLATION"
## [17] "PROSTITUTION" "GAMBLING"
## [19] "CRIM SEXUAL ASSAULT" "LIQUOR LAW VIOLATION"
## [21] "ARSON" "STALKING"
## [23] "KIDNAPPING" "INTIMIDATION"
## [25] "CONCEALED CARRY LICENSE VIOLATION" "NON - CRIMINAL"
## [27] "HUMAN TRAFFICKING" "OBSCENITY"
## [29] "PUBLIC INDECENCY" "OTHER NARCOTIC VIOLATION"
## [31] "NON-CRIMINAL" "HOMICIDE"
## [33] "CRIMINAL SEXUAL ASSAULT" "NON-CRIMINAL (SUBJECT SPECIFIED)"
Melalui data tersebut, kita akan memilih untuk menggunakan kasus
Narcotics pada kolom Primary Type.
# crime$Date <- as.POSIXct(crime$Date, format = "%m/%d/%Y %H:%M:%S")
crime_narcotics <- crime %>%
mutate(Date = lubridate::mdy_hms(Date)) %>%
filter(`Primary Type`=="NARCOTICS") %>%
group_by(Date=date(Date)) %>%
summarise(Count.Narcotics=n_distinct(`Case Number`)) %>%
ungroup()
head(crime_narcotics)
## # A tibble: 6 x 2
## Date Count.Narcotics
## <date> <int>
## 1 2015-01-01 43
## 2 2015-01-02 86
## 3 2015-01-03 70
## 4 2015-01-04 66
## 5 2015-01-05 43
## 6 2015-01-06 57
Cek Missing Value
# cek NA
colSums(is.na(crime_narcotics))
## Date Count.Narcotics
## 0 0
Dari data di atas, tidak ditemukan adanya missing value dan data
tersebut sudah diurutkan berdasarkan kolom Date, sehingga
kita dapat melanjutkan proses selanjutnya.
Split data: menggunakan Data train dari tahun 2015 - 2019 (4 tahun) dan Data test pada tahun 2019 (1 tahun)
narcotics_test <- tail(crime_narcotics, 12)
narcotics_train <- head(crime_narcotics, -12)
Membuat object time series
range(crime_narcotics$Date)
## [1] "2015-01-01" "2019-12-31"
narcotics_ts <- ts(data = crime_narcotics$Count.Narcotics,
start = c(2015,1,1),
end = c(2019, 12, 31),
frequency = 24)
narcotics_ts %>% autoplot()
Melakukan decompose pada object time series untuk menentukan additive atau multiplicative
narcotic_decom <- decompose(narcotics_ts)
narcotic_decom %>% autoplot()
narcotic_decom$trend %>% autoplot()
Melihat object time series decompose dengan tipe
multiplicative
decompose(narcotics_ts, type = "multiplicative") %>% autoplot()
Seperti yang dapat kita lihat, pola dari trend masih menunjukkan pola seperti seasonal, hal ini menunjukkan bahwa terdapat kemungkinan pola seasonality yang tidak ditangkap di plot di atas. Untuk mengatasi hal tersebut, kita akan membuat objek Multi Seasonal Time Series.
narcotics_msts <- msts(narcotics_train$Count.Narcotics, seasonal.periods = c(24, # Daily
24*7, # Weekly
24*30)) # Monthly
# Decompose MSTS Object
narcotics_msts_dec <- mstl(narcotics_msts, lambda = "auto") #%>% autoplot()
narcotics_msts_dec %>%
tail(365) %>%
autoplot()
Setelah dilakukan decompose multiseasonal, pola trend mulai terlihat menjadi pola additive.
head(narcotics_msts)
## Multi-Seasonal Time Series:
## Start: 1 1
## Seasonal Periods: 24 168 720
## Data:
## [1] 43 86 70 66 43 57
# Create a data frame based on MSTS Object
df_narcotics_multi <- as.data.frame(narcotics_msts_dec)
1. Daily Seasonality
plot1 <- df_narcotics_multi %>%
mutate(day = narcotics_train$Date) %>%
group_by(day) %>%
summarise(seasonal = sum(Seasonal24 + Seasonal168 + Seasonal720)) %>%
head(24*2) %>%
ggplot(aes(x = day, y = seasonal)) +
geom_point(col = "maroon") + geom_line(col = "red") +
theme_minimal()
plot1
Dari visualisasi di atas, dapat terlihat bahwa kasus narkotika mengalamai peningkatan pada tanggal 15 Januari.
2. Weekly Seasonality
plot2 <- df_narcotics_multi %>%
mutate(day = narcotics_train$Date) %>%
group_by(day) %>%
summarise(seasonal = sum(Seasonal24 + Seasonal168 + Seasonal720)) %>%
head(24*7) %>%
ggplot(aes(x = day, y = seasonal)) +
geom_point(col = "maroon") + geom_line(col = "red") +
theme_minimal()
plot2
3. Monthly Seasonality
plot3 <- df_narcotics_multi %>%
mutate(day = narcotics_train$Date, month = month(narcotics_train$Date, label = T)) %>%
group_by(month) %>%
summarise(seasonal = sum(Seasonal24 + Seasonal168 + Seasonal720)) %>%
head(24*30) %>%
ggplot(aes(x = month, y = seasonal)) +
geom_point() + geom_col() +
theme_minimal()
plot3
Sedangkan berdasarkan data pola per bulan, dapat dilihat terdapat peningkatan yang tinggi pada bulan maret, kemudian terus menurun hingga bulan juni.
Dalam hal ini, kita akan coba menggunakan metode Multi-Seasonal HoltWinter
narcotics_hw <- stlm(narcotics_msts, s.window = 24, method = "ets")
f_hw <- forecast(narcotics_hw, h = 24)
narcotics_msts %>%
tail(100) %>%
autoplot(series = "actual") +
# autolayer(narcotics_test, series = "test data") +
autolayer(f_hw, series = "Multi-Seasonal Holt_Winter") +
scale_color_manual(values = c("black", "blue", "firebrick"))
Multi-Seasonal ARIMA model
narcotic_arima <- stlm(narcotics_msts, method = "arima")
f_arima <- forecast(narcotic_arima, h = 365)
narcotics_msts %>%
tail(100) %>%
autoplot() +
autolayer(f_arima, series = "Multi-Seasonal ARIMA", color = "red")
Asumsi pada time series diujikan untuk mengukur apakah residual yang peroleh dari hasil modeling sudah cukup baik untuk menggambarkan dan menangkap informasi pada data. Mengapa menggunakan residual data? Karena dengan menggunakan residual data, kita dapat mendapatkan informasi dari data aktual maupun dari hasil prediksi menggunakan model. Metode forecasting yang baik menghasilkan nilai residual berikut ini[^11]:
Untuk memastikan bahwa residual yang dihasilkan memiliki kriteria diatas, maka terdapat asumsi untuk mengujinya.
\(H_0\): residual has no-autocorrelation \(H_1\): residual has autocorrelation
yang diinginkan p-value > 0.05 (alpha), no-autocorrelation
Untuk mengecek ada/tidaknya autokorelasi pada residual hasil forecasting time series, kita akan menggunakan metode Ljung-Box:
Box.test(residual model, type = "Ljung-Box)\(H_0\): residual has no-autocorrelation \(H_1\): residual has autocorrelation
# menggunakan Ljung-Box test
Box.test(narcotic_arima$residuals, type = "Ljung-Box", lag = 2*365)
##
## Box-Ljung test
##
## data: narcotic_arima$residuals
## X-squared = 1871.2, df = 730, p-value < 2.2e-16
Berdasarkan hasil model di atas, p-value < 0.05
berarti terdapat autokorelasi dalam model residual.
\(H_0\): residual menyebar normal \(H_1\): residual tidak menyebar normal
yang diinginkan p-value > 0.05 (alpha), residual menyebar normal
Untuk mengecek normality residual pada hasil forecasting time series
kita bisa melakukan uji normality (shapiro test) dengan menggunakan
fungsi shapiro.test(residual model)
library(nortest)
ad.test(narcotic_arima$residuals)
##
## Anderson-Darling normality test
##
## data: narcotic_arima$residuals
## A = 2.9257, p-value = 2.366e-07
Berdasarkan hasil model di atas, p-value < 0.05 maka
tolak h0 atau residual tidak menyebar normal.
accuracy(f_arima, narcotics_test$Count.Narcotics)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.1664684 6.321473 4.830176 -3.607164 13.90407 0.478609
## Test set 3.2685846 14.341325 12.632686 -15.122809 51.50754 1.251738
## ACF1
## Training set 0.001085914
## Test set NA
Perbedaan nilai MAE pada model di atas adalah 7.8%, yang berarti masih bisa dibilang tidak overfitting karena tidak mencapai 10%.
Dari kedua model tersebut, kita juga tidak dapat memenuhi asumsi autokorelasi dan residual normal. Karena hal tersebut, dapat dilihat atau dikatakan bahwa model yang kita gunakan dapat dilakukan tuning untuk menjadi model yang lebih optimal.