Melakukan prediksi jumlah order perusahaan scotty berdasarkan satuan jam untuk seminggu kedepan.
library(tidyverse) #data manipulation
library(lubridate) # date manipulation
library(forecast) # time series library
library(tidymodels)
library(TTR) # for Simple moving average function
library(MLmetrics) # calculate error
library(tseries) # adf.test
library(zoo)
library(tidyquant)
library(magrittr)
library(padr)
library(TSstudio)
library(ggplot2)
library(plotly)
Tahapan yang pertama adalah load data set yang akan digunakan. Pada tahapan ini kita aka menggunakan 2 data yaitu data train dan test. Untuk data train akan kita bagi lagi menjadi data validation (check model yang sudah dibuat meenggnakan 7 hari data dari data train) dan sisanya adalah data train_clean yang akan kita buat mode hingga memilih model berdasarkan nilai error dari setiap model.
raw_test <- read.csv("data/data-test.csv")
raw_train <- read.csv("data/data-train.csv")
#Check data type diseluruh data.
glimpse(raw_train)
## Rows: 90,113
## Columns: 16
## $ id <chr> "59d005e1ffcfa261708ce9cd", "59d0066affcfa261708ceb…
## $ trip_id <chr> "59d005e9cb564761a8fe5d3e", NA, "59d006c131e39c6189…
## $ driver_id <chr> "59a892c5568be44b2734f276", NA, "599dc0dfa5b4fd5471…
## $ rider_id <chr> "59ad2d6efba75a581666b506", "59cd704bcf482f6ce2fadf…
## $ start_time <chr> "2017-10-01T00:00:17Z", "2017-10-01T00:02:34Z", "20…
## $ src_lat <dbl> 41.07047, 41.07487, 41.04995, 41.05287, 41.06760, 4…
## $ src_lon <dbl> 29.01945, 28.99528, 29.03107, 28.99522, 28.98827, 2…
## $ src_area <chr> "sxk9", "sxk9", "sxk9", "sxk9", "sxk9", "sxk9", "sx…
## $ src_sub_area <chr> "sxk9s", "sxk9e", "sxk9s", "sxk9e", "sxk9e", "sxk9s…
## $ dest_lat <dbl> 41.11716, 41.08351, 41.04495, 41.08140, 41.02125, 4…
## $ dest_lon <dbl> 29.03650, 29.00228, 28.98192, 28.98197, 29.11316, 2…
## $ dest_area <chr> "sxk9", "sxk9", "sxk9", "sxk9", "sxk9", "sxk9", "sx…
## $ dest_sub_area <chr> "sxk9u", "sxk9e", "sxk9e", "sxk9e", "sxk9q", "sxk9e…
## $ distance <dbl> 5.379250, 1.126098, 4.169492, 3.358296, 11.693573, …
## $ status <chr> "confirmed", "nodrivers", "confirmed", "confirmed",…
## $ confirmed_time_sec <int> 8, 0, 32, 65, 0, 27, 32, 30, 0, 24, 9, 0, 118, 106,…
#Check apakah ada data yang missing
colSums(is.na(raw_train))
## id trip_id driver_id rider_id
## 0 4676 4676 0
## start_time src_lat src_lon src_area
## 0 0 0 0
## src_sub_area dest_lat dest_lon dest_area
## 0 0 0 0
## dest_sub_area distance status confirmed_time_sec
## 0 0 0 0
Terdapat data missing pada variable trip_id & driver id, namun untuk menjawab business question kita missing data tersebut tidak mempengaruhi hasil.
#Pilih variable yang sesuai untuk menjawab business question
train_clean <- raw_train %>%
select(start_time, src_sub_area)
#Adjust data type untuk variable start_time
train_clean <- train_clean %>%
mutate(start_time = as_datetime(start_time),
start_time = floor_date(start_time, unit = "hour"))
#Melakukan summary jumlah order disetiap jam pada data train
train_clean <- train_clean %>%
select(src_sub_area,start_time) %>%
group_by(src_sub_area,start_time) %>%
summarise(demand = n())
## `summarise()` has grouped output by 'src_sub_area'. You can override using the `.groups` argument.
train_clean
Untuk melakukan forecasting dengan bentuk time series salah satu syaratnya adalah data waktu tersebut harus lengkap sesuai dengan range yang digunakan pada model. Sehingga kita harus melengkapi data tersebut menggunakan fungsi pad().
#Check date range start & end
range(train_clean$start_time)
## [1] "2017-10-01 00:00:00 UTC" "2017-12-02 23:00:00 UTC"
#Complete data dan mengisi nilai NA dengan 0
train_clean <- train_clean %>%
pad() %>%
mutate(demand = na.fill(demand, fill = 0))
## Warning: The `.dots` argument of `group_by()` is deprecated as of dplyr 1.0.0.
## pad applied on the interval: hour
#Group by using start time only
scotty_all<- train_clean %>%
group_by(start_time) %>%
summarise(demand = sum(demand))
head(scotty_all)
#Create TS Model
scotty_ts <- zoo(
x = scotty_all$demand,
order.by = scotty_all$start_time,
frequency = 24)
#Plotting time series data
scotty_ts %>%
ts_plot(slider = T)
#Decompose the data
scotty_dc <- scotty_all$demand %>%
ts(freq = 24) %>%
decompose() %>%
autoplot()
scotty_dc
Hasil decompose diatas data kita memiliki tren dan seasonal.
Pada tahapan ini akan dilakukan cross validation dengan metode nested untuk mempermudah proses forecastiong dengan metode purr berdasarkan refrensi https://algotech.netlify.app/blog/purrr-operly-fitting-multiple-time-series-model/
Split data train dan validation dalam 4 minggu dalam 2 bulan per hari per jam, maka secara logikanya adalah 24 jam * 7 hari * 4 minggu * 2 bulan. Data validation adalah forecast seminggu terakhir dan data train menggunakan sisanya.
# Split the data
validation_size <- 24 * 7 #1 week
train_size <- 24 * 7 * 4 * 2 - validation_size
validation_end <- max(train_clean$start_time)
validation_start <- validation_end - hours(validation_size) + hours(1)
train_end <- validation_start - hours(1)
train_start <- train_end - hours(train_size) + hours(1)
intrain <- interval(train_start,train_end)
invalidation <- interval(validation_start, validation_end)
train_clean %>%
mutate(sample = case_when(
start_time %within% intrain ~ "train",
start_time %within% invalidation ~ "validation"
)) %>%
drop_na() %>%
mutate(sample = factor(sample, levels = c("train", "validation"))) %>%
ggplot(aes(x = start_time, y = demand, colour = sample)) +
geom_line() +
labs(x = NULL, y = NULL, colour = NULL) +
facet_wrap(~ src_sub_area, scale = "free", ncol = 1) +
tidyquant::theme_tq() +
tidyquant::scale_colour_tq()
Melakukan data processing dengan membagi data menjadi train & validation dan melakukan nesting berdasarkan fungsinya.
spread_train <- train_clean %>%
spread(src_sub_area, demand)
head(spread_train)
Selanjutnya processing data menggunakan fungsi recipe() & bake() &
rec <- recipe(~ ., filter(spread_train, start_time %within% intrain)) %>%
step_sqrt(all_numeric()) %>%
step_center(all_numeric()) %>%
step_scale(all_numeric()) %>%
prep()
jika menggunakan recipe() data harus dikembalikan kebentuk awal sebagai data frame.
spread_train <- bake(rec, spread_train)
rec_revert <- function(vector, rec, varname) {
# store recipe values
rec_center <- rec$steps[[2]]$means[varname]
rec_scale <- rec$steps[[3]]$sds[varname]
# convert back based on the recipe
results <- (vector * rec_scale + rec_center) ^ 2
# add additional adjustment if necessary
results <- round(results)
# return the results
results
}
spread_train %<>%
gather(src_sub_area, demand, -start_time)
Melakukan nested berdasarkan sample & src_sub_area. Proses ini dilakukan untuk mempermudah tahapan modeling dengan berbagai metode secara sekaligus.
# adjust by sample
spread_train %<>%
mutate(sample = case_when(
start_time %within% intrain ~ "train",
start_time %within% invalidation ~ "validation"
)) %>%
drop_na()
spread_train
#Nesting the data
spread_train %<>%
group_by(src_sub_area, sample) %>%
nest(.key = "data") %>%
spread(sample, data)
## Warning: `.key` is deprecated
spread_train
Membuat nest function untuk metode model yang akan digunakan. Pada kasus ini kita akan menggunakan daily seasonality (TS) dan weekly seasonality(MSTS).
data_funs <- list(
ts = function(x) ts(x$demand, frequency = 24),
msts = function(x) msts(x$demand, seasonal.periods = c(24, 24 * 7))
)
data_funs
## $ts
## function(x) ts(x$demand, frequency = 24)
##
## $msts
## function(x) msts(x$demand, seasonal.periods = c(24, 24 * 7))
data_funs %<>%
rep(length(unique(spread_train$src_sub_area))) %>%
enframe("data_fun_name", "data_fun") %>%
mutate(src_sub_area =
sort(rep(unique(spread_train$src_sub_area), length(unique(.$data_fun_name))))
)
data_funs
spread_train %<>%
left_join(data_funs)
## Joining, by = "src_sub_area"
spread_train
Membuat model list. Fungsi yang digunakan adalah auto.arima(), ets(), stlm(), dan tbats().
#Model list
models <- list(
auto.arima = function(x) auto.arima(x),
ets = function(x) ets(x),
stlm = function(x) stlm(x),
tbats = function(x) tbats(x, use.box.cox = FALSE)
)
models
## $auto.arima
## function(x) auto.arima(x)
##
## $ets
## function(x) ets(x)
##
## $stlm
## function(x) stlm(x)
##
## $tbats
## function(x) tbats(x, use.box.cox = FALSE)
models %<>%
rep(length(unique(spread_train$src_sub_area))) %>%
enframe("model_name", "model") %>%
mutate(src_sub_area =
sort(rep(unique(spread_train$src_sub_area), length(unique(.$model_name))))
)
models
spread_train %<>%
left_join(models) %>%
filter(
!(model_name == "ets" & data_fun_name == "msts"),
!(model_name == "auto.arima" & data_fun_name == "msts")
)
## Joining, by = "src_sub_area"
#Processing the dataset with all models to find the best model
spread_train %<>%
mutate(
params = map(train, ~ list(x = .x)),
data = invoke_map(data_fun, params),
params = map(data, ~ list(x = .x)),
fitted = invoke_map(model, params)
) %>%
select(-data, -params)
spread_train
#Compare output data
train_model <- spread_train %>%
mutate(rmse =
map(fitted, ~ forecast(.x, h = 24 * 7)) %>%
map2_dbl(validation, ~ rmse_vec(truth = .y$demand, estimate = .x$mean)),
mae =
map(fitted, ~ forecast(.x, h = 24 * 7)) %>%
map2_dbl(validation, ~ mae_vec(truth = .y$demand, estimate = .x$mean)),
mape =
map(fitted, ~ forecast(.x, h = 24 * 7)) %>%
map2_dbl(validation, ~ mape_vec(truth = .y$demand, estimate = .x$mean)),
) %>%
arrange(src_sub_area, rmse, mae, mape)
train_model %>%
select(src_sub_area, ends_with("_name"), rmse, mae, mape)
Choose the model
#Create data input
data_input <- c("sxk97","sxk9s","sxk9e")
data_input2 <- c("sxk97","sxk9s")
data_input3 <- c("sxk97","sxk9e")
#Model 1 -> use mae minumum error
train_model1 <- train_model %>%
select(-fitted) %>%
group_by(src_sub_area) %>%
filter(src_sub_area %in% data_input & mae == min(mae)) %>%
ungroup()
train_model1
#Combine data train & validation
full_train_model1 <- train_model1 %>%
mutate(full_data = map2(.x = train, .y = validation, ~bind_rows(.x,.y))) %>%
select(src_sub_area, full_data, everything(),-train,-validation)
#Processing the model
full_train_model1 %<>%
mutate(
params = map(full_data, ~ list(x = .x)),
data = invoke_map(data_fun, params),
params = map(data, ~ list(x = .x)),
fitted = invoke_map(model, params)
) %>%
select(-data, -params)
full_train_model1
#Forecasting the data from model 1
full_train_model1 %<>%
mutate(forecast =
map(fitted, ~ forecast(.x, h = 24 * 7)) %>%
map2(full_data, ~ tibble(
start_time = timetk::tk_make_future_timeseries(.y$start_time, 24 * 7),
demand = as.vector(.x$mean)
))
)
full_train_model1
#Unest the data and save to CSV
model1 <- full_train_model1 %>%
select(src_sub_area, actual = full_data, forecast) %>%
gather(key, value, -src_sub_area) %>%
unnest(value) %>%
mutate(demand = rec_revert(demand, rec, src_sub_area))
model1 <- model1 %>%
filter(key == "forecast") %>%
select(-key)
cols <- c("src_sub_area","datetime","demand")
colnames(model1) <- cols
submit <- model1 %>%
mutate(src_sub_area = as.factor(src_sub_area))
write_csv(submit,"model_1.csv")
Setelah melakukan pengecekan mae melalui shinyapps algoritma, ternyata dari 3 area hanya satu yang lolos MAE < 11 yaitu area sxk97 dengan MAE = 9.32.
#Model 2 -> use mape minumum error
train_model2 <- train_model %>%
select(-fitted) %>%
group_by(src_sub_area) %>%
filter(src_sub_area %in% data_input & mape == min(mape)) %>%
ungroup()
train_model2
#Combine data train & validation
full_train_model2 <- train_model2 %>%
mutate(full_data = map2(.x = train, .y = validation, ~bind_rows(.x,.y))) %>%
select(src_sub_area, full_data, everything(),-train,-validation)
#Processing the model
full_train_model2 %<>%
mutate(
params = map(full_data, ~ list(x = .x)),
data = invoke_map(data_fun, params),
params = map(data, ~ list(x = .x)),
fitted = invoke_map(model, params)
) %>%
select(-data, -params)
full_train_model2
#Forecasting the data from model 1
full_train_model2 %<>%
mutate(forecast =
map(fitted, ~ forecast(.x, h = 24 * 7)) %>%
map2(full_data, ~ tibble(
start_time = timetk::tk_make_future_timeseries(.y$start_time, 24 * 7),
demand = as.vector(.x$mean)
))
)
full_train_model2
#Unest the data and save to CSV
model2 <- full_train_model2 %>%
select(src_sub_area, actual = full_data, forecast) %>%
gather(key, value, -src_sub_area) %>%
unnest(value) %>%
mutate(demand = rec_revert(demand, rec, src_sub_area))
model2 <- model2 %>%
filter(key == "forecast") %>%
select(-key)
cols <- c("src_sub_area","datetime","demand")
colnames(model2) <- cols
submit <- model2 %>%
mutate(src_sub_area = as.factor(src_sub_area))
write_csv(submit,"model_2.csv")
Pada percobaan model kedua ini masih diteumakan satu area sxk9e yang nilainya masih diatas 11 walaupun secara mean all area sudah dibawah 11.
#Model 3 -> use RMSE minumum error
train_model3 <- train_model %>%
select(-fitted) %>%
group_by(src_sub_area) %>%
filter(src_sub_area %in% data_input & rmse == min(rmse)) %>%
ungroup()
train_model3
#Combine data train & validation
full_train_model3 <- train_model3 %>%
mutate(full_data = map2(.x = train, .y = validation, ~bind_rows(.x,.y))) %>%
select(src_sub_area, full_data, everything(),-train,-validation)
#Processing the model
full_train_model3 %<>%
mutate(
params = map(full_data, ~ list(x = .x)),
data = invoke_map(data_fun, params),
params = map(data, ~ list(x = .x)),
fitted = invoke_map(model, params)
) %>%
select(-data, -params)
full_train_model3
#Forecasting the data from model 1
full_train_model3 %<>%
mutate(forecast =
map(fitted, ~ forecast(.x, h = 24 * 7)) %>%
map2(full_data, ~ tibble(
start_time = timetk::tk_make_future_timeseries(.y$start_time, 24 * 7),
demand = as.vector(.x$mean)
))
)
full_train_model3
#Unest the data and save to CSV
model3 <- full_train_model3 %>%
select(src_sub_area, actual = full_data, forecast) %>%
gather(key, value, -src_sub_area) %>%
unnest(value) %>%
mutate(demand = rec_revert(demand, rec, src_sub_area))
model3_viz <- model3
model3 <- model3 %>%
filter(key == "forecast") %>%
select(-key)
cols <- c("src_sub_area","datetime","demand")
colnames(model3) <- cols
submit <- model3 %>%
mutate(src_sub_area = as.factor(src_sub_area))
write_csv(submit,"model_3.csv")
Hasil dari model 3 dengan melihat paramater RMSE cukup memuaskan dengan nilai MAE, 1. sxk97 = 9.61 2. sxk9s = 8.26 3. sxk9e = 9.99 4. All Area = 9.29
Dari tahapan pembuatan model disini dipilih model 3 dikarekan secara nilai MAE data test melalui shinyapps algoritma paling baik dengan score total 9.29.
model_eval <- model3_viz %>%
ggplot(aes(x = start_time, y = demand, colour = key)) +
geom_line() +
labs(x = NULL, y = NULL, colour = NULL) +
facet_wrap(~ src_sub_area, scale = "free", ncol = 1) +
labs(x = "", y = "Demand",title = "Final Model Prediction Comparison")+
tidyquant::theme_tq() +
tidyquant::scale_colour_tq()
ggplotly(model_eval)
model3_viz %>%
filter(key == "forecast") %>%
select(-key) %>%
select(src_sub_area, datetime = start_time, demand) %>%
ggplot(mapping = aes(x = datetime, y= demand)) +
geom_line()+
labs(x = NULL, y = NULL, colour = NULL) +
facet_wrap(~ src_sub_area, scale = "free", ncol = 1) +
labs(x = "", y = "Demand",title = "Demand Forecast for December 3rd - December 9th 2017")+
tidyquant::theme_tq() +
tidyquant::scale_colour_tq()
Asumsi checking disini ingin mengetahui apakah ada korelasi antar data yang digunakan dan perlu digaris bawahi adalah model forecasting time-series yang baik adalah pada saat tidak ada korelasi antar data.
Metode yang akan digunakan adalah Ljung-Box.
Testing hipotesis auto-corelattion : H0: no-autocorrelation <- p-value > 0.05 H1: autocorrelation <- p-value < 0.05
Area sx97
Box.test(residuals(full_train_model3$fitted[[1]]), type = "Ljung-Box")
##
## Box-Ljung test
##
## data: residuals(full_train_model3$fitted[[1]])
## X-squared = 0.0063355, df = 1, p-value = 0.9366
p-value = 0.936, terima H0 tidak ada korelasi
Area sxk9e
Box.test(residuals(full_train_model3$fitted[[2]]), type = "Ljung-Box")
##
## Box-Ljung test
##
## data: residuals(full_train_model3$fitted[[2]])
## X-squared = 0.015738, df = 1, p-value = 0.9002
p-value = 0.900, terima H0 tidak ada korelasi
Area sxk9s
Box.test(residuals(full_train_model3$fitted[[3]]), type = "Ljung-Box")
##
## Box-Ljung test
##
## data: residuals(full_train_model3$fitted[[3]])
## X-squared = 1.8519e-05, df = 1, p-value = 0.9966
p-value = 0.704, terima H0 tidak ada korelasi
Area sxk97
ggplotly(gghistogram(residuals(full_train_model3$fitted[[1]]))+
labs(title="Check Normality Residual for sxk97",x="Residuals")+
theme_minimal()+
labs(y = NULL))
Untuk residul area sxk97 terdistribusi normal dilihat dari data dominan di 0
Area sxk9e
ggplotly(gghistogram(residuals(full_train_model3$fitted[[2]]))+
labs(title="Check Normality Residual for sxk9e",x="Residuals")+
theme_minimal()+
labs(y = NULL))
Untuk residul area sxk9e terdistribusi normal dilihat dari data dominan di 0
Area sxk9s
ggplotly(gghistogram(residuals(full_train_model3$fitted[[3]]))+
labs(title="Check Normality Residual for sxk9s",x="Residuals")+
theme_minimal()+
labs(y = NULL))
Untuk residul area sxk9s terdistribusi normal dilihat dari data dominan di 0
Untuk menjawab busines question terkait forecasting di minggu selanjutnya, model yang terbaik yang dapat memberikan prediksi adalah Model 4 dimana merupakan kombinasi dari model time series. Pada model 4, area sxk97 menggunakan function msts dengan model tbats, area sxk9e menggunakan function msts dengan model tbats, dan area sxk9s menggunakan function ts dengan model tbats. Hasil pengecekan asumsi pun seleuruh area lulus uji asumsi tidak ada korelasi dan residual data terdistribusi normal.