Code requires these packages:
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom 1.0.10 ✔ recipes 1.3.1
## ✔ dials 1.4.2 ✔ rsample 1.3.1
## ✔ dplyr 1.1.4 ✔ tailor 0.1.0
## ✔ ggplot2 4.0.0 ✔ tidyr 1.3.1
## ✔ infer 1.0.9 ✔ tune 2.0.0
## ✔ modeldata 1.5.1 ✔ workflows 1.3.0
## ✔ parsnip 1.3.3 ✔ workflowsets 1.1.1
## ✔ purrr 1.1.0 ✔ yardstick 1.3.2
## Warning: package 'dials' was built under R version 4.4.3
## Warning: package 'scales' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'infer' was built under R version 4.4.3
## Warning: package 'modeldata' was built under R version 4.4.3
## Warning: package 'parsnip' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'tailor' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.2
## Warning: package 'tune' was built under R version 4.4.3
## Warning: package 'workflows' was built under R version 4.4.3
## Warning: package 'workflowsets' was built under R version 4.4.3
## Warning: package 'yardstick' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.2
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:rsample':
##
## calibration
## The following object is masked from 'package:purrr':
##
## lift
theme_set(theme_bw())
load("C:/Users/Lenovo/Downloads/pRADYTHA/MACHINE LEARNING/TUGAS 1/chicago.RData")
load("C:/Users/Lenovo/Downloads/pRADYTHA/MACHINE LEARNING/TUGAS 1/stations.RData")
holidays <-
c("USNewYearsDay", "Jan02_Mon_Fri", "USMLKingsBirthday",
"USPresidentsDay", "USMemorialDay", "USIndependenceDay",
"Jul03_Mon_Fri", "Jul05_Mon_Fri", "USLaborDay", "USThanksgivingDay",
"Day_after_Thx", "ChristmasEve", "USChristmasDay", "Dec26_wkday",
"Dec31_Mon_Fri")
common_holiday <-
apply(training %>% dplyr::select(one_of(holidays)), 1,
function(x) ifelse(any(x == 1), 1, 0))
training <-
training %>%
mutate(
holiday = common_holiday,
weekday = ifelse(dow %in% c("Sat", "Sun"), 0, 1)
)
get_resid <- function(terms, next_term, return_mod = FALSE) {
ctrl$verboseIter <- FALSE
ctrl$predictionBounds <- c(0, NA)
set.seed(4194)
mod <- train(s_40380 ~ .,
data = training[, c("s_40380", terms)],
method = "lm",
metric = "RMSE",
maximize = FALSE,
trControl = ctrl)
x_mod <- train(as.formula(paste(next_term,"~ .")),
data = training[, c(terms, next_term)],
method = "lm",
metric = "RMSE",
maximize = FALSE,
trControl = ctrl)
if(!return_mod) {
out <- mod$pred
out$Resample <- ymd(out$Resample)
out$Date <- train_days[out$rowIndex]
out$Month <- training$month[out$rowIndex]
out$holiday <- training$holiday[out$rowIndex]
out$weekday <- training$weekday[out$rowIndex]
out$Residual <- out$obs - out$pred
out$xResidual <- x_mod$pred$obs - x_mod$pred$pred
} else out <- mod
out
}
# There will be a warning about the "outcome only has two possible values".
# This can be ignored.
base_resid <-
get_resid(terms = c("year", "month", "week"), next_term = "weekday") %>%
mutate(Model = "Base Model")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
pow_resid <-
get_resid(terms = c("year", "month", "week", "weekday"), next_term = "l14_40380") %>%
mutate(Model = "Base + Part of Week")
l14_resid <-
get_resid(
terms = c("year", "month", "week", "weekday", "l14_40380"),
next_term = "holiday"
) %>%
mutate(Model = "Base + Part of Week + 14-Day Lag")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
hol_resid <-
get_resid(terms = c("year", "month", "week", "weekday", "l14_40380", "holiday"),
next_term = "l14_40370") %>%
mutate(Model = "Base + Part of Week + 14-Day Lag + Holiday")
mod_lev <- c("Base Model", "Base + Part of Week",
"Base + Part of Week + 14-Day Lag",
"Base + Part of Week + 14-Day Lag + Holiday")
model_resid <-
bind_rows(base_resid, pow_resid, l14_resid, hol_resid) %>%
mutate(
Model = factor(Model, levels = mod_lev),
holiday = ifelse(holiday == 1, "yes", "no"),
weekday = ifelse(weekday == 0, "Weekend", "Weekday")
)
resid_hists <-
ggplot(model_resid, aes(x = Residual)) +
geom_vline(xintercept = 0, lty = 2, col = "darkgreen") +
geom_histogram(binwidth = 0.5, col = rgb(1, 1, 1, 0), fill = "blue", alpha = .5) +
facet_wrap(~Model, ncol = 1) +
xlab("Resampling Residual") +
ylab("Count") +
ggtitle("(a)")
day_resid <-
base_resid %>%
mutate(weekday = ifelse(weekday == 0, "Weekend", "Weekday")) %>%
ggplot(aes(x = xResidual, y = Residual)) +
geom_smooth(se = FALSE, method = lm, col = "black") +
geom_point(aes(col = weekday, shape = weekday), alpha = .5) +
xlab("POW Model Residuals") +
ylab("Base Model \n Residuals \n") +
theme(
legend.position = c(.2, .8),
legend.background = element_blank(),
legend.title = element_blank()
) +
ggtitle("(b)")
l14_PR_resid <-
ggplot(pow_resid, aes(x = xResidual, y = Residual)) +
geom_point(alpha = .5) +
xlab("14-day Lag Model Residuals") +
ylab("Base + POW Model \n Residuals \n") +
ggtitle("(c)")
hol_PR_resid <-
l14_resid %>%
mutate(holiday = ifelse(holiday == 1, "yes", "no")) %>%
ggplot(aes(x = xResidual, y = Residual)) +
geom_smooth(se = FALSE, method = lm, col = "black") +
geom_point(aes(col = holiday, shape = holiday), alpha = .5) +
xlab("Holiday Model Residuals") +
ylab("Base + POW + \n 14-day Lag Model \n Residuals") +
theme(
legend.position = c(.2, .25),
legend.background = element_blank(),
legend.title = element_blank()
) +
ggtitle("(d)")
# https://stackoverflow.com/questions/8112208/how-can-i-obtain-an-unbalanced-grid-of-ggplots
# grid.arrange(resid_hists, day_resid, l14_PR_resid, hol_PR_resid,
# layout_matrix = cbind(c(1, 1, 1), c(2, 3, 4)))
sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_Indonesia.utf8 LC_CTYPE=English_Indonesia.utf8
## [3] LC_MONETARY=English_Indonesia.utf8 LC_NUMERIC=C
## [5] LC_TIME=English_Indonesia.utf8
##
## time zone: Asia/Jakarta
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] caret_7.0-1 lattice_0.22-6 gridExtra_2.3 lubridate_1.9.4
## [5] yardstick_1.3.2 workflowsets_1.1.1 workflows_1.3.0 tune_2.0.0
## [9] tidyr_1.3.1 tailor_0.1.0 rsample_1.3.1 recipes_1.3.1
## [13] purrr_1.1.0 parsnip_1.3.3 modeldata_1.5.1 infer_1.0.9
## [17] ggplot2_4.0.0 dplyr_1.1.4 dials_1.4.2 scales_1.4.0
## [21] broom_1.0.10 tidymodels_1.4.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 timeDate_4041.110 farver_2.1.2
## [4] S7_0.2.0 fastmap_1.2.0 pROC_1.19.0.1
## [7] digest_0.6.37 rpart_4.1.23 timechange_0.3.0
## [10] lifecycle_1.0.4 survival_3.6-4 magrittr_2.0.3
## [13] compiler_4.4.1 rlang_1.1.6 sass_0.4.9
## [16] tools_4.4.1 yaml_2.3.10 data.table_1.16.2
## [19] knitr_1.49 plyr_1.8.9 DiceDesign_1.10
## [22] RColorBrewer_1.1-3 withr_3.0.2 stats4_4.4.1
## [25] nnet_7.3-19 grid_4.4.1 future_1.67.0
## [28] iterators_1.0.14 globals_0.18.0 MASS_7.3-60.2
## [31] cli_3.6.5 rmarkdown_2.29 generics_0.1.3
## [34] rstudioapi_0.17.1 future.apply_1.20.0 reshape2_1.4.4
## [37] cachem_1.1.0 stringr_1.5.1 splines_4.4.1
## [40] parallel_4.4.1 vctrs_0.6.5 hardhat_1.4.2
## [43] Matrix_1.7-0 jsonlite_1.8.9 listenv_0.9.1
## [46] foreach_1.5.2 gower_1.0.2 jquerylib_0.1.4
## [49] glue_1.8.0 parallelly_1.45.1 codetools_0.2-20
## [52] stringi_1.8.4 gtable_0.3.6 GPfit_1.0-9
## [55] tibble_3.3.0 pillar_1.11.0 furrr_0.3.1
## [58] htmltools_0.5.8.1 ipred_0.9-15 lava_1.8.1
## [61] R6_2.5.1 lhs_1.2.0 evaluate_1.0.1
## [64] backports_1.5.0 bslib_0.8.0 class_7.3-22
## [67] Rcpp_1.0.13-1 nlme_3.1-164 prodlim_2025.04.28
## [70] xfun_0.49 ModelMetrics_1.2.2.2 pkgconfig_2.0.3