Chapter 12 Code
# -------- Code Chank 1 --------
library(TSstudio)
## Warning: package 'TSstudio' was built under R version 4.2.3
data(USVSales)
# -------- Code Chank 2 --------
ts_info(USVSales)
## The USVSales series is a ts object with 1 variable and 528 observations
## Frequency: 12
## Start time: 1976 1
## End time: 2019 12
# -------- Code Chank 3 --------
ts_plot(USVSales,
title = "US Total Monthly Vehicle Sales",
Ytitle = "Thousands of Units",
Xtitle = "Year")
# -------- Code Chank 4 --------
ts_decompose(USVSales)
# -------- Code Chank 5 --------
USVSales_detrend <- USVSales - decompose(USVSales)$trend
ts_seasonal(USVSales_detrend, type = "box")
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
# -------- Code Chank 6 --------
acf(USVSales)

# -------- Code Chank 7 --------
ts_lags(USVSales, lags = c(12, 24, 36))
# -------- Code Chank 8 --------
df <- ts_to_prophet(window(USVSales, start = c(2010,1)))
names(df) <- c("date", "y")
head(df)
## date y
## 1 2010-01-01 712.469
## 2 2010-02-01 793.362
## 3 2010-03-01 1083.953
## 4 2010-04-01 997.334
## 5 2010-05-01 1117.570
## 6 2010-06-01 1000.455
# -------- Code Chank 9 --------
ts_plot(df,
title = "US Total Monthly Vehicle Sales (Subset)",
Ytitle = "Thousands of Units",
Xtitle = "Year")
# -------- Code Chank 10 --------
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
df <- df %>% mutate(month = factor(lubridate::month(date, label = TRUE), ordered = FALSE),
lag12 = lag(y, n = 12)) %>%
filter(!is.na(lag12))
# -------- Code Chank 11 --------
df$trend <- 1:nrow(df)
df$trend_sqr <- df$trend ^ 2
# -------- Code Chank 12 --------
str(df)
## 'data.frame': 108 obs. of 6 variables:
## $ date : Date, format: "2011-01-01" "2011-02-01" ...
## $ y : num 836 1007 1277 1174 1081 ...
## $ month : Factor w/ 12 levels "ene","feb","mar",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ lag12 : num 712 793 1084 997 1118 ...
## $ trend : int 1 2 3 4 5 6 7 8 9 10 ...
## $ trend_sqr: num 1 4 9 16 25 36 49 64 81 100 ...
# -------- Code Chank 13 --------
h <- 12
train_df <- df[1:(nrow(df) - h), ]
test_df <- df[(nrow(df) - h + 1):nrow(df), ]
# -------- Code Chank 14 --------
forecast_df <- data.frame(date = seq.Date(from = max(df$date) + lubridate::month(1),
length.out = h, by = "month"),
trend = seq(from = max(df$trend) + 1, length.out = h, by = 1))
forecast_df$trend_sqr <- forecast_df$trend ^ 2
# to avoid conflict with the h2o `month` function use the "lubridate::month" to explicly call the month from the lubridate function
forecast_df$month <- factor(lubridate::month(forecast_df$date, label = TRUE), ordered= FALSE)
forecast_df$lag12 <- tail(df$y, 12)
# -------- Code Chank 15 --------
lr <- lm(y ~ month + lag12 + trend + trend_sqr, data = train_df)
# -------- Code Chank 16 --------
summary(lr)
##
## Call:
## lm(formula = y ~ month + lag12 + trend + trend_sqr, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -146.625 -38.997 0.111 39.196 112.577
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 542.93505 72.59490 7.479 7.91e-11 ***
## monthfeb 112.73160 34.16141 3.300 0.001439 **
## monthmar 299.20932 54.24042 5.516 4.03e-07 ***
## monthabr 182.52406 42.53129 4.292 4.88e-05 ***
## monthmay 268.75603 51.28464 5.240 1.24e-06 ***
## monthjun 224.66897 44.26374 5.076 2.41e-06 ***
## monthjul 177.88564 42.21898 4.213 6.49e-05 ***
## monthago 241.63260 47.00693 5.140 1.86e-06 ***
## monthsept 152.99058 37.04199 4.130 8.76e-05 ***
## monthoct 125.16484 35.04896 3.571 0.000601 ***
## monthnov 127.97288 34.18772 3.743 0.000338 ***
## monthdic 278.67994 51.09552 5.454 5.21e-07 ***
## lag12 0.33906 0.10738 3.158 0.002236 **
## trend 7.73667 1.72415 4.487 2.36e-05 ***
## trend_sqr -0.05587 0.01221 -4.576 1.69e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 59.6 on 81 degrees of freedom
## Multiple R-squared: 0.9198, Adjusted R-squared: 0.9059
## F-statistic: 66.36 on 14 and 81 DF, p-value: < 2.2e-16
# -------- Code Chank 17 --------
test_df$yhat <- predict(lr, newdata = test_df)
mape_lr <- mean(abs(test_df$y - test_df$yhat) / test_df$y)
mape_lr
## [1] 0.03594578
# -------- Code Chank 18 --------
#install.packages("h2o")
library(h2o)
## Warning: package 'h2o' was built under R version 4.2.3
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.init(max_mem_size = "16G")
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 hours 45 minutes
## H2O cluster timezone: America/Bogota
## H2O data parsing timezone: UTC
## H2O cluster version: 3.40.0.1
## H2O cluster version age: 2 months and 24 days
## H2O cluster name: H2O_started_from_R_pc_eaf525
## H2O cluster total nodes: 1
## H2O cluster total memory: 9.93 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.2.2 (2022-10-31 ucrt)
# -------- Code Chank 19--------
train_h <- as.h2o(train_df)
##
|
| | 0%
|
|======================================================================| 100%
test_h <- as.h2o(test_df)
##
|
| | 0%
|
|======================================================================| 100%
forecast_h <- as.h2o(forecast_df)
##
|
| | 0%
|
|======================================================================| 100%
# -------- Code Chank 20 --------
x <- c("month", "lag12", "trend", "trend_sqr")
y <- "y"
# -------- Code Chank 21 --------
rf_md <- h2o.randomForest(training_frame = train_h,
nfolds = 5,
x = x,
y = y,
ntrees = 500,
stopping_rounds = 10,
stopping_metric = "RMSE",
score_each_iteration = TRUE,
stopping_tolerance = 0.0001,
seed = 1234)
##
|
| | 0%
|
|============================================================ | 85%
|
|======================================================================| 100%
# -------- Code Chank 22 --------
h2o.varimp_plot(rf_md)

# -------- Code Chank 23 --------
rf_md@model$model_summary
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 41 41 31588 8
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 12 10.04878 45 66 56.70732
# -------- Code Chank 24 --------
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
tree_score <- rf_md@model$scoring_history$training_rmse
plot_ly(x = seq_along(tree_score), y = tree_score,
type = "scatter", mode = "line") %>%
layout(title = "Random Forest Model - Trained Score History",
yaxis = list(title = "RMSE"),
xaxis = list(title = "Num. of Trees"))
## Warning: Ignoring 1 observations
# -------- Code Chank 25 --------
test_h$pred_rf <- h2o.predict(rf_md, test_h)
##
|
| | 0%
|
|======================================================================| 100%
# -------- Code Chank 26--------
test_1 <- as.data.frame(test_h)
# -------- Code Chank 27 --------
mape_rf <- mean(abs(test_1$y - test_1$pred_rf) / test_1$y)
mape_rf
## [1] 0.04647164
# -------- Code Chank 28 --------
search_criteria_rf <- list(
strategy = "RandomDiscrete",
stopping_metric = "rmse",
stopping_tolerance = 0.0001,
stopping_rounds = 10,
max_runtime_secs = 60 * 20
)
hyper_params_rf <- list(mtries = c(2, 3, 4),
sample_rate = c(0.632, 0.8, 0.95),
col_sample_rate_per_tree = c(0.5, 0.9, 1.0),
max_depth = c(seq(1, 30, 3)),
min_rows = c(1, 2, 5, 10))
search_criteria_rf <- list(strategy = "RandomDiscrete",
stopping_metric = "rmse",
stopping_tolerance = 0.0001,
stopping_rounds = 10,
max_runtime_secs = 60 * 20)
#rf2 <- h2o.grid(algorithm = "randomForest",
# search_criteria = search_criteria_rf,
# hyper_params = hyper_params_rf,
# x = x,
# y = y,
# training_frame = train_h,
# ntrees = 5000,
# nfolds = 5,
# grid_id = "rf_grid",
# seed = 1234)
# -------- Code Chank 29 --------
rf2_grid_search <- h2o.getGrid(grid_id = "rf_grid",
sort_by = "rmse",
decreasing = FALSE)
rf_grid_model <- h2o.getModel(rf2_grid_search@model_ids[[1]])
test_h$rf_grid <- h2o.predict(rf_grid_model, test_h)
##
|
| | 0%
|
|======================================================================| 100%
mape_rf2 <- mean(abs(test_1$y - test_1$rf_grid) / test_1$y)
mape_rf2
## [1] NaN
# -------- Code Chank 30 --------
plot_ly(data = test_1) %>%
add_lines(x = ~ date, y = ~y, name = "Actual") %>%
add_lines(x = ~ date, y = ~ yhat, name = "Linear Regression", line = list(dash = "dot")) %>%
add_lines(x = ~ date, y = ~ pred_rf, name = "Random Forest", line = list(dash = "dash")) %>%
layout(title = "Total Vehicle Sales - Actual vs. Prediction (Random Forest)",
yaxis = list(title = "Thousands of Units"),
xaxis = list(title = "Month"))
# -------- Code Chank 31 --------
gbm_md <- h2o.gbm(
training_frame = train_h,
nfolds = 5,
x = x,
y = y,
max_depth = 20,
distribution = "gaussian",
ntrees = 500,
learn_rate = 0.1,
score_each_iteration = TRUE
)
##
|
| | 0%
|
|============ | 17%
|
|=================== | 27%
|
|========================= | 35%
|
|=========================== | 38%
|
|============================= | 42%
|
|================================ | 45%
|
|================================== | 49%
|
|========================================== | 61%
|
|======================================================= | 79%
|
|================================================================ | 91%
|
|===================================================================== | 99%
|
|======================================================================| 100%
# -------- Code Chank 32 --------
h2o.varimp_plot(gbm_md)

test_h$pred_gbm <- h2o.predict(gbm_md, test_h)
##
|
| | 0%
|
|======================================================================| 100%
test_1 <- as.data.frame(test_h)
mape_gbm <- mean(abs(test_1$y - test_1$pred_gbm) / test_1$y)
mape_gbm
## [1] 0.03857898
# -------- Code Chank 33 --------
plot_ly(data = test_1) %>%
add_lines(x = ~ date, y = ~y, name = "Actual") %>%
add_lines(x = ~ date, y = ~ yhat, name = "Linear Regression", line = list(dash = "dot")) %>%
add_lines(x = ~ date, y = ~ pred_gbm, name = "Gradient Boosting Machine", line = list(dash = "dash")) %>%
layout(title = "Total Vehicle Sales - Actual vs. Prediction (Gradient Boosting Machine)",
yaxis = list(title = "Thousands of Units"),
xaxis = list(title = "Month"))
# -------- Code Chank 34 --------
autoML1 <- h2o.automl(training_frame = train_h,
x = x,
y = y,
nfolds = 5,
max_runtime_secs = 60*20,
seed = 1234)
##
|
| | 0%
|
|====== | 8%
## 23:14:41.215: AutoML: XGBoost is not available; skipping it.
## 23:14:41.730: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 96.0.
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|============================ | 39%
|
|============================ | 40%
|
|============================ | 41%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|=================================== | 51%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|===================================== | 54%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================ | 64%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================= | 71%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|======================================================================| 100%
# -------- Code Chank 35 --------
autoML1@leaderboard
## model_id rmse mse
## 1 StackedEnsemble_BestOfFamily_5_AutoML_3_20230502_231441 58.64951 3439.764
## 2 StackedEnsemble_BestOfFamily_3_AutoML_3_20230502_231441 60.10811 3612.985
## 3 DeepLearning_grid_1_AutoML_3_20230502_231441_model_21 60.77014 3693.010
## 4 DeepLearning_grid_1_AutoML_3_20230502_231441_model_115 60.89124 3707.743
## 5 DeepLearning_grid_1_AutoML_3_20230502_231441_model_17 61.63763 3799.197
## 6 DeepLearning_grid_1_AutoML_3_20230502_231441_model_37 61.93047 3835.384
## mae rmsle mean_residual_deviance
## 1 47.46628 0.04413580 3439.764
## 2 47.89576 0.04518533 3612.985
## 3 47.64741 0.04596416 3693.010
## 4 49.35682 0.04624308 3707.743
## 5 48.98060 0.04614691 3799.197
## 6 48.19092 0.04702943 3835.384
##
## [247 rows x 6 columns]
test_h$pred_autoML <- h2o.predict(autoML1@leader, test_h)
##
|
| | 0%
|
|======================================================================| 100%
test_1 <- as.data.frame(test_h)
mape_autoML <- mean(abs(test_1$y - test_1$pred_autoML) / test_1$y)
mape_autoML
## [1] 0.03865248
# -------- Code Chank 36 --------
plot_ly(data = test_1) %>%
add_lines(x = ~ date, y = ~y, name = "Actual") %>%
add_lines(x = ~ date, y = ~ yhat, name = "Linear Regression", line = list(dash = "dot")) %>%
add_lines(x = ~ date, y = ~ pred_autoML, name = "autoML", line = list(dash = "dash")) %>%
layout(title = "Total Vehicle Sales - Actual vs. Prediction (Auto ML Model)",
yaxis = list(title = "Thousands of Units"),
xaxis = list(title = "Month"))
# -------- Code Chank 37 --------
forecast_h$pred_gbm <- h2o.predict(gbm_md, forecast_h)
##
|
| | 0%
|
|======================================================================| 100%
forecast_h$pred_rf <- h2o.predict(rf_grid_model, forecast_h)
##
|
| | 0%
|
|======================================================================| 100%
forecast_h$pred_automl <- h2o.predict(autoML1@leader, forecast_h)
##
|
| | 0%
|
|======================================================================| 100%
final_forecast <- as.data.frame(forecast_h)
plot_ly(x = df$date, y = df$y,
type = "scatter",
mode = "line",
name = "Actual") %>%
add_lines(x = final_forecast$date, y = final_forecast$pred_rf, name = "Random Forest") %>%
add_lines(x = final_forecast$date, y = final_forecast$pred_gbm, name = "GBM") %>%
add_lines(x = final_forecast$date, y = final_forecast$pred_automl, name = "Auto ML") %>%
layout(title = "Total Vehicle Sales - Final Forecast",
yaxis = list(title = "Thousands of Units", range = c(1100, 1750)),
xaxis = list(title = "Month", range = c(as.Date("2016-01-01"), as.Date("2020-01-01"))))