Seccion 12

Geronimo Rojas

2023-04-30

Capitulo 12

Chapter 12 Code

# -------- Code Chank 1 --------
library(TSstudio)
## Warning: package 'TSstudio' was built under R version 4.2.3
data(USVSales)
# -------- Code Chank 2 --------
ts_info(USVSales)
##  The USVSales series is a ts object with 1 variable and 528 observations
##  Frequency: 12 
##  Start time: 1976 1 
##  End time: 2019 12
# -------- Code Chank 3 --------
ts_plot(USVSales,
        title = "US Total Monthly Vehicle Sales",
        Ytitle = "Thousands of Units",
        Xtitle = "Year")
# -------- Code Chank 4 --------
ts_decompose(USVSales)
# -------- Code Chank 5 --------
USVSales_detrend <- USVSales - decompose(USVSales)$trend

ts_seasonal(USVSales_detrend, type = "box")
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations
# -------- Code Chank 6 --------
acf(USVSales)

# -------- Code Chank 7 --------
ts_lags(USVSales, lags = c(12, 24, 36))
# -------- Code Chank 8 --------
df <- ts_to_prophet(window(USVSales, start = c(2010,1)))

names(df) <- c("date", "y")

head(df)
##         date        y
## 1 2010-01-01  712.469
## 2 2010-02-01  793.362
## 3 2010-03-01 1083.953
## 4 2010-04-01  997.334
## 5 2010-05-01 1117.570
## 6 2010-06-01 1000.455
# -------- Code Chank 9 --------
ts_plot(df,
        title = "US Total Monthly Vehicle Sales (Subset)",
        Ytitle = "Thousands of Units",
        Xtitle = "Year")
# -------- Code Chank 10 --------
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
df <- df %>% mutate(month = factor(lubridate::month(date, label = TRUE), ordered = FALSE),
                    lag12 = lag(y, n = 12)) %>%
  filter(!is.na(lag12))
# -------- Code Chank 11 --------
df$trend <- 1:nrow(df)
df$trend_sqr <- df$trend ^ 2
# -------- Code Chank 12 --------
str(df)
## 'data.frame':    108 obs. of  6 variables:
##  $ date     : Date, format: "2011-01-01" "2011-02-01" ...
##  $ y        : num  836 1007 1277 1174 1081 ...
##  $ month    : Factor w/ 12 levels "ene","feb","mar",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ lag12    : num  712 793 1084 997 1118 ...
##  $ trend    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ trend_sqr: num  1 4 9 16 25 36 49 64 81 100 ...
# -------- Code Chank 13 --------
h <- 12
train_df <- df[1:(nrow(df) - h), ]
test_df <- df[(nrow(df) - h + 1):nrow(df), ]
# -------- Code Chank 14 --------
forecast_df <- data.frame(date = seq.Date(from = max(df$date) + lubridate::month(1),
                                          length.out = h, by = "month"),
                          trend = seq(from = max(df$trend) + 1, length.out = h, by = 1))
forecast_df$trend_sqr <- forecast_df$trend ^ 2

# to avoid conflict with the h2o `month` function use the "lubridate::month" to explicly call the month from the lubridate function 
forecast_df$month <- factor(lubridate::month(forecast_df$date, label = TRUE), ordered= FALSE) 
forecast_df$lag12 <- tail(df$y, 12)

# -------- Code Chank 15 --------
lr <- lm(y ~ month + lag12 + trend + trend_sqr, data = train_df)
# -------- Code Chank 16 --------
summary(lr)
## 
## Call:
## lm(formula = y ~ month + lag12 + trend + trend_sqr, data = train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -146.625  -38.997    0.111   39.196  112.577 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 542.93505   72.59490   7.479 7.91e-11 ***
## monthfeb    112.73160   34.16141   3.300 0.001439 ** 
## monthmar    299.20932   54.24042   5.516 4.03e-07 ***
## monthabr    182.52406   42.53129   4.292 4.88e-05 ***
## monthmay    268.75603   51.28464   5.240 1.24e-06 ***
## monthjun    224.66897   44.26374   5.076 2.41e-06 ***
## monthjul    177.88564   42.21898   4.213 6.49e-05 ***
## monthago    241.63260   47.00693   5.140 1.86e-06 ***
## monthsept   152.99058   37.04199   4.130 8.76e-05 ***
## monthoct    125.16484   35.04896   3.571 0.000601 ***
## monthnov    127.97288   34.18772   3.743 0.000338 ***
## monthdic    278.67994   51.09552   5.454 5.21e-07 ***
## lag12         0.33906    0.10738   3.158 0.002236 ** 
## trend         7.73667    1.72415   4.487 2.36e-05 ***
## trend_sqr    -0.05587    0.01221  -4.576 1.69e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 59.6 on 81 degrees of freedom
## Multiple R-squared:  0.9198, Adjusted R-squared:  0.9059 
## F-statistic: 66.36 on 14 and 81 DF,  p-value: < 2.2e-16
# -------- Code Chank 17 --------
test_df$yhat <- predict(lr, newdata = test_df)

mape_lr <- mean(abs(test_df$y - test_df$yhat) / test_df$y)
mape_lr
## [1] 0.03594578
# -------- Code Chank 18 --------
#install.packages("h2o")
library(h2o)
## Warning: package 'h2o' was built under R version 4.2.3
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
h2o.init(max_mem_size = "16G")
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 hours 45 minutes 
##     H2O cluster timezone:       America/Bogota 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.40.0.1 
##     H2O cluster version age:    2 months and 24 days 
##     H2O cluster name:           H2O_started_from_R_pc_eaf525 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   9.93 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.2.2 (2022-10-31 ucrt)
# -------- Code Chank 19--------
train_h <- as.h2o(train_df)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
test_h <- as.h2o(test_df)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
forecast_h <- as.h2o(forecast_df)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# -------- Code Chank 20 --------
x <- c("month", "lag12", "trend", "trend_sqr")
y <- "y"
# -------- Code Chank 21 --------
rf_md <- h2o.randomForest(training_frame = train_h,
                          nfolds = 5,
                          x = x,
                          y = y,
                          ntrees = 500,
                          stopping_rounds = 10,
                          stopping_metric = "RMSE",
                          score_each_iteration = TRUE,
                          stopping_tolerance = 0.0001,
                          seed = 1234)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |======================================================================| 100%
# -------- Code Chank 22 --------
h2o.varimp_plot(rf_md)

# -------- Code Chank 23 --------
rf_md@model$model_summary
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1              41                       41               31588         8
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1        12   10.04878         45         66    56.70732
# -------- Code Chank 24 --------
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
tree_score <- rf_md@model$scoring_history$training_rmse
plot_ly(x = seq_along(tree_score), y = tree_score,
        type = "scatter", mode = "line") %>%
  layout(title = "Random Forest Model - Trained Score History",
         yaxis = list(title = "RMSE"),
         xaxis = list(title = "Num. of Trees"))
## Warning: Ignoring 1 observations
# -------- Code Chank 25 --------
test_h$pred_rf <- h2o.predict(rf_md, test_h)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# -------- Code Chank 26--------
test_1 <- as.data.frame(test_h)
# -------- Code Chank 27 --------
mape_rf <- mean(abs(test_1$y - test_1$pred_rf) / test_1$y)
mape_rf
## [1] 0.04647164
# -------- Code Chank 28 --------
search_criteria_rf <- list(
  strategy = "RandomDiscrete",
  stopping_metric = "rmse",
  stopping_tolerance = 0.0001,
  stopping_rounds = 10,
  max_runtime_secs = 60 * 20
)

hyper_params_rf <- list(mtries = c(2, 3, 4),
                        sample_rate = c(0.632, 0.8, 0.95),
                        col_sample_rate_per_tree = c(0.5, 0.9, 1.0),
                        max_depth = c(seq(1, 30, 3)),
                        min_rows = c(1, 2, 5, 10))

search_criteria_rf <- list(strategy = "RandomDiscrete",
                           stopping_metric = "rmse",
                           stopping_tolerance = 0.0001,
                           stopping_rounds = 10,
                           max_runtime_secs = 60 * 20)

#rf2 <- h2o.grid(algorithm = "randomForest",
 #               search_criteria = search_criteria_rf,
  #              hyper_params = hyper_params_rf,
   #             x = x,
    #            y = y,
     #          training_frame = train_h,
      #          ntrees = 5000,
       #        nfolds = 5,
        #        grid_id = "rf_grid",
         #       seed = 1234)

# -------- Code Chank 29 --------
rf2_grid_search <- h2o.getGrid(grid_id = "rf_grid",
                               sort_by = "rmse",
                               decreasing = FALSE)

rf_grid_model <- h2o.getModel(rf2_grid_search@model_ids[[1]])


test_h$rf_grid  <- h2o.predict(rf_grid_model, test_h)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
mape_rf2 <- mean(abs(test_1$y - test_1$rf_grid) / test_1$y)
mape_rf2
## [1] NaN
# -------- Code Chank 30 --------
plot_ly(data = test_1) %>%
  add_lines(x = ~ date, y = ~y, name = "Actual") %>%
  add_lines(x = ~ date, y = ~ yhat, name = "Linear Regression", line = list(dash = "dot")) %>%
  add_lines(x = ~ date, y = ~ pred_rf, name = "Random Forest", line = list(dash = "dash")) %>%
  
  layout(title = "Total Vehicle Sales - Actual vs. Prediction (Random Forest)",
         yaxis = list(title = "Thousands of Units"),
         xaxis = list(title = "Month"))
# -------- Code Chank 31 --------
gbm_md <- h2o.gbm(
  training_frame = train_h,
  nfolds = 5,
  x = x,
  y = y,
  max_depth = 20,
  distribution = "gaussian",
  ntrees = 500,
  learn_rate = 0.1,
  score_each_iteration = TRUE
)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |=========================                                             |  35%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |==========================================                            |  61%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================| 100%
# -------- Code Chank 32 --------

h2o.varimp_plot(gbm_md)

test_h$pred_gbm  <- h2o.predict(gbm_md, test_h)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
test_1 <- as.data.frame(test_h)

mape_gbm <- mean(abs(test_1$y - test_1$pred_gbm) / test_1$y)
mape_gbm
## [1] 0.03857898
# -------- Code Chank 33 --------
plot_ly(data = test_1) %>%
  add_lines(x = ~ date, y = ~y, name = "Actual") %>%
  add_lines(x = ~ date, y = ~ yhat, name = "Linear Regression", line = list(dash = "dot")) %>%
  add_lines(x = ~ date, y = ~ pred_gbm, name = "Gradient Boosting Machine", line = list(dash = "dash")) %>%
  layout(title = "Total Vehicle Sales - Actual vs. Prediction (Gradient Boosting Machine)",
         yaxis = list(title = "Thousands of Units"),
         xaxis = list(title = "Month"))
# -------- Code Chank 34 --------
autoML1 <- h2o.automl(training_frame = train_h,
                      x = x,
                      y = y,
                      nfolds = 5,
                      max_runtime_secs = 60*20,
                      seed = 1234)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======                                                                |   8%
## 23:14:41.215: AutoML: XGBoost is not available; skipping it.
## 23:14:41.730: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 96.0.
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |===================                                                   |  28%
  |                                                                            
  |============================                                          |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |============================                                          |  41%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |=================================                                     |  48%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |===================================                                   |  51%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |=====================================                                 |  54%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |========================================                              |  58%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |==========================================                            |  61%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |============================================                          |  64%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |=============================================                         |  65%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |===============================================                       |  68%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |=================================================                     |  71%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |======================================================                |  78%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |========================================================              |  81%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |=============================================================         |  88%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |====================================================================  |  98%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |======================================================================| 100%
# -------- Code Chank 35 --------
autoML1@leaderboard
##                                                  model_id     rmse      mse
## 1 StackedEnsemble_BestOfFamily_5_AutoML_3_20230502_231441 58.64951 3439.764
## 2 StackedEnsemble_BestOfFamily_3_AutoML_3_20230502_231441 60.10811 3612.985
## 3   DeepLearning_grid_1_AutoML_3_20230502_231441_model_21 60.77014 3693.010
## 4  DeepLearning_grid_1_AutoML_3_20230502_231441_model_115 60.89124 3707.743
## 5   DeepLearning_grid_1_AutoML_3_20230502_231441_model_17 61.63763 3799.197
## 6   DeepLearning_grid_1_AutoML_3_20230502_231441_model_37 61.93047 3835.384
##        mae      rmsle mean_residual_deviance
## 1 47.46628 0.04413580               3439.764
## 2 47.89576 0.04518533               3612.985
## 3 47.64741 0.04596416               3693.010
## 4 49.35682 0.04624308               3707.743
## 5 48.98060 0.04614691               3799.197
## 6 48.19092 0.04702943               3835.384
## 
## [247 rows x 6 columns]
test_h$pred_autoML  <- h2o.predict(autoML1@leader, test_h)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
test_1 <- as.data.frame(test_h)

mape_autoML <- mean(abs(test_1$y - test_1$pred_autoML) / test_1$y)
mape_autoML
## [1] 0.03865248
# -------- Code Chank 36 --------
plot_ly(data = test_1) %>%
  add_lines(x = ~ date, y = ~y, name = "Actual") %>%
  add_lines(x = ~ date, y = ~ yhat, name = "Linear Regression", line = list(dash = "dot")) %>%
  add_lines(x = ~ date, y = ~ pred_autoML, name = "autoML", line = list(dash = "dash")) %>%
  layout(title = "Total Vehicle Sales - Actual vs. Prediction (Auto ML Model)",
         yaxis = list(title = "Thousands of Units"),
         xaxis = list(title = "Month"))
# -------- Code Chank 37 --------
forecast_h$pred_gbm  <- h2o.predict(gbm_md, forecast_h)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
forecast_h$pred_rf  <- h2o.predict(rf_grid_model, forecast_h)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
forecast_h$pred_automl  <- h2o.predict(autoML1@leader, forecast_h)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
final_forecast <- as.data.frame(forecast_h)

plot_ly(x = df$date, y = df$y,
        type = "scatter",
        mode = "line", 
        name = "Actual") %>% 
  add_lines(x = final_forecast$date, y = final_forecast$pred_rf, name = "Random Forest") %>%
  add_lines(x = final_forecast$date, y = final_forecast$pred_gbm, name = "GBM") %>%
  add_lines(x = final_forecast$date, y = final_forecast$pred_automl, name = "Auto ML") %>%
  layout(title = "Total Vehicle Sales - Final Forecast",
         yaxis = list(title = "Thousands of Units", range = c(1100, 1750)),
         xaxis = list(title = "Month", range = c(as.Date("2016-01-01"), as.Date("2020-01-01"))))