12

library(TSstudio)
data(USVSales)

ts_info(USVSales)

##  The USVSales series is a ts object with 1 variable and 528 observations
##  Frequency: 12 
##  Start time: 1976 1 
##  End time: 2019 12

ts_plot(USVSales,
        title = "US Total Monthly Vehicles Saves",
        Ytitle = "Thousands of units",
        Xtitle = "Year")

ts_decompose(USVSales)

USVSales_detrend<-USVSales - decompose(USVSales)$trend

ts_seasonal(USVSales_detrend, type = "box")

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations

ts_cor(USVSales)

ts_lags(USVSales, lags = c(12, 24, 36))

df <- ts_to_prophet(window(USVSales, start = c(2010,1)))
names(df) <- c("date", "y")
head(df)

##         date        y
## 1 2010-01-01  712.469
## 2 2010-02-01  793.362
## 3 2010-03-01 1083.953
## 4 2010-04-01  997.334
## 5 2010-05-01 1117.570
## 6 2010-06-01 1000.455

ts_plot(df,
        title = "US Total Monthly Vehicle Sales (Subset)",
        Ytitle = "Thousands os units",
        Xtitle = "Year")

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

df <- df %>% mutate(month = factor(lubridate::month(date, label = TRUE), ordered = FALSE),
                    lag12 = lag(y, n = 12)) %>%
  filter(!is.na(lag12))

df$trend <- 1:nrow(df)

df$trend_sqr <- df$trend ^ 2

str(df)

## 'data.frame':    108 obs. of  6 variables:
##  $ date     : Date, format: "2011-01-01" "2011-02-01" ...
##  $ y        : num  836 1007 1277 1174 1081 ...
##  $ month    : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ lag12    : num  712 793 1084 997 1118 ...
##  $ trend    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ trend_sqr: num  1 4 9 16 25 36 49 64 81 100 ...

h <- 12

train_df <- df[1:(nrow(df) - h), ]

test_df <- df[(nrow(df) - h + 1):nrow(df), ]

forecast_df <- data.frame(date = seq.Date(from = max(df$date) + lubridate::month(1), length.out = h, by = "month"),
                          trend = seq(from = max(df$trend) + 1, length.out = h, by = 1))

forecast_df$trend_sqr <- forecast_df$trend ^ 2

forecast_df$month <- factor(lubridate::month(forecast_df$date, label = TRUE), ordered = FALSE)

forecast_df$lag12 <- tail(df$y, 12)

lr <- lm(y ~ month + lag12 + trend + trend_sqr, data = train_df)

summary(lr)

## 
## Call:
## lm(formula = y ~ month + lag12 + trend + trend_sqr, data = train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -146.625  -38.997    0.111   39.196  112.577 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 542.93505   72.59490   7.479 7.91e-11 ***
## monthFeb    112.73160   34.16141   3.300 0.001439 ** 
## monthMar    299.20932   54.24042   5.516 4.03e-07 ***
## monthApr    182.52406   42.53129   4.292 4.88e-05 ***
## monthMay    268.75603   51.28464   5.240 1.24e-06 ***
## monthJun    224.66897   44.26374   5.076 2.41e-06 ***
## monthJul    177.88564   42.21898   4.213 6.49e-05 ***
## monthAug    241.63260   47.00693   5.140 1.86e-06 ***
## monthSep    152.99058   37.04199   4.130 8.76e-05 ***
## monthOct    125.16484   35.04896   3.571 0.000601 ***
## monthNov    127.97288   34.18772   3.743 0.000338 ***
## monthDec    278.67994   51.09552   5.454 5.21e-07 ***
## lag12         0.33906    0.10738   3.158 0.002236 ** 
## trend         7.73667    1.72415   4.487 2.36e-05 ***
## trend_sqr    -0.05587    0.01221  -4.576 1.69e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 59.6 on 81 degrees of freedom
## Multiple R-squared:  0.9198, Adjusted R-squared:  0.9059 
## F-statistic: 66.36 on 14 and 81 DF,  p-value: < 2.2e-16

test_df$yhat <- predict(lr, newdata = test_df)

mape_lr <- mean(abs(test_df$y - test_df$yhat) / test_df$y)

mape_lr

## [1] 0.03594578

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

h2o.init(ip = "127.0.0.1")

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         50 seconds 475 milliseconds 
##     H2O cluster timezone:       UTC 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.40.0.1 
##     H2O cluster version age:    2 months and 23 days 
##     H2O cluster name:           H2O_started_from_R_r2020846_ghd421 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.18 GB 
##     H2O cluster total cores:    1 
##     H2O cluster allowed cores:  1 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          127.0.0.1 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.0 (2023-04-21)

h2o.init(max_mem_size = "16G")

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         50 seconds 513 milliseconds 
##     H2O cluster timezone:       UTC 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.40.0.1 
##     H2O cluster version age:    2 months and 23 days 
##     H2O cluster name:           H2O_started_from_R_r2020846_ghd421 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.18 GB 
##     H2O cluster total cores:    1 
##     H2O cluster allowed cores:  1 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.0 (2023-04-21)

train_h <- as.h2o(train_df)

## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher.  Please upgrade to take advangage of
## data.table speedups.

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

test_h <- as.h2o(test_df)

## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher.  Please upgrade to take advangage of
## data.table speedups.

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

forecast_h <- as.h2o(forecast_df)

## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher.  Please upgrade to take advangage of
## data.table speedups.

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

x <- c("month", "lag12", "trend", "trend_sqr")

y <- "y"

rf_md <- h2o.randomForest(training_frame = train_h,
                          nfolds = 5,
                          x = x,
                          y = y,
                          ntrees = 500,
                          stopping_rounds = 10,
                          stopping_metric = "RMSE",
                          score_each_iteration = TRUE,
                          stopping_tolerance = 0.0001,
                          seed = 1234)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=========================                                             |  35%
  |                                                                            
  |======================================================================| 100%

h2o.varimp_plot (rf_md)

rf_md@model$model_summary

## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1              41                       41               31588         8
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1        12   10.04878         45         66    56.70732

library(plotly)

## Loading required package: ggplot2

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

tree_score <- rf_md@model$scoring_history$training_rmse
plot_ly(x = seq_along(tree_score), y = tree_score,
        type = "scatter", mode = "line") %>%
  layout(title = "Random Forest Model - Trained Score History",
         yaxis = list(title = "RMSE"),
         xaxis = list(title = "Num. of Trees"))

## Warning: Ignoring 1 observations

12

Nicolás Díaz

2023-04-29