library(TSstudio)
data(USVSales)
ts_info(USVSales)
## The USVSales series is a ts object with 1 variable and 528 observations
## Frequency: 12
## Start time: 1976 1
## End time: 2019 12
ts_plot(USVSales,
title = "US Total Monthly Vehicles Saves",
Ytitle = "Thousands of units",
Xtitle = "Year")
ts_decompose(USVSales)
USVSales_detrend<-USVSales - decompose(USVSales)$trend
ts_seasonal(USVSales_detrend, type = "box")
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
ts_cor(USVSales)
ts_lags(USVSales, lags = c(12, 24, 36))
df <- ts_to_prophet(window(USVSales, start = c(2010,1)))
names(df) <- c("date", "y")
head(df)
## date y
## 1 2010-01-01 712.469
## 2 2010-02-01 793.362
## 3 2010-03-01 1083.953
## 4 2010-04-01 997.334
## 5 2010-05-01 1117.570
## 6 2010-06-01 1000.455
ts_plot(df,
title = "US Total Monthly Vehicle Sales (Subset)",
Ytitle = "Thousands os units",
Xtitle = "Year")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
df <- df %>% mutate(month = factor(lubridate::month(date, label = TRUE), ordered = FALSE),
lag12 = lag(y, n = 12)) %>%
filter(!is.na(lag12))
df$trend <- 1:nrow(df)
df$trend_sqr <- df$trend ^ 2
str(df)
## 'data.frame': 108 obs. of 6 variables:
## $ date : Date, format: "2011-01-01" "2011-02-01" ...
## $ y : num 836 1007 1277 1174 1081 ...
## $ month : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ lag12 : num 712 793 1084 997 1118 ...
## $ trend : int 1 2 3 4 5 6 7 8 9 10 ...
## $ trend_sqr: num 1 4 9 16 25 36 49 64 81 100 ...
h <- 12
train_df <- df[1:(nrow(df) - h), ]
test_df <- df[(nrow(df) - h + 1):nrow(df), ]
forecast_df <- data.frame(date = seq.Date(from = max(df$date) + lubridate::month(1), length.out = h, by = "month"),
trend = seq(from = max(df$trend) + 1, length.out = h, by = 1))
forecast_df$trend_sqr <- forecast_df$trend ^ 2
forecast_df$month <- factor(lubridate::month(forecast_df$date, label = TRUE), ordered = FALSE)
forecast_df$lag12 <- tail(df$y, 12)
lr <- lm(y ~ month + lag12 + trend + trend_sqr, data = train_df)
summary(lr)
##
## Call:
## lm(formula = y ~ month + lag12 + trend + trend_sqr, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -146.625 -38.997 0.111 39.196 112.577
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 542.93505 72.59490 7.479 7.91e-11 ***
## monthFeb 112.73160 34.16141 3.300 0.001439 **
## monthMar 299.20932 54.24042 5.516 4.03e-07 ***
## monthApr 182.52406 42.53129 4.292 4.88e-05 ***
## monthMay 268.75603 51.28464 5.240 1.24e-06 ***
## monthJun 224.66897 44.26374 5.076 2.41e-06 ***
## monthJul 177.88564 42.21898 4.213 6.49e-05 ***
## monthAug 241.63260 47.00693 5.140 1.86e-06 ***
## monthSep 152.99058 37.04199 4.130 8.76e-05 ***
## monthOct 125.16484 35.04896 3.571 0.000601 ***
## monthNov 127.97288 34.18772 3.743 0.000338 ***
## monthDec 278.67994 51.09552 5.454 5.21e-07 ***
## lag12 0.33906 0.10738 3.158 0.002236 **
## trend 7.73667 1.72415 4.487 2.36e-05 ***
## trend_sqr -0.05587 0.01221 -4.576 1.69e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 59.6 on 81 degrees of freedom
## Multiple R-squared: 0.9198, Adjusted R-squared: 0.9059
## F-statistic: 66.36 on 14 and 81 DF, p-value: < 2.2e-16
test_df$yhat <- predict(lr, newdata = test_df)
mape_lr <- mean(abs(test_df$y - test_df$yhat) / test_df$y)
mape_lr
## [1] 0.03594578
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.init(ip = "127.0.0.1")
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 50 seconds 475 milliseconds
## H2O cluster timezone: UTC
## H2O data parsing timezone: UTC
## H2O cluster version: 3.40.0.1
## H2O cluster version age: 2 months and 23 days
## H2O cluster name: H2O_started_from_R_r2020846_ghd421
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.18 GB
## H2O cluster total cores: 1
## H2O cluster allowed cores: 1
## H2O cluster healthy: TRUE
## H2O Connection ip: 127.0.0.1
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.0 (2023-04-21)
h2o.init(max_mem_size = "16G")
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 50 seconds 513 milliseconds
## H2O cluster timezone: UTC
## H2O data parsing timezone: UTC
## H2O cluster version: 3.40.0.1
## H2O cluster version age: 2 months and 23 days
## H2O cluster name: H2O_started_from_R_r2020846_ghd421
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.18 GB
## H2O cluster total cores: 1
## H2O cluster allowed cores: 1
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.0 (2023-04-21)
train_h <- as.h2o(train_df)
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
##
|
| | 0%
|
|======================================================================| 100%
test_h <- as.h2o(test_df)
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
##
|
| | 0%
|
|======================================================================| 100%
forecast_h <- as.h2o(forecast_df)
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
##
|
| | 0%
|
|======================================================================| 100%
x <- c("month", "lag12", "trend", "trend_sqr")
y <- "y"
rf_md <- h2o.randomForest(training_frame = train_h,
nfolds = 5,
x = x,
y = y,
ntrees = 500,
stopping_rounds = 10,
stopping_metric = "RMSE",
score_each_iteration = TRUE,
stopping_tolerance = 0.0001,
seed = 1234)
##
|
| | 0%
|
|========================= | 35%
|
|======================================================================| 100%
h2o.varimp_plot (rf_md)

rf_md@model$model_summary
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 41 41 31588 8
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 12 10.04878 45 66 56.70732
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
tree_score <- rf_md@model$scoring_history$training_rmse
plot_ly(x = seq_along(tree_score), y = tree_score,
type = "scatter", mode = "line") %>%
layout(title = "Random Forest Model - Trained Score History",
yaxis = list(title = "RMSE"),
xaxis = list(title = "Num. of Trees"))
## Warning: Ignoring 1 observations