# for Core packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for financial analysis
library(tidyquant)
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
##
## Attaching package: 'PerformanceAnalytics'
##
## The following object is masked from 'package:graphics':
##
## legend
##
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# for times series
library(timetk)
Goal: Apply Matt Dancho’s tutorial to state unemployment initial claims of New England states.
The following is the replication of Matt Dancho’s tutorial on this page
start_date <- "1989-01-01"
symbols_txt <- c("CTICLAIMS", # Connecticut
"MEICLAIMS", # Maine
"MAICLAIMS", # Massachusetts
"NHICLAIMS", # New Hampshire
"RIICLAIMS", # Rhode Island
"VTICLAIMS") # Vermont
claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
mutate(symbol = fct_recode(symbol,
"Connecticut" = "CTICLAIMS",
"Maine" = "MEICLAIMS",
"Massachusetts" = "MAICLAIMS",
"New Hampshire" = "NHICLAIMS",
"Rhode Island" = "RIICLAIMS",
"Vermont" = "VTICLAIMS")) %>%
rename(claims = price)
claims_tbl
## # A tibble: 11,238 × 3
## symbol date claims
## <fct> <date> <int>
## 1 Connecticut 1989-01-07 8345
## 2 Connecticut 1989-01-14 6503
## 3 Connecticut 1989-01-21 3821
## 4 Connecticut 1989-01-28 4663
## 5 Connecticut 1989-02-04 4162
## 6 Connecticut 1989-02-11 4337
## 7 Connecticut 1989-02-18 4079
## 8 Connecticut 1989-02-25 3556
## 9 Connecticut 1989-03-04 3826
## 10 Connecticut 1989-03-11 3515
## # ℹ 11,228 more rows
claims_tbl %>%
plot_time_series(.date_var = date, .value = claims)
claims_tbl %>% count(symbol)
## # A tibble: 6 × 2
## symbol n
## <fct> <int>
## 1 Connecticut 1873
## 2 Massachusetts 1873
## 3 Maine 1873
## 4 New Hampshire 1873
## 5 Rhode Island 1873
## 6 Vermont 1873
claims_tbl %>%
group_by(symbol) %>%
plot_time_series(
.date_var = date,
.value = claims,
.facet_ncol = 2,
.facet_scales = "free_x",
.interactive = FALSE)
claims_tbl %>%
filter_by_time(.date_var = date) %>%
group_by(symbol) %>%
plot_time_series_boxplot(
date,claims,
.period = "1 year",
.facet_ncol = 2)
claims_tbl %>%
group_by(symbol) %>%
plot_time_series_regression(
.date_var = date,
.formula = log(claims) ~ as.numeric(date) + month(date, label = TRUE),
.facet_ncol = 2,
.show_summary = FALSE
)
claims_tbl %>%
group_by(symbol) %>%
plot_acf_diagnostics(
date, claims,
.lags = "7 days")
claims_tbl %>%
group_by(symbol) %>%
plot_acf_diagnostics(
Date, claims,
.ccf_vars = c(date, claims))
claims_tbl %>%
group_by(symbol) %>%
plot_seasonal_diagnostics(date, claims)
claims_tbl %>%
group_by(symbol) %>%
plot_stl_diagnostics(
date, claims,
.feature_set = c("observed", "trend", "remainder"))
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
claims_tbl %>%
group_by(symbol) %>%
plot_time_series(date, claims, .facet_ncol = 2, .interactive = FALSE)
Summarize by quarter
claims_tbl %>%
group_by(symbol) %>%
summarise_by_time(.date_var = date, volume = sum(claims), .by = "quarter") %>%
plot_time_series(date, volume, .facet_ncol = 2, .interactive = FALSE)
claims_tbl %>%
group_by(symbol) %>%
summarise_by_time(.date_var = date, adjusted = mean(claims), .by = "month") %>%
plot_time_series(date, adjusted, .facet_ncol = 2, .interactive = FALSE)
claims_tbl %>%
group_by(symbol) %>%
filter_by_time(.date_var = date,
.start_date = "2013",
.end_date = "2015") %>%
plot_time_series(date, claims, .facet_ncol = 2)
claims_tbl %>%
group_by(symbol) %>%
pad_by_time(date, .by = "auto", .pad_value = 0)
## pad applied on the interval: week
## # A tibble: 11,238 × 3
## # Groups: symbol [6]
## symbol date claims
## <fct> <date> <int>
## 1 Connecticut 1989-01-07 8345
## 2 Connecticut 1989-01-14 6503
## 3 Connecticut 1989-01-21 3821
## 4 Connecticut 1989-01-28 4663
## 5 Connecticut 1989-02-04 4162
## 6 Connecticut 1989-02-11 4337
## 7 Connecticut 1989-02-18 4079
## 8 Connecticut 1989-02-25 3556
## 9 Connecticut 1989-03-04 3826
## 10 Connecticut 1989-03-11 3515
## # ℹ 11,228 more rows
claims_tbl %>%
head(10) %>%
mutate(rolling_avg_2 = slidify_vec(claims, mean,
.period = 2,
.align = "left",
.partial = TRUE))
## # A tibble: 10 × 4
## symbol date claims rolling_avg_2
## <fct> <date> <int> <dbl>
## 1 Connecticut 1989-01-07 8345 7424
## 2 Connecticut 1989-01-14 6503 5162
## 3 Connecticut 1989-01-21 3821 4242
## 4 Connecticut 1989-01-28 4663 4412.
## 5 Connecticut 1989-02-04 4162 4250.
## 6 Connecticut 1989-02-11 4337 4208
## 7 Connecticut 1989-02-18 4079 3818.
## 8 Connecticut 1989-02-25 3556 3691
## 9 Connecticut 1989-03-04 3826 3670.
## 10 Connecticut 1989-03-11 3515 3515
# Make the rolling function
roll_avg_30 <- slidify(.f = mean, .period = 30, .align = "center", .partial = TRUE)
# Apply the rolling function
claims_tbl %>%
select(symbol, date, claims) %>%
group_by(symbol) %>%
# Apply Sliding Function
mutate(rolling_avg_30 = roll_avg_30(claims)) %>%
tidyr::pivot_longer(cols = c(claims, rolling_avg_30)) %>%
plot_time_series(date, value, .color_var = name,
.facet_ncol = 2, .smooth = FALSE)
Import the cleaned data from Module 7.
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.0.10
## Warning: package 'modeldata' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ✖ dials::momentum() masks TTR::momentum()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
claims_tbl %>%
# h2o requires all variables to be either numeric or factors
mutate(across(where(is.character), factor))
## # A tibble: 11,238 × 3
## symbol date claims
## <fct> <date> <int>
## 1 Connecticut 1989-01-07 8345
## 2 Connecticut 1989-01-14 6503
## 3 Connecticut 1989-01-21 3821
## 4 Connecticut 1989-01-28 4663
## 5 Connecticut 1989-02-04 4162
## 6 Connecticut 1989-02-11 4337
## 7 Connecticut 1989-02-18 4079
## 8 Connecticut 1989-02-25 3556
## 9 Connecticut 1989-03-04 3826
## 10 Connecticut 1989-03-11 3515
## # ℹ 11,228 more rows
set.seed(1234)
data_split <- initial_split(claims_tbl, strata = "claims")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)
recipe_obj <- recipe(claims ~ ., data = train_tbl) %>%
# Remove zero variance variables
step_zv(all_predictors())
# Initialize h2o
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 hours 37 minutes
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 11 months and 12 days
## H2O cluster name: H2O_started_from_R_johnnymckinnon_nqx583
## H2O cluster total nodes: 1
## H2O cluster total memory: 2.76 GB
## H2O cluster total cores: 12
## H2O cluster allowed cores: 12
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.2 (2023-10-31)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (11 months and 12 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
split.h20 <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 5639)
##
|
| | 0%
|
|======================================================================| 100%
train_h2o <- split.h20[[1]]
valid_h2o <- split.h20[[2]]
test_h2o <- as.h2o(test_tbl)
##
|
| | 0%
|
|======================================================================| 100%
y <- "symbol"
x <- setdiff(names(train_tbl), y)
models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
# max_runtime_secs = 30,
max_models = 10,
exclude_algos = "DeepLearning",
nfolds = 5,
seed = 3456
)
##
|
| | 0%
## 16:23:21.577: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|===== | 7%
|
|===== | 8%
|
|======= | 10%
|
|========= | 13%
|
|=========== | 15%
|
|============ | 17%
|
|============= | 18%
|
|============== | 21%
|
|==================== | 28%
|
|====================== | 31%
|
|======================= | 33%
|
|======================================================================| 100%
Examine the output of h2o.automl
models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name" "leader" "leaderboard" "event_log"
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
## model_id mean_per_class_error
## 1 StackedEnsemble_AllModels_1_AutoML_9_20241203_162321 0.3685360
## 2 StackedEnsemble_BestOfFamily_1_AutoML_9_20241203_162321 0.4152764
## 3 XRT_1_AutoML_9_20241203_162321 0.5167572
## 4 GBM_1_AutoML_9_20241203_162321 0.5177677
## 5 DRF_1_AutoML_9_20241203_162321 0.5292755
## 6 GLM_1_AutoML_9_20241203_162321 0.5388845
## logloss rmse mse
## 1 0.852542 0.5438285 0.2957494
## 2 0.911274 0.5664689 0.3208871
## 3 1.319180 0.7219818 0.5212578
## 4 1.068785 0.6276309 0.3939205
## 5 1.340430 0.7281312 0.5301751
## 6 1.244583 0.6699670 0.4488558
##
## [12 rows x 5 columns]
models_h2o@leader
## Model Details:
## ==============
##
## H2OMultinomialModel: stackedensemble
## Model ID: StackedEnsemble_AllModels_1_AutoML_9_20241203_162321
## Model Summary for Stacked Ensemble:
## key value
## 1 Stacking strategy cross_validation
## 2 Number of base models (used / total) 10/10
## 3 # GBM base models (used / total) 4/4
## 4 # XGBoost base models (used / total) 3/3
## 5 # DRF base models (used / total) 2/2
## 6 # GLM base models (used / total) 1/1
## 7 Metalearner algorithm GLM
## 8 Metalearner fold assignment scheme Random
## 9 Metalearner nfolds 5
## 10 Metalearner fold_column NA
## 11 Custom metalearner hyperparameters None
##
##
## H2OMultinomialMetrics: stackedensemble
## ** Reported on training data. **
##
## Training Set Metrics:
## =====================
##
## Extract training frame with `h2o.getFrame("AutoML_9_20241203_162321_training_RTMP_sid_ae6d_5")`
## MSE: (Extract with `h2o.mse`) 0.3098416
## RMSE: (Extract with `h2o.rmse`) 0.5566342
## Logloss: (Extract with `h2o.logloss`) 0.8600982
## Mean Per-Class Error: 0.4415219
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 25755.11
## Residual Deviance: (Extract with `h2o.residual_deviance`) 12364.77
## AIC: (Extract with `h2o.aic`) NaN
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut 1019 4 179 1 27
## Maine 19 332 1 246 592
## Massachusetts 85 0 1067 0 7
## New Hampshire 2 157 1 325 67
## Rhode Island 104 535 6 64 455
## Vermont 3 35 0 291 53
## Totals 1232 1063 1254 927 1201
## Vermont Error Rate
## Connecticut 0 0.1715 = 211 / 1,230
## Maine 40 0.7301 = 898 / 1,230
## Massachusetts 0 0.0794 = 92 / 1,159
## New Hampshire 645 0.7285 = 872 / 1,197
## Rhode Island 24 0.6170 = 733 / 1,188
## Vermont 802 0.3226 = 382 / 1,184
## Totals 1511 0.4435 = 3,188 / 7,188
##
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-6 Hit Ratios:
## k hit_ratio
## 1 1 0.556483
## 2 2 0.893433
## 3 3 0.966889
## 4 4 0.998052
## 5 5 0.999583
## 6 6 1.000000
##
##
##
##
## H2OMultinomialMetrics: stackedensemble
## ** Reported on validation data. **
##
## Validation Set Metrics:
## =====================
##
## MSE: (Extract with `h2o.mse`) 0.305676
## RMSE: (Extract with `h2o.rmse`) 0.5528798
## Logloss: (Extract with `h2o.logloss`) 0.8784158
## Mean Per-Class Error: 0.3921174
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 4438.605
## Residual Deviance: (Extract with `h2o.residual_deviance`) 2174.957
## AIC: (Extract with `h2o.aic`) NaN
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,valid = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut 152 0 34 0 4
## Maine 2 77 1 39 73
## Massachusetts 32 0 193 1 3
## New Hampshire 3 26 0 97 22
## Rhode Island 17 51 2 20 102
## Vermont 1 10 0 40 11
## Totals 207 164 230 197 215
## Vermont Error Rate
## Connecticut 0 0.2000 = 38 / 190
## Maine 18 0.6333 = 133 / 210
## Massachusetts 0 0.1572 = 36 / 229
## New Hampshire 75 0.5650 = 126 / 223
## Rhode Island 2 0.4742 = 92 / 194
## Vermont 130 0.3229 = 62 / 192
## Totals 225 0.3934 = 487 / 1,238
##
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,valid = TRUE)`
## =======================================================================
## Top-6 Hit Ratios:
## k hit_ratio
## 1 1 0.606624
## 2 2 0.882068
## 3 3 0.970113
## 4 4 0.997577
## 5 5 0.999192
## 6 6 1.000000
##
##
##
##
## H2OMultinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## Cross-Validation Set Metrics:
## =====================
##
## Extract cross-validation frame with `h2o.getFrame("levelone_training_StackedEnsemble_AllModels_1_AutoML_9_20241203_162321")`
## MSE: (Extract with `h2o.mse`) 0.2925926
## RMSE: (Extract with `h2o.rmse`) 0.5409183
## Logloss: (Extract with `h2o.logloss`) 0.8423963
## Mean Per-Class Error: 0.3708954
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 25767.24
## Residual Deviance: (Extract with `h2o.residual_deviance`) 12110.29
## AIC: (Extract with `h2o.aic`) NaN
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,xval = TRUE)`
## =======================================================================
## Top-6 Hit Ratios:
## k hit_ratio
## 1 1 0.628130
## 2 2 0.891903
## 3 3 0.965220
## 4 4 0.995409
## 5 5 0.998748
## 6 6 1.000000
##
##
##
##
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid
## accuracy 0.627391 0.013936 0.605498 0.624741
## auc NA 0.000000 NA NA
## err 0.372609 0.013936 0.394502 0.375259
## err_count 536.000000 36.090164 574.000000 543.000000
## logloss 0.841912 0.013921 0.859396 0.831485
## max_per_class_error 0.567701 0.039323 0.571970 0.604255
## mean_per_class_accuracy 0.628757 0.011565 0.614289 0.623152
## mean_per_class_error 0.371243 0.011565 0.385711 0.376848
## mse 0.292296 0.005311 0.301223 0.290064
## null_deviance 5153.448700 214.011960 5216.419400 5188.217000
## pr_auc NA 0.000000 NA NA
## r2 0.900432 0.002286 0.899640 0.898534
## residual_deviance 2419.913000 85.605640 2500.841800 2406.318000
## rmse 0.540626 0.004891 0.548838 0.538576
## cv_3_valid cv_4_valid cv_5_valid
## accuracy 0.632997 0.630539 0.643178
## auc NA NA NA
## err 0.367003 0.369461 0.356822
## err_count 545.000000 542.000000 476.000000
## logloss 0.832230 0.831742 0.854704
## max_per_class_error 0.590909 0.569170 0.502203
## mean_per_class_accuracy 0.625481 0.638095 0.642768
## mean_per_class_error 0.374519 0.361905 0.357232
## mse 0.287202 0.292172 0.290817
## null_deviance 5323.343000 5257.700700 4781.564500
## pr_auc NA NA NA
## r2 0.903365 0.898304 0.902316
## residual_deviance 2471.723400 2440.331800 2280.350300
## rmse 0.535912 0.540529 0.539275
?h2o.getModel
?h2o.saveModel
?h2o.loadModel
#h2o.getModel("StackedEnsemble_AllModels_1_AutoML_1_20241203_144731") %>%
#h2o.saveModel("h2o_models/")
best_model <- h2o.loadModel("h2o_models/StackedEnsemble_AllModels_1_AutoML_1_20241203_144731")
predictions <- h2o.predict(best_model, newdata = test_h2o)
##
|
| | 0%
|
|======================================================================| 100%
predictions_tbl <- predictions %>%
as.tibble()
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
predictions_tbl %>%
bind_cols(test_tbl)
## # A tibble: 2,812 × 10
## predict Connecticut Maine Massachusetts New.Hampshire Rhode.Island Vermont
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Massach… 0.474 8.37e-4 0.520 0.000200 0.00452 8.53e-5
## 2 Connect… 0.866 1.09e-2 0.0682 0.0000946 0.0547 1.47e-4
## 3 Connect… 0.851 1.65e-2 0.00961 0.00201 0.119 1.70e-3
## 4 Connect… 0.600 5.46e-2 0.00121 0.0159 0.323 5.34e-3
## 5 Connect… 0.448 1.66e-1 0.000147 0.0206 0.358 6.93e-3
## 6 Connect… 0.908 1.33e-3 0.0814 0.0000165 0.00884 7.40e-5
## 7 Connect… 0.675 6.43e-2 0.000387 0.00815 0.249 3.37e-3
## 8 Rhode I… 0.408 1.40e-1 0.000327 0.0136 0.433 4.15e-3
## 9 Connect… 0.422 1.63e-1 0.000226 0.0213 0.384 9.56e-3
## 10 Connect… 0.590 1.10e-1 0.00101 0.0166 0.276 6.50e-3
## # ℹ 2,802 more rows
## # ℹ 3 more variables: symbol <fct>, date <date>, claims <int>
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train" "on_valid" "on_xval" "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
##
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
##
## $model$`__meta`$schema_type
## [1] "Key<Model>"
##
##
## $model$name
## [1] "StackedEnsemble_AllModels_1_AutoML_1_20241203_144731"
##
## $model$type
## [1] "Key<Model>"
##
## $model$URL
## [1] "/3/Models/StackedEnsemble_AllModels_1_AutoML_1_20241203_144731"
##
##
## $model_checksum
## [1] "-4233719964055368032"
##
## $frame
## $frame$name
## [1] "test_tbl_sid_ae6d_3"
##
##
## $frame_checksum
## [1] "-741132111419434106"
##
## $description
## NULL
##
## $scoring_time
## [1] 1.733261e+12
##
## $predictions
## NULL
##
## $MSE
## [1] 0.2957494
##
## $RMSE
## [1] 0.5438285
##
## $nobs
## [1] 2812
##
## $custom_metric_name
## NULL
##
## $custom_metric_value
## [1] 0
##
## $r2
## [1] 0.8990639
##
## $hit_ratio_table
## Top-6 Hit Ratios:
## k hit_ratio
## 1 1 0.636558
## 2 2 0.883001
## 3 3 0.964794
## 4 4 0.997511
## 5 5 0.999289
## 6 6 1.000000
##
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
##
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
##
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
##
##
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut 361 6 68 1 17
## Maine 9 172 1 81 132
## Massachusetts 49 0 429 1 6
## New Hampshire 0 66 2 191 42
## Rhode Island 29 131 5 40 276
## Vermont 1 21 0 92 22
## Totals 449 396 505 406 495
## Vermont Error Rate
## Connecticut 0 0.2031 = 92 / 453
## Maine 38 0.6028 = 261 / 433
## Massachusetts 0 0.1155 = 56 / 485
## New Hampshire 152 0.5784 = 262 / 453
## Rhode Island 10 0.4379 = 215 / 491
## Vermont 361 0.2736 = 136 / 497
## Totals 561 0.3634 = 1,022 / 2,812
##
##
## $logloss
## [1] 0.852542
##
## $mean_per_class_error
## [1] 0.368536
##
## $AUC
## [1] "NaN"
##
## $pr_auc
## [1] "NaN"
##
## $multinomial_auc_table
## NULL
##
## $multinomial_aucpr_table
## NULL
##
## $residual_deviance
## [1] 4794.696
##
## $null_deviance
## [1] 10082.92
##
## $AIC
## [1] "NaN"
##
## $loglikelihood
## [1] 0
##
## $null_degrees_of_freedom
## [1] 2811
##
## $residual_degrees_of_freedom
## [1] 2557
h2o.auc(best_model)
## [1] "NaN"
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut 361 6 68 1 17
## Maine 9 172 1 81 132
## Massachusetts 49 0 429 1 6
## New Hampshire 0 66 2 191 42
## Rhode Island 29 131 5 40 276
## Vermont 1 21 0 92 22
## Totals 449 396 505 406 495
## Vermont Error Rate
## Connecticut 0 0.2031 = 92 / 453
## Maine 38 0.6028 = 261 / 433
## Massachusetts 0 0.1155 = 56 / 485
## New Hampshire 152 0.5784 = 262 / 453
## Rhode Island 10 0.4379 = 215 / 491
## Vermont 361 0.2736 = 136 / 497
## Totals 561 0.3634 = 1,022 / 2,812