ML h2o

1.- BIBLIOTECAS

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## 
## Attaching package: 'h2o'
## 
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## 
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## 
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

library(DALEX)

## Welcome to DALEX (version: 2.4.3).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
## 
## 
## Attaching package: 'DALEX'
## 
## The following object is masked from 'package:dplyr':
## 
##     explain

library(DALEXtra)
library(iml)
library(skimr)
library(DataExplorer)
library(DataExplorer)
library(ggpubr)
library(univariateML)
library(recipes)

## 
## Attaching package: 'recipes'
## 
## The following object is masked from 'package:stringr':
## 
##     fixed
## 
## The following object is masked from 'package:stats':
## 
##     step

2.- DATOS

l set de datos rent, disponible en el paquete gamlss.data, contiene información sobre el precio del alquiler de 1969 viviendas situadas en Munich en el año 1993. Además del precio, incluye 9 variables adicionales:

R: precio del alquiler.

Fl: metros cuadrados de la vivienda.

A: año de construcción.

B: si tiene cuarto de baño (1) o no (0).

H: si tiene calefacción central (1) o no (0).

L: si la cocina está equipada (1) o no (0).

Sp: si la calidad del barrio donde está situada la vivienda es superior la media (1) o no (0).

Sm: si la calidad del barrio donde está situada la vivienda es inferior la media (1) o no (0).

loc: combinación de Sp y Sm indicando si la calidad del barrio donde está situada la vivienda es inferior (1), igual (2) o superior (3) a la media.

data("rent", package = "gamlss.data")
datos <- rent

str(datos)

## 'data.frame':    1969 obs. of  9 variables:
##  $ R  : num  693 422 737 732 1295 ...
##  $ Fl : num  50 54 70 50 55 59 46 94 93 65 ...
##  $ A  : num  1972 1972 1972 1972 1893 ...
##  $ Sp : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sm : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ B  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ H  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
##  $ L  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ loc: Factor w/ 3 levels "1","2","3": 2 2 2 2 2 2 2 2 2 2 ...

# Se descartan las variables SP y SM, ya que su información está recogida en la variable loc.
datos <- datos[, !(names(datos) %in% c("Sp", "Sm"))]

str(datos)

## 'data.frame':    1969 obs. of  7 variables:
##  $ R  : num  693 422 737 732 1295 ...
##  $ Fl : num  50 54 70 50 55 59 46 94 93 65 ...
##  $ A  : num  1972 1972 1972 1972 1893 ...
##  $ B  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ H  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
##  $ L  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ loc: Factor w/ 3 levels "1","2","3": 2 2 2 2 2 2 2 2 2 2 ...

# Se renombran las columnas para que sean más descriptivas.
colnames(datos) <- c("precio", "metros", "anyo", "banyo", "calefaccion", "cocina", "situacion")

str(datos)

## 'data.frame':    1969 obs. of  7 variables:
##  $ precio     : num  693 422 737 732 1295 ...
##  $ metros     : num  50 54 70 50 55 59 46 94 93 65 ...
##  $ anyo       : num  1972 1972 1972 1972 1893 ...
##  $ banyo      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ calefaccion: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
##  $ cocina     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ situacion  : Factor w/ 3 levels "1","2","3": 2 2 2 2 2 2 2 2 2 2 ...

3.- EDA

skim(datos)

Data summary
Name	datos
Number of rows	1969
Number of columns	7
_______________________
Column type frequency:
factor	4
numeric	3
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
banyo	1	FALSE	2	0: 1925, 1: 44
calefaccion	1	FALSE	2	0: 1580, 1: 389
cocina	1	FALSE	2	0: 1808, 1: 161
situacion	1	FALSE	3	2: 1247, 3: 550, 1: 172

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
precio	1	811.88	379.00	101.7	544.2	737.8	1022	3000	▇▇▂▁▁
metros	1	67.73	20.86	30.0	52.0	67.0	82	120	▆▇▇▅▂
anyo	1	1948.48	29.02	1890.0	1934.0	1957.0	1972	1988	▃▁▃▇▇

head(datos)

##   precio metros anyo banyo calefaccion cocina situacion
## 1  693.3     50 1972     0           0      0         2
## 2  422.0     54 1972     0           0      0         2
## 3  736.6     70 1972     0           0      0         2
## 4  732.2     50 1972     0           0      0         2
## 5 1295.1     55 1893     0           0      0         2
## 6 1195.9     59 1893     0           0      0         2

4.- Datos ausentes

# Número de datos ausentes por variable
datos %>% map_dbl(.f = function(x){sum(is.na(x))})

##      precio      metros        anyo       banyo calefaccion      cocina 
##           0           0           0           0           0           0 
##   situacion 
##           0

plot_missing(
  data = datos,
  title = "Porcentaje de valores ausentes",
  ggtheme = theme_bw(),
  theme_config = list(legend.position = "bottom")
)

5.- Variable respuesta

p1 <- ggplot(data = datos, aes(x = precio)) +
      geom_density(fill = "steelblue", alpha = 0.8) +
      geom_rug(alpha = 0.1) +
      scale_x_continuous(labels = scales::comma) +
      labs(title = "Distribución original", x = "gastos_totales") +
      theme_bw()

p2 <- ggplot(data = datos, aes(x = sqrt(precio))) +
      geom_density(fill = "steelblue", alpha = 0.8) +
      geom_rug(alpha = 0.1) +
      scale_x_continuous(labels = scales::comma) +
      labs(title = "Transformación raíz cuadrada", x = "gastos_totales") +
      theme_bw()

p3 <- ggplot(data = datos, aes(x = log10(precio))) +
      geom_density(fill = "steelblue", alpha = 0.8) +
      geom_rug(alpha = 0.1) +
      scale_x_continuous(labels = scales::comma) +
      labs(title = "Transformación logarítmica") +
      theme_bw()

p4 <- ggplot(data = datos, aes(x = log(precio))) +
      geom_density(fill = "steelblue", alpha = 0.8) +
      geom_rug(alpha = 0.1) +
      scale_x_continuous(labels = scales::comma) +
      labs(title = "Transformación logarítmica") +
      theme_bw()

ggarrange(p1, p2, p3, p4, ncol = 1, align = "v")

# Tabla de estadísticos principales
summary(datos$precio)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   101.7   544.2   737.8   811.9  1022.0  3000.0

Algunos modelos de machine learning y aprendizaje estadístico requieren que la variable respuesta se distribuya de una forma determinada. Por ejemplo: para los modelos de regresión lineal (LM), la distribución tiene que ser de tipo normal. Para los modelos lineales generalizados (GLM), la distribución tiene que ser de la familia exponencial. Existen varios paquetes en R que permiten identificar a qué distribución se ajustan mejor los datos, uno de ellos es univariateML. Para conocer más sobre cómo identificar distribuciones consultar Ajuste de distribuciones con R.

comparacion_aic <- AIC(
                    mlbetapr(datos$precio),
                    mlexp(datos$precio),
                    mlinvgamma(datos$precio),
                    mlgamma(datos$precio),
                    mllnorm(datos$precio),
                    mlrayleigh(datos$precio),
                    mlinvgauss(datos$precio),
                    mlweibull(datos$precio),
                    mlinvweibull(datos$precio),
                    mllgamma(datos$precio)
                   )
comparacion_aic %>% rownames_to_column(var = "distribucion") %>% arrange(AIC)

##                  distribucion df      AIC
## 1       mlgamma(datos$precio)  2 28615.58
## 2       mllnorm(datos$precio)  2 28657.23
## 3    mlinvgauss(datos$precio)  2 28690.03
## 4     mlweibull(datos$precio)  2 28746.25
## 5    mlrayleigh(datos$precio)  1 28797.71
## 6      mlbetapr(datos$precio)  2 28867.94
## 7    mlinvgamma(datos$precio)  2 28869.14
## 8  mlinvweibull(datos$precio)  2 29245.12
## 9         mlexp(datos$precio)  1 30322.05
## 10     mllgamma(datos$precio)  2      Inf

6.- Variables continuas

plot_density(
  data = datos %>% select(-precio),
  ncol= 3,
  title = "Distribución variables continuas",
  ggtheme = theme_bw(),
  theme_config = list(
                  plot.title = element_text(size = 16, face = "bold"),
                  strip.text = element_text(colour = "black", size = 12, face = 2)
  )
)

custom_corr_plot <- function(variable1, variable2, df, alpha=0.3){
  p <- df %>%
       mutate(
         # Truco para que se ponga el título estilo facet
        title = paste(toupper(variable2), "vs", toupper(variable1))
       ) %>%
       ggplot(aes(x = !!sym(variable1), y = !!sym(variable2))) + 
       geom_point(alpha = alpha) +
       # Tendencia no lineal
       geom_smooth(se = FALSE, method = "gam", formula =  y ~ splines::bs(x, 3)) +
       # Tendencia lineal
       geom_smooth(se = FALSE, method = "lm", color = "firebrick") +
       facet_grid(. ~ title) +
       theme_bw() +
       theme(strip.text = element_text(colour = "black", size = 10, face = 2),
             axis.title = element_blank())
  return(p)
}

variables_continuas <- c("anyo", "metros")

plots <- map(
            .x = variables_continuas,
            .f = custom_corr_plot,
            variable2 = "precio",
            df = datos
         )

ggarrange(plotlist = plots, ncol = 2, nrow = 1) %>%
  annotate_figure(
    top = text_grob("Correlación con el precio de alquiler", face = "bold", size = 16,
                    x = 0.4)
  )

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

plot_correlation(
  data     = datos,
  cor_args = list(use = "pairwise.complete.obs"),
  type     = "continuous",
  title    = "Matriz de correlación variables continuas",
  theme_config = list(legend.position = "none",
                      plot.title = element_text(size = 16, face = "bold"),
                      axis.title = element_blank(),
                      axis.text.x = element_text(angle = -45, hjust = +0.1)
                     )
)

GGally::ggscatmat(
  data = datos %>% select_if(is.numeric),
  alpha = 0.1) +
theme_bw() +
labs(title = "Correlación por pares") +
theme(
  plot.title = element_text(size = 16, face = "bold"),
  #axis.text = element_blank(),
  strip.text = element_text(colour = "black", size = 10, face = 2)
)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

plot_bar( 
    datos, 
    ncol = 2, 
    title = "Número de observaciones por grupo", 
    ggtheme = theme_bw(), 
    theme_config = list( 
                      plot.title = element_text(size = 16, face = "bold"), 
                      strip.text = element_text(colour = "black", size = 12, face = 2),
                      legend.position = "none" 
                      ) 
)

custom_box_plot <- function(variable1, variable2, df, alpha=0.3){
  p <- df %>%
       mutate(
         # Truco para que se ponga el título estilo facet
        title = paste(toupper(variable2), "vs", toupper(variable1))
       ) %>%
       ggplot(aes(x = !!sym(variable1), y = !!sym(variable2))) + 
       geom_violin(alpha = alpha) +
       geom_boxplot(width = 0.1, outlier.shape = NA) +
       facet_grid(. ~ title) +
       theme_bw() +
       theme(strip.text = element_text(colour = "black", size = 10, face = 2),
             axis.title = element_blank())
  return(p)
}

variables_cualitativas <- c("banyo", "calefaccion", "cocina", "situacion")

plots <- map(
            .x = variables_cualitativas,
            .f = custom_box_plot,
            variable2 = "precio",
            df = datos
         )

ggarrange(plotlist = plots, ncol = 2, nrow = 2) %>%
  annotate_figure(
    top = text_grob("Correlación con precio", face = "bold", size = 16,
                    x = 0.28)
  )

Si alguno de los niveles de una variable cualitativa tiene muy pocas observaciones en comparación a los otros niveles, puede ocurrir que, durante la validación cruzada o bootstrapping, algunas particiones no contengan ninguna observación de dicha clase (varianza cero), lo que puede dar lugar a errores. En este caso hay que tener precaución con la variable banyo.

table(datos$banyo)

## 
##    0    1 
## 1925   44

7.- Modelos

# Creación de un cluster local con todos los cores disponibles.
h2o.init(ip = "localhost",
         # -1 indica que se empleen todos los cores disponibles.
         nthreads = -1,
         # Máxima memoria disponible para el cluster.
         max_mem_size = "6g")

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         27 minutes 37 seconds 
##     H2O cluster timezone:       Europe/Paris 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.42.0.2 
##     H2O cluster version age:    4 months and 27 days 
##     H2O cluster name:           H2O_started_from_R_Usuario_hwc583 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   4.81 GB 
##     H2O cluster total cores:    20 
##     H2O cluster allowed cores:  20 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.2 (2023-10-31 ucrt)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (4 months and 27 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

# Se eliminan los datos del cluster por si ya había sido iniciado.
h2o.removeAll()
h2o.no_progress()

datos_h2o <- as.h2o(x = datos, destination_frame = "datos_h2o")

7.1.- TRAIN Y TEST SPLIT

set.seed(123)
particiones <- h2o.splitFrame(data=datos_h2o, ratios = c(0.8), seed = 123)
datos_train_h2o <- h2o.assign(data = particiones[[1]], key = "datos_train_h2o")
datos_test_h2o <- h2o.assign(data = particiones[[2]], key = "datos_test_h2o")

datos_train <- as.data.frame(datos_train_h2o)
datos_test <- as.data.frame(datos_test_h2o)

summary(datos_train$precio)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   101.7   544.7   741.6   813.9  1026.5  3000.0

summary(datos_test$precio)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   127.1   537.8   718.6   804.0  1000.0  2764.5

summary(datos_train$banyo)/nrow(datos_train)*100

##         0         1 
## 97.963081  2.036919

summary(datos_test$banyo)/nrow(datos_test)*100

##         0         1 
## 96.984925  3.015075

7.2.- MODELO GLM

Entrenamiento

# Se define la variable respuesta y los predictores.
var_respuesta <- "precio"
# Para este modelo se emplean todos los predictores disponibles.
predictores <- setdiff(h2o.colnames(datos_train_h2o), var_respuesta)

Optimización de hiperparámetros

# Valores de alpha que se van a comparar.
hiperparametros <- list(alpha = seq(0,1,0.1))

grid_glm <- h2o.grid(
    # Algoritmo y parámetros
    algorithm      = "glm",
    family         = "gamma",
    # Variable respuesta y predictores
    y              = var_respuesta,
    x              = predictores, 
    # Datos de entrenamiento
    training_frame = datos_train_h2o,
    # Preprocesado
    standardize    = TRUE,
    missing_values_handling = "Skip",
    ignore_const_cols = TRUE,
    # Hiperparámetros
    hyper_params    = hiperparametros,
    # Tipo de búsqueda
    search_criteria = list(strategy = "Cartesian"),
    lambda_search   = TRUE,
    # Selección automática del solver adecuado
    solver          = "AUTO",
    # Estrategia de validación para seleccionar el mejor modelo
    seed            = 123,
    nfolds          = 3,
    keep_cross_validation_predictions = TRUE,
    grid_id         = "grid_glm"
)

## Warning in doTryCatch(return(expr), name, parentenv, handler): Reached maximum
## number of iterations 12!

## Warning in doTryCatch(return(expr), name, parentenv, handler): Reached maximum
## number of iterations 71!

## Warning in doTryCatch(return(expr), name, parentenv, handler): Reached maximum
## number of iterations 71!

## Warning in doTryCatch(return(expr), name, parentenv, handler): Reached maximum
## number of iterations 71!

## Warning in doTryCatch(return(expr), name, parentenv, handler): Reached maximum
## number of iterations 71!

## Warning in doTryCatch(return(expr), name, parentenv, handler): Reached maximum
## number of iterations 71!

## Warning in doTryCatch(return(expr), name, parentenv, handler): Reached maximum
## number of iterations 71!

## Warning in doTryCatch(return(expr), name, parentenv, handler): Reached maximum
## number of iterations 71!

## Warning in doTryCatch(return(expr), name, parentenv, handler): Reached maximum
## number of iterations 71!

## Warning in doTryCatch(return(expr), name, parentenv, handler): Reached maximum
## number of iterations 71!

## Warning in doTryCatch(return(expr), name, parentenv, handler): Reached maximum
## number of iterations 71!

## Warning in h2o.getGrid(grid_id = grid_id): Adding alpha array to hyperparameter
## runs slower with gridsearch. This is due to the fact that the algo has to run
## initialization for every alpha value. Setting the alpha array as a model
## parameter will skip the initialization and run faster overall.

# Se muestran los modelos ordenados de mayor a menor rmse
resultados_grid_glm <- h2o.getGrid(
                         grid_id = "grid_glm",
                         sort_by = "rmse",
                         decreasing = FALSE
                       )

## Warning in h2o.getGrid(grid_id = "grid_glm", sort_by = "rmse", decreasing =
## FALSE): Adding alpha array to hyperparameter runs slower with gridsearch. This
## is due to the fact that the algo has to run initialization for every alpha
## value. Setting the alpha array as a model parameter will skip the
## initialization and run faster overall.

as.data.frame(resultados_grid_glm@summary_table)

##                  alpha         model_ids     rmse
## 1                  0.1  grid_glm_model_2 310.2497
## 2                  0.2  grid_glm_model_3 310.2498
## 3  0.30000000000000004  grid_glm_model_4 310.2498
## 4                  0.4  grid_glm_model_5 310.2499
## 5                  0.5  grid_glm_model_6 310.2499
## 6   0.6000000000000001  grid_glm_model_7 310.2499
## 7   0.7000000000000001  grid_glm_model_8 310.2499
## 8                  0.8  grid_glm_model_9 310.2499
## 9                  0.9 grid_glm_model_10 310.2499
## 10                 1.0 grid_glm_model_11 310.2499
## 11                 0.0  grid_glm_model_1 310.3075

# Se reentrena el modelo con los mejores hiperparámetros
mejores_hiperparam <- h2o.getModel(resultados_grid_glm@model_ids[[1]])@parameters

modelo_glm <- h2o.glm(
    # Variable respuesta y predictores
    y              = var_respuesta,
    x              = predictores,
    # Datos de entrenamiento
    training_frame = datos_train_h2o,
    # Preprocesado
    standardize    = TRUE,
    missing_values_handling = "Skip",
    ignore_const_cols = TRUE,
    # Hiperparámetros
    alpha = mejores_hiperparam$alpha,
    lambda_search   = TRUE,
    # Selección automática del solver adecuado
    solver          = "AUTO",
    # Estrategia de validación (para comparar con otros modelos)
    seed            = 123,
    nfolds          = 10,
    keep_cross_validation_predictions = TRUE,
    model_id        = "modelo_glm"
)

## Warning in doTryCatch(return(expr), name, parentenv, handler): Reached maximum
## number of iterations 3!

Influencia de predictores

h2o.varimp(modelo_glm)

## Variable Importances: 
##         variable relative_importance scaled_importance percentage
## 1         metros            0.010826          1.000000   1.000000
## 2    situacion.1            0.000000          0.000000   0.000000
## 3    situacion.2            0.000000          0.000000   0.000000
## 4    situacion.3            0.000000          0.000000   0.000000
## 5  calefaccion.0            0.000000          0.000000   0.000000
## 6  calefaccion.1            0.000000          0.000000   0.000000
## 7       cocina.0            0.000000          0.000000   0.000000
## 8       cocina.1            0.000000          0.000000   0.000000
## 9        banyo.0            0.000000          0.000000   0.000000
## 10       banyo.1            0.000000          0.000000   0.000000
## 11          anyo            0.000000          0.000000   0.000000

h2o.varimp_plot(modelo_glm)

Diagnosis de los residuos

explainer_glm <- DALEXtra::explain_h2o(
                  model = modelo_glm,
                  data  = datos_train[, predictores],
                  y     = datos_train[[var_respuesta]],
                  label = "modelo_glm"
                )

## Preparation of a new explainer is initiated
##   -> model label       :  modelo_glm 
##   -> data              :  1571  rows  6  cols 
##   -> target variable   :  1571  values 
##   -> predict function  :  yhat.H2ORegressionModel  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package h2o , ver. 3.42.0.2 , task regression (  default  ) 
##   -> predicted values  :  numerical, min =  813.8608 , mean =  813.8803 , max =  813.9076  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -712.1723 , mean =  2.96012e-13 , max =  2186.092  
##   A new explainer has been created!

plot(model_performance(explainer_glm))

predicciones_train <- h2o.predict(
                        modelo_glm,
                        newdata = datos_train_h2o
                      )
predicciones_train <- h2o.cbind(
                       datos_train_h2o["precio"],
                       predicciones_train
                      )
predicciones_train <- as.data.frame(predicciones_train)
predicciones_train <- predicciones_train %>%
                      mutate(
                        residuo = predict - precio
                      )

p1 <- ggplot(predicciones_train, aes(x = precio, y  = predict)) +
      geom_point(alpha = 0.1) +
      geom_smooth(method = "gam", color = "red", size = 1, se = TRUE) +
      labs(title = "Predicciones vs valor real") +
      theme_bw()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

p2 <- ggplot(predicciones_train, aes(1:nrow(predicciones_train), y  = residuo)) +
      geom_point(alpha = 0.1) +
      geom_hline(yintercept = 0, color = "red", size = 1) +
      labs(title = "Residuos del modelo") +
      theme_bw()
p3 <- ggplot(predicciones_train, aes(x  = residuo)) +
      geom_density() +
      geom_rug() +
      labs(title = "Residuos del modelo") +
      theme_bw()

p4 <- ggplot(predicciones_train, aes(sample  = predict)) +
      stat_qq() +
      stat_qq_line(color = "red", size = 1) +
      labs(title = "QQ-plot residuos del modelo") +
      theme_bw()

ggpubr::ggarrange(p1, p2, p3, p4, ncol = 2, nrow = 2) %>%
ggpubr::annotate_figure(
  top = ggpubr::text_grob("Diagnóstico residuos entrenamiento",
                          color = "black", face = "bold", size = 14)
)

## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'

predicciones_test <- h2o.predict(
                        object  = modelo_glm,
                        newdata = datos_test_h2o
                     )
predicciones_test     <- as.vector(predicciones_test)
datos_test$prediccion <- predicciones_test

Error de test

rmse_test_glm <- MLmetrics::RMSE(
                    y_pred = datos_test$precio,
                    y_true = datos_test$prediccion
                 )
paste("Error de test (rmse) del modelo GLM:", rmse_test_glm)

## [1] "Error de test (rmse) del modelo GLM: 378.70675549663"

Escritura del modelo

# Se guarda el modelo en el directorio actual en la carpeta modelos
h2o.saveModel(object = modelo_glm, path = "./modelos", force = TRUE)

## [1] "C:\\Users\\Usuario\\OneDrive\\Documentos\\modelos\\modelo_glm"

file.rename(file.path("./modelos", modelo_glm@model_id), "./modelos/modelo_glm.h2o")

## Warning in file.rename(file.path("./modelos", modelo_glm@model_id),
## "./modelos/modelo_glm.h2o"): cannot rename file './modelos/modelo_glm' to
## './modelos/modelo_glm.h2o', reason 'El sistema no puede encontrar la ruta
## especificada'

## [1] FALSE

ML h2o

2023-12-22