librerias a utilizar

library(tidymodels) # para machine learning
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.1 ──
## ✓ broom     0.7.2      ✓ recipes   0.1.15
## ✓ dials     0.0.9      ✓ rsample   0.0.8 
## ✓ dplyr     1.0.2      ✓ tibble    3.0.4 
## ✓ ggplot2   3.3.2      ✓ tidyr     1.1.2 
## ✓ infer     0.5.3      ✓ tune      0.1.1 
## ✓ modeldata 0.1.0      ✓ workflows 0.2.1 
## ✓ parsnip   0.1.4      ✓ yardstick 0.0.7 
## ✓ purrr     0.3.4
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x purrr::discard() masks scales::discard()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
## x recipes::step()  masks stats::step()
library(keras) # para redes neuronales
## 
## Attaching package: 'keras'
## The following object is masked from 'package:yardstick':
## 
##     get_weights
library(ranger) # para random forest
library(themis) # SMOTE, ROSE etc
## Registered S3 methods overwritten by 'themis':
##   method               from   
##   bake.step_downsample recipes
##   bake.step_upsample   recipes
##   prep.step_downsample recipes
##   prep.step_upsample   recipes
##   tidy.step_downsample recipes
##   tidy.step_upsample   recipes
## 
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
## 
##     step_downsample, step_upsample
library(DataExplorer)

Load raw dataset

library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:yardstick':
## 
##     spec
## The following object is masked from 'package:scales':
## 
##     col_factor
dacc_daily_tmin <- read_csv("data/dacc-daily-tmin.csv", 
    col_types = cols(X1 = col_skip(), date = col_date(format = "%Y-%m-%d")))
## Warning: Missing column names filled in: 'X1' [1]
#View(dacc_daily_tmin)

Create dataset for one station

Solo tomaré la estación de Junín

junin <- dacc_daily_tmin %>% 
  select(starts_with("junin")) %>% 
  rename_with( ~ tolower(gsub("junin.", "", .x, fixed = TRUE))) # quita el junin. de los nombres de las columnas

Quiero predecir la temperatura mínima, si es o no es helada.

tmin <- junin %>% 
  select(temp_min) 

colnames(tmin) <- "tmin"
tmin_helada <- tmin

tmin_helada <- tmin_helada %>% mutate(tmin = case_when(
  tmin <= 0 ~ as.character("helada"),  # frost event
  TRUE ~ as.character("no-helada")   # no frost
))

Creo el dataset desfazado en un día a predecir

data_tmin <- cbind(junin[1:(nrow(junin)-1),],tmin[2:nrow(tmin),])
data_clasificacion <- cbind(junin[1:(nrow(junin)-1),],tmin_helada[2:nrow(tmin_helada),])

Convierto a factor la columna tmin

data_clasificacion$tmin <- as.factor(data_clasificacion$tmin)

Explorar el dataset

DataExplorer::plot_boxplot(data_clasificacion, by="tmin")

Notamos que trabajamos con un dataset desbalanceado

table(data_clasificacion$tmin)
## 
##    helada no-helada 
##       612      4905

Configuración entrenamiento

set.seed(4848) #setear la semilla
base_split <- data_clasificacion %>%
          initial_split(prop=0.75) # divido en 75%

# to extract the training and testing datasets 
base_train <- training(base_split)
base_testing <- testing(base_split)

base_split
## <Analysis/Assess/Total>
## <4138/1379/5517>
# para hacer validación cruzada estratificada
base_folds <- vfold_cv(base_train, strata = tmin) 

Vamos a entrenar con 4138 muestras, validar con 1379 y en total son 5517 muestras.

Especifico ahora la receta para el entrenamiento.

# receta
set.seed(4848) #setear la semilla

base_recipe <- training(base_split) %>%
  recipe(tmin ~.) %>%
  step_corr(all_numeric()) %>%
  step_normalize(all_numeric(), -all_outcomes())  %>%
  step_smote(tmin) 

base_recipe
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          7
## 
## Operations:
## 
## Correlation filter on all_numeric()
## Centering and scaling for all_numeric(), -all_outcomes()
## SMOTE based on tmin

Modelo random forest Junín solo para predecir helada o no_helada

# modelo
base_rf_spec <- rand_forest() %>% 
  set_engine("ranger") %>% 
  set_mode("classification")
#random forest fit 
set.seed(4848)

base_rf_model <- base_rf_spec %>% 
  fit_resamples(tmin ~ ., 
                resamples = base_folds)
base_rf_model %>% collect_metrics() 
## # A tibble: 2 x 5
##   .metric  .estimator  mean     n std_err
##   <chr>    <chr>      <dbl> <int>   <dbl>
## 1 accuracy binary     0.937    10 0.00346
## 2 roc_auc  binary     0.969    10 0.00265
show_best(base_rf_model, metric = "accuracy")
## # A tibble: 1 x 5
##   .metric  .estimator  mean     n std_err
##   <chr>    <chr>      <dbl> <int>   <dbl>
## 1 accuracy binary     0.937    10 0.00346

Predicción en test-set

#armo el workflow
wf <- workflow() %>%
  add_recipe(base_recipe) %>% #agrego la receta
  add_model(base_rf_spec) #agrego el modelo
wf 
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## ● step_corr()
## ● step_normalize()
## ● step_smote()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
## 
## Computational engine: ranger
#utilizamos la funcion last_fit junto al workflow y al split de datos
final_fit <- last_fit(wf,split = base_split)


final_fit %>%
  collect_metrics()
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.905
## 2 roc_auc  binary         0.947
final_fit$.predictions[[1]] %>% recall(truth= tmin, .pred_class) 
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 recall  binary         0.802
final_fit$.predictions[[1]] %>% precision(truth= tmin, .pred_class) 
## # A tibble: 1 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 precision binary         0.568

pr_curve

final_fit$splits[[1]]
## <Analysis/Assess/Total>
## <4138/1379/5517>
final_fit$.predictions[[1]] %>% nrow()
## [1] 1379
final_fit$.predictions[[1]]
## # A tibble: 1,379 x 5
##    .pred_helada `.pred_no-helada`  .row .pred_class tmin     
##           <dbl>             <dbl> <int> <fct>       <fct>    
##  1      0                   1         2 no-helada   no-helada
##  2      0.00025             1.00      4 no-helada   no-helada
##  3      0                   1         6 no-helada   no-helada
##  4      0                   1        10 no-helada   no-helada
##  5      0                   1        12 no-helada   no-helada
##  6      0.0045              0.996    19 no-helada   no-helada
##  7      0.00533             0.995    21 no-helada   no-helada
##  8      0.00557             0.994    23 no-helada   no-helada
##  9      0                   1        32 no-helada   no-helada
## 10      0                   1        33 no-helada   no-helada
## # … with 1,369 more rows
final_fit$.predictions[[1]] %>% pr_curve(truth=tmin,.pred_helada) %>% autoplot()

roc curve

final_fit$.predictions[[1]] %>% roc_curve(truth=tmin,.pred_helada) %>% autoplot()

Con ggplot

library(ggplot2)
library(dplyr)
final_fit$.predictions[[1]] %>% 
  roc_curve(truth=tmin,.pred_helada) %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity)) +
  geom_path() +
  geom_abline(lty = 3) +
  coord_equal() +
  theme_bw()

conf_mat

model_rf <- wf %>% fit(data=base_train)


#model_rf <- wf %>% fit_resamples(resamples = base_folds)
model_rf %>% pull_workflow_fit()
## parsnip model object
## 
## Fit time:  2.9s 
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, num.threads = 1,      verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) 
## 
## Type:                             Probability estimation 
## Number of trees:                  500 
## Sample size:                      7376 
## Number of independent variables:  5 
## Mtry:                             2 
## Target node size:                 10 
## Variable importance mode:         none 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.04072297
model_rf %>% 
  predict( new_data = base_testing) %>% 
  bind_cols(base_testing["tmin"]) %>% 
  precision(truth= as.factor(tmin), .pred_class) 
## # A tibble: 1 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 precision binary         0.558
model_rf %>% 
  predict( new_data = base_testing) %>% 
  bind_cols(base_testing["tmin"]) %>% 
  bal_accuracy(truth= as.factor(tmin), .pred_class) 
## # A tibble: 1 x 3
##   .metric      .estimator .estimate
##   <chr>        <chr>          <dbl>
## 1 bal_accuracy binary         0.856
model_rf %>% 
  predict( new_data = base_testing) %>% 
  bind_cols(base_testing["tmin"]) %>% 
  recall(truth= as.factor(tmin), .pred_class) 
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 recall  binary         0.796
model_rf %>% 
  predict( new_data = base_testing) %>% 
  bind_cols(base_testing["tmin"]) %>% 
  kap(truth= as.factor(tmin), .pred_class) 
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 kap     binary         0.601

Modelo random forest Junin con las otras estaciones para predecir helada

Dataset

Solo tomaré la estación de Junín y Tunuyan

junin_tunuyan <- dacc_daily_tmin %>% 
  select(starts_with("junin") | starts_with("tunuyan") ) #%>% 
  #rename_with( ~ tolower(gsub("junin.", "", .x, fixed = TRUE))) # quita el junin. de los nombres de las columnas

Quiero predecir la temperatura mínima, si es o no es helada.

tmin <- junin_tunuyan %>% 
  select(junin.temp_min) 

colnames(tmin) <- "tmin"
tmin_helada <- tmin

tmin_helada <- tmin_helada %>% mutate(tmin = case_when(
  tmin <= 0 ~ as.character("helada"),  # frost event
  TRUE ~ as.character("no-helada")   # no frost
))

Creo el dataset desfazado en un día a predecir

data_tmin_junin_tunu <- cbind(junin_tunuyan[1:(nrow(junin_tunuyan)-1),],tmin[2:nrow(tmin),])
data_clasificacion_junin_tunu <- cbind(junin_tunuyan[1:(nrow(junin_tunuyan)-1),],tmin_helada[2:nrow(tmin_helada),])

Convierto a factor la columna tmin

data_clasificacion_junin_tunu$tmin <- as.factor(data_clasificacion_junin_tunu$tmin)

Configuración entrenamiento

set.seed(4848) #setear la semilla
base_split_jt <- data_clasificacion_junin_tunu %>%
          initial_split(prop=0.75) # divido en 75%

# to extract the training and testing datasets 
base_train_jt <- training(base_split_jt)
base_testing_jt <- testing(base_split_jt)

base_split_jt
## <Analysis/Assess/Total>
## <4138/1379/5517>
# para hacer validación cruzada estratificada
base_folds_jt <- vfold_cv(base_train_jt, strata = tmin) 

Vamos a usar la misma receta y modelo que para entrenar sólo a Junin

# receta
set.seed(4848) #setear la semilla

base_recipe_jt <- training(base_split_jt) %>%
  recipe(tmin ~.) %>%
  step_corr(all_numeric()) %>%
  step_normalize(all_numeric(), -all_outcomes())  %>%
  step_smote(tmin) 
base_recipe
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          7
## 
## Operations:
## 
## Correlation filter on all_numeric()
## Centering and scaling for all_numeric(), -all_outcomes()
## SMOTE based on tmin

Cross-validation fit

#random forest fit 
set.seed(123)

base_rf_model_jt <- base_rf_spec %>% 
  fit_resamples(tmin ~ ., 
                resamples = base_folds_jt)
base_rf_model_jt %>% collect_metrics() 
## # A tibble: 2 x 5
##   .metric  .estimator  mean     n std_err
##   <chr>    <chr>      <dbl> <int>   <dbl>
## 1 accuracy binary     0.939    10 0.00392
## 2 roc_auc  binary     0.971    10 0.00237

Predicción en test-set

#armo el workflow
wf_jt <- workflow() %>%
  add_recipe(base_recipe_jt) %>% #agrego la receta
  add_model(base_rf_spec) #agrego el modelo
wf_jt 
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## ● step_corr()
## ● step_normalize()
## ● step_smote()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
## 
## Computational engine: ranger
#utilizamos la funcion last_fit junto al workflow y al split de datos
final_fit_jt <- last_fit(wf_jt,split = base_split_jt)


final_fit_jt %>%
  collect_metrics()
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.920
## 2 roc_auc  binary         0.961
final_fit$.predictions[[1]] %>% select(.pred_class,tmin ) %>% table()
##            tmin
## .pred_class helada no-helada
##   helada       130        99
##   no-helada     32      1118
final_fit_jt$.predictions[[1]] %>% select(.pred_class,tmin ) %>% table()
##            tmin
## .pred_class helada no-helada
##   helada       134        83
##   no-helada     28      1134
final_fit_jt$.predictions[[1]] %>% recall(truth= tmin, .pred_class) 
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 recall  binary         0.827
final_fit_jt$.predictions[[1]] %>% precision(truth= tmin, .pred_class) 
## # A tibble: 1 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 precision binary         0.618

pr curve

final_fit_jt$.predictions[[1]] %>% pr_curve(truth=tmin,.pred_helada) %>% autoplot()

roc curve

final_fit_jt$.predictions[[1]] %>% roc_curve(truth=tmin,.pred_helada) %>% autoplot()

unir dos curvas roc o dos curvas pr

pr curve

tablita <- final_fit_jt$.predictions[[1]] %>% pr_curve(truth=tmin,.pred_helada) %>% add_column(model="Junin+Tunuyan")
final_fit$.predictions[[1]] %>% 
  pr_curve(truth=tmin,.pred_helada) %>% 
  add_column(model="Junin") %>% 
  bind_rows(tablita) %>% 
  ggplot(aes(x=recall,y=precision,color=model)) +
  geom_line()

roc curve

final_fit$.predictions[[1]] %>% roc_curve(truth=tmin,.pred_helada)
## # A tibble: 880 x 3
##     .threshold specificity sensitivity
##          <dbl>       <dbl>       <dbl>
##  1 -Inf              0               1
##  2    0              0               1
##  3    0.0002         0.316           1
##  4    0.000222       0.318           1
##  5    0.00025        0.322           1
##  6    0.000286       0.324           1
##  7    0.000333       0.326           1
##  8    0.0004         0.330           1
##  9    0.000444       0.334           1
## 10    0.0005         0.337           1
## # … with 870 more rows
tt <- final_fit$.predictions[[1]] %>% roc_curve(truth=tmin,.pred_helada) %>% add_column(model = "Junin")
final_fit_jt$.predictions[[1]] %>% 
  roc_curve(truth=tmin,.pred_helada)  %>% 
  add_column(model="Junin+Tunuyan") %>% 
  bind_rows(tt) %>% 
  ggplot(aes(x=(1-specificity),y=sensitivity,color=model)) +
  geom_line()