library(tidymodels) # para machine learning
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.1 ──
## ✓ broom 0.7.2 ✓ recipes 0.1.15
## ✓ dials 0.0.9 ✓ rsample 0.0.8
## ✓ dplyr 1.0.2 ✓ tibble 3.0.4
## ✓ ggplot2 3.3.2 ✓ tidyr 1.1.2
## ✓ infer 0.5.3 ✓ tune 0.1.1
## ✓ modeldata 0.1.0 ✓ workflows 0.2.1
## ✓ parsnip 0.1.4 ✓ yardstick 0.0.7
## ✓ purrr 0.3.4
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x recipes::step() masks stats::step()
library(keras) # para redes neuronales
##
## Attaching package: 'keras'
## The following object is masked from 'package:yardstick':
##
## get_weights
library(ranger) # para random forest
library(themis) # SMOTE, ROSE etc
## Registered S3 methods overwritten by 'themis':
## method from
## bake.step_downsample recipes
## bake.step_upsample recipes
## prep.step_downsample recipes
## prep.step_upsample recipes
## tidy.step_downsample recipes
## tidy.step_upsample recipes
##
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
##
## step_downsample, step_upsample
library(DataExplorer)
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:yardstick':
##
## spec
## The following object is masked from 'package:scales':
##
## col_factor
dacc_daily_tmin <- read_csv("data/dacc-daily-tmin.csv",
col_types = cols(X1 = col_skip(), date = col_date(format = "%Y-%m-%d")))
## Warning: Missing column names filled in: 'X1' [1]
#View(dacc_daily_tmin)
Solo tomaré la estación de Junín
junin <- dacc_daily_tmin %>%
select(starts_with("junin")) %>%
rename_with( ~ tolower(gsub("junin.", "", .x, fixed = TRUE))) # quita el junin. de los nombres de las columnas
Quiero predecir la temperatura mínima, si es o no es helada.
tmin <- junin %>%
select(temp_min)
colnames(tmin) <- "tmin"
tmin_helada <- tmin
tmin_helada <- tmin_helada %>% mutate(tmin = case_when(
tmin <= 0 ~ as.character("helada"), # frost event
TRUE ~ as.character("no-helada") # no frost
))
Creo el dataset desfazado en un día a predecir
data_tmin <- cbind(junin[1:(nrow(junin)-1),],tmin[2:nrow(tmin),])
data_clasificacion <- cbind(junin[1:(nrow(junin)-1),],tmin_helada[2:nrow(tmin_helada),])
Convierto a factor la columna tmin
data_clasificacion$tmin <- as.factor(data_clasificacion$tmin)
DataExplorer::plot_boxplot(data_clasificacion, by="tmin")
Notamos que trabajamos con un dataset desbalanceado
table(data_clasificacion$tmin)
##
## helada no-helada
## 612 4905
set.seed(4848) #setear la semilla
base_split <- data_clasificacion %>%
initial_split(prop=0.75) # divido en 75%
# to extract the training and testing datasets
base_train <- training(base_split)
base_testing <- testing(base_split)
base_split
## <Analysis/Assess/Total>
## <4138/1379/5517>
# para hacer validación cruzada estratificada
base_folds <- vfold_cv(base_train, strata = tmin)
Vamos a entrenar con 4138 muestras, validar con 1379 y en total son 5517 muestras.
Especifico ahora la receta para el entrenamiento.
# receta
set.seed(4848) #setear la semilla
base_recipe <- training(base_split) %>%
recipe(tmin ~.) %>%
step_corr(all_numeric()) %>%
step_normalize(all_numeric(), -all_outcomes()) %>%
step_smote(tmin)
base_recipe
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 7
##
## Operations:
##
## Correlation filter on all_numeric()
## Centering and scaling for all_numeric(), -all_outcomes()
## SMOTE based on tmin
# modelo
base_rf_spec <- rand_forest() %>%
set_engine("ranger") %>%
set_mode("classification")
#random forest fit
set.seed(4848)
base_rf_model <- base_rf_spec %>%
fit_resamples(tmin ~ .,
resamples = base_folds)
base_rf_model %>% collect_metrics()
## # A tibble: 2 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 accuracy binary 0.937 10 0.00346
## 2 roc_auc binary 0.969 10 0.00265
show_best(base_rf_model, metric = "accuracy")
## # A tibble: 1 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 accuracy binary 0.937 10 0.00346
#armo el workflow
wf <- workflow() %>%
add_recipe(base_recipe) %>% #agrego la receta
add_model(base_rf_spec) #agrego el modelo
wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## ● step_corr()
## ● step_normalize()
## ● step_smote()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
##
## Computational engine: ranger
#utilizamos la funcion last_fit junto al workflow y al split de datos
final_fit <- last_fit(wf,split = base_split)
final_fit %>%
collect_metrics()
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.905
## 2 roc_auc binary 0.947
final_fit$.predictions[[1]] %>% recall(truth= tmin, .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall binary 0.802
final_fit$.predictions[[1]] %>% precision(truth= tmin, .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 precision binary 0.568
final_fit$splits[[1]]
## <Analysis/Assess/Total>
## <4138/1379/5517>
final_fit$.predictions[[1]] %>% nrow()
## [1] 1379
final_fit$.predictions[[1]]
## # A tibble: 1,379 x 5
## .pred_helada `.pred_no-helada` .row .pred_class tmin
## <dbl> <dbl> <int> <fct> <fct>
## 1 0 1 2 no-helada no-helada
## 2 0.00025 1.00 4 no-helada no-helada
## 3 0 1 6 no-helada no-helada
## 4 0 1 10 no-helada no-helada
## 5 0 1 12 no-helada no-helada
## 6 0.0045 0.996 19 no-helada no-helada
## 7 0.00533 0.995 21 no-helada no-helada
## 8 0.00557 0.994 23 no-helada no-helada
## 9 0 1 32 no-helada no-helada
## 10 0 1 33 no-helada no-helada
## # … with 1,369 more rows
final_fit$.predictions[[1]] %>% pr_curve(truth=tmin,.pred_helada) %>% autoplot()
final_fit$.predictions[[1]] %>% roc_curve(truth=tmin,.pred_helada) %>% autoplot()
Con ggplot
library(ggplot2)
library(dplyr)
final_fit$.predictions[[1]] %>%
roc_curve(truth=tmin,.pred_helada) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_path() +
geom_abline(lty = 3) +
coord_equal() +
theme_bw()
model_rf <- wf %>% fit(data=base_train)
#model_rf <- wf %>% fit_resamples(resamples = base_folds)
model_rf %>% pull_workflow_fit()
## parsnip model object
##
## Fit time: 2.9s
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE)
##
## Type: Probability estimation
## Number of trees: 500
## Sample size: 7376
## Number of independent variables: 5
## Mtry: 2
## Target node size: 10
## Variable importance mode: none
## Splitrule: gini
## OOB prediction error (Brier s.): 0.04072297
model_rf %>%
predict( new_data = base_testing) %>%
bind_cols(base_testing["tmin"]) %>%
precision(truth= as.factor(tmin), .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 precision binary 0.558
model_rf %>%
predict( new_data = base_testing) %>%
bind_cols(base_testing["tmin"]) %>%
bal_accuracy(truth= as.factor(tmin), .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 bal_accuracy binary 0.856
model_rf %>%
predict( new_data = base_testing) %>%
bind_cols(base_testing["tmin"]) %>%
recall(truth= as.factor(tmin), .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall binary 0.796
model_rf %>%
predict( new_data = base_testing) %>%
bind_cols(base_testing["tmin"]) %>%
kap(truth= as.factor(tmin), .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 kap binary 0.601
Solo tomaré la estación de Junín y Tunuyan
junin_tunuyan <- dacc_daily_tmin %>%
select(starts_with("junin") | starts_with("tunuyan") ) #%>%
#rename_with( ~ tolower(gsub("junin.", "", .x, fixed = TRUE))) # quita el junin. de los nombres de las columnas
Quiero predecir la temperatura mínima, si es o no es helada.
tmin <- junin_tunuyan %>%
select(junin.temp_min)
colnames(tmin) <- "tmin"
tmin_helada <- tmin
tmin_helada <- tmin_helada %>% mutate(tmin = case_when(
tmin <= 0 ~ as.character("helada"), # frost event
TRUE ~ as.character("no-helada") # no frost
))
Creo el dataset desfazado en un día a predecir
data_tmin_junin_tunu <- cbind(junin_tunuyan[1:(nrow(junin_tunuyan)-1),],tmin[2:nrow(tmin),])
data_clasificacion_junin_tunu <- cbind(junin_tunuyan[1:(nrow(junin_tunuyan)-1),],tmin_helada[2:nrow(tmin_helada),])
Convierto a factor la columna tmin
data_clasificacion_junin_tunu$tmin <- as.factor(data_clasificacion_junin_tunu$tmin)
set.seed(4848) #setear la semilla
base_split_jt <- data_clasificacion_junin_tunu %>%
initial_split(prop=0.75) # divido en 75%
# to extract the training and testing datasets
base_train_jt <- training(base_split_jt)
base_testing_jt <- testing(base_split_jt)
base_split_jt
## <Analysis/Assess/Total>
## <4138/1379/5517>
# para hacer validación cruzada estratificada
base_folds_jt <- vfold_cv(base_train_jt, strata = tmin)
Vamos a usar la misma receta y modelo que para entrenar sólo a Junin
# receta
set.seed(4848) #setear la semilla
base_recipe_jt <- training(base_split_jt) %>%
recipe(tmin ~.) %>%
step_corr(all_numeric()) %>%
step_normalize(all_numeric(), -all_outcomes()) %>%
step_smote(tmin)
base_recipe
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 7
##
## Operations:
##
## Correlation filter on all_numeric()
## Centering and scaling for all_numeric(), -all_outcomes()
## SMOTE based on tmin
Cross-validation fit
#random forest fit
set.seed(123)
base_rf_model_jt <- base_rf_spec %>%
fit_resamples(tmin ~ .,
resamples = base_folds_jt)
base_rf_model_jt %>% collect_metrics()
## # A tibble: 2 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 accuracy binary 0.939 10 0.00392
## 2 roc_auc binary 0.971 10 0.00237
#armo el workflow
wf_jt <- workflow() %>%
add_recipe(base_recipe_jt) %>% #agrego la receta
add_model(base_rf_spec) #agrego el modelo
wf_jt
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## ● step_corr()
## ● step_normalize()
## ● step_smote()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
##
## Computational engine: ranger
#utilizamos la funcion last_fit junto al workflow y al split de datos
final_fit_jt <- last_fit(wf_jt,split = base_split_jt)
final_fit_jt %>%
collect_metrics()
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.920
## 2 roc_auc binary 0.961
final_fit$.predictions[[1]] %>% select(.pred_class,tmin ) %>% table()
## tmin
## .pred_class helada no-helada
## helada 130 99
## no-helada 32 1118
final_fit_jt$.predictions[[1]] %>% select(.pred_class,tmin ) %>% table()
## tmin
## .pred_class helada no-helada
## helada 134 83
## no-helada 28 1134
final_fit_jt$.predictions[[1]] %>% recall(truth= tmin, .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall binary 0.827
final_fit_jt$.predictions[[1]] %>% precision(truth= tmin, .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 precision binary 0.618
final_fit_jt$.predictions[[1]] %>% pr_curve(truth=tmin,.pred_helada) %>% autoplot()
final_fit_jt$.predictions[[1]] %>% roc_curve(truth=tmin,.pred_helada) %>% autoplot()
tablita <- final_fit_jt$.predictions[[1]] %>% pr_curve(truth=tmin,.pred_helada) %>% add_column(model="Junin+Tunuyan")
final_fit$.predictions[[1]] %>%
pr_curve(truth=tmin,.pred_helada) %>%
add_column(model="Junin") %>%
bind_rows(tablita) %>%
ggplot(aes(x=recall,y=precision,color=model)) +
geom_line()
final_fit$.predictions[[1]] %>% roc_curve(truth=tmin,.pred_helada)
## # A tibble: 880 x 3
## .threshold specificity sensitivity
## <dbl> <dbl> <dbl>
## 1 -Inf 0 1
## 2 0 0 1
## 3 0.0002 0.316 1
## 4 0.000222 0.318 1
## 5 0.00025 0.322 1
## 6 0.000286 0.324 1
## 7 0.000333 0.326 1
## 8 0.0004 0.330 1
## 9 0.000444 0.334 1
## 10 0.0005 0.337 1
## # … with 870 more rows
tt <- final_fit$.predictions[[1]] %>% roc_curve(truth=tmin,.pred_helada) %>% add_column(model = "Junin")
final_fit_jt$.predictions[[1]] %>%
roc_curve(truth=tmin,.pred_helada) %>%
add_column(model="Junin+Tunuyan") %>%
bind_rows(tt) %>%
ggplot(aes(x=(1-specificity),y=sensitivity,color=model)) +
geom_line()