ML-Mejorando Modelos Supervisados

##1.Carga, limpieza de datos y librerias necesarias

library(textrecipes)

## Warning: package 'textrecipes' was built under R version 4.0.4

## Loading required package: recipes

## Warning: package 'recipes' was built under R version 4.0.4

## Loading required package: dplyr

## Warning: package 'dplyr' was built under R version 4.0.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## 
## Attaching package: 'recipes'

## The following object is masked from 'package:stats':
## 
##     step

library(stopwords)

## Warning: package 'stopwords' was built under R version 4.0.4

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.0.3

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## v readr   1.4.0

## Warning: package 'ggplot2' was built under R version 4.0.3

## Warning: package 'tibble' was built under R version 4.0.4

## Warning: package 'tidyr' was built under R version 4.0.4

## Warning: package 'readr' was built under R version 4.0.3

## Warning: package 'purrr' was built under R version 4.0.3

## Warning: package 'stringr' was built under R version 4.0.3

## Warning: package 'forcats' was built under R version 4.0.3

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()  masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag()     masks stats::lag()

library(themis)

## Warning: package 'themis' was built under R version 4.0.4

## Registered S3 methods overwritten by 'themis':
##   method                  from   
##   bake.step_downsample    recipes
##   bake.step_upsample      recipes
##   prep.step_downsample    recipes
##   prep.step_upsample      recipes
##   tidy.step_downsample    recipes
##   tidy.step_upsample      recipes
##   tunable.step_downsample recipes
##   tunable.step_upsample   recipes

## 
## Attaching package: 'themis'

## The following objects are masked from 'package:recipes':
## 
##     step_downsample, step_upsample

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.0.4

## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --

## v broom     0.7.5     v rsample   0.0.9
## v dials     0.0.9     v tune      0.1.3
## v infer     0.5.4     v workflows 0.2.2
## v modeldata 0.1.0     v yardstick 0.0.7
## v parsnip   0.1.5

## Warning: package 'dials' was built under R version 4.0.4

## Warning: package 'scales' was built under R version 4.0.4

## Warning: package 'infer' was built under R version 4.0.4

## Warning: package 'modeldata' was built under R version 4.0.4

## Warning: package 'parsnip' was built under R version 4.0.4

## Warning: package 'rsample' was built under R version 4.0.4

## Warning: package 'tune' was built under R version 4.0.4

## Warning: package 'workflows' was built under R version 4.0.4

## Warning: package 'yardstick' was built under R version 4.0.4

## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard()         masks purrr::discard()
## x dplyr::filter()           masks stats::filter()
## x stringr::fixed()          masks recipes::fixed()
## x dplyr::lag()              masks stats::lag()
## x yardstick::spec()         masks readr::spec()
## x recipes::step()           masks stats::step()
## x themis::step_downsample() masks recipes::step_downsample()
## x themis::step_upsample()   masks recipes::step_upsample()

library(glmnet)

## Warning: package 'glmnet' was built under R version 4.0.4

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-1

library(readr)

setwd("C:/Users/carlos mario/Dropbox/Data Analysis/R Directory/Carlos Mario")
spamData <- read_csv("spamData.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   class = col_character(),
##   message = col_character()
## )

#Proceso de limpieza

Clean_String <- function(string){

  # Remover caracteres no UTF-8
  temp<- iconv(enc2utf8(string),sub="byte")
  temp<- str_replace_all(temp,"[^[:graph:]]", " ") 
  # Remover todo lo que no sea número o letra 
  temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
  # remover espacios extra
  temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
  # minúscula
  temp <- tolower(temp)
  
  return(temp)
  
}

# Aplicar la función a los comentarios
spamData$message <- Clean_String(spamData$message)

##2.Muestra de entrenamiento y prueba

set.seed(1234) #Semilla Aleatoria no puede faltar

#Realizar la partición de las muestras

emails_split <- initial_split(spamData,prop=.7)

emails_train <- training(emails_split)
emails_test <- testing(emails_split)

#2.1 Verificación del tamaño de las clases

##3.Creación de receta de preprocesamiento de datos

# Setear la receta del modelo a utilizar

emails_recipe <- recipe(class ~ message, 
                         data = emails_train)

#Aplicar los pasos de procesamiento de datos

emails_recipeProcessed <- emails_recipe %>%
  step_tokenize(message) %>%
  step_stopwords(message, keep = FALSE) %>%
  step_untokenize(message) %>%
  step_tokenize(message, token = "ngrams", 
                options = list(n = 4, n_min = 1)) %>%
  step_tokenfilter(message, max_tokens = 100) %>%
  step_tfidf(message)

##4.Ajuste del modelo inicial

# Creamos el workflow y la receta para el nuevo modelo
emails_wf <- workflow() %>%
  add_recipe(emails_recipeProcessed)

# Especificación del modelo

rlasso_spec <-  logistic_reg(penalty = tune(), mixture = 1) %>% # Mixture=1 se requiere para indicar que es LASSO
  set_engine("glmnet")

# Observamos el modelo especificado
rlasso_spec

## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet

#Ajustar el modelo con los datos

lasso_wf <- workflow() %>%
  add_recipe(emails_recipeProcessed) %>%
  add_model(rlasso_spec)

# Observamos el modelo
lasso_wf

## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 6 Recipe Steps
## 
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## 
## -- Model -----------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet

##5. Medición de métricas mediante validación cruzada

# Creamos un grid para entrenar los parametros adicionales del modelo
lambda_grid <- grid_random(penalty(), size = 25)

lambda_grid

## # A tibble: 25 x 1
##        penalty
##          <dbl>
##  1 0.00000577 
##  2 0.0000778  
##  3 0.000000124
##  4 0.00000475 
##  5 0.000308   
##  6 0.0222     
##  7 0.000387   
##  8 0.275      
##  9 0.00451    
## 10 0.00000313 
## # ... with 15 more rows

#Seteamos semilla aleatoria y creamos los subsets de la validación cruzada
set.seed(123)
emails_folds <- vfold_cv(emails_train,v=7)
emails_folds

## #  7-fold cross-validation 
## # A tibble: 7 x 2
##   splits             id   
##   <list>             <chr>
## 1 <split [3343/558]> Fold1
## 2 <split [3343/558]> Fold2
## 3 <split [3344/557]> Fold3
## 4 <split [3344/557]> Fold4
## 5 <split [3344/557]> Fold5
## 6 <split [3344/557]> Fold6
## 7 <split [3344/557]> Fold7

##6. Creación del modelo final y validación de métricas

# Entrenamos el modelo con los diferentes valores del grid para que escoja los mejores valores de los parametros

set.seed(2020)
lasso_grid <- tune_grid(lasso_wf,
                        resamples = emails_folds,
                        grid = lambda_grid,
                        control = control_resamples(save_pred = TRUE),
                        metrics = metric_set(f_meas, recall, precision)
)

## Warning: package 'rlang' was built under R version 4.0.3

## Warning: package 'vctrs' was built under R version 4.0.3

lasso_grid

## # Tuning results
## # 7-fold cross-validation 
## # A tibble: 7 x 5
##   splits             id    .metrics         .notes          .predictions        
##   <list>             <chr> <list>           <list>          <list>              
## 1 <split [3343/558]> Fold1 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [13,950 x 5~
## 2 <split [3343/558]> Fold2 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [13,950 x 5~
## 3 <split [3344/557]> Fold3 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [13,925 x 5~
## 4 <split [3344/557]> Fold4 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [13,925 x 5~
## 5 <split [3344/557]> Fold5 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [13,925 x 5~
## 6 <split [3344/557]> Fold6 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [13,925 x 5~
## 7 <split [3344/557]> Fold7 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [13,925 x 5~

#Visualizamos las métricas del modelo resultante
lasso_grid %>%
  collect_metrics()

## # A tibble: 75 x 7
##          penalty .metric   .estimator  mean     n std_err .config              
##            <dbl> <chr>     <chr>      <dbl> <int>   <dbl> <chr>                
##  1 0.00000000344 f_meas    binary     0.972     7 0.00230 Preprocessor1_Model01
##  2 0.00000000344 precision binary     0.965     7 0.00336 Preprocessor1_Model01
##  3 0.00000000344 recall    binary     0.980     7 0.00202 Preprocessor1_Model01
##  4 0.00000000633 f_meas    binary     0.972     7 0.00230 Preprocessor1_Model02
##  5 0.00000000633 precision binary     0.965     7 0.00336 Preprocessor1_Model02
##  6 0.00000000633 recall    binary     0.980     7 0.00202 Preprocessor1_Model02
##  7 0.0000000394  f_meas    binary     0.972     7 0.00230 Preprocessor1_Model03
##  8 0.0000000394  precision binary     0.965     7 0.00336 Preprocessor1_Model03
##  9 0.0000000394  recall    binary     0.980     7 0.00202 Preprocessor1_Model03
## 10 0.0000000528  f_meas    binary     0.972     7 0.00230 Preprocessor1_Model04
## # ... with 65 more rows

#Visualizamos los cambios de las métricas en función de los valores de penalidad
lasso_grid %>%
  collect_metrics() %>%
  ggplot(aes(penalty, mean, color = .metric)) +
  geom_line(size = 1.5, show.legend = FALSE) +
  facet_wrap(~.metric) +
  scale_x_log10() +
  theme_minimal()

##7. Se selecciona el mejor modelo y se entrena el modelo final con lo valores del mejor modelo de entrenamiento

#seleccionamos el mejor modelo según métrica F1 Score

best_f <- lasso_grid %>%
  select_best("f_meas")

best_f

## # A tibble: 1 x 2
##    penalty .config              
##      <dbl> <chr>                
## 1 0.000387 Preprocessor1_Model17

final_lasso <- finalize_workflow(lasso_wf, best_f) %>%
  fit(emails_train)

final_lasso

## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 6 Recipe Steps
## 
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## 
## -- Model -----------------------------------------------------------------------
## 
## Call:  glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial",      alpha = ~1) 
## 
##      Df  %Dev   Lambda
## 1     0  0.00 0.119400
## 2     3  4.03 0.108800
## 3     3  8.09 0.099120
## 4     6 12.71 0.090320
## 5     8 18.01 0.082290
## 6     8 22.32 0.074980
## 7     8 25.90 0.068320
## 8    10 29.77 0.062250
## 9    11 33.49 0.056720
## 10   12 36.68 0.051680
## 11   12 39.49 0.047090
## 12   13 42.03 0.042910
## 13   13 44.27 0.039100
## 14   14 46.31 0.035620
## 15   15 48.13 0.032460
## 16   16 49.77 0.029580
## 17   16 51.34 0.026950
## 18   16 52.70 0.024550
## 19   17 53.99 0.022370
## 20   19 55.27 0.020380
## 21   20 56.48 0.018570
## 22   20 57.57 0.016920
## 23   20 58.51 0.015420
## 24   23 59.46 0.014050
## 25   24 60.39 0.012800
## 26   24 61.19 0.011670
## 27   24 61.88 0.010630
## 28   24 62.48 0.009684
## 29   25 63.02 0.008824
## 30   26 63.56 0.008040
## 31   27 64.02 0.007326
## 32   28 64.45 0.006675
## 33   29 64.83 0.006082
## 34   30 65.18 0.005542
## 35   35 65.56 0.005050
## 36   39 65.96 0.004601
## 37   41 66.35 0.004192
## 38   45 66.72 0.003820
## 39   46 67.09 0.003480
## 40   52 67.44 0.003171
## 41   55 67.78 0.002890
## 42   57 68.11 0.002633
## 43   60 68.42 0.002399
## 44   62 68.71 0.002186
## 45   66 68.99 0.001992
## 46   69 69.25 0.001815
## 
## ...
## and 54 more lines.

##8. Evaluacion del Modelo con datos de prueba y visualizacion de predicciones

#Evaluamos el modelo con los datos de prueba
emails_final <- last_fit(final_lasso, 
                         split=emails_split,
                         metrics = metric_set(f_meas, recall, precision)
)

# Observamos métricas del modelo evaluado en datos de prueba
emails_final %>%
  collect_metrics()

## # A tibble: 3 x 4
##   .metric   .estimator .estimate .config             
##   <chr>     <chr>          <dbl> <chr>               
## 1 f_meas    binary         0.977 Preprocessor1_Model1
## 2 recall    binary         0.986 Preprocessor1_Model1
## 3 precision binary         0.967 Preprocessor1_Model1

# Visualizar las predicciones del dataframe de prueba

emails_final %>%
  collect_predictions %>%
  head()

## # A tibble: 6 x 5
##   id               .pred_class  .row class .config             
##   <chr>            <fct>       <int> <fct> <chr>               
## 1 train/test split ham             7 ham   Preprocessor1_Model1
## 2 train/test split ham            11 ham   Preprocessor1_Model1
## 3 train/test split spam           13 spam  Preprocessor1_Model1
## 4 train/test split ham            15 ham   Preprocessor1_Model1
## 5 train/test split spam           16 spam  Preprocessor1_Model1
## 6 train/test split ham            17 ham   Preprocessor1_Model1

##9.Prueba de nuevos datos(Sugerencia:Observar ejemplos del dataset e incluir textos similares).

comment<- "dont wait to get your prize"
len<- str_length(comment)

new_comment <- tribble(~message,~len,comment,len)
new_comment

## # A tibble: 1 x 2
##   message                       len
##   <chr>                       <int>
## 1 dont wait to get your prize    27

prediction<-predict(final_lasso, new_data = new_comment)


paste0("el resultado para el comentario ","'",new_comment$message,"'","es: ",
       prediction$.pred_class)

## [1] "el resultado para el comentario 'dont wait to get your prize'es: ham"

ML-Mejorando Modelos Supervisados

Carlos Salguero

7/4/2021