Modelos supervisados - Karol Esquivel

#install.packages("textrecipes")
#("stopwords")
#install.packages("tidyverse")
#install.packages("themis")
#install.packages("tidymodels")
#install.packages("glmnet")
#install.packages("rlang")

library(textrecipes)
library(stopwords)
library(tidyverse)
library(themis)
library(tidymodels)
library(glmnet)
library(rlang)

Carga y limpieza de datos

#Carga del dataset
spam <- read.csv("https://raw.githubusercontent.com/KarolEsq/DA/main/spamData.csv")

Clean_String <- function(string){
  # minúscula
  temp <- tolower(string)
  # Remover todo lo que no sea número o letra 
  temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
  # remover espacios extra
  temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
  
  return(temp)
  
}

spam$message <- Clean_String(spam$message)

Inicializando Receta y creando predictor adicional: len

set.seed(1234) # Asegurar siempre una misma semilla aleatoria.

#Spam ya tiene el proceso de limpieza de datos

messagesClass <- spam %>% # spam ya está precargado y limpio.
  mutate(len= str_length(message)
         )
head(messagesClass)
##   class
## 1   ham
## 2   ham
## 3  spam
## 4   ham
## 5   ham
## 6  spam
##                                                                                                                                   message
## 1                                 go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat 
## 2                                                                                                                ok lar joking wif u oni 
## 3          free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s
## 4                                                                                            u dun say so early hor u c already then say 
## 5                                                                            nah i don t think he goes to usf he lives around here though
## 6 freemsg hey there darling it s been week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send to rcv
##   len
## 1 103
## 2  24
## 3 126
## 4  44
## 5  60
## 6 135
#Realizar la partición de las muestras

messages_split <- initial_split(messagesClass,prop=.7)

messages_train <- training(messages_split)
messages_test <- testing(messages_split)

#Creamos la receta inicial

messages_recipe <- recipe(class ~ message+len, 
                         data = messages_train)


#Aplicar los pasos de procesamiento de datos

messages_recipeProcessed <- messages_recipe %>%
  step_text_normalization(message) %>% # elimina caracteres extraños
  step_tokenize(message) %>%
  step_stopwords(message, keep = FALSE) %>%
  step_untokenize(message) %>%
  step_tokenize(message, token = "ngrams", 
                options = list(n = 4, n_min = 1)) %>%
  step_tokenfilter(message, max_tokens = 200) %>%
  step_tfidf(message)  %>%
  step_upsample(class) #alternativa a step_smote

Creando workflow inicial

# Creamos el workflow y la receta para el nuevo modelo
messages_wf <- workflow() %>%
  add_recipe(messages_recipeProcessed)

# Especificación del modelo

rlasso_spec <-  logistic_reg(penalty = tune(), mixture = 1) %>% # Mixture=1 se requiere para indicar que es LASSO
  set_engine("glmnet")

# Observamos el modelo especificado
rlasso_spec
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet
#Ajustar el modelo con los datos

lasso_wf <- workflow() %>%
  add_recipe(messages_recipeProcessed) %>%
  add_model(rlasso_spec)

# Observamos el modelo
lasso_wf
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
## 
## * step_text_normalization()
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_upsample()
## 
## -- Model -----------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet

Optimización

# Creamos un grid para entrenar los parametros adicionales del modelo
lambda_grid <- grid_random(penalty(), size = 25)

lambda_grid
## # A tibble: 25 x 1
##          penalty
##            <dbl>
##  1 0.00000285   
##  2 0.000000934  
##  3 0.265        
##  4 0.0000176    
##  5 0.000000140  
##  6 0.00173      
##  7 0.000000406  
##  8 0.391        
##  9 0.00196      
## 10 0.00000000344
## # ... with 15 more rows
#Seteamos semilla aleatoria y creamos los subsets de la validación cruzada

set.seed(123)
messages_folds <- vfold_cv(messages_train,v=5)
messages_folds
## #  5-fold cross-validation 
## # A tibble: 5 x 2
##   splits             id   
##   <list>             <chr>
## 1 <split [3120/781]> Fold1
## 2 <split [3121/780]> Fold2
## 3 <split [3121/780]> Fold3
## 4 <split [3121/780]> Fold4
## 5 <split [3121/780]> Fold5
# Entrenamos el modelo con los diferentes valores del grid para que escoja los mejores valores de los parametros

set.seed(2020)
lasso_grid <- tune_grid(lasso_wf,
                        resamples = messages_folds,
                        grid = lambda_grid,
                        control = control_resamples(save_pred = TRUE),
                        metrics = metric_set(f_meas, recall, precision)
                        )
## Warning: package 'vctrs' was built under R version 4.0.4
## Warning: package 'stringi' was built under R version 4.0.3
## ! Fold2: preprocessor 1/1, model 1/1: from glmnet Fortran code (error code -73); ...
## ! Fold3: internal: While computing binary `precision()`, no predicted events were...
## ! Fold4: preprocessor 1/1, model 1/1: from glmnet Fortran code (error code -73); ...
## ! Fold5: internal: While computing binary `precision()`, no predicted events were...
lasso_grid
## Warning: This tuning result has notes. Example notes on model fitting include:
## preprocessor 1/1, model 1/1: from glmnet Fortran code (error code -73); Convergence for 73th lambda value not reached after maxit=100000 iterations; solutions for larger lambdas returned
## internal: While computing binary `precision()`, no predicted events were detected (i.e. `true_positive + false_positive = 0`). 
## Precision is undefined in this case, and `NA` will be returned.
## Note that 694 true event(s) actually occured for the problematic event level, 'ham'.
## internal: While computing binary `precision()`, no predicted events were detected (i.e. `true_positive + false_positive = 0`). 
## Precision is undefined in this case, and `NA` will be returned.
## Note that 678 true event(s) actually occured for the problematic event level, 'ham'.
## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 x 5
##   splits             id    .metrics         .notes          .predictions        
##   <list>             <chr> <list>           <list>          <list>              
## 1 <split [3120/781]> Fold1 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,525 x 5~
## 2 <split [3121/780]> Fold2 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
## 3 <split [3121/780]> Fold3 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
## 4 <split [3121/780]> Fold4 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
## 5 <split [3121/780]> Fold5 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
#Visualizamos las métricas del modelo resultante
lasso_grid %>%
  collect_metrics()
## # A tibble: 75 x 7
##     penalty .metric   .estimator  mean     n std_err .config              
##       <dbl> <chr>     <chr>      <dbl> <int>   <dbl> <chr>                
##  1 3.61e-10 f_meas    binary     0.976     5 0.00222 Preprocessor1_Model01
##  2 3.61e-10 precision binary     0.987     5 0.00271 Preprocessor1_Model01
##  3 3.61e-10 recall    binary     0.965     5 0.00256 Preprocessor1_Model01
##  4 3.44e- 9 f_meas    binary     0.976     5 0.00222 Preprocessor1_Model02
##  5 3.44e- 9 precision binary     0.987     5 0.00271 Preprocessor1_Model02
##  6 3.44e- 9 recall    binary     0.965     5 0.00256 Preprocessor1_Model02
##  7 7.52e- 9 f_meas    binary     0.976     5 0.00222 Preprocessor1_Model03
##  8 7.52e- 9 precision binary     0.987     5 0.00271 Preprocessor1_Model03
##  9 7.52e- 9 recall    binary     0.965     5 0.00256 Preprocessor1_Model03
## 10 3.94e- 8 f_meas    binary     0.976     5 0.00222 Preprocessor1_Model04
## # ... with 65 more rows

Selección del mejor modelo

#Visualizamos los cambios de las métricas en función de los valores de penalidad
lasso_grid %>%
  collect_metrics() %>%
  ggplot(aes(penalty, mean, color = .metric)) +
  geom_line(size = 1.5, show.legend = FALSE) +
  facet_wrap(~.metric) +
  scale_x_log10() +
  theme_minimal()

#seleccionamos el mejor modelo según métrica F1 Score

best_f <- lasso_grid %>%
  select_best("f_meas")

best_f
## # A tibble: 1 x 2
##    penalty .config              
##      <dbl> <chr>                
## 1 0.000433 Preprocessor1_Model16
#Entrenamos el modelo final con los valores del mejor modelo de entrenamiento
final_lasso <- finalize_workflow(lasso_wf, best_f) %>%
  fit(messages_train)

final_lasso
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
## 
## * step_text_normalization()
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_upsample()
## 
## -- Model -----------------------------------------------------------------------
## 
## Call:  glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial",      alpha = ~1) 
## 
##     Df  %Dev   Lambda
## 1    0  0.00 0.234500
## 2    1  2.73 0.213600
## 3    1  5.11 0.194700
## 4    1  7.21 0.177400
## 5    2  9.36 0.161600
## 6    2 11.95 0.147300
## 7    3 14.17 0.134200
## 8    4 17.09 0.122300
## 9    5 20.11 0.111400
## 10   5 22.93 0.101500
## 11   6 25.69 0.092480
## 12   6 28.26 0.084270
## 13   9 30.86 0.076780
## 14  13 33.95 0.069960
## 15  15 37.17 0.063740
## 16  15 40.06 0.058080
## 17  19 42.92 0.052920
## 18  23 45.75 0.048220
## 19  26 48.60 0.043940
## 20  30 51.31 0.040030
## 21  31 53.88 0.036480
## 22  33 56.28 0.033240
## 23  35 58.47 0.030280
## 24  36 60.47 0.027590
## 25  37 62.29 0.025140
## 26  39 63.93 0.022910
## 27  40 65.43 0.020870
## 28  42 66.80 0.019020
## 29  46 68.12 0.017330
## 30  47 69.37 0.015790
## 31  53 70.52 0.014390
## 32  60 71.68 0.013110
## 33  66 72.78 0.011940
## 34  68 73.79 0.010880
## 35  73 74.78 0.009916
## 36  76 75.73 0.009035
## 37  80 76.61 0.008233
## 38  85 77.42 0.007501
## 39  89 78.18 0.006835
## 40  96 78.91 0.006228
## 41  99 79.60 0.005675
## 42 101 80.24 0.005170
## 43 107 80.84 0.004711
## 44 115 81.42 0.004293
## 45 118 81.96 0.003911
## 46 123 82.48 0.003564
## 
## ...
## and 28 more lines.

Evaluacion del modelo

#Evaluamos el modelo con los datos de prueba
messages_final <- last_fit(final_lasso, 
                         split=messages_split,
                         metrics = metric_set(f_meas, recall, precision)
                         )
## ! train/test split: preprocessor 1/1, model 1/1: from glmnet Fortran code (error code -75); ...
# Observamos métricas del modelo evaluado en datos de prueba
messages_final %>%
  collect_metrics()
## # A tibble: 3 x 4
##   .metric   .estimator .estimate .config             
##   <chr>     <chr>          <dbl> <chr>               
## 1 f_meas    binary         0.974 Preprocessor1_Model1
## 2 recall    binary         0.958 Preprocessor1_Model1
## 3 precision binary         0.991 Preprocessor1_Model1
# Visualizar las predicciones del dataframe de prueba

messages_final %>%
  collect_predictions %>%
  head()
## # A tibble: 6 x 5
##   id               .pred_class  .row class .config             
##   <chr>            <fct>       <int> <fct> <chr>               
## 1 train/test split ham             7 ham   Preprocessor1_Model1
## 2 train/test split ham            11 ham   Preprocessor1_Model1
## 3 train/test split spam           13 spam  Preprocessor1_Model1
## 4 train/test split ham            15 ham   Preprocessor1_Model1
## 5 train/test split spam           16 spam  Preprocessor1_Model1
## 6 train/test split ham            17 ham   Preprocessor1_Model1

Predecir nuevos datos

message<- "congratulations, you have been hired"
len<- str_length(message)

new_message <- tribble(~message,~len,message,len)
new_message
## # A tibble: 1 x 2
##   message                                len
##   <chr>                                <int>
## 1 congratulations, you have been hired    36
prediction<-predict(final_lasso, new_data = new_message)


paste0("el resultado para el comentario ","'",new_message$message,"'","es: ",
       prediction$.pred_class)
## [1] "el resultado para el comentario 'congratulations, you have been hired'es: ham"