Laboratorio 6

library(readr)
library(stringr)
spamData2 <- read_csv("spamData.csv")
Clean_String <- function(string){
  
  # Remover caracteres no UTF-8
  temp<- iconv(enc2utf8(string),sub="byte")
  temp<- str_replace_all(temp,"[^[:graph:]]", " ") 
  # Remover todo lo que no sea número o letra 
  temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
  # remover espacios extra
  temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
  # minúscula
  temp <- tolower(temp)
  
  return(temp)
  
}

# Aplicar la función a los comentarios
spamData2$message <- Clean_String(spamData2$message)
library(tidymodels)

set.seed(1234)

#Realizar la partición de las muestras

spamData2_split <- initial_split(spamData2, prop = .7)

spamData2_train <- training(spamData2_split)
spamData2_test <- testing(spamData2_split)

dim(spamData2_train) ; dim(spamData2_test)

## [1] 3901    2

## [1] 1671    2

library(textrecipes)
library(stopwords)

# Setear la receta del modelo a utilizar

spamData2_recipe <- recipe(class ~ message, 
                          data = spamData2_train)

#Aplicar los pasos de procesamiento de datos
library(wordcloud)
library(textrecipes)
library(stopwords)
library(readr)
library(dplyr)
library(recipes)
library(tidyverse)
library(GGally)
library(mlbench)
library(themis)
# install.packages("tidymodels")
library(tidymodels)

spamData2_recipeProcessed <- spamData2_recipe %>%
  step_text_normalization(message) %>% # elimina caracteres extraños
  step_tokenize(message) %>%
  step_stopwords(message, keep = FALSE) %>%
  step_untokenize(message) %>%
  step_tokenize(message, token = "ngrams", 
                options = list(n = 2, n_min = 1)) %>%
  step_tokenfilter(message, max_tokens = 200) %>%
  step_tfidf(message) %>%
  step_upsample(class) #alternativa a step_smote

#Ejecutar la receta del paso anterior
spamData2_recipeProcessedF <- prep(spamData2_recipeProcessed)

spamData2_recipeProcessedF

## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          1
## 
## Training data contained 3901 data points and no missing data.
## 
## Operations:
## 
## text_normalizationming for message [trained]
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
## Up-sampling based on class [trained]

#Setear el workflow para trabajar el modelo de Machine Learning
library(glmnet)
spamData2_wf <- workflow() %>%
  add_recipe(spamData2_recipeProcessed)

# Especificación del modelo

spamData2_spec <-  logistic_reg(penalty = tune(), mixture = 1) %>% # Mixture=1 se requiere para indicar que es LASSO
  set_engine("glmnet")
spamData2_spec

## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet

#Ajustar el modelo con los datos

spamData2_wf <- workflow() %>%
  add_recipe(spamData2_recipeProcessed) %>%
  add_model(spamData2_spec)
spamData2_wf

## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
## 
## * step_text_normalization()
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_upsample()
## 
## -- Model -----------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet

# Creamos un grid para entrenar los parametros adicionales del modelo
spamData2_grid <- grid_random(penalty(), size = 25)
spamData2_grid

## # A tibble: 25 x 1
##          penalty
##            <dbl>
##  1 0.00000285   
##  2 0.000000934  
##  3 0.265        
##  4 0.0000176    
##  5 0.000000140  
##  6 0.00173      
##  7 0.000000406  
##  8 0.391        
##  9 0.00196      
## 10 0.00000000344
## # ... with 15 more rows

#Seteamos semilla aleatoria y creamos los subsets de la validación cruzada

set.seed(123)
messages_folds <- vfold_cv(spamData2_train,v=5)
messages_folds

## #  5-fold cross-validation 
## # A tibble: 5 x 2
##   splits             id   
##   <list>             <chr>
## 1 <split [3120/781]> Fold1
## 2 <split [3121/780]> Fold2
## 3 <split [3121/780]> Fold3
## 4 <split [3121/780]> Fold4
## 5 <split [3121/780]> Fold5

# Entrenamos el modelo con los diferentes valores del grid para que escoja los mejores valores de los parametros
set.seed(2020)
spamData2_grid <- tune_grid(spamData2_wf,
                           resamples = messages_folds,
                           grid = spamData2_grid,
                           control = control_resamples(save_pred = TRUE),
                           metrics = metric_set(f_meas, recall, precision)
)
spamData2_grid

## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 x 5
##   splits             id    .metrics         .notes          .predictions        
##   <list>             <chr> <list>           <list>          <list>              
## 1 <split [3120/781]> Fold1 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,525 x 5~
## 2 <split [3121/780]> Fold2 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,500 x 5~
## 3 <split [3121/780]> Fold3 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
## 4 <split [3121/780]> Fold4 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,500 x 5~
## 5 <split [3121/780]> Fold5 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~

#Visualizamos las métricas del modelo resultante
spamData2_grid %>%
  collect_metrics()

## # A tibble: 75 x 7
##     penalty .metric   .estimator  mean     n std_err .config              
##       <dbl> <chr>     <chr>      <dbl> <int>   <dbl> <chr>                
##  1 3.61e-10 f_meas    binary     0.970     5 0.00232 Preprocessor1_Model01
##  2 3.61e-10 precision binary     0.987     5 0.00196 Preprocessor1_Model01
##  3 3.61e-10 recall    binary     0.955     5 0.00501 Preprocessor1_Model01
##  4 3.44e- 9 f_meas    binary     0.970     5 0.00232 Preprocessor1_Model02
##  5 3.44e- 9 precision binary     0.987     5 0.00196 Preprocessor1_Model02
##  6 3.44e- 9 recall    binary     0.955     5 0.00501 Preprocessor1_Model02
##  7 7.52e- 9 f_meas    binary     0.970     5 0.00232 Preprocessor1_Model03
##  8 7.52e- 9 precision binary     0.987     5 0.00196 Preprocessor1_Model03
##  9 7.52e- 9 recall    binary     0.955     5 0.00501 Preprocessor1_Model03
## 10 3.94e- 8 f_meas    binary     0.970     5 0.00232 Preprocessor1_Model04
## # ... with 65 more rows

#Visualizamos los cambios de las métricas en función de los valores de penalidad
library(ggplot2)
spamData2_grid %>%
  collect_metrics() %>%
  ggplot(aes(penalty, mean, color = .metric)) +
  geom_line(size = 1.5, show.legend = FALSE) +
  facet_wrap(~.metric) +
  scale_x_log10() +
  theme_minimal()

#seleccionamos el mejor modelo según métrica F1 Score
best_f <- spamData2_grid %>%
  select_best("f_meas")

best_f

## # A tibble: 1 x 2
##   penalty .config              
##     <dbl> <chr>                
## 1 0.00337 Preprocessor1_Model20

#Entrenamos el modelo final con los valores del mejor modelo de entrenamiento
final_spamData2 <- finalize_workflow(spamData2_wf, best_f) %>%
  fit(spamData2_train)
final_spamData2

## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
## 
## * step_text_normalization()
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_upsample()
## 
## -- Model -----------------------------------------------------------------------
## 
## Call:  glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial",      alpha = ~1) 
## 
##      Df  %Dev   Lambda
## 1     0  0.00 0.173900
## 2     1  1.49 0.158500
## 3     2  3.73 0.144400
## 4     4  6.41 0.131600
## 5     4  9.70 0.119900
## 6     5 13.17 0.109200
## 7     5 16.36 0.099520
## 8     7 19.44 0.090680
## 9    13 23.21 0.082620
## 10   14 27.23 0.075280
## 11   15 30.92 0.068590
## 12   19 34.42 0.062500
## 13   20 37.77 0.056950
## 14   21 40.87 0.051890
## 15   25 43.91 0.047280
## 16   29 46.85 0.043080
## 17   30 49.56 0.039250
## 18   32 52.09 0.035760
## 19   36 54.42 0.032590
## 20   37 56.63 0.029690
## 21   38 58.59 0.027050
## 22   38 60.35 0.024650
## 23   41 61.96 0.022460
## 24   43 63.44 0.020470
## 25   47 64.89 0.018650
## 26   51 66.22 0.016990
## 27   55 67.47 0.015480
## 28   58 68.69 0.014110
## 29   59 69.80 0.012850
## 30   63 70.81 0.011710
## 31   69 71.79 0.010670
## 32   75 72.77 0.009723
## 33   75 73.67 0.008859
## 34   77 74.49 0.008072
## 35   82 75.24 0.007355
## 36   88 75.94 0.006702
## 37   92 76.60 0.006106
## 38  101 77.24 0.005564
## 39  110 77.84 0.005070
## 40  117 78.42 0.004619
## 41  121 78.96 0.004209
## 42  123 79.46 0.003835
## 43  125 79.92 0.003494
## 44  130 80.34 0.003184
## 45  137 80.74 0.002901
## 46  140 81.11 0.002643
## 
## ...
## and 54 more lines.

#Evaluamos el modelo con los datos de prueba
review_final <- last_fit(final_spamData2, 
                         split=spamData2_split,
                         metrics = metric_set(f_meas, recall, precision)
)

# Observamos métricas del modelo evaluado en datos de prueba
review_final %>%
  collect_metrics()

## # A tibble: 3 x 4
##   .metric   .estimator .estimate .config             
##   <chr>     <chr>          <dbl> <chr>               
## 1 f_meas    binary         0.969 Preprocessor1_Model1
## 2 recall    binary         0.949 Preprocessor1_Model1
## 3 precision binary         0.990 Preprocessor1_Model1

# Visualizar las predicciones del dataframe de prueba

review_final %>%
  collect_predictions %>%
  head()

## # A tibble: 6 x 5
##   id               .pred_class  .row class .config             
##   <chr>            <fct>       <int> <fct> <chr>               
## 1 train/test split ham             7 ham   Preprocessor1_Model1
## 2 train/test split ham            11 ham   Preprocessor1_Model1
## 3 train/test split spam           13 spam  Preprocessor1_Model1
## 4 train/test split ham            15 ham   Preprocessor1_Model1
## 5 train/test split spam           16 spam  Preprocessor1_Model1
## 6 train/test split ham            17 ham   Preprocessor1_Model1

#Predicciones 
comment<- "the book was good."
len<- str_length(comment)

new_comment <- tribble(~message,~len,comment,len)
comment

## [1] "the book was good."

prediction<-predict(final_spamData2, new_data = new_comment)


paste0("el resultado para el comentario ","'",new_comment$message,"'","es: ",
       prediction$.pred_class)

## [1] "el resultado para el comentario 'the book was good.'es: ham"

prediction

## # A tibble: 1 x 1
##   .pred_class
##   <fct>      
## 1 ham

prediction

## # A tibble: 1 x 1
##   .pred_class
##   <fct>      
## 1 ham

Laboratorio 6

Gustavo Trejos

8/4/2021