Limpieza de datos

library(readr)
library('dplyr');
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('textrecipes');
## Loading required package: recipes
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library('stopwords');
library('tidyverse');
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()  masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag()     masks stats::lag()
library('themis');
## Registered S3 methods overwritten by 'themis':
##   method                  from   
##   bake.step_downsample    recipes
##   bake.step_upsample      recipes
##   prep.step_downsample    recipes
##   prep.step_upsample      recipes
##   tidy.step_downsample    recipes
##   tidy.step_upsample      recipes
##   tunable.step_downsample recipes
##   tunable.step_upsample   recipes
## 
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
## 
##     step_downsample, step_upsample
library('tidymodels');
## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --
## v broom     0.7.5     v rsample   0.0.9
## v dials     0.0.9     v tune      0.1.3
## v infer     0.5.4     v workflows 0.2.2
## v modeldata 0.1.0     v yardstick 0.0.7
## v parsnip   0.1.5
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard()         masks purrr::discard()
## x dplyr::filter()           masks stats::filter()
## x stringr::fixed()          masks recipes::fixed()
## x dplyr::lag()              masks stats::lag()
## x yardstick::spec()         masks readr::spec()
## x recipes::step()           masks stats::step()
## x themis::step_downsample() masks recipes::step_downsample()
## x themis::step_upsample()   masks recipes::step_upsample()
library('themis');
#library('latexpdf');
library('glmnet');
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-1
df <- read_csv("C:/Users/Viviana Moya C/OneDrive/Escritorio/spamData.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   class = col_character(),
##   message = col_character()
## )
spamData <- df

Clean_String <- function(string){

  # Remover caracteres no UTF-8
  temp<- iconv(enc2utf8(string),sub="byte")
  temp<- str_replace_all(temp,"[^[:graph:]]", " ") 
  # Remover todo lo que no sea número o letra 
  temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
  # remover espacios extra
  temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
  # minúscula
  temp <- tolower(temp)
  
  return(temp)
  
}

# Aplicar la función a los comentarios
spamData$message <- Clean_String(spamData$message)

Muestra de entrenamiento y prueba

set.seed(1234) # Asegurar siempre una misma semilla aleatoria.

reviewClass <- spamData %>% 
  mutate(len= str_length(message))

#Realizar la partición de las muestras

reviews_split <- initial_split(reviewClass,prop=.7)

reviews_train <- training(reviews_split)
reviews_test <- testing(reviews_split)

#Creamos la receta inicial

reviews_recipe <- recipe(class ~ message+len, 
                         data = reviews_train)


#Aplicar los pasos de procesamiento de datos

reviews_recipeProcessed <- reviews_recipe %>%
  step_text_normalization(message) %>% # elimina caracteres extraños
  step_tokenize(message) %>%
  step_stopwords(message, keep = FALSE) %>%
  step_untokenize(message) %>%
  step_tokenize(message, token = "ngrams", 
                options = list(n = 4, n_min = 1)) %>%
  step_tokenfilter(message, max_tokens = 500) %>%
  step_tfidf(message)  %>%
  step_upsample(class) #alternativa a step_smote

Creamos el workflow y la receta para el nuevo modelo

library(glmnet)

# Creamos el workflow y la receta para el nuevo modelo
reviews_wf <- workflow() %>%
  add_recipe(reviews_recipeProcessed)

# Especificación del modelo

rlasso_spec <-  logistic_reg(penalty = tune(), mixture = 1) %>% # Mixture=1 se requiere para indicar que es LASSO
  set_engine("glmnet")

# Observamos el modelo especificado
rlasso_spec
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet
#Ajustar el modelo con los datos

lasso_wf <- workflow() %>%
  add_recipe(reviews_recipeProcessed) %>%
  add_model(rlasso_spec)

# Observamos el modelo
lasso_wf
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
## 
## * step_text_normalization()
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_upsample()
## 
## -- Model -----------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet

Creación del grid para entrenar los parametros adicionales del modelo

# Creamos un grid para entrenar los parametros adicionales del modelo
lambda_grid <- grid_random(penalty(), size = 25)

lambda_grid
## # A tibble: 25 x 1
##          penalty
##            <dbl>
##  1 0.00000285   
##  2 0.000000934  
##  3 0.265        
##  4 0.0000176    
##  5 0.000000140  
##  6 0.00173      
##  7 0.000000406  
##  8 0.391        
##  9 0.00196      
## 10 0.00000000344
## # ... with 15 more rows
#Seteamos semilla aleatoria y creamos los subsets de la validación cruzada

set.seed(123)
reviews_folds <- vfold_cv(reviews_train,v=5)
reviews_folds
## #  5-fold cross-validation 
## # A tibble: 5 x 2
##   splits             id   
##   <list>             <chr>
## 1 <split [3120/781]> Fold1
## 2 <split [3121/780]> Fold2
## 3 <split [3121/780]> Fold3
## 4 <split [3121/780]> Fold4
## 5 <split [3121/780]> Fold5

Modelo con los diferentes valores del grid y Visualizamos las métricas

# Entrenamos el modelo con los diferentes valores del grid para que escoja los mejores valores de los parametros

set.seed(2020)
lasso_grid <- tune_grid(lasso_wf,
                        resamples = reviews_folds,
                        grid = lambda_grid,
                        control = control_resamples(save_pred = TRUE),
                        metrics = metric_set(f_meas, recall, precision)
                        )
## ! Fold3: internal: While computing binary `precision()`, no predicted events were...
## ! Fold5: internal: While computing binary `precision()`, no predicted events were...
lasso_grid
## Warning: This tuning result has notes. Example notes on model fitting include:
## internal: While computing binary `precision()`, no predicted events were detected (i.e. `true_positive + false_positive = 0`). 
## Precision is undefined in this case, and `NA` will be returned.
## Note that 694 true event(s) actually occured for the problematic event level, 'ham'.
## internal: While computing binary `precision()`, no predicted events were detected (i.e. `true_positive + false_positive = 0`). 
## Precision is undefined in this case, and `NA` will be returned.
## Note that 678 true event(s) actually occured for the problematic event level, 'ham'.
## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 x 5
##   splits             id    .metrics         .notes          .predictions        
##   <list>             <chr> <list>           <list>          <list>              
## 1 <split [3120/781]> Fold1 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,525 x 5~
## 2 <split [3121/780]> Fold2 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,500 x 5~
## 3 <split [3121/780]> Fold3 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
## 4 <split [3121/780]> Fold4 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,500 x 5~
## 5 <split [3121/780]> Fold5 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
#Visualizamos las métricas del modelo resultante
lasso_grid %>%
  collect_metrics()
## # A tibble: 75 x 7
##     penalty .metric   .estimator  mean     n std_err .config              
##       <dbl> <chr>     <chr>      <dbl> <int>   <dbl> <chr>                
##  1 3.61e-10 f_meas    binary     0.975     5 0.00180 Preprocessor1_Model01
##  2 3.61e-10 precision binary     0.984     5 0.00111 Preprocessor1_Model01
##  3 3.61e-10 recall    binary     0.965     5 0.00352 Preprocessor1_Model01
##  4 3.44e- 9 f_meas    binary     0.975     5 0.00180 Preprocessor1_Model02
##  5 3.44e- 9 precision binary     0.984     5 0.00111 Preprocessor1_Model02
##  6 3.44e- 9 recall    binary     0.965     5 0.00352 Preprocessor1_Model02
##  7 7.52e- 9 f_meas    binary     0.975     5 0.00180 Preprocessor1_Model03
##  8 7.52e- 9 precision binary     0.984     5 0.00111 Preprocessor1_Model03
##  9 7.52e- 9 recall    binary     0.965     5 0.00352 Preprocessor1_Model03
## 10 3.94e- 8 f_meas    binary     0.975     5 0.00180 Preprocessor1_Model04
## # ... with 65 more rows
#seleccionamos el mejor modelo según métrica F1 Score


best_f <- lasso_grid %>%
  select_best("f_meas")

best_f
## # A tibble: 1 x 2
##   penalty .config              
##     <dbl> <chr>                
## 1 0.00173 Preprocessor1_Model18
#Entrenamos el modelo final con los valores del mejor modelo de entrenamiento
final_lasso <- finalize_workflow(lasso_wf, best_f) %>%
  fit(reviews_train)

final_lasso
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
## 
## * step_text_normalization()
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_upsample()
## 
## -- Model -----------------------------------------------------------------------
## 
## Call:  glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial",      alpha = ~1) 
## 
##      Df  %Dev   Lambda
## 1     0  0.00 0.234400
## 2     1  2.72 0.213600
## 3     1  5.11 0.194600
## 4     1  7.20 0.177300
## 5     1  9.04 0.161600
## 6     2 11.29 0.147200
## 7     3 13.63 0.134200
## 8     3 16.27 0.122200
## 9     5 18.81 0.111400
## 10    5 21.65 0.101500
## 11    6 24.31 0.092470
## 12    8 26.95 0.084250
## 13   10 30.02 0.076770
## 14   11 33.05 0.069950
## 15   14 35.95 0.063730
## 16   18 38.79 0.058070
## 17   19 41.65 0.052910
## 18   23 44.33 0.048210
## 19   28 47.03 0.043930
## 20   34 49.74 0.040030
## 21   37 52.38 0.036470
## 22   43 55.05 0.033230
## 23   49 57.71 0.030280
## 24   54 60.18 0.027590
## 25   56 62.43 0.025140
## 26   60 64.50 0.022900
## 27   65 66.50 0.020870
## 28   69 68.35 0.019020
## 29   74 70.15 0.017330
## 30   75 71.78 0.015790
## 31   80 73.25 0.014380
## 32   82 74.61 0.013110
## 33   90 75.89 0.011940
## 34  103 77.14 0.010880
## 35  112 78.32 0.009915
## 36  115 79.43 0.009034
## 37  120 80.47 0.008231
## 38  130 81.48 0.007500
## 39  139 82.46 0.006834
## 40  146 83.37 0.006227
## 41  156 84.21 0.005674
## 42  166 85.02 0.005170
## 43  172 85.76 0.004710
## 44  184 86.46 0.004292
## 45  195 87.13 0.003911
## 46  200 87.75 0.003563
## 
## ...
## and 54 more lines.

Entrenamos el modelo con los diferentes valores del grid

# Entrenamos el modelo con los diferentes valores del grid para que escoja los mejores valores de los parametros

set.seed(2020)
lasso_grid <- tune_grid(lasso_wf,
                        resamples = reviews_folds,
                        grid = lambda_grid,
                        control = control_resamples(save_pred = TRUE),
                        metrics = metric_set(f_meas, recall, precision)
                        )
## ! Fold3: internal: While computing binary `precision()`, no predicted events were...
## ! Fold5: internal: While computing binary `precision()`, no predicted events were...
lasso_grid
## Warning: This tuning result has notes. Example notes on model fitting include:
## internal: While computing binary `precision()`, no predicted events were detected (i.e. `true_positive + false_positive = 0`). 
## Precision is undefined in this case, and `NA` will be returned.
## Note that 694 true event(s) actually occured for the problematic event level, 'ham'.
## internal: While computing binary `precision()`, no predicted events were detected (i.e. `true_positive + false_positive = 0`). 
## Precision is undefined in this case, and `NA` will be returned.
## Note that 678 true event(s) actually occured for the problematic event level, 'ham'.
## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 x 5
##   splits             id    .metrics         .notes          .predictions        
##   <list>             <chr> <list>           <list>          <list>              
## 1 <split [3120/781]> Fold1 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,525 x 5~
## 2 <split [3121/780]> Fold2 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,500 x 5~
## 3 <split [3121/780]> Fold3 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
## 4 <split [3121/780]> Fold4 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,500 x 5~
## 5 <split [3121/780]> Fold5 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
library(rlang)

#Visualizamos las métricas del modelo resultante

lasso_grid %>%
  collect_metrics()
## # A tibble: 75 x 7
##     penalty .metric   .estimator  mean     n std_err .config              
##       <dbl> <chr>     <chr>      <dbl> <int>   <dbl> <chr>                
##  1 3.61e-10 f_meas    binary     0.975     5 0.00180 Preprocessor1_Model01
##  2 3.61e-10 precision binary     0.984     5 0.00111 Preprocessor1_Model01
##  3 3.61e-10 recall    binary     0.965     5 0.00352 Preprocessor1_Model01
##  4 3.44e- 9 f_meas    binary     0.975     5 0.00180 Preprocessor1_Model02
##  5 3.44e- 9 precision binary     0.984     5 0.00111 Preprocessor1_Model02
##  6 3.44e- 9 recall    binary     0.965     5 0.00352 Preprocessor1_Model02
##  7 7.52e- 9 f_meas    binary     0.975     5 0.00180 Preprocessor1_Model03
##  8 7.52e- 9 precision binary     0.984     5 0.00111 Preprocessor1_Model03
##  9 7.52e- 9 recall    binary     0.965     5 0.00352 Preprocessor1_Model03
## 10 3.94e- 8 f_meas    binary     0.975     5 0.00180 Preprocessor1_Model04
## # ... with 65 more rows

Selección del mejor modelo

#Visualizamos los cambios de las métricas en función de los valores de penalidad
lasso_grid %>%
  collect_metrics() %>%
  ggplot(aes(penalty, mean, color = .metric)) +
  geom_line(size = 1.5, show.legend = FALSE) +
  facet_wrap(~.metric) +
  scale_x_log10() +
  theme_minimal()

Seleccionamos el mejor modelo según métrica F1 Score

#seleccionamos el mejor modelo según métrica F1 Score

best_f <- lasso_grid %>%
  select_best("f_meas")

best_f
## # A tibble: 1 x 2
##   penalty .config              
##     <dbl> <chr>                
## 1 0.00173 Preprocessor1_Model18
#Entrenamos el modelo final con los valores del mejor modelo de entrenamiento
final_lasso <- finalize_workflow(lasso_wf, best_f) %>%
  fit(reviews_train)

final_lasso
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
## 
## * step_text_normalization()
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_upsample()
## 
## -- Model -----------------------------------------------------------------------
## 
## Call:  glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial",      alpha = ~1) 
## 
##      Df  %Dev   Lambda
## 1     0  0.00 0.234400
## 2     1  2.72 0.213600
## 3     1  5.11 0.194600
## 4     1  7.20 0.177300
## 5     1  9.04 0.161600
## 6     2 11.29 0.147200
## 7     3 13.63 0.134200
## 8     3 16.27 0.122200
## 9     5 18.81 0.111400
## 10    5 21.65 0.101500
## 11    6 24.31 0.092470
## 12    8 26.95 0.084250
## 13   10 30.02 0.076770
## 14   11 33.05 0.069950
## 15   14 35.95 0.063730
## 16   18 38.79 0.058070
## 17   19 41.65 0.052910
## 18   23 44.33 0.048210
## 19   28 47.03 0.043930
## 20   34 49.74 0.040030
## 21   37 52.38 0.036470
## 22   43 55.05 0.033230
## 23   49 57.71 0.030280
## 24   54 60.18 0.027590
## 25   56 62.43 0.025140
## 26   60 64.50 0.022900
## 27   65 66.50 0.020870
## 28   69 68.35 0.019020
## 29   74 70.15 0.017330
## 30   75 71.78 0.015790
## 31   80 73.25 0.014380
## 32   82 74.61 0.013110
## 33   90 75.89 0.011940
## 34  103 77.14 0.010880
## 35  112 78.32 0.009915
## 36  115 79.43 0.009034
## 37  120 80.47 0.008231
## 38  130 81.48 0.007500
## 39  139 82.46 0.006834
## 40  146 83.37 0.006227
## 41  156 84.21 0.005674
## 42  166 85.02 0.005170
## 43  172 85.76 0.004710
## 44  184 86.46 0.004292
## 45  195 87.13 0.003911
## 46  200 87.75 0.003563
## 
## ...
## and 54 more lines.

Evaluación de resultados en datos de prueba

#Evaluamos el modelo con los datos de prueba
review_final <- last_fit(final_lasso, 
                         split=reviews_split,
                         metrics = metric_set(f_meas, recall, precision)
                         )

# Observamos métricas del modelo evaluado en datos de prueba
review_final %>%
  collect_metrics()
## # A tibble: 3 x 4
##   .metric   .estimator .estimate .config             
##   <chr>     <chr>          <dbl> <chr>               
## 1 f_meas    binary         0.976 Preprocessor1_Model1
## 2 recall    binary         0.962 Preprocessor1_Model1
## 3 precision binary         0.990 Preprocessor1_Model1
# Visualizar las predicciones del dataframe de prueba

review_final %>%
  collect_predictions %>%
  head()
## # A tibble: 6 x 5
##   id               .pred_class  .row class .config             
##   <chr>            <fct>       <int> <fct> <chr>               
## 1 train/test split ham             7 ham   Preprocessor1_Model1
## 2 train/test split ham            11 ham   Preprocessor1_Model1
## 3 train/test split spam           13 spam  Preprocessor1_Model1
## 4 train/test split ham            15 ham   Preprocessor1_Model1
## 5 train/test split spam           16 spam  Preprocessor1_Model1
## 6 train/test split ham            17 ham   Preprocessor1_Model1

Predecir nuevos datos

comment<- "congratulations ur."
len<- str_length(comment)

new_comment <- tribble(~message,~len,comment,len)
new_comment
## # A tibble: 1 x 2
##   message               len
##   <chr>               <int>
## 1 congratulations ur.    19
prediction<-predict(final_lasso, new_data = new_comment)


paste0("el resultado para el comentario ","'",new_comment$message,"'","es: ",
       prediction$.pred_class)
## [1] "el resultado para el comentario 'congratulations ur.'es: ham"