library(textrecipes)
## Loading required package: recipes
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library(stopwords)

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.0     ✓ stringr 1.4.0
## ✓ tidyr   1.1.3     ✓ forcats 0.5.1
## ✓ readr   1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter()  masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag()     masks stats::lag()
library(themis)
## Registered S3 methods overwritten by 'themis':
##   method                  from   
##   bake.step_downsample    recipes
##   bake.step_upsample      recipes
##   prep.step_downsample    recipes
##   prep.step_upsample      recipes
##   tidy.step_downsample    recipes
##   tidy.step_upsample      recipes
##   tunable.step_downsample recipes
##   tunable.step_upsample   recipes
## 
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
## 
##     step_downsample, step_upsample
library(knitr)

library(ggplot2)

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(forcats)

library(stringr)

library(dplyr)

library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-1
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
## ✓ broom     0.7.5     ✓ rsample   0.0.9
## ✓ dials     0.0.9     ✓ tune      0.1.3
## ✓ infer     0.5.4     ✓ workflows 0.2.2
## ✓ modeldata 0.1.0     ✓ yardstick 0.0.8
## ✓ parsnip   0.1.5
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard()         masks purrr::discard()
## x Matrix::expand()          masks tidyr::expand()
## x dplyr::filter()           masks stats::filter()
## x stringr::fixed()          masks recipes::fixed()
## x dplyr::lag()              masks stats::lag()
## x Matrix::pack()            masks tidyr::pack()
## x yardstick::spec()         masks readr::spec()
## x recipes::step()           masks stats::step()
## x themis::step_downsample() masks recipes::step_downsample()
## x themis::step_upsample()   masks recipes::step_upsample()
## x Matrix::unpack()          masks tidyr::unpack()
## x Matrix::update()          masks recipes::update(), stats::update()
spamData <- read.csv("spamData.csv", sep = ",")

1.Limpieza de los datos

Clean_String <- function(string){
  
  # Remover caracteres no UTF-8
  temp<- iconv(enc2utf8(string),sub="byte")
  temp<- str_replace_all(temp,"[^[:graph:]]", " ") 
  
  # Remover todo lo que no sea numero o letra 
  temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
  
  # remover espacios extra
  temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
  
  # minuscula
  temp <- tolower(temp)
  
  return(temp)
}

spamData$message <- Clean_String(spamData$message)

2. Muestra de entrenamiento y prueba

2.1 Verificación del tamaño de las clases

set.seed(124)  

#Realizar la particion de las muestras

spam_split <- initial_split(spamData, prop=.7)

spam_train <- training(spam_split)
spam_test <- testing(spam_split)


dim(spam_train);dim(spam_test)
## [1] 3901    2
## [1] 1671    2

3.Creación de receta de preprocesamiento de datos

# Setear la receta del modelo a utilizar

spam_recipe <- recipe(class ~ message, 
                      data = spam_train)

#Aplicar los pasos de procesamiento de datos

spam_recipeProcessed <- spam_recipe %>%
  step_tokenize(message) %>%
  step_stopwords(message, keep = FALSE) %>%
  step_untokenize(message) %>%
  step_tokenize(message, token = "ngrams", 
                options = list(n = 2, n_min = 1)) %>%
  step_tokenfilter(message, max_tokens = 100) %>%
  step_tfidf(message)

#Ejecutar la receta del paso anterior
spam_recipeProcessedF <- prep(spam_recipeProcessed)

spam_recipeProcessedF
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          1
## 
## Training data contained 3901 data points and no missing data.
## 
## Operations:
## 
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
#Setear el workflow para trabajar el modelo de Machine Learning

spam_wf <- workflow() %>%
  add_recipe(spam_recipeProcessed)

4.Ajuste del modelo inicial

#Verificamos las frecuencias de nuestro dataframe

spam_train %>%
  group_by(class) %>%
  summarise(n=n()) %>%
  mutate(freq = prop.table(n))
## # A tibble: 2 x 3
##   class     n  freq
##   <chr> <int> <dbl>
## 1 ham    3373 0.865
## 2 spam    528 0.135
# Si es requerido se utiliza la función step_smote sobre la receta que ya se había creado

spam_recipeProcessed2 <- spam_recipeProcessed %>%
  step_smote(class)

#Ejecutar la receta del paso anterior
spam_recipeProcessedF2 <- prep(spam_recipeProcessed2)

spam_recipeProcessedF2
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          1
## 
## Training data contained 3901 data points and no missing data.
## 
## Operations:
## 
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
## SMOTE based on class [trained]
#Setear el workflow para trabajar el modelo de Machine Learning

spam_wf2 <- workflow() %>%
  add_recipe(spam_recipeProcessed2)


# Especificación del modelo

rl_spec <- logistic_reg() %>% 
  set_engine("glm")

rl_spec
## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm
#Ajustar el modelo con los datos

rl_fit <- spam_wf2 %>%
  add_model(rl_spec) %>%
  fit(data = spam_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
rl_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 7 Recipe Steps
## 
## ● step_tokenize()
## ● step_stopwords()
## ● step_untokenize()
## ● step_tokenize()
## ● step_tokenfilter()
## ● step_tfidf()
## ● step_smote()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:  stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
## 
## Coefficients:
##            (Intercept)   tfidf_message_already       tfidf_message_amp  
##             -1.007e+00              -4.147e+01              -6.034e+01  
##        tfidf_message_b      tfidf_message_babe      tfidf_message_back  
##             -1.632e-01               3.457e-01              -2.947e-02  
##        tfidf_message_c      tfidf_message_call       tfidf_message_can  
##              7.797e-01               3.468e+00              -2.140e+00  
##     tfidf_message_cash        tfidf_message_cc     tfidf_message_claim  
##              1.453e+00              -2.602e+00               7.064e+02  
##     tfidf_message_come         tfidf_message_d        tfidf_message_da  
##             -1.171e+00              -2.652e+00              -4.247e+01  
##      tfidf_message_day      tfidf_message_dear       tfidf_message_don  
##             -3.146e-01              -1.586e+00              -6.873e+01  
##  `tfidf_message_don t`      tfidf_message_dont         tfidf_message_e  
##              6.464e+01              -5.975e-01               3.005e+00  
##     tfidf_message_free       tfidf_message_get      tfidf_message_give  
##              3.433e+00              -1.496e-01               5.976e-02  
##       tfidf_message_go     tfidf_message_going      tfidf_message_good  
##             -1.635e-01              -1.327e+00              -4.495e+00  
##      tfidf_message_got     tfidf_message_great        tfidf_message_gt  
##             -2.716e+00              -2.910e+00              -2.704e+01  
##    tfidf_message_happy       tfidf_message_hey        tfidf_message_hi  
##             -4.276e+01              -1.919e+00              -6.506e-01  
##     tfidf_message_home      tfidf_message_hope        tfidf_message_im  
##             -3.594e+00              -2.187e+00              -1.960e-01  
##     tfidf_message_just         tfidf_message_k      tfidf_message_know  
##             -1.362e+00              -8.341e-01              -8.885e-01  
##    tfidf_message_later      tfidf_message_life      tfidf_message_like  
##             -3.513e+01              -7.942e-01              -6.417e-01  
##       tfidf_message_ll       tfidf_message_lor      tfidf_message_love  
##             -8.393e-01              -5.643e+01              -1.275e+00  
##       tfidf_message_lt   `tfidf_message_lt gt`         tfidf_message_m  
##             -2.808e+01              -1.595e+03              -7.549e-01  
##     tfidf_message_make   tfidf_message_message    tfidf_message_mobile  
##              2.817e-01               4.505e-01               6.625e+00  
##      tfidf_message_msg      tfidf_message_much         tfidf_message_n  
##              4.082e-01              -1.110e+00              -1.073e+00  
##     tfidf_message_need       tfidf_message_new     tfidf_message_night  
##             -1.193e+00               5.205e-01              -2.914e-02  
##      tfidf_message_now    tfidf_message_number        tfidf_message_oh  
##              4.691e-01              -1.686e+00              -5.485e+01  
##       tfidf_message_ok       tfidf_message_one         tfidf_message_p  
##             -3.672e+00              -4.568e-01               8.477e+00  
##    tfidf_message_phone    tfidf_message_please       tfidf_message_pls  
##              2.684e-01               2.179e-01              -1.853e+00  
##    tfidf_message_prize         tfidf_message_r        tfidf_message_re  
##              4.183e+02               5.120e-01              -1.414e-01  
## 
## ...
## and 26 more lines.
spam_spec <-  logistic_reg(penalty = tune(), mixture = 1) %>% 
  set_engine("glmnet")

# Observamos el modelo especificado
spam_spec
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet
#Ajustar el modelo con los datos

spam_wf <- workflow() %>%
  add_recipe(spam_recipeProcessed) %>%
  add_model(spam_spec)

# Observamos el modelo
spam_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
## 
## ● step_tokenize()
## ● step_stopwords()
## ● step_untokenize()
## ● step_tokenize()
## ● step_tokenfilter()
## ● step_tfidf()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet
# Creamos un grid para entrenar los parametros adicionales del modelo
spam_grid <- grid_random(penalty(), size = 25)

spam_grid
## # A tibble: 25 x 1
##          penalty
##            <dbl>
##  1 0.00174      
##  2 0.00000000110
##  3 0.00000814   
##  4 0.0901       
##  5 0.485        
##  6 0.177        
##  7 0.0854       
##  8 0.466        
##  9 0.000000498  
## 10 0.0000000139 
## # … with 15 more rows
#Seteamos semilla aleatoria y creamos los subsets de la validación cruzada

set.seed(123)
spam_folds <- vfold_cv(spam_train,v=3)
spam_folds
## #  3-fold cross-validation 
## # A tibble: 3 x 2
##   splits              id   
##   <list>              <chr>
## 1 <split [2600/1301]> Fold1
## 2 <split [2601/1300]> Fold2
## 3 <split [2601/1300]> Fold3

5. Medición de métricas mediante validación cruzada

set.seed(234)

# Se genera las submuestras de validación cruzada

spam_folds <- vfold_cv(spam_train,v=5)

spam_folds
## #  5-fold cross-validation 
## # A tibble: 5 x 2
##   splits             id   
##   <list>             <chr>
## 1 <split [3120/781]> Fold1
## 2 <split [3121/780]> Fold2
## 3 <split [3121/780]> Fold3
## 4 <split [3121/780]> Fold4
## 5 <split [3121/780]> Fold5
# Se ajusta el modelo para cada fold

rl_rs <- fit_resamples(rl_fit,
                       spam_folds,
                       control = control_resamples(save_pred = TRUE),
                       metrics = metric_set(f_meas,recall,precision)
                       )
## ! Fold1: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold2: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold3: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold4: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold5: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
# Se calcula la matriz de confusión

rl_rs %>% conf_mat_resampled(tidy = F)
##        ham spam
## ham  646.2 28.4
## spam  12.2 93.4
rl_rs %>% collect_metrics()
## # A tibble: 3 x 6
##   .metric   .estimator  mean     n std_err .config             
##   <chr>     <chr>      <dbl> <int>   <dbl> <chr>               
## 1 f_meas    binary     0.969     5 0.00345 Preprocessor1_Model1
## 2 precision binary     0.981     5 0.00268 Preprocessor1_Model1
## 3 recall    binary     0.958     5 0.00507 Preprocessor1_Model1
#Calculo de predicciones

rl_rs %>% collect_predictions() %>% head(20)
## # A tibble: 20 x 5
##    id    .pred_class  .row class .config             
##    <chr> <fct>       <int> <fct> <chr>               
##  1 Fold1 spam            8 spam  Preprocessor1_Model1
##  2 Fold1 ham            25 ham   Preprocessor1_Model1
##  3 Fold1 ham            32 ham   Preprocessor1_Model1
##  4 Fold1 ham            36 ham   Preprocessor1_Model1
##  5 Fold1 ham            39 ham   Preprocessor1_Model1
##  6 Fold1 spam           40 spam  Preprocessor1_Model1
##  7 Fold1 ham            42 spam  Preprocessor1_Model1
##  8 Fold1 ham            43 ham   Preprocessor1_Model1
##  9 Fold1 ham            46 ham   Preprocessor1_Model1
## 10 Fold1 ham            48 ham   Preprocessor1_Model1
## 11 Fold1 ham            49 ham   Preprocessor1_Model1
## 12 Fold1 ham            55 ham   Preprocessor1_Model1
## 13 Fold1 spam           56 ham   Preprocessor1_Model1
## 14 Fold1 ham            61 ham   Preprocessor1_Model1
## 15 Fold1 ham            62 ham   Preprocessor1_Model1
## 16 Fold1 ham            65 ham   Preprocessor1_Model1
## 17 Fold1 ham            71 ham   Preprocessor1_Model1
## 18 Fold1 spam           77 spam  Preprocessor1_Model1
## 19 Fold1 spam           79 ham   Preprocessor1_Model1
## 20 Fold1 ham            83 ham   Preprocessor1_Model1
# Entrenamos el modelo con los diferentes valores del grid para que escoja los mejores valores de los parametros

set.seed(2020)
spam_grid <- tune_grid(spam_wf,
                        resamples = spam_folds,
                        grid = spam_grid,
                        control = control_resamples(save_pred = TRUE),
                        metrics = metric_set(f_meas, recall, precision)
                        )

spam_grid
## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 x 5
##   splits          id    .metrics           .notes          .predictions         
##   <list>          <chr> <list>             <list>          <list>               
## 1 <split [3120/7… Fold1 <tibble[,5] [75 ×… <tibble[,1] [0… <tibble[,5] [19,525 …
## 2 <split [3121/7… Fold2 <tibble[,5] [75 ×… <tibble[,1] [0… <tibble[,5] [19,500 …
## 3 <split [3121/7… Fold3 <tibble[,5] [75 ×… <tibble[,1] [0… <tibble[,5] [19,500 …
## 4 <split [3121/7… Fold4 <tibble[,5] [75 ×… <tibble[,1] [0… <tibble[,5] [19,500 …
## 5 <split [3121/7… Fold5 <tibble[,5] [75 ×… <tibble[,1] [0… <tibble[,5] [19,500 …
#Visualizamos las métricas del modelo resultante
spam_grid %>%
  collect_metrics()
## # A tibble: 75 x 7
##     penalty .metric   .estimator  mean     n std_err .config              
##       <dbl> <chr>     <chr>      <dbl> <int>   <dbl> <chr>                
##  1 2.24e-10 f_meas    binary     0.973     5 0.00211 Preprocessor1_Model01
##  2 2.24e-10 precision binary     0.966     5 0.00338 Preprocessor1_Model01
##  3 2.24e-10 recall    binary     0.981     5 0.00279 Preprocessor1_Model01
##  4 1.10e- 9 f_meas    binary     0.973     5 0.00211 Preprocessor1_Model02
##  5 1.10e- 9 precision binary     0.966     5 0.00338 Preprocessor1_Model02
##  6 1.10e- 9 recall    binary     0.981     5 0.00279 Preprocessor1_Model02
##  7 1.39e- 8 f_meas    binary     0.973     5 0.00211 Preprocessor1_Model03
##  8 1.39e- 8 precision binary     0.966     5 0.00338 Preprocessor1_Model03
##  9 1.39e- 8 recall    binary     0.981     5 0.00279 Preprocessor1_Model03
## 10 1.90e- 8 f_meas    binary     0.973     5 0.00211 Preprocessor1_Model04
## # … with 65 more rows
#Visualizamos los cambios de las métricas en función de los valores de penalidad
spam_grid %>%
  collect_metrics() %>%
  ggplot(aes(penalty, mean, color = .metric)) +
  geom_line(size = 1.5, show.legend = FALSE) +
  facet_wrap(~.metric) +
  scale_x_log10() +
  theme_minimal()

6. Creación del modelo final y validación de métricas

best_f <- spam_grid %>%
  select_best("f_meas")

best_f
## # A tibble: 1 x 2
##     penalty .config              
##       <dbl> <chr>                
## 1 0.0000979 Preprocessor1_Model14
#Entrenamos el modelo final con los valores del mejor modelo de entrenamiento
final_spam <- finalize_workflow(spam_wf, best_f) %>%
  fit(spam_train)

final_spam
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
## 
## ● step_tokenize()
## ● step_stopwords()
## ● step_untokenize()
## ● step_tokenize()
## ● step_tokenfilter()
## ● step_tfidf()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:  glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial",      alpha = ~1) 
## 
##      Df  %Dev   Lambda
## 1     0  0.00 0.123200
## 2     3  3.16 0.112300
## 3     5  9.96 0.102300
## 4     6 14.99 0.093190
## 5     8 19.48 0.084910
## 6     9 24.11 0.077370
## 7     9 27.78 0.070500
## 8    10 31.18 0.064230
## 9    10 34.15 0.058530
## 10   11 36.82 0.053330
## 11   13 39.63 0.048590
## 12   14 42.27 0.044270
## 13   14 44.64 0.040340
## 14   14 46.68 0.036760
## 15   14 48.46 0.033490
## 16   14 50.03 0.030520
## 17   14 51.41 0.027810
## 18   15 52.73 0.025340
## 19   16 54.02 0.023080
## 20   16 55.16 0.021030
## 21   17 56.16 0.019170
## 22   18 57.22 0.017460
## 23   18 58.13 0.015910
## 24   19 58.92 0.014500
## 25   20 59.70 0.013210
## 26   22 60.45 0.012040
## 27   22 61.12 0.010970
## 28   23 61.71 0.009993
## 29   24 62.25 0.009105
## 30   25 62.73 0.008296
## 31   26 63.19 0.007559
## 32   27 63.62 0.006888
## 33   28 64.00 0.006276
## 34   30 64.36 0.005718
## 35   34 64.75 0.005210
## 36   36 65.14 0.004747
## 37   40 65.52 0.004326
## 38   46 65.89 0.003941
## 39   49 66.28 0.003591
## 40   56 66.66 0.003272
## 41   59 67.05 0.002982
## 42   59 67.40 0.002717
## 43   60 67.72 0.002475
## 44   60 68.02 0.002255
## 45   67 68.30 0.002055
## 46   68 68.56 0.001872
## 
## ...
## and 54 more lines.
spam_final <- last_fit(final_spam, 
                         split=spam_split,
                         metrics = metric_set(f_meas, recall, precision)
                         )

# Observamos métricas del modelo evaluado en datos de prueba
spam_final %>%
  collect_metrics()
## # A tibble: 3 x 4
##   .metric   .estimator .estimate .config             
##   <chr>     <chr>          <dbl> <chr>               
## 1 f_meas    binary         0.974 Preprocessor1_Model1
## 2 recall    binary         0.979 Preprocessor1_Model1
## 3 precision binary         0.969 Preprocessor1_Model1
# Visualizar las predicciones del dataframe de prueba

spam_final %>%
  collect_predictions %>%
  head()
## # A tibble: 6 x 5
##   id               .pred_class  .row class .config             
##   <chr>            <fct>       <int> <fct> <chr>               
## 1 train/test split ham             4 ham   Preprocessor1_Model1
## 2 train/test split ham             8 ham   Preprocessor1_Model1
## 3 train/test split spam           16 spam  Preprocessor1_Model1
## 4 train/test split ham            17 ham   Preprocessor1_Model1
## 5 train/test split ham            18 ham   Preprocessor1_Model1
## 6 train/test split ham            19 ham   Preprocessor1_Model1