Tarea 7 - Raquel Barrantes E

library(textrecipes)

## Loading required package: recipes

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## 
## Attaching package: 'recipes'

## The following object is masked from 'package:stats':
## 
##     step

library(stopwords)

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.0     ✓ stringr 1.4.0
## ✓ tidyr   1.1.3     ✓ forcats 0.5.1
## ✓ readr   1.4.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter()  masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag()     masks stats::lag()

library(themis)

## Registered S3 methods overwritten by 'themis':
##   method                  from   
##   bake.step_downsample    recipes
##   bake.step_upsample      recipes
##   prep.step_downsample    recipes
##   prep.step_upsample      recipes
##   tidy.step_downsample    recipes
##   tidy.step_upsample      recipes
##   tunable.step_downsample recipes
##   tunable.step_upsample   recipes

## 
## Attaching package: 'themis'

## The following objects are masked from 'package:recipes':
## 
##     step_downsample, step_upsample

library(knitr)

library(ggplot2)

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(forcats)

library(stringr)

library(dplyr)

library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-1

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──

## ✓ broom     0.7.5     ✓ rsample   0.0.9
## ✓ dials     0.0.9     ✓ tune      0.1.3
## ✓ infer     0.5.4     ✓ workflows 0.2.2
## ✓ modeldata 0.1.0     ✓ yardstick 0.0.8
## ✓ parsnip   0.1.5

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard()         masks purrr::discard()
## x Matrix::expand()          masks tidyr::expand()
## x dplyr::filter()           masks stats::filter()
## x stringr::fixed()          masks recipes::fixed()
## x dplyr::lag()              masks stats::lag()
## x Matrix::pack()            masks tidyr::pack()
## x yardstick::spec()         masks readr::spec()
## x recipes::step()           masks stats::step()
## x themis::step_downsample() masks recipes::step_downsample()
## x themis::step_upsample()   masks recipes::step_upsample()
## x Matrix::unpack()          masks tidyr::unpack()
## x Matrix::update()          masks recipes::update(), stats::update()

spamData <- read.csv("spamData.csv", sep = ",")

1.Limpieza de los datos

Clean_String <- function(string){
  
  # Remover caracteres no UTF-8
  temp<- iconv(enc2utf8(string),sub="byte")
  temp<- str_replace_all(temp,"[^[:graph:]]", " ") 
  
  # Remover todo lo que no sea numero o letra 
  temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
  
  # remover espacios extra
  temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
  
  # minuscula
  temp <- tolower(temp)
  
  return(temp)
}

spamData$message <- Clean_String(spamData$message)

2. Muestra de entrenamiento y prueba

2.1 Verificación del tamaño de las clases

set.seed(124)  

#Realizar la particion de las muestras

spam_split <- initial_split(spamData, prop=.7)

spam_train <- training(spam_split)
spam_test <- testing(spam_split)


dim(spam_train);dim(spam_test)

## [1] 3901    2

## [1] 1671    2

3.Creación de receta de preprocesamiento de datos

# Setear la receta del modelo a utilizar

spam_recipe <- recipe(class ~ message, 
                      data = spam_train)

#Aplicar los pasos de procesamiento de datos

spam_recipeProcessed <- spam_recipe %>%
  step_tokenize(message) %>%
  step_stopwords(message, keep = FALSE) %>%
  step_untokenize(message) %>%
  step_tokenize(message, token = "ngrams", 
                options = list(n = 2, n_min = 1)) %>%
  step_tokenfilter(message, max_tokens = 100) %>%
  step_tfidf(message)

#Ejecutar la receta del paso anterior
spam_recipeProcessedF <- prep(spam_recipeProcessed)

spam_recipeProcessedF

## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          1
## 
## Training data contained 3901 data points and no missing data.
## 
## Operations:
## 
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]

#Setear el workflow para trabajar el modelo de Machine Learning

spam_wf <- workflow() %>%
  add_recipe(spam_recipeProcessed)

4.Ajuste del modelo inicial

#Verificamos las frecuencias de nuestro dataframe

spam_train %>%
  group_by(class) %>%
  summarise(n=n()) %>%
  mutate(freq = prop.table(n))

## # A tibble: 2 x 3
##   class     n  freq
##   <chr> <int> <dbl>
## 1 ham    3373 0.865
## 2 spam    528 0.135

# Si es requerido se utiliza la función step_smote sobre la receta que ya se había creado

spam_recipeProcessed2 <- spam_recipeProcessed %>%
  step_smote(class)

#Ejecutar la receta del paso anterior
spam_recipeProcessedF2 <- prep(spam_recipeProcessed2)

spam_recipeProcessedF2

## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          1
## 
## Training data contained 3901 data points and no missing data.
## 
## Operations:
## 
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
## SMOTE based on class [trained]

#Setear el workflow para trabajar el modelo de Machine Learning

spam_wf2 <- workflow() %>%
  add_recipe(spam_recipeProcessed2)


# Especificación del modelo

rl_spec <- logistic_reg() %>% 
  set_engine("glm")

rl_spec

## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm

#Ajustar el modelo con los datos

rl_fit <- spam_wf2 %>%
  add_model(rl_spec) %>%
  fit(data = spam_train)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

rl_fit

## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 7 Recipe Steps
## 
## ● step_tokenize()
## ● step_stopwords()
## ● step_untokenize()
## ● step_tokenize()
## ● step_tokenfilter()
## ● step_tfidf()
## ● step_smote()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:  stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
## 
## Coefficients:
##            (Intercept)   tfidf_message_already       tfidf_message_amp  
##             -1.007e+00              -4.147e+01              -6.034e+01  
##        tfidf_message_b      tfidf_message_babe      tfidf_message_back  
##             -1.632e-01               3.457e-01              -2.947e-02  
##        tfidf_message_c      tfidf_message_call       tfidf_message_can  
##              7.797e-01               3.468e+00              -2.140e+00  
##     tfidf_message_cash        tfidf_message_cc     tfidf_message_claim  
##              1.453e+00              -2.602e+00               7.064e+02  
##     tfidf_message_come         tfidf_message_d        tfidf_message_da  
##             -1.171e+00              -2.652e+00              -4.247e+01  
##      tfidf_message_day      tfidf_message_dear       tfidf_message_don  
##             -3.146e-01              -1.586e+00              -6.873e+01  
##  `tfidf_message_don t`      tfidf_message_dont         tfidf_message_e  
##              6.464e+01              -5.975e-01               3.005e+00  
##     tfidf_message_free       tfidf_message_get      tfidf_message_give  
##              3.433e+00              -1.496e-01               5.976e-02  
##       tfidf_message_go     tfidf_message_going      tfidf_message_good  
##             -1.635e-01              -1.327e+00              -4.495e+00  
##      tfidf_message_got     tfidf_message_great        tfidf_message_gt  
##             -2.716e+00              -2.910e+00              -2.704e+01  
##    tfidf_message_happy       tfidf_message_hey        tfidf_message_hi  
##             -4.276e+01              -1.919e+00              -6.506e-01  
##     tfidf_message_home      tfidf_message_hope        tfidf_message_im  
##             -3.594e+00              -2.187e+00              -1.960e-01  
##     tfidf_message_just         tfidf_message_k      tfidf_message_know  
##             -1.362e+00              -8.341e-01              -8.885e-01  
##    tfidf_message_later      tfidf_message_life      tfidf_message_like  
##             -3.513e+01              -7.942e-01              -6.417e-01  
##       tfidf_message_ll       tfidf_message_lor      tfidf_message_love  
##             -8.393e-01              -5.643e+01              -1.275e+00  
##       tfidf_message_lt   `tfidf_message_lt gt`         tfidf_message_m  
##             -2.808e+01              -1.595e+03              -7.549e-01  
##     tfidf_message_make   tfidf_message_message    tfidf_message_mobile  
##              2.817e-01               4.505e-01               6.625e+00  
##      tfidf_message_msg      tfidf_message_much         tfidf_message_n  
##              4.082e-01              -1.110e+00              -1.073e+00  
##     tfidf_message_need       tfidf_message_new     tfidf_message_night  
##             -1.193e+00               5.205e-01              -2.914e-02  
##      tfidf_message_now    tfidf_message_number        tfidf_message_oh  
##              4.691e-01              -1.686e+00              -5.485e+01  
##       tfidf_message_ok       tfidf_message_one         tfidf_message_p  
##             -3.672e+00              -4.568e-01               8.477e+00  
##    tfidf_message_phone    tfidf_message_please       tfidf_message_pls  
##              2.684e-01               2.179e-01              -1.853e+00  
##    tfidf_message_prize         tfidf_message_r        tfidf_message_re  
##              4.183e+02               5.120e-01              -1.414e-01  
## 
## ...
## and 26 more lines.

spam_spec <-  logistic_reg(penalty = tune(), mixture = 1) %>% 
  set_engine("glmnet")

# Observamos el modelo especificado
spam_spec

## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet

#Ajustar el modelo con los datos

spam_wf <- workflow() %>%
  add_recipe(spam_recipeProcessed) %>%
  add_model(spam_spec)

# Observamos el modelo
spam_wf

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
## 
## ● step_tokenize()
## ● step_stopwords()
## ● step_untokenize()
## ● step_tokenize()
## ● step_tokenfilter()
## ● step_tfidf()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet

# Creamos un grid para entrenar los parametros adicionales del modelo
spam_grid <- grid_random(penalty(), size = 25)

spam_grid

## # A tibble: 25 x 1
##          penalty
##            <dbl>
##  1 0.00174      
##  2 0.00000000110
##  3 0.00000814   
##  4 0.0901       
##  5 0.485        
##  6 0.177        
##  7 0.0854       
##  8 0.466        
##  9 0.000000498  
## 10 0.0000000139 
## # … with 15 more rows

#Seteamos semilla aleatoria y creamos los subsets de la validación cruzada

set.seed(123)
spam_folds <- vfold_cv(spam_train,v=3)
spam_folds

## #  3-fold cross-validation 
## # A tibble: 3 x 2
##   splits              id   
##   <list>              <chr>
## 1 <split [2600/1301]> Fold1
## 2 <split [2601/1300]> Fold2
## 3 <split [2601/1300]> Fold3

5. Medición de métricas mediante validación cruzada

set.seed(234)

# Se genera las submuestras de validación cruzada

spam_folds <- vfold_cv(spam_train,v=5)

spam_folds

## #  5-fold cross-validation 
## # A tibble: 5 x 2
##   splits             id   
##   <list>             <chr>
## 1 <split [3120/781]> Fold1
## 2 <split [3121/780]> Fold2
## 3 <split [3121/780]> Fold3
## 4 <split [3121/780]> Fold4
## 5 <split [3121/780]> Fold5

# Se ajusta el modelo para cada fold

rl_rs <- fit_resamples(rl_fit,
                       spam_folds,
                       control = control_resamples(save_pred = TRUE),
                       metrics = metric_set(f_meas,recall,precision)
                       )

## ! Fold1: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...

## ! Fold2: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...

## ! Fold3: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...

## ! Fold4: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...

## ! Fold5: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...

# Se calcula la matriz de confusión

rl_rs %>% conf_mat_resampled(tidy = F)

##        ham spam
## ham  646.2 28.4
## spam  12.2 93.4

rl_rs %>% collect_metrics()

## # A tibble: 3 x 6
##   .metric   .estimator  mean     n std_err .config             
##   <chr>     <chr>      <dbl> <int>   <dbl> <chr>               
## 1 f_meas    binary     0.969     5 0.00345 Preprocessor1_Model1
## 2 precision binary     0.981     5 0.00268 Preprocessor1_Model1
## 3 recall    binary     0.958     5 0.00507 Preprocessor1_Model1

#Calculo de predicciones

rl_rs %>% collect_predictions() %>% head(20)

## # A tibble: 20 x 5
##    id    .pred_class  .row class .config             
##    <chr> <fct>       <int> <fct> <chr>               
##  1 Fold1 spam            8 spam  Preprocessor1_Model1
##  2 Fold1 ham            25 ham   Preprocessor1_Model1
##  3 Fold1 ham            32 ham   Preprocessor1_Model1
##  4 Fold1 ham            36 ham   Preprocessor1_Model1
##  5 Fold1 ham            39 ham   Preprocessor1_Model1
##  6 Fold1 spam           40 spam  Preprocessor1_Model1
##  7 Fold1 ham            42 spam  Preprocessor1_Model1
##  8 Fold1 ham            43 ham   Preprocessor1_Model1
##  9 Fold1 ham            46 ham   Preprocessor1_Model1
## 10 Fold1 ham            48 ham   Preprocessor1_Model1
## 11 Fold1 ham            49 ham   Preprocessor1_Model1
## 12 Fold1 ham            55 ham   Preprocessor1_Model1
## 13 Fold1 spam           56 ham   Preprocessor1_Model1
## 14 Fold1 ham            61 ham   Preprocessor1_Model1
## 15 Fold1 ham            62 ham   Preprocessor1_Model1
## 16 Fold1 ham            65 ham   Preprocessor1_Model1
## 17 Fold1 ham            71 ham   Preprocessor1_Model1
## 18 Fold1 spam           77 spam  Preprocessor1_Model1
## 19 Fold1 spam           79 ham   Preprocessor1_Model1
## 20 Fold1 ham            83 ham   Preprocessor1_Model1

# Entrenamos el modelo con los diferentes valores del grid para que escoja los mejores valores de los parametros

set.seed(2020)
spam_grid <- tune_grid(spam_wf,
                        resamples = spam_folds,
                        grid = spam_grid,
                        control = control_resamples(save_pred = TRUE),
                        metrics = metric_set(f_meas, recall, precision)
                        )

spam_grid

## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 x 5
##   splits          id    .metrics           .notes          .predictions         
##   <list>          <chr> <list>             <list>          <list>               
## 1 <split [3120/7… Fold1 <tibble[,5] [75 ×… <tibble[,1] [0… <tibble[,5] [19,525 …
## 2 <split [3121/7… Fold2 <tibble[,5] [75 ×… <tibble[,1] [0… <tibble[,5] [19,500 …
## 3 <split [3121/7… Fold3 <tibble[,5] [75 ×… <tibble[,1] [0… <tibble[,5] [19,500 …
## 4 <split [3121/7… Fold4 <tibble[,5] [75 ×… <tibble[,1] [0… <tibble[,5] [19,500 …
## 5 <split [3121/7… Fold5 <tibble[,5] [75 ×… <tibble[,1] [0… <tibble[,5] [19,500 …

#Visualizamos las métricas del modelo resultante
spam_grid %>%
  collect_metrics()

## # A tibble: 75 x 7
##     penalty .metric   .estimator  mean     n std_err .config              
##       <dbl> <chr>     <chr>      <dbl> <int>   <dbl> <chr>                
##  1 2.24e-10 f_meas    binary     0.973     5 0.00211 Preprocessor1_Model01
##  2 2.24e-10 precision binary     0.966     5 0.00338 Preprocessor1_Model01
##  3 2.24e-10 recall    binary     0.981     5 0.00279 Preprocessor1_Model01
##  4 1.10e- 9 f_meas    binary     0.973     5 0.00211 Preprocessor1_Model02
##  5 1.10e- 9 precision binary     0.966     5 0.00338 Preprocessor1_Model02
##  6 1.10e- 9 recall    binary     0.981     5 0.00279 Preprocessor1_Model02
##  7 1.39e- 8 f_meas    binary     0.973     5 0.00211 Preprocessor1_Model03
##  8 1.39e- 8 precision binary     0.966     5 0.00338 Preprocessor1_Model03
##  9 1.39e- 8 recall    binary     0.981     5 0.00279 Preprocessor1_Model03
## 10 1.90e- 8 f_meas    binary     0.973     5 0.00211 Preprocessor1_Model04
## # … with 65 more rows

#Visualizamos los cambios de las métricas en función de los valores de penalidad
spam_grid %>%
  collect_metrics() %>%
  ggplot(aes(penalty, mean, color = .metric)) +
  geom_line(size = 1.5, show.legend = FALSE) +
  facet_wrap(~.metric) +
  scale_x_log10() +
  theme_minimal()

6. Creación del modelo final y validación de métricas

best_f <- spam_grid %>%
  select_best("f_meas")

best_f

## # A tibble: 1 x 2
##     penalty .config              
##       <dbl> <chr>                
## 1 0.0000979 Preprocessor1_Model14

#Entrenamos el modelo final con los valores del mejor modelo de entrenamiento
final_spam <- finalize_workflow(spam_wf, best_f) %>%
  fit(spam_train)

final_spam

## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
## 
## ● step_tokenize()
## ● step_stopwords()
## ● step_untokenize()
## ● step_tokenize()
## ● step_tokenfilter()
## ● step_tfidf()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:  glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial",      alpha = ~1) 
## 
##      Df  %Dev   Lambda
## 1     0  0.00 0.123200
## 2     3  3.16 0.112300
## 3     5  9.96 0.102300
## 4     6 14.99 0.093190
## 5     8 19.48 0.084910
## 6     9 24.11 0.077370
## 7     9 27.78 0.070500
## 8    10 31.18 0.064230
## 9    10 34.15 0.058530
## 10   11 36.82 0.053330
## 11   13 39.63 0.048590
## 12   14 42.27 0.044270
## 13   14 44.64 0.040340
## 14   14 46.68 0.036760
## 15   14 48.46 0.033490
## 16   14 50.03 0.030520
## 17   14 51.41 0.027810
## 18   15 52.73 0.025340
## 19   16 54.02 0.023080
## 20   16 55.16 0.021030
## 21   17 56.16 0.019170
## 22   18 57.22 0.017460
## 23   18 58.13 0.015910
## 24   19 58.92 0.014500
## 25   20 59.70 0.013210
## 26   22 60.45 0.012040
## 27   22 61.12 0.010970
## 28   23 61.71 0.009993
## 29   24 62.25 0.009105
## 30   25 62.73 0.008296
## 31   26 63.19 0.007559
## 32   27 63.62 0.006888
## 33   28 64.00 0.006276
## 34   30 64.36 0.005718
## 35   34 64.75 0.005210
## 36   36 65.14 0.004747
## 37   40 65.52 0.004326
## 38   46 65.89 0.003941
## 39   49 66.28 0.003591
## 40   56 66.66 0.003272
## 41   59 67.05 0.002982
## 42   59 67.40 0.002717
## 43   60 67.72 0.002475
## 44   60 68.02 0.002255
## 45   67 68.30 0.002055
## 46   68 68.56 0.001872
## 
## ...
## and 54 more lines.

spam_final <- last_fit(final_spam, 
                         split=spam_split,
                         metrics = metric_set(f_meas, recall, precision)
                         )

# Observamos métricas del modelo evaluado en datos de prueba
spam_final %>%
  collect_metrics()

## # A tibble: 3 x 4
##   .metric   .estimator .estimate .config             
##   <chr>     <chr>          <dbl> <chr>               
## 1 f_meas    binary         0.974 Preprocessor1_Model1
## 2 recall    binary         0.979 Preprocessor1_Model1
## 3 precision binary         0.969 Preprocessor1_Model1

# Visualizar las predicciones del dataframe de prueba

spam_final %>%
  collect_predictions %>%
  head()

## # A tibble: 6 x 5
##   id               .pred_class  .row class .config             
##   <chr>            <fct>       <int> <fct> <chr>               
## 1 train/test split ham             4 ham   Preprocessor1_Model1
## 2 train/test split ham             8 ham   Preprocessor1_Model1
## 3 train/test split spam           16 spam  Preprocessor1_Model1
## 4 train/test split ham            17 ham   Preprocessor1_Model1
## 5 train/test split ham            18 ham   Preprocessor1_Model1
## 6 train/test split ham            19 ham   Preprocessor1_Model1

Tarea 7 - Raquel Barrantes E

Raquel Barrantes

7/4/2021

1.Limpieza de los datos

2. Muestra de entrenamiento y prueba

2.1 Verificación del tamaño de las clases

3.Creación de receta de preprocesamiento de datos

4.Ajuste del modelo inicial

5. Medición de métricas mediante validación cruzada

6. Creación del modelo final y validación de métricas