1.Limpieza de los datos

library(textrecipes)
## Warning: package 'textrecipes' was built under R version 4.0.4
## Loading required package: recipes
## Warning: package 'recipes' was built under R version 4.0.4
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.4
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library(stopwords)
## Warning: package 'stopwords' was built under R version 4.0.4
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.4
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.1.0     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.0
## v readr   1.4.0
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.4
## Warning: package 'tidyr' was built under R version 4.0.3
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'stringr' was built under R version 4.0.3
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()  masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag()     masks stats::lag()
library(themis)
## Warning: package 'themis' was built under R version 4.0.4
## Registered S3 methods overwritten by 'themis':
##   method                  from   
##   bake.step_downsample    recipes
##   bake.step_upsample      recipes
##   prep.step_downsample    recipes
##   prep.step_upsample      recipes
##   tidy.step_downsample    recipes
##   tidy.step_upsample      recipes
##   tunable.step_downsample recipes
##   tunable.step_upsample   recipes
## 
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
## 
##     step_downsample, step_upsample
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.0.4
## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --
## v broom     0.7.2     v rsample   0.0.9
## v dials     0.0.9     v tune      0.1.3
## v infer     0.5.4     v workflows 0.2.2
## v modeldata 0.1.0     v yardstick 0.0.7
## v parsnip   0.1.5
## Warning: package 'broom' was built under R version 4.0.3
## Warning: package 'dials' was built under R version 4.0.4
## Warning: package 'scales' was built under R version 4.0.3
## Warning: package 'infer' was built under R version 4.0.4
## Warning: package 'modeldata' was built under R version 4.0.4
## Warning: package 'parsnip' was built under R version 4.0.4
## Warning: package 'rsample' was built under R version 4.0.4
## Warning: package 'tune' was built under R version 4.0.4
## Warning: package 'workflows' was built under R version 4.0.4
## Warning: package 'yardstick' was built under R version 4.0.4
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard()         masks purrr::discard()
## x dplyr::filter()           masks stats::filter()
## x stringr::fixed()          masks recipes::fixed()
## x dplyr::lag()              masks stats::lag()
## x yardstick::spec()         masks readr::spec()
## x recipes::step()           masks stats::step()
## x themis::step_downsample() masks recipes::step_downsample()
## x themis::step_upsample()   masks recipes::step_upsample()
library(readr)

spamData <- read_csv("spamData.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   class = col_character(),
##   message = col_character()
## )
Clean_String <- function(string){

  # Remover caracteres no UTF-8
  temp<- iconv(enc2utf8(string),sub="byte")
  temp<- str_replace_all(temp,"[^[:graph:]]", " ") 
  # Remover todo lo que no sea número o letra 
  temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
  # remover espacios extra
  temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
  # minúscula
  temp <- tolower(temp)
  
  return(temp)
  
}

# Aplicar la función a los comentarios
spamData$message <- Clean_String(spamData$message)

2. Muestra de entrenamiento y prueba

2.1 Verificación del tamaño de las clases

library(tidymodels)

set.seed(1234) # Asegurar siempre una misma semilla aleatoria.

#Realizar la partición de las muestras

reviews_split <- initial_split(spamData,prop=.7)

reviews_train <- training(reviews_split)
reviews_test <- testing(reviews_split)

dim(reviews_train);dim(reviews_test)
## [1] 3901    2
## [1] 1671    2

Creación de receta de preprocesamiento de datos

library(textrecipes)
library(stopwords)

# Setear la receta del modelo a utilizar

reviews_recipe <- recipe(class ~ message, 
                         data = reviews_train)

#Aplicar los pasos de procesamiento de datos

reviews_recipeProcessed <- reviews_recipe %>%
  step_tokenize(message) %>%
  step_stopwords(message, keep = FALSE) %>%
  step_untokenize(message) %>%
  step_tokenize(message, token = "ngrams", 
                options = list(n = 2, n_min = 1)) %>%
  step_tokenfilter(message, max_tokens = 500) %>%
  step_tfidf(message)

#Ejecutar la receta del paso anterior
reviews_recipeProcessedF <- prep(reviews_recipeProcessed)

reviews_recipeProcessedF
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          1
## 
## Training data contained 3901 data points and no missing data.
## 
## Operations:
## 
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
#Setear el workflow para trabajar el modelo de Machine Learning

reviews_wf <- workflow() %>%
  add_recipe(reviews_recipeProcessed)

Ajuste del modelo inicial

library(themis)

#Verificamos las frecuencias de nuestro dataframe

reviews_train %>%
  group_by(class) %>%
  summarise(n=n()) %>%
  mutate(freq = prop.table(n))
## # A tibble: 2 x 3
##   class     n  freq
##   <chr> <int> <dbl>
## 1 ham    3387 0.868
## 2 spam    514 0.132
# Si es requerido se utiliza la función step_smote sobre la receta que ya se había creado


reviews_recipeProcessed2 <- reviews_recipeProcessed %>%
  step_smote(class)

#Ejecutar la receta del paso anterior
reviews_recipeProcessedF2 <- prep(reviews_recipeProcessed2)

reviews_recipeProcessedF2
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          1
## 
## Training data contained 3901 data points and no missing data.
## 
## Operations:
## 
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
## SMOTE based on class [trained]
#Setear el workflow para trabajar el modelo de Machine Learning

reviews_wf2 <- workflow() %>%
  add_recipe(reviews_recipeProcessed2)

Medición de métricas mediante validación cruzada

#Setear el workflow para trabajar el modelo de Machine Learning

reviews_wf2 <- workflow() %>%
  add_recipe(reviews_recipeProcessed2)

# Especificación del modelo

rl_spec <- logistic_reg() %>% 
  set_engine("glm")

rl_spec
## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm
#Ajustar el modelo con los datos

rl_fit <- reviews_wf2 %>%
  add_model(rl_spec) %>%
  fit(data = reviews_train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
rl_fit
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 7 Recipe Steps
## 
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_smote()
## 
## -- Model -----------------------------------------------------------------------
## 
## Call:  stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
## 
## Coefficients:
##                  (Intercept)            tfidf_message_able  
##                   -2.072e+01                    -6.036e+14  
##            tfidf_message_abt         tfidf_message_account  
##                   -1.509e+15                    -4.608e+13  
##       tfidf_message_actually         tfidf_message_address  
##                   -9.276e+14                    -4.050e+13  
##            tfidf_message_aft       tfidf_message_afternoon  
##                   -2.291e+13                    -3.107e+14  
##            tfidf_message_age              tfidf_message_ah  
##                    1.333e+14                     3.346e+13  
##          tfidf_message_aight         tfidf_message_already  
##                   -5.766e+14                    -5.256e+14  
##        tfidf_message_alright            tfidf_message_also  
##                   -4.376e+14                    -5.844e+14  
##         tfidf_message_always             tfidf_message_amp  
##                   -7.958e+14                    -1.575e+15  
##        tfidf_message_another          tfidf_message_answer  
##                   -6.964e+14                    -1.698e+14  
##       tfidf_message_anything          tfidf_message_anyway  
##                   -1.130e+15                    -1.113e+14  
##          tfidf_message_apply             tfidf_message_ard  
##                    8.366e+14                    -1.218e+15  
##         tfidf_message_around             tfidf_message_ask  
##                    1.378e+15                     9.041e+13  
##          tfidf_message_asked         tfidf_message_auction  
##                   -8.761e+14                     1.439e+15  
##        tfidf_message_awarded            tfidf_message_away  
##                    2.959e+15                    -7.127e+14  
##              tfidf_message_b            tfidf_message_babe  
##                    1.145e+15                    -4.921e+14  
##           tfidf_message_baby            tfidf_message_back  
##                   -1.105e+15                     2.432e+14  
##            tfidf_message_bad             tfidf_message_bed  
##                   -3.366e+14                    -6.131e+14  
##           tfidf_message_best          tfidf_message_better  
##                   -1.315e+14                    -2.062e+15  
##            tfidf_message_big        tfidf_message_birthday  
##                   -6.043e+14                     6.991e+14  
##            tfidf_message_bit            tfidf_message_book  
##                   -9.977e+13                    -6.323e+14  
##          tfidf_message_bored             tfidf_message_box  
##                   -1.358e+15                    -3.108e+14  
##            tfidf_message_boy           tfidf_message_bring  
##                   -8.905e+14                    -2.729e+14  
##             tfidf_message_bt             tfidf_message_bus  
##                   -3.742e+15                    -1.284e+15  
## 
## ...
## and 460 more lines.

Creación del modelo final y validación de métricas

#Se setea una semilla aleatoria para evitar diferentes resultados cada corrida
set.seed(234)

# Se genera las submuestras de validación cruzada

reviews_folds <- vfold_cv(reviews_train,v=5)

reviews_folds
## #  5-fold cross-validation 
## # A tibble: 5 x 2
##   splits             id   
##   <list>             <chr>
## 1 <split [3120/781]> Fold1
## 2 <split [3121/780]> Fold2
## 3 <split [3121/780]> Fold3
## 4 <split [3121/780]> Fold4
## 5 <split [3121/780]> Fold5
# Se ajusta el modelo para cada fold

rl_rs <- fit_resamples(rl_fit,
                       reviews_folds,
                       control = control_resamples(save_pred = TRUE),
                       metrics = metric_set(f_meas,recall,precision)
                       )
## Warning: package 'rlang' was built under R version 4.0.4
## Warning: package 'vctrs' was built under R version 4.0.4
## ! Fold1: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Fold1: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Fold2: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold2: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Fold3: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold3: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Fold4: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold4: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Fold5: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold5: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
# Se calcula la matriz de confusión

rl_rs %>% conf_mat_resampled(tidy = F)
##        ham spam
## ham  640.8 36.6
## spam  14.4 88.4
#Cálculo de Predicciones
rl_rs %>% collect_predictions() %>% head()
## # A tibble: 6 x 5
##   id    .pred_class  .row class .config             
##   <chr> <fct>       <int> <fct> <chr>               
## 1 Fold1 spam            8 spam  Preprocessor1_Model1
## 2 Fold1 ham            25 ham   Preprocessor1_Model1
## 3 Fold1 ham            32 ham   Preprocessor1_Model1
## 4 Fold1 ham            36 ham   Preprocessor1_Model1
## 5 Fold1 ham            39 ham   Preprocessor1_Model1
## 6 Fold1 ham            40 ham   Preprocessor1_Model1

Prueba de nuevos datos(Sugerencia:Observar ejemplos del dataset e incluir textos similares)

rl_fitFinal <- reviews_wf2 %>%
  add_model(rl_spec) %>%
  # Ajusta en el dataset de entrenamiento y evalúa en el dataset de prueba
  last_fit(reviews_split,
           metrics = metric_set(f_meas,recall,precision)
           )
## ! train/test split: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! train/test split: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
# recolectamos las métricas
rl_fitFinal %>% collect_metrics()
## # A tibble: 3 x 4
##   .metric   .estimator .estimate .config             
##   <chr>     <chr>          <dbl> <chr>               
## 1 f_meas    binary         0.969 Preprocessor1_Model1
## 2 recall    binary         0.953 Preprocessor1_Model1
## 3 precision binary         0.986 Preprocessor1_Model1
tokensImp<-pull_workflow_fit(rl_fit)$fit 

tokensImp<- tokensImp$coefficients

tokensImpDF <- data.frame(token=names(tokensImp),values=tokensImp) %>%
  mutate(token=str_remove_all(token,"tfidf_comments_"))
  
tokensImpDF %>%
  top_n(15, abs(tokensImpDF$values)) %>%
  ungroup() %>%
  ggplot(aes(fct_reorder(token, values), values, fill = values > 0)) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  coord_flip() +
  labs(
    x = NULL,
    title = "Coefficients that increase/decrease probability the most")+
  theme_minimal()

#Predecir nuevos datos

new_comment <- tribble(~message,"price")
new_comment
## # A tibble: 1 x 1
##   message
##   <chr>  
## 1 price
prediction<-predict(rl_fit, new_data = new_comment)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
paste0("el resultado para el comentario ","'",new_comment,"'","es: ",prediction$.pred_class)
## [1] "el resultado para el comentario 'price'es: spam"