Replique todos los pasos requeridos y ejemplificados en este tutorial para generar un modelo de regresionn logistica que clasifique entre ham y spam.

library(textrecipes)
library(stopwords)
library(tidyverse)
library(themis)
library(tidymodels)
library(stringr)
library(rsample)
library(tidytext)

##Limpieza de los datos

Clean_String <- function(string){
  
  # Remover caracteres no UTF-8
  temp<- iconv(enc2utf8(string),sub="byte")
  temp<- str_replace_all(temp,"[^[:graph:]]", " ") 
  # Remover todo lo que no sea número o letra 
  temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
  # remover espacios extra
  temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
  # minúscula
  temp <- tolower(temp)
  
  return(temp)
  
}

library(readr)

wd <- "C:/Users/Aleba/Documents/spamData.csv"
spamData <- read.csv(wd)


spamData$message <- Clean_String(spamData$message)

str(spamData)
## 'data.frame':    5572 obs. of  2 variables:
##  $ class  : chr  "ham" "ham" "spam" "ham" ...
##  $ message: chr  "go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat " "ok lar joking wif u oni " "free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s" "u dun say so early hor u c already then say " ...

Muestra de entrenamiento y prueba

set.seed(1234)



spamdividido <- initial_split(spamData,prop=.7)

spam_training <- training(spamdividido)
spam_testing <- testing(spamdividido)

dim(spam_training);dim(spam_testing)
## [1] 3901    2
## [1] 1671    2
 receta_1<- recipe(class ~ message, 
                         data = spamData)



recetaprocesada1 <- receta_1 %>%
  step_tokenize(message) %>%
  step_stopwords(message, keep = FALSE) %>%
  step_untokenize(message) %>%
  step_tokenize(message, token = "ngrams", 
                options = list(n = 2, n_min = 1)) %>%
  step_tokenfilter(message, max_tokens = 500) %>%
  step_tfidf(message)


recetaejecutada <- prep(recetaprocesada1)
    

recetaejecutada
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          1
## 
## Training data contained 5572 data points and no missing data.
## 
## Operations:
## 
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
recetaworkflow_1 <- workflow() %>%
  add_recipe(recetaprocesada1)

Verificación del tamaño de las clases

spam_training %>%
  group_by(class) %>%
  summarise(n=n()) %>%
  mutate(freq = prop.table(n))
## # A tibble: 2 x 3
##   class     n  freq
##   <chr> <int> <dbl>
## 1 ham    3387 0.868
## 2 spam    514 0.132
recetasobremuestreada <- recetaprocesada1 %>%
  step_smote(class)

#Creación de receta de preprocesamiento de datos

recetaejecutada_2 <- prep(recetasobremuestreada)

recetaejecutada_2
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          1
## 
## Training data contained 5572 data points and no missing data.
## 
## Operations:
## 
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
## SMOTE based on class [trained]
recetaworkflow_2 <- workflow() %>%
add_recipe(recetasobremuestreada)

Ajuste del modelo inicial

regresion <- logistic_reg() %>% 
  set_engine("glm")

regresion
## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm
modeloajustado <- recetaworkflow_2 %>%
  add_model(regresion) %>%
  fit(data = spam_training)

modeloajustado
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 7 Recipe Steps
## 
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_smote()
## 
## -- Model -----------------------------------------------------------------------
## 
## Call:  stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
## 
## Coefficients:
##                  (Intercept)            tfidf_message_able  
##                   -2.072e+01                    -6.036e+14  
##            tfidf_message_abt         tfidf_message_account  
##                   -1.509e+15                    -4.608e+13  
##       tfidf_message_actually         tfidf_message_address  
##                   -9.276e+14                    -4.050e+13  
##            tfidf_message_aft       tfidf_message_afternoon  
##                   -2.291e+13                    -3.107e+14  
##            tfidf_message_age              tfidf_message_ah  
##                    1.333e+14                     3.346e+13  
##          tfidf_message_aight         tfidf_message_already  
##                   -5.766e+14                    -5.256e+14  
##        tfidf_message_alright            tfidf_message_also  
##                   -4.376e+14                    -5.844e+14  
##         tfidf_message_always             tfidf_message_amp  
##                   -7.958e+14                    -1.575e+15  
##        tfidf_message_another          tfidf_message_answer  
##                   -6.964e+14                    -1.698e+14  
##       tfidf_message_anything          tfidf_message_anyway  
##                   -1.130e+15                    -1.113e+14  
##          tfidf_message_apply             tfidf_message_ard  
##                    8.366e+14                    -1.218e+15  
##         tfidf_message_around             tfidf_message_ask  
##                    1.378e+15                     9.041e+13  
##          tfidf_message_asked         tfidf_message_auction  
##                   -8.761e+14                     1.439e+15  
##        tfidf_message_awarded            tfidf_message_away  
##                    2.959e+15                    -7.127e+14  
##              tfidf_message_b            tfidf_message_babe  
##                    1.145e+15                    -4.921e+14  
##           tfidf_message_baby            tfidf_message_back  
##                   -1.105e+15                     2.432e+14  
##            tfidf_message_bad             tfidf_message_bed  
##                   -3.366e+14                    -6.131e+14  
##           tfidf_message_best          tfidf_message_better  
##                   -1.315e+14                    -2.062e+15  
##            tfidf_message_big        tfidf_message_birthday  
##                   -6.043e+14                     6.991e+14  
##            tfidf_message_bit            tfidf_message_book  
##                   -9.977e+13                    -6.323e+14  
##          tfidf_message_bored             tfidf_message_box  
##                   -1.358e+15                    -3.108e+14  
##            tfidf_message_boy           tfidf_message_bring  
##                   -8.905e+14                    -2.729e+14  
##             tfidf_message_bt             tfidf_message_bus  
##                   -3.742e+15                    -1.284e+15  
## 
## ...
## and 460 more lines.

Medición de métricas mediante validación cruzada

set.seed(234)



matriz_folds <- vfold_cv(spam_training, v=5)

matriz_folds
## #  5-fold cross-validation 
## # A tibble: 5 x 2
##   splits             id   
##   <list>             <chr>
## 1 <split [3120/781]> Fold1
## 2 <split [3121/780]> Fold2
## 3 <split [3121/780]> Fold3
## 4 <split [3121/780]> Fold4
## 5 <split [3121/780]> Fold5

Creación del modelo final y validación de métricas

medidas <- fit_resamples(modeloajustado,
                       matriz_folds,
                       control = control_resamples(save_pred = TRUE),
                       metrics = metric_set(f_meas,recall,precision))



medidas %>% conf_mat_resampled(tidy = F)
##        ham spam
## ham  640.8 36.6
## spam  14.4 88.4
medidas %>% collect_metrics()
## # A tibble: 3 x 6
##   .metric   .estimator  mean     n std_err .config             
##   <chr>     <chr>      <dbl> <int>   <dbl> <chr>               
## 1 f_meas    binary     0.962     5 0.00631 Preprocessor1_Model1
## 2 precision binary     0.978     5 0.00434 Preprocessor1_Model1
## 3 recall    binary     0.946     5 0.0107  Preprocessor1_Model1
medidas %>% collect_predictions() %>% head()
## # A tibble: 6 x 5
##   id    .pred_class  .row class .config             
##   <chr> <fct>       <int> <fct> <chr>               
## 1 Fold1 spam            8 spam  Preprocessor1_Model1
## 2 Fold1 ham            25 ham   Preprocessor1_Model1
## 3 Fold1 ham            32 ham   Preprocessor1_Model1
## 4 Fold1 ham            36 ham   Preprocessor1_Model1
## 5 Fold1 ham            39 ham   Preprocessor1_Model1
## 6 Fold1 ham            40 ham   Preprocessor1_Model1

Prueba de nuevos datos(Sugerencia:Observar ejemplos del dataset e incluir textos similares).

new_comment <- tribble(~message,"ok lar joking wif u oni ")
new_comment
## # A tibble: 1 x 1
##   message                   
##   <chr>                     
## 1 "ok lar joking wif u oni "
prediction<-predict(modeloajustado, new_data = new_comment)


paste0("el resultado para el comentario ","'",new_comment,"'","es: ",prediction$.pred_class)
## [1] "el resultado para el comentario 'ok lar joking wif u oni 'es: ham"