1.Limpieza de los datos

library(stringr)
library(textrecipes)
library(stopwords)
library(tidyverse)
library(themis)
library(tidymodels)
library(stringr)
library(rsample)
library(tidytext)


Clean_String <- function(string){
  
  # Remover caracteres no UTF-8
  temp<- iconv(enc2utf8(string),sub="byte")
  temp<- str_replace_all(temp,"[^[:graph:]]", " ") 
  # Remover todo lo que no sea nÃºmero o letra 
  temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
  # remover espacios extra
  temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
  # minÃºscula
  temp <- tolower(temp)
  
  return(temp)
  
}

library(readr)
wd <- "C:/Users/Aleba/Documents/spamData.csv"
spamData <- read.csv(wd)

spamData$message <- Clean_String(spamData$message)

str(spamData)

## 'data.frame':    5572 obs. of  2 variables:
##  $ class  : chr  "ham" "ham" "spam" "ham" ...
##  $ message: chr  "go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat " "ok lar joking wif u oni " "free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s" "u dun say so early hor u c already then say " ...

2.Muestra de entrenamiento y prueba

set.seed(1234)



spamdividido <- initial_split(spamData,prop=.7)

spam_training <- training(spamdividido)
spam_testing <- testing(spamdividido)

dim(spam_training);dim(spam_testing)

## [1] 3901    2

## [1] 1671    2

receta_1<- recipe(class ~ message, 
                  data = spam_training)



recetaprocesada1 <- receta_1 %>%
  step_text_normalization(message) %>% 
  step_tokenize(message) %>%
  step_stopwords(message, keep = FALSE) %>%
  step_untokenize(message) %>%
  step_tokenize(message, token = "ngrams", 
                options = list(n = 4, n_min = 1)) %>%
  step_tokenfilter(message, max_tokens = 200) %>%
  step_tfidf(message)  %>%
  step_upsample(class)

2.1 Verificación del tamaño de las clases

spam_training %>%
  group_by(class) %>%
  summarise(n=n()) %>%
  mutate(freq = prop.table(n))

## # A tibble: 2 x 3
##   class     n  freq
##   <chr> <int> <dbl>
## 1 ham    3387 0.868
## 2 spam    514 0.132

recetasobremuestreada <- recetaprocesada1 %>%
  step_smote(class)

3.Creación de receta de preprocesamiento de datos

recetaejecutada <- prep(recetaprocesada1)

recetaworkflow_1 <- workflow() %>%
  add_recipe(recetaprocesada1)

head(recetaejecutada, 2)

## $var_info
## # A tibble: 2 x 4
##   variable type    role      source  
##   <chr>    <chr>   <chr>     <chr>   
## 1 message  nominal predictor original
## 2 class    nominal outcome   original
## 
## $term_info
## # A tibble: 201 x 4
##    variable               type    role      source  
##    <chr>                  <chr>   <chr>     <chr>   
##  1 class                  nominal outcome   original
##  2 tfidf_message_already  numeric predictor derived 
##  3 tfidf_message_also     numeric predictor derived 
##  4 tfidf_message_always   numeric predictor derived 
##  5 tfidf_message_amp      numeric predictor derived 
##  6 tfidf_message_anything numeric predictor derived 
##  7 tfidf_message_around   numeric predictor derived 
##  8 tfidf_message_ask      numeric predictor derived 
##  9 tfidf_message_b        numeric predictor derived 
## 10 tfidf_message_babe     numeric predictor derived 
## # ... with 191 more rows

4.Ajuste del modelo inicial

rlasso_spec <-  logistic_reg(penalty = tune(), mixture = 1) %>% 
  set_engine("glmnet")

lasso_wf <- workflow() %>%
  add_recipe(recetaprocesada1) %>%
  add_model(rlasso_spec)

lasso_wf

## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
## 
## * step_text_normalization()
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_upsample()
## 
## -- Model -----------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet

5. Medición de métricas mediante validación cruzada

lambda_grid <- grid_random(penalty(), size = 25)

lambda_grid

## # A tibble: 25 x 1
##          penalty
##            <dbl>
##  1 0.391        
##  2 0.00196      
##  3 0.00000000344
##  4 0.0000000528 
##  5 0.0000000394 
##  6 0.0521       
##  7 0.000000113  
##  8 0.000433     
##  9 0.00665      
## 10 0.0000227    
## # ... with 15 more rows

set.seed(123)
reviews_folds <- vfold_cv(spam_training,v=5)
reviews_folds

## #  5-fold cross-validation 
## # A tibble: 5 x 2
##   splits             id   
##   <list>             <chr>
## 1 <split [3120/781]> Fold1
## 2 <split [3121/780]> Fold2
## 3 <split [3121/780]> Fold3
## 4 <split [3121/780]> Fold4
## 5 <split [3121/780]> Fold5

6. Creación del modelo final y validación de métricas

set.seed(2020)
lasso_grid <- tune_grid(lasso_wf,
                        resamples = reviews_folds,
                        grid = lambda_grid,
                        control = control_resamples(save_pred = TRUE),
                        metrics = metric_set(f_meas, recall, precision)
)

lasso_grid

## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 x 5
##   splits             id    .metrics         .notes          .predictions        
##   <list>             <chr> <list>           <list>          <list>              
## 1 <split [3120/781]> Fold1 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,525 x 5~
## 2 <split [3121/780]> Fold2 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,500 x 5~
## 3 <split [3121/780]> Fold3 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
## 4 <split [3121/780]> Fold4 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,500 x 5~
## 5 <split [3121/780]> Fold5 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~

lasso_grid %>%
  collect_metrics()

## # A tibble: 75 x 7
##     penalty .metric   .estimator  mean     n std_err .config              
##       <dbl> <chr>     <chr>      <dbl> <int>   <dbl> <chr>                
##  1 3.61e-10 f_meas    binary     0.970     5 0.00219 Preprocessor1_Model01
##  2 3.61e-10 precision binary     0.987     5 0.00206 Preprocessor1_Model01
##  3 3.61e-10 recall    binary     0.954     5 0.00482 Preprocessor1_Model01
##  4 3.62e-10 f_meas    binary     0.970     5 0.00219 Preprocessor1_Model02
##  5 3.62e-10 precision binary     0.987     5 0.00206 Preprocessor1_Model02
##  6 3.62e-10 recall    binary     0.954     5 0.00482 Preprocessor1_Model02
##  7 3.44e- 9 f_meas    binary     0.970     5 0.00219 Preprocessor1_Model03
##  8 3.44e- 9 precision binary     0.987     5 0.00206 Preprocessor1_Model03
##  9 3.44e- 9 recall    binary     0.954     5 0.00482 Preprocessor1_Model03
## 10 5.96e- 9 f_meas    binary     0.970     5 0.00219 Preprocessor1_Model04
## # ... with 65 more rows

best_f <- lasso_grid %>%
  select_best("f_meas")

best_f

## # A tibble: 1 x 2
##   penalty .config              
##     <dbl> <chr>                
## 1 0.00337 Preprocessor1_Model19

final_lasso <- finalize_workflow(lasso_wf, best_f) %>%
  fit(spam_training)

final_lasso

## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
## 
## * step_text_normalization()
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_upsample()
## 
## -- Model -----------------------------------------------------------------------
## 
## Call:  glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial",      alpha = ~1) 
## 
##      Df  %Dev   Lambda
## 1     0  0.00 0.173900
## 2     1  1.49 0.158500
## 3     2  3.73 0.144400
## 4     4  6.41 0.131600
## 5     4  9.70 0.119900
## 6     5 13.17 0.109200
## 7     5 16.36 0.099520
## 8     7 19.44 0.090680
## 9    13 23.21 0.082620
## 10   14 27.23 0.075280
## 11   15 30.92 0.068590
## 12   19 34.42 0.062500
## 13   20 37.77 0.056950
## 14   21 40.87 0.051890
## 15   25 43.91 0.047280
## 16   29 46.85 0.043080
## 17   30 49.56 0.039250
## 18   32 52.09 0.035760
## 19   36 54.42 0.032590
## 20   37 56.63 0.029690
## 21   38 58.59 0.027050
## 22   38 60.35 0.024650
## 23   41 61.96 0.022460
## 24   43 63.44 0.020470
## 25   47 64.89 0.018650
## 26   51 66.22 0.016990
## 27   55 67.47 0.015480
## 28   58 68.69 0.014110
## 29   59 69.80 0.012850
## 30   63 70.81 0.011710
## 31   69 71.79 0.010670
## 32   75 72.77 0.009723
## 33   75 73.67 0.008859
## 34   77 74.49 0.008072
## 35   82 75.24 0.007355
## 36   88 75.94 0.006702
## 37   92 76.60 0.006106
## 38  101 77.24 0.005564
## 39  110 77.84 0.005070
## 40  117 78.42 0.004619
## 41  121 78.96 0.004209
## 42  123 79.46 0.003835
## 43  125 79.92 0.003494
## 44  130 80.34 0.003184
## 45  137 80.74 0.002901
## 46  140 81.11 0.002643
## 
## ...
## and 54 more lines.

review_final <- last_fit(final_lasso, 
                         split=spamdividido,
                         metrics = metric_set(f_meas, recall, precision))


review_final %>%
  collect_metrics()

## # A tibble: 3 x 4
##   .metric   .estimator .estimate .config             
##   <chr>     <chr>          <dbl> <chr>               
## 1 f_meas    binary         0.969 Preprocessor1_Model1
## 2 recall    binary         0.949 Preprocessor1_Model1
## 3 precision binary         0.990 Preprocessor1_Model1

review_final %>%
  collect_predictions %>%
  head()

## # A tibble: 6 x 5
##   id               .pred_class  .row class .config             
##   <chr>            <fct>       <int> <fct> <chr>               
## 1 train/test split ham             7 ham   Preprocessor1_Model1
## 2 train/test split ham            11 ham   Preprocessor1_Model1
## 3 train/test split spam           13 spam  Preprocessor1_Model1
## 4 train/test split ham            15 ham   Preprocessor1_Model1
## 5 train/test split spam           16 spam  Preprocessor1_Model1
## 6 train/test split ham            17 ham   Preprocessor1_Model1

7. Prueba de nuevos datos(Sugerencia:Observar ejemplos del dataset e incluir textos similares).

new_comment <- tribble(~message,"ok lar joking wif u oni ")
new_comment

## # A tibble: 1 x 1
##   message                   
##   <chr>                     
## 1 "ok lar joking wif u oni "

prediction<-predict(final_lasso, new_data = new_comment)


paste0("el resultado para el comentario ","'",new_comment,"'","es: ",prediction$.pred_class)

## [1] "el resultado para el comentario 'ok lar joking wif u oni 'es: ham"

Tarea #7

Alejandro Roa Badilla

4/8/2021

1.Limpieza de los datos

2.Muestra de entrenamiento y prueba

2.1 Verificación del tamaño de las clases

3.Creación de receta de preprocesamiento de datos

4.Ajuste del modelo inicial

5. Medición de métricas mediante validación cruzada

6. Creación del modelo final y validación de métricas

7. Prueba de nuevos datos(Sugerencia:Observar ejemplos del dataset e incluir textos similares).