Limpieza de datos

library(readr)
library('dplyr');

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library('textrecipes');

## Loading required package: recipes

## 
## Attaching package: 'recipes'

## The following object is masked from 'package:stats':
## 
##     step

library('stopwords');
library('tidyverse');

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()  masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag()     masks stats::lag()

library('themis');

## Registered S3 methods overwritten by 'themis':
##   method                  from   
##   bake.step_downsample    recipes
##   bake.step_upsample      recipes
##   prep.step_downsample    recipes
##   prep.step_upsample      recipes
##   tidy.step_downsample    recipes
##   tidy.step_upsample      recipes
##   tunable.step_downsample recipes
##   tunable.step_upsample   recipes

## 
## Attaching package: 'themis'

## The following objects are masked from 'package:recipes':
## 
##     step_downsample, step_upsample

library('tidymodels');

## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --

## v broom     0.7.5     v rsample   0.0.9
## v dials     0.0.9     v tune      0.1.3
## v infer     0.5.4     v workflows 0.2.2
## v modeldata 0.1.0     v yardstick 0.0.7
## v parsnip   0.1.5

## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard()         masks purrr::discard()
## x dplyr::filter()           masks stats::filter()
## x stringr::fixed()          masks recipes::fixed()
## x dplyr::lag()              masks stats::lag()
## x yardstick::spec()         masks readr::spec()
## x recipes::step()           masks stats::step()
## x themis::step_downsample() masks recipes::step_downsample()
## x themis::step_upsample()   masks recipes::step_upsample()

library('themis');
#library('latexpdf');
library('glmnet');

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-1

df <- read_csv("C:/Users/Viviana Moya C/OneDrive/Escritorio/spamData.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   class = col_character(),
##   message = col_character()
## )

spamData <- df

Clean_String <- function(string){

  # Remover caracteres no UTF-8
  temp<- iconv(enc2utf8(string),sub="byte")
  temp<- str_replace_all(temp,"[^[:graph:]]", " ") 
  # Remover todo lo que no sea número o letra 
  temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
  # remover espacios extra
  temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
  # minúscula
  temp <- tolower(temp)
  
  return(temp)
  
}

# Aplicar la función a los comentarios
spamData$message <- Clean_String(spamData$message)

Muestra de entrenamiento y prueba

set.seed(1234) # Asegurar siempre una misma semilla aleatoria.

reviewClass <- spamData %>% 
  mutate(len= str_length(message))

#Realizar la partición de las muestras

reviews_split <- initial_split(reviewClass,prop=.7)

reviews_train <- training(reviews_split)
reviews_test <- testing(reviews_split)

#Creamos la receta inicial

reviews_recipe <- recipe(class ~ message+len, 
                         data = reviews_train)


#Aplicar los pasos de procesamiento de datos

reviews_recipeProcessed <- reviews_recipe %>%
  step_text_normalization(message) %>% # elimina caracteres extraños
  step_tokenize(message) %>%
  step_stopwords(message, keep = FALSE) %>%
  step_untokenize(message) %>%
  step_tokenize(message, token = "ngrams", 
                options = list(n = 4, n_min = 1)) %>%
  step_tokenfilter(message, max_tokens = 500) %>%
  step_tfidf(message)  %>%
  step_upsample(class) #alternativa a step_smote

Creamos el workflow y la receta para el nuevo modelo

library(glmnet)

# Creamos el workflow y la receta para el nuevo modelo
reviews_wf <- workflow() %>%
  add_recipe(reviews_recipeProcessed)

# Especificación del modelo

rlasso_spec <-  logistic_reg(penalty = tune(), mixture = 1) %>% # Mixture=1 se requiere para indicar que es LASSO
  set_engine("glmnet")

# Observamos el modelo especificado
rlasso_spec

## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet

#Ajustar el modelo con los datos

lasso_wf <- workflow() %>%
  add_recipe(reviews_recipeProcessed) %>%
  add_model(rlasso_spec)

# Observamos el modelo
lasso_wf

## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
## 
## * step_text_normalization()
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_upsample()
## 
## -- Model -----------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet

Creación del grid para entrenar los parametros adicionales del modelo

# Creamos un grid para entrenar los parametros adicionales del modelo
lambda_grid <- grid_random(penalty(), size = 25)

lambda_grid

## # A tibble: 25 x 1
##          penalty
##            <dbl>
##  1 0.00000285   
##  2 0.000000934  
##  3 0.265        
##  4 0.0000176    
##  5 0.000000140  
##  6 0.00173      
##  7 0.000000406  
##  8 0.391        
##  9 0.00196      
## 10 0.00000000344
## # ... with 15 more rows

#Seteamos semilla aleatoria y creamos los subsets de la validación cruzada

set.seed(123)
reviews_folds <- vfold_cv(reviews_train,v=5)
reviews_folds

## #  5-fold cross-validation 
## # A tibble: 5 x 2
##   splits             id   
##   <list>             <chr>
## 1 <split [3120/781]> Fold1
## 2 <split [3121/780]> Fold2
## 3 <split [3121/780]> Fold3
## 4 <split [3121/780]> Fold4
## 5 <split [3121/780]> Fold5

Modelo con los diferentes valores del grid y Visualizamos las métricas

# Entrenamos el modelo con los diferentes valores del grid para que escoja los mejores valores de los parametros

set.seed(2020)
lasso_grid <- tune_grid(lasso_wf,
                        resamples = reviews_folds,
                        grid = lambda_grid,
                        control = control_resamples(save_pred = TRUE),
                        metrics = metric_set(f_meas, recall, precision)
                        )

## ! Fold3: internal: While computing binary `precision()`, no predicted events were...

## ! Fold5: internal: While computing binary `precision()`, no predicted events were...

lasso_grid

## Warning: This tuning result has notes. Example notes on model fitting include:
## internal: While computing binary `precision()`, no predicted events were detected (i.e. `true_positive + false_positive = 0`). 
## Precision is undefined in this case, and `NA` will be returned.
## Note that 694 true event(s) actually occured for the problematic event level, 'ham'.
## internal: While computing binary `precision()`, no predicted events were detected (i.e. `true_positive + false_positive = 0`). 
## Precision is undefined in this case, and `NA` will be returned.
## Note that 678 true event(s) actually occured for the problematic event level, 'ham'.

## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 x 5
##   splits             id    .metrics         .notes          .predictions        
##   <list>             <chr> <list>           <list>          <list>              
## 1 <split [3120/781]> Fold1 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,525 x 5~
## 2 <split [3121/780]> Fold2 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,500 x 5~
## 3 <split [3121/780]> Fold3 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
## 4 <split [3121/780]> Fold4 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,500 x 5~
## 5 <split [3121/780]> Fold5 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~

#Visualizamos las métricas del modelo resultante
lasso_grid %>%
  collect_metrics()

## # A tibble: 75 x 7
##     penalty .metric   .estimator  mean     n std_err .config              
##       <dbl> <chr>     <chr>      <dbl> <int>   <dbl> <chr>                
##  1 3.61e-10 f_meas    binary     0.975     5 0.00180 Preprocessor1_Model01
##  2 3.61e-10 precision binary     0.984     5 0.00111 Preprocessor1_Model01
##  3 3.61e-10 recall    binary     0.965     5 0.00352 Preprocessor1_Model01
##  4 3.44e- 9 f_meas    binary     0.975     5 0.00180 Preprocessor1_Model02
##  5 3.44e- 9 precision binary     0.984     5 0.00111 Preprocessor1_Model02
##  6 3.44e- 9 recall    binary     0.965     5 0.00352 Preprocessor1_Model02
##  7 7.52e- 9 f_meas    binary     0.975     5 0.00180 Preprocessor1_Model03
##  8 7.52e- 9 precision binary     0.984     5 0.00111 Preprocessor1_Model03
##  9 7.52e- 9 recall    binary     0.965     5 0.00352 Preprocessor1_Model03
## 10 3.94e- 8 f_meas    binary     0.975     5 0.00180 Preprocessor1_Model04
## # ... with 65 more rows

#seleccionamos el mejor modelo según métrica F1 Score


best_f <- lasso_grid %>%
  select_best("f_meas")

best_f

## # A tibble: 1 x 2
##   penalty .config              
##     <dbl> <chr>                
## 1 0.00173 Preprocessor1_Model18

#Entrenamos el modelo final con los valores del mejor modelo de entrenamiento
final_lasso <- finalize_workflow(lasso_wf, best_f) %>%
  fit(reviews_train)

final_lasso

## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
## 
## * step_text_normalization()
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_upsample()
## 
## -- Model -----------------------------------------------------------------------
## 
## Call:  glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial",      alpha = ~1) 
## 
##      Df  %Dev   Lambda
## 1     0  0.00 0.234400
## 2     1  2.72 0.213600
## 3     1  5.11 0.194600
## 4     1  7.20 0.177300
## 5     1  9.04 0.161600
## 6     2 11.29 0.147200
## 7     3 13.63 0.134200
## 8     3 16.27 0.122200
## 9     5 18.81 0.111400
## 10    5 21.65 0.101500
## 11    6 24.31 0.092470
## 12    8 26.95 0.084250
## 13   10 30.02 0.076770
## 14   11 33.05 0.069950
## 15   14 35.95 0.063730
## 16   18 38.79 0.058070
## 17   19 41.65 0.052910
## 18   23 44.33 0.048210
## 19   28 47.03 0.043930
## 20   34 49.74 0.040030
## 21   37 52.38 0.036470
## 22   43 55.05 0.033230
## 23   49 57.71 0.030280
## 24   54 60.18 0.027590
## 25   56 62.43 0.025140
## 26   60 64.50 0.022900
## 27   65 66.50 0.020870
## 28   69 68.35 0.019020
## 29   74 70.15 0.017330
## 30   75 71.78 0.015790
## 31   80 73.25 0.014380
## 32   82 74.61 0.013110
## 33   90 75.89 0.011940
## 34  103 77.14 0.010880
## 35  112 78.32 0.009915
## 36  115 79.43 0.009034
## 37  120 80.47 0.008231
## 38  130 81.48 0.007500
## 39  139 82.46 0.006834
## 40  146 83.37 0.006227
## 41  156 84.21 0.005674
## 42  166 85.02 0.005170
## 43  172 85.76 0.004710
## 44  184 86.46 0.004292
## 45  195 87.13 0.003911
## 46  200 87.75 0.003563
## 
## ...
## and 54 more lines.

Entrenamos el modelo con los diferentes valores del grid

# Entrenamos el modelo con los diferentes valores del grid para que escoja los mejores valores de los parametros

set.seed(2020)
lasso_grid <- tune_grid(lasso_wf,
                        resamples = reviews_folds,
                        grid = lambda_grid,
                        control = control_resamples(save_pred = TRUE),
                        metrics = metric_set(f_meas, recall, precision)
                        )

## ! Fold3: internal: While computing binary `precision()`, no predicted events were...

## ! Fold5: internal: While computing binary `precision()`, no predicted events were...

lasso_grid

## Warning: This tuning result has notes. Example notes on model fitting include:
## internal: While computing binary `precision()`, no predicted events were detected (i.e. `true_positive + false_positive = 0`). 
## Precision is undefined in this case, and `NA` will be returned.
## Note that 694 true event(s) actually occured for the problematic event level, 'ham'.
## internal: While computing binary `precision()`, no predicted events were detected (i.e. `true_positive + false_positive = 0`). 
## Precision is undefined in this case, and `NA` will be returned.
## Note that 678 true event(s) actually occured for the problematic event level, 'ham'.

## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 x 5
##   splits             id    .metrics         .notes          .predictions        
##   <list>             <chr> <list>           <list>          <list>              
## 1 <split [3120/781]> Fold1 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,525 x 5~
## 2 <split [3121/780]> Fold2 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,500 x 5~
## 3 <split [3121/780]> Fold3 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
## 4 <split [3121/780]> Fold4 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,500 x 5~
## 5 <split [3121/780]> Fold5 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~

library(rlang)

#Visualizamos las métricas del modelo resultante

lasso_grid %>%
  collect_metrics()

## # A tibble: 75 x 7
##     penalty .metric   .estimator  mean     n std_err .config              
##       <dbl> <chr>     <chr>      <dbl> <int>   <dbl> <chr>                
##  1 3.61e-10 f_meas    binary     0.975     5 0.00180 Preprocessor1_Model01
##  2 3.61e-10 precision binary     0.984     5 0.00111 Preprocessor1_Model01
##  3 3.61e-10 recall    binary     0.965     5 0.00352 Preprocessor1_Model01
##  4 3.44e- 9 f_meas    binary     0.975     5 0.00180 Preprocessor1_Model02
##  5 3.44e- 9 precision binary     0.984     5 0.00111 Preprocessor1_Model02
##  6 3.44e- 9 recall    binary     0.965     5 0.00352 Preprocessor1_Model02
##  7 7.52e- 9 f_meas    binary     0.975     5 0.00180 Preprocessor1_Model03
##  8 7.52e- 9 precision binary     0.984     5 0.00111 Preprocessor1_Model03
##  9 7.52e- 9 recall    binary     0.965     5 0.00352 Preprocessor1_Model03
## 10 3.94e- 8 f_meas    binary     0.975     5 0.00180 Preprocessor1_Model04
## # ... with 65 more rows

Selección del mejor modelo

#Visualizamos los cambios de las métricas en función de los valores de penalidad
lasso_grid %>%
  collect_metrics() %>%
  ggplot(aes(penalty, mean, color = .metric)) +
  geom_line(size = 1.5, show.legend = FALSE) +
  facet_wrap(~.metric) +
  scale_x_log10() +
  theme_minimal()

Seleccionamos el mejor modelo según métrica F1 Score

#seleccionamos el mejor modelo según métrica F1 Score

best_f <- lasso_grid %>%
  select_best("f_meas")

best_f

## # A tibble: 1 x 2
##   penalty .config              
##     <dbl> <chr>                
## 1 0.00173 Preprocessor1_Model18

#Entrenamos el modelo final con los valores del mejor modelo de entrenamiento
final_lasso <- finalize_workflow(lasso_wf, best_f) %>%
  fit(reviews_train)

final_lasso

## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
## 
## * step_text_normalization()
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_upsample()
## 
## -- Model -----------------------------------------------------------------------
## 
## Call:  glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial",      alpha = ~1) 
## 
##      Df  %Dev   Lambda
## 1     0  0.00 0.234400
## 2     1  2.72 0.213600
## 3     1  5.11 0.194600
## 4     1  7.20 0.177300
## 5     1  9.04 0.161600
## 6     2 11.29 0.147200
## 7     3 13.63 0.134200
## 8     3 16.27 0.122200
## 9     5 18.81 0.111400
## 10    5 21.65 0.101500
## 11    6 24.31 0.092470
## 12    8 26.95 0.084250
## 13   10 30.02 0.076770
## 14   11 33.05 0.069950
## 15   14 35.95 0.063730
## 16   18 38.79 0.058070
## 17   19 41.65 0.052910
## 18   23 44.33 0.048210
## 19   28 47.03 0.043930
## 20   34 49.74 0.040030
## 21   37 52.38 0.036470
## 22   43 55.05 0.033230
## 23   49 57.71 0.030280
## 24   54 60.18 0.027590
## 25   56 62.43 0.025140
## 26   60 64.50 0.022900
## 27   65 66.50 0.020870
## 28   69 68.35 0.019020
## 29   74 70.15 0.017330
## 30   75 71.78 0.015790
## 31   80 73.25 0.014380
## 32   82 74.61 0.013110
## 33   90 75.89 0.011940
## 34  103 77.14 0.010880
## 35  112 78.32 0.009915
## 36  115 79.43 0.009034
## 37  120 80.47 0.008231
## 38  130 81.48 0.007500
## 39  139 82.46 0.006834
## 40  146 83.37 0.006227
## 41  156 84.21 0.005674
## 42  166 85.02 0.005170
## 43  172 85.76 0.004710
## 44  184 86.46 0.004292
## 45  195 87.13 0.003911
## 46  200 87.75 0.003563
## 
## ...
## and 54 more lines.

Evaluación de resultados en datos de prueba

#Evaluamos el modelo con los datos de prueba
review_final <- last_fit(final_lasso, 
                         split=reviews_split,
                         metrics = metric_set(f_meas, recall, precision)
                         )

# Observamos métricas del modelo evaluado en datos de prueba
review_final %>%
  collect_metrics()

## # A tibble: 3 x 4
##   .metric   .estimator .estimate .config             
##   <chr>     <chr>          <dbl> <chr>               
## 1 f_meas    binary         0.976 Preprocessor1_Model1
## 2 recall    binary         0.962 Preprocessor1_Model1
## 3 precision binary         0.990 Preprocessor1_Model1

# Visualizar las predicciones del dataframe de prueba

review_final %>%
  collect_predictions %>%
  head()

## # A tibble: 6 x 5
##   id               .pred_class  .row class .config             
##   <chr>            <fct>       <int> <fct> <chr>               
## 1 train/test split ham             7 ham   Preprocessor1_Model1
## 2 train/test split ham            11 ham   Preprocessor1_Model1
## 3 train/test split spam           13 spam  Preprocessor1_Model1
## 4 train/test split ham            15 ham   Preprocessor1_Model1
## 5 train/test split spam           16 spam  Preprocessor1_Model1
## 6 train/test split ham            17 ham   Preprocessor1_Model1

Predecir nuevos datos

comment<- "congratulations ur."
len<- str_length(comment)

new_comment <- tribble(~message,~len,comment,len)
new_comment

## # A tibble: 1 x 2
##   message               len
##   <chr>               <int>
## 1 congratulations ur.    19

prediction<-predict(final_lasso, new_data = new_comment)


paste0("el resultado para el comentario ","'",new_comment$message,"'","es: ",
       prediction$.pred_class)

## [1] "el resultado para el comentario 'congratulations ur.'es: ham"

Tarea7

Asincronica7

4/4/2021

Limpieza de datos

Muestra de entrenamiento y prueba

Creamos el workflow y la receta para el nuevo modelo

Creación del grid para entrenar los parametros adicionales del modelo

Modelo con los diferentes valores del grid y Visualizamos las métricas

Entrenamos el modelo con los diferentes valores del grid

Selección del mejor modelo

Seleccionamos el mejor modelo según métrica F1 Score

Evaluación de resultados en datos de prueba

Predecir nuevos datos