##Librerías, dataset y limpieza de datos
#install.packages("glmnet")
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.5
## Loading required package: Matrix
## Loaded glmnet 4.1-1
library(textrecipes)
## Loading required package: recipes
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'recipes'
## The following object is masked from 'package:Matrix':
##
## update
## The following object is masked from 'package:stats':
##
## step
library(stopwords)
library(themis)
## Registered S3 methods overwritten by 'themis':
## method from
## bake.step_downsample recipes
## bake.step_upsample recipes
## prep.step_downsample recipes
## prep.step_upsample recipes
## tidy.step_downsample recipes
## tidy.step_upsample recipes
## tunable.step_downsample recipes
## tunable.step_upsample recipes
##
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
##
## step_downsample, step_upsample
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --
## v broom 0.7.5 v rsample 0.0.9
## v dials 0.0.9 v tibble 3.1.0
## v ggplot2 3.3.3 v tidyr 1.1.3
## v infer 0.5.4 v tune 0.1.3
## v modeldata 0.1.0 v workflows 0.2.2
## v parsnip 0.1.5 v yardstick 0.0.7
## v purrr 0.3.4
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x purrr::discard() masks scales::discard()
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x tidyr::pack() masks Matrix::pack()
## x recipes::step() masks stats::step()
## x themis::step_downsample() masks recipes::step_downsample()
## x themis::step_upsample() masks recipes::step_upsample()
## x tidyr::unpack() masks Matrix::unpack()
## x recipes::update() masks Matrix::update(), stats::update()
library(dplyr)
library(tidyr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v readr 1.4.0 v forcats 0.5.1
## v stringr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard() masks scales::discard()
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag() masks stats::lag()
## x tidyr::pack() masks Matrix::pack()
## x readr::spec() masks yardstick::spec()
## x tidyr::unpack() masks Matrix::unpack()
library(tidymodels)
SpamData <- read.csv("https://raw.githubusercontent.com/kervinalfaro/Data-Mining/main/spamData.csv")
Clean_String <- function(string){
# Remover caracteres no UTF-8
temp<- iconv(enc2utf8(string),sub="byte")
temp<- str_replace_all(temp,"[^[:graph:]]", " ")
# Remover todo lo que no sea número o letra
temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
# remover espacios extra
temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
# minúscula
temp <- tolower(temp)
return(temp)
}
# Aplicar la función a los comentarios
SpamData$message <- Clean_String(SpamData$message )
reviewClass <- SpamData %>% # bookseviews ya está precargado y limpio.
mutate(class = factor(if_else(class == "spam","spam", "ham")),
len= str_length(message)
)
#Realizar la partición de las muestras
reviews_split <- initial_split(reviewClass,prop=.7)
reviews_train <- training(reviews_split)
reviews_test <- testing(reviews_split)
#Creamos la receta inicial
reviews_recipe <- recipe(class ~ message+len,
data = reviews_train)
#Aplicar los pasos de procesamiento de datos
reviews_recipeProcessed <- reviews_recipe %>%
step_text_normalization(message) %>% # elimina caracteres extraños
step_tokenize(message) %>%
step_stopwords(message, keep = FALSE) %>%
step_untokenize(message) %>%
step_tokenize(message, token = "ngrams",
options = list(n = 4, n_min = 1)) %>%
step_tokenfilter(message, max_tokens = 200) %>%
step_tfidf(message) %>%
step_upsample(class) #alternativa a step_smote
##Ajuste
# Creamos el workflow y la receta para el nuevo modelo
reviews_wf <- workflow() %>%
add_recipe(reviews_recipeProcessed)
# Especificación del modelo
rlasso_spec <- logistic_reg(penalty = tune(), mixture = 1) %>% # Mixture=1 se requiere para indicar que es LASSO
set_engine("glmnet")
# Observamos el modelo especificado
rlasso_spec
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = tune()
## mixture = 1
##
## Computational engine: glmnet
#Ajustar el modelo con los datos
lasso_wf <- workflow() %>%
add_recipe(reviews_recipeProcessed) %>%
add_model(rlasso_spec)
# Observamos el modelo
lasso_wf
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
##
## * step_text_normalization()
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_upsample()
##
## -- Model -----------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = tune()
## mixture = 1
##
## Computational engine: glmnet
# Creamos un grid para entrenar los parametros adicionales del modelo
lambda_grid <- grid_random(penalty(), size = 25)
lambda_grid
## # A tibble: 25 x 1
## penalty
## <dbl>
## 1 5.60e- 9
## 2 1.02e-10
## 3 1.91e- 4
## 4 3.33e-10
## 5 4.75e- 2
## 6 2.71e-10
## 7 1.75e- 5
## 8 4.53e- 3
## 9 6.25e- 2
## 10 4.57e- 7
## # ... with 15 more rows
#Seteamos semilla aleatoria y creamos los subsets de la validación cruzada
set.seed(123)
reviews_folds <- vfold_cv(reviews_train,v=5)
reviews_folds
## # 5-fold cross-validation
## # A tibble: 5 x 2
## splits id
## <list> <chr>
## 1 <split [3120/781]> Fold1
## 2 <split [3121/780]> Fold2
## 3 <split [3121/780]> Fold3
## 4 <split [3121/780]> Fold4
## 5 <split [3121/780]> Fold5
#Medición de métricas
# Entrenamos el modelo con los diferentes valores del grid para que escoja los mejores valores de los parametros
set.seed(2020)
lasso_grid <- tune_grid(lasso_wf,
resamples = reviews_folds,
grid = lambda_grid,
control = control_resamples(save_pred = TRUE),
metrics = metric_set(f_meas, recall, precision)
)
## ! Fold2: internal: While computing binary `precision()`, no predicted events were...
## ! Fold3: preprocessor 1/1, model 1/1: from glmnet Fortran code (error code -75); ...
## ! Fold3: internal: While computing binary `precision()`, no predicted events were...
## ! Fold4: preprocessor 1/1, model 1/1: from glmnet Fortran code (error code -69); ...
## ! Fold5: internal: While computing binary `precision()`, no predicted events were...
lasso_grid
## Warning: This tuning result has notes. Example notes on model fitting include:
## internal: While computing binary `precision()`, no predicted events were detected (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and `NA` will be returned.
## Note that 692 true event(s) actually occured for the problematic event level, 'ham'.
## preprocessor 1/1, model 1/1: from glmnet Fortran code (error code -75); Convergence for 75th lambda value not reached after maxit=100000 iterations; solutions for larger lambdas returned
## preprocessor 1/1, model 1/1: from glmnet Fortran code (error code -69); Convergence for 69th lambda value not reached after maxit=100000 iterations; solutions for larger lambdas returned
## # Tuning results
## # 5-fold cross-validation
## # A tibble: 5 x 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [3120/781]> Fold1 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,525 x 5~
## 2 <split [3121/780]> Fold2 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
## 3 <split [3121/780]> Fold3 <tibble [75 x 5~ <tibble [2 x 1~ <tibble [19,500 x 5~
## 4 <split [3121/780]> Fold4 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
## 5 <split [3121/780]> Fold5 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
#Visualizamos las métricas del modelo resultante
lasso_grid %>%
collect_metrics()
## # A tibble: 75 x 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1.02e-10 f_meas binary 0.973 5 0.00125 Preprocessor1_Model01
## 2 1.02e-10 precision binary 0.985 5 0.00314 Preprocessor1_Model01
## 3 1.02e-10 recall binary 0.962 5 0.00413 Preprocessor1_Model01
## 4 2.71e-10 f_meas binary 0.973 5 0.00125 Preprocessor1_Model02
## 5 2.71e-10 precision binary 0.985 5 0.00314 Preprocessor1_Model02
## 6 2.71e-10 recall binary 0.962 5 0.00413 Preprocessor1_Model02
## 7 3.33e-10 f_meas binary 0.973 5 0.00125 Preprocessor1_Model03
## 8 3.33e-10 precision binary 0.985 5 0.00314 Preprocessor1_Model03
## 9 3.33e-10 recall binary 0.962 5 0.00413 Preprocessor1_Model03
## 10 5.60e- 9 f_meas binary 0.973 5 0.00125 Preprocessor1_Model04
## # ... with 65 more rows
#Visualizamos los cambios de las métricas en función de los valores de penalidad
lasso_grid %>%
collect_metrics() %>%
ggplot(aes(penalty, mean, color = .metric)) +
geom_line(size = 1.5, show.legend = FALSE) +
facet_wrap(~.metric) +
scale_x_log10() +
theme_minimal()
#Modelo final
#seleccionamos el mejor modelo según métrica F1 Score
best_f <- lasso_grid %>%
select_best("f_meas")
best_f
## # A tibble: 1 x 2
## penalty .config
## <dbl> <chr>
## 1 1.02e-10 Preprocessor1_Model01
#Entrenamos el modelo final con los valores del mejor modelo de entrenamiento
final_lasso <- finalize_workflow(lasso_wf, best_f) %>%
fit(reviews_train)
final_lasso
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
##
## * step_text_normalization()
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_upsample()
##
## -- Model -----------------------------------------------------------------------
##
## Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
##
## Df %Dev Lambda
## 1 0 0.00 0.246500
## 2 1 3.02 0.224600
## 3 1 5.66 0.204700
## 4 1 8.00 0.186500
## 5 1 10.05 0.169900
## 6 1 11.83 0.154800
## 7 2 14.17 0.141100
## 8 3 16.29 0.128500
## 9 4 19.13 0.117100
## 10 4 21.77 0.106700
## 11 5 24.19 0.097230
## 12 6 26.87 0.088590
## 13 9 29.47 0.080720
## 14 12 32.70 0.073550
## 15 15 36.08 0.067020
## 16 16 39.25 0.061060
## 17 19 42.14 0.055640
## 18 23 45.08 0.050700
## 19 25 47.95 0.046190
## 20 28 50.65 0.042090
## 21 32 53.19 0.038350
## 22 33 55.59 0.034940
## 23 35 57.74 0.031840
## 24 38 59.75 0.029010
## 25 39 61.62 0.026430
## 26 43 63.33 0.024080
## 27 45 65.02 0.021940
## 28 45 66.54 0.020000
## 29 48 67.91 0.018220
## 30 53 69.21 0.016600
## 31 55 70.42 0.015130
## 32 56 71.51 0.013780
## 33 58 72.54 0.012560
## 34 59 73.47 0.011440
## 35 68 74.36 0.010430
## 36 70 75.20 0.009499
## 37 76 76.00 0.008656
## 38 77 76.74 0.007887
## 39 83 77.43 0.007186
## 40 89 78.09 0.006548
## 41 98 78.75 0.005966
## 42 103 79.38 0.005436
## 43 107 79.97 0.004953
## 44 113 80.53 0.004513
## 45 124 81.07 0.004112
## 46 130 81.61 0.003747
##
## ...
## and 54 more lines.
#Evaluación del modelo
#Evaluamos el modelo con los datos de prueba
review_final <- last_fit(final_lasso,
split=reviews_split,
metrics = metric_set(f_meas, recall, precision)
)
# Observamos métricas del modelo evaluado en datos de prueba
review_final %>%
collect_metrics()
## # A tibble: 3 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 f_meas binary 0.975 Preprocessor1_Model1
## 2 recall binary 0.967 Preprocessor1_Model1
## 3 precision binary 0.982 Preprocessor1_Model1
# Visualizar las predicciones del dataframe de prueba
review_final %>%
collect_predictions %>%
head()
## # A tibble: 6 x 5
## id .pred_class .row class .config
## <chr> <fct> <int> <fct> <chr>
## 1 train/test split ham 1 ham Preprocessor1_Model1
## 2 train/test split ham 2 ham Preprocessor1_Model1
## 3 train/test split ham 5 ham Preprocessor1_Model1
## 4 train/test split ham 6 spam Preprocessor1_Model1
## 5 train/test split ham 8 ham Preprocessor1_Model1
## 6 train/test split spam 10 spam Preprocessor1_Model1
comment<- "had your mobile months or more u r entitled to update to the latest colour mobiles with camera for free call the mobile update co free on "
len<- str_length(comment)
new_comment <- tribble(~message,~len,comment,len)
new_comment
## # A tibble: 1 x 2
## message len
## <chr> <int>
## 1 "had your mobile months or more u r entitled to update to the latest co~ 138
prediction<-predict(final_lasso, new_data = new_comment)
paste0("el mensaje es: ",prediction$.pred_class)
## [1] "el mensaje es: spam"