##1.Carga, limpieza de datos y librerias necesarias
library(textrecipes)
## Warning: package 'textrecipes' was built under R version 4.0.4
## Loading required package: recipes
## Warning: package 'recipes' was built under R version 4.0.4
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(stopwords)
## Warning: package 'stopwords' was built under R version 4.0.4
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v stringr 1.4.0
## v tidyr 1.1.3 v forcats 0.5.1
## v readr 1.4.0
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.4
## Warning: package 'tidyr' was built under R version 4.0.4
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'purrr' was built under R version 4.0.3
## Warning: package 'stringr' was built under R version 4.0.3
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag() masks stats::lag()
library(themis)
## Warning: package 'themis' was built under R version 4.0.4
## Registered S3 methods overwritten by 'themis':
## method from
## bake.step_downsample recipes
## bake.step_upsample recipes
## prep.step_downsample recipes
## prep.step_upsample recipes
## tidy.step_downsample recipes
## tidy.step_upsample recipes
## tunable.step_downsample recipes
## tunable.step_upsample recipes
##
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
##
## step_downsample, step_upsample
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.0.4
## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --
## v broom 0.7.5 v rsample 0.0.9
## v dials 0.0.9 v tune 0.1.3
## v infer 0.5.4 v workflows 0.2.2
## v modeldata 0.1.0 v yardstick 0.0.7
## v parsnip 0.1.5
## Warning: package 'dials' was built under R version 4.0.4
## Warning: package 'scales' was built under R version 4.0.4
## Warning: package 'infer' was built under R version 4.0.4
## Warning: package 'modeldata' was built under R version 4.0.4
## Warning: package 'parsnip' was built under R version 4.0.4
## Warning: package 'rsample' was built under R version 4.0.4
## Warning: package 'tune' was built under R version 4.0.4
## Warning: package 'workflows' was built under R version 4.0.4
## Warning: package 'yardstick' was built under R version 4.0.4
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## x themis::step_downsample() masks recipes::step_downsample()
## x themis::step_upsample() masks recipes::step_upsample()
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.4
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-1
library(readr)
setwd("C:/Users/carlos mario/Dropbox/Data Analysis/R Directory/Carlos Mario")
spamData <- read_csv("spamData.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## class = col_character(),
## message = col_character()
## )
#Proceso de limpieza
Clean_String <- function(string){
# Remover caracteres no UTF-8
temp<- iconv(enc2utf8(string),sub="byte")
temp<- str_replace_all(temp,"[^[:graph:]]", " ")
# Remover todo lo que no sea número o letra
temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
# remover espacios extra
temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
# minúscula
temp <- tolower(temp)
return(temp)
}
# Aplicar la función a los comentarios
spamData$message <- Clean_String(spamData$message)
##2.Muestra de entrenamiento y prueba
set.seed(1234) #Semilla Aleatoria no puede faltar
#Realizar la partición de las muestras
emails_split <- initial_split(spamData,prop=.7)
emails_train <- training(emails_split)
emails_test <- testing(emails_split)
#2.1 Verificación del tamaño de las clases
##3.Creación de receta de preprocesamiento de datos
# Setear la receta del modelo a utilizar
emails_recipe <- recipe(class ~ message,
data = emails_train)
#Aplicar los pasos de procesamiento de datos
emails_recipeProcessed <- emails_recipe %>%
step_tokenize(message) %>%
step_stopwords(message, keep = FALSE) %>%
step_untokenize(message) %>%
step_tokenize(message, token = "ngrams",
options = list(n = 4, n_min = 1)) %>%
step_tokenfilter(message, max_tokens = 100) %>%
step_tfidf(message)
##4.Ajuste del modelo inicial
# Creamos el workflow y la receta para el nuevo modelo
emails_wf <- workflow() %>%
add_recipe(emails_recipeProcessed)
# Especificación del modelo
rlasso_spec <- logistic_reg(penalty = tune(), mixture = 1) %>% # Mixture=1 se requiere para indicar que es LASSO
set_engine("glmnet")
# Observamos el modelo especificado
rlasso_spec
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = tune()
## mixture = 1
##
## Computational engine: glmnet
#Ajustar el modelo con los datos
lasso_wf <- workflow() %>%
add_recipe(emails_recipeProcessed) %>%
add_model(rlasso_spec)
# Observamos el modelo
lasso_wf
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 6 Recipe Steps
##
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
##
## -- Model -----------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = tune()
## mixture = 1
##
## Computational engine: glmnet
##5. Medición de métricas mediante validación cruzada
# Creamos un grid para entrenar los parametros adicionales del modelo
lambda_grid <- grid_random(penalty(), size = 25)
lambda_grid
## # A tibble: 25 x 1
## penalty
## <dbl>
## 1 0.00000577
## 2 0.0000778
## 3 0.000000124
## 4 0.00000475
## 5 0.000308
## 6 0.0222
## 7 0.000387
## 8 0.275
## 9 0.00451
## 10 0.00000313
## # ... with 15 more rows
#Seteamos semilla aleatoria y creamos los subsets de la validación cruzada
set.seed(123)
emails_folds <- vfold_cv(emails_train,v=7)
emails_folds
## # 7-fold cross-validation
## # A tibble: 7 x 2
## splits id
## <list> <chr>
## 1 <split [3343/558]> Fold1
## 2 <split [3343/558]> Fold2
## 3 <split [3344/557]> Fold3
## 4 <split [3344/557]> Fold4
## 5 <split [3344/557]> Fold5
## 6 <split [3344/557]> Fold6
## 7 <split [3344/557]> Fold7
##6. Creación del modelo final y validación de métricas
# Entrenamos el modelo con los diferentes valores del grid para que escoja los mejores valores de los parametros
set.seed(2020)
lasso_grid <- tune_grid(lasso_wf,
resamples = emails_folds,
grid = lambda_grid,
control = control_resamples(save_pred = TRUE),
metrics = metric_set(f_meas, recall, precision)
)
## Warning: package 'rlang' was built under R version 4.0.3
## Warning: package 'vctrs' was built under R version 4.0.3
lasso_grid
## # Tuning results
## # 7-fold cross-validation
## # A tibble: 7 x 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [3343/558]> Fold1 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [13,950 x 5~
## 2 <split [3343/558]> Fold2 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [13,950 x 5~
## 3 <split [3344/557]> Fold3 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [13,925 x 5~
## 4 <split [3344/557]> Fold4 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [13,925 x 5~
## 5 <split [3344/557]> Fold5 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [13,925 x 5~
## 6 <split [3344/557]> Fold6 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [13,925 x 5~
## 7 <split [3344/557]> Fold7 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [13,925 x 5~
#Visualizamos las métricas del modelo resultante
lasso_grid %>%
collect_metrics()
## # A tibble: 75 x 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00000000344 f_meas binary 0.972 7 0.00230 Preprocessor1_Model01
## 2 0.00000000344 precision binary 0.965 7 0.00336 Preprocessor1_Model01
## 3 0.00000000344 recall binary 0.980 7 0.00202 Preprocessor1_Model01
## 4 0.00000000633 f_meas binary 0.972 7 0.00230 Preprocessor1_Model02
## 5 0.00000000633 precision binary 0.965 7 0.00336 Preprocessor1_Model02
## 6 0.00000000633 recall binary 0.980 7 0.00202 Preprocessor1_Model02
## 7 0.0000000394 f_meas binary 0.972 7 0.00230 Preprocessor1_Model03
## 8 0.0000000394 precision binary 0.965 7 0.00336 Preprocessor1_Model03
## 9 0.0000000394 recall binary 0.980 7 0.00202 Preprocessor1_Model03
## 10 0.0000000528 f_meas binary 0.972 7 0.00230 Preprocessor1_Model04
## # ... with 65 more rows
#Visualizamos los cambios de las métricas en función de los valores de penalidad
lasso_grid %>%
collect_metrics() %>%
ggplot(aes(penalty, mean, color = .metric)) +
geom_line(size = 1.5, show.legend = FALSE) +
facet_wrap(~.metric) +
scale_x_log10() +
theme_minimal()
##7. Se selecciona el mejor modelo y se entrena el modelo final con lo valores del mejor modelo de entrenamiento
#seleccionamos el mejor modelo según métrica F1 Score
best_f <- lasso_grid %>%
select_best("f_meas")
best_f
## # A tibble: 1 x 2
## penalty .config
## <dbl> <chr>
## 1 0.000387 Preprocessor1_Model17
final_lasso <- finalize_workflow(lasso_wf, best_f) %>%
fit(emails_train)
final_lasso
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 6 Recipe Steps
##
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
##
## -- Model -----------------------------------------------------------------------
##
## Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
##
## Df %Dev Lambda
## 1 0 0.00 0.119400
## 2 3 4.03 0.108800
## 3 3 8.09 0.099120
## 4 6 12.71 0.090320
## 5 8 18.01 0.082290
## 6 8 22.32 0.074980
## 7 8 25.90 0.068320
## 8 10 29.77 0.062250
## 9 11 33.49 0.056720
## 10 12 36.68 0.051680
## 11 12 39.49 0.047090
## 12 13 42.03 0.042910
## 13 13 44.27 0.039100
## 14 14 46.31 0.035620
## 15 15 48.13 0.032460
## 16 16 49.77 0.029580
## 17 16 51.34 0.026950
## 18 16 52.70 0.024550
## 19 17 53.99 0.022370
## 20 19 55.27 0.020380
## 21 20 56.48 0.018570
## 22 20 57.57 0.016920
## 23 20 58.51 0.015420
## 24 23 59.46 0.014050
## 25 24 60.39 0.012800
## 26 24 61.19 0.011670
## 27 24 61.88 0.010630
## 28 24 62.48 0.009684
## 29 25 63.02 0.008824
## 30 26 63.56 0.008040
## 31 27 64.02 0.007326
## 32 28 64.45 0.006675
## 33 29 64.83 0.006082
## 34 30 65.18 0.005542
## 35 35 65.56 0.005050
## 36 39 65.96 0.004601
## 37 41 66.35 0.004192
## 38 45 66.72 0.003820
## 39 46 67.09 0.003480
## 40 52 67.44 0.003171
## 41 55 67.78 0.002890
## 42 57 68.11 0.002633
## 43 60 68.42 0.002399
## 44 62 68.71 0.002186
## 45 66 68.99 0.001992
## 46 69 69.25 0.001815
##
## ...
## and 54 more lines.
##8. Evaluacion del Modelo con datos de prueba y visualizacion de predicciones
#Evaluamos el modelo con los datos de prueba
emails_final <- last_fit(final_lasso,
split=emails_split,
metrics = metric_set(f_meas, recall, precision)
)
# Observamos métricas del modelo evaluado en datos de prueba
emails_final %>%
collect_metrics()
## # A tibble: 3 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 f_meas binary 0.977 Preprocessor1_Model1
## 2 recall binary 0.986 Preprocessor1_Model1
## 3 precision binary 0.967 Preprocessor1_Model1
# Visualizar las predicciones del dataframe de prueba
emails_final %>%
collect_predictions %>%
head()
## # A tibble: 6 x 5
## id .pred_class .row class .config
## <chr> <fct> <int> <fct> <chr>
## 1 train/test split ham 7 ham Preprocessor1_Model1
## 2 train/test split ham 11 ham Preprocessor1_Model1
## 3 train/test split spam 13 spam Preprocessor1_Model1
## 4 train/test split ham 15 ham Preprocessor1_Model1
## 5 train/test split spam 16 spam Preprocessor1_Model1
## 6 train/test split ham 17 ham Preprocessor1_Model1
##9.Prueba de nuevos datos(Sugerencia:Observar ejemplos del dataset e incluir textos similares).
comment<- "dont wait to get your prize"
len<- str_length(comment)
new_comment <- tribble(~message,~len,comment,len)
new_comment
## # A tibble: 1 x 2
## message len
## <chr> <int>
## 1 dont wait to get your prize 27
prediction<-predict(final_lasso, new_data = new_comment)
paste0("el resultado para el comentario ","'",new_comment$message,"'","es: ",
prediction$.pred_class)
## [1] "el resultado para el comentario 'dont wait to get your prize'es: ham"