library(textrecipes)
## Loading required package: recipes
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(stopwords)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ stringr 1.4.0
## ✓ tidyr 1.1.3 ✓ forcats 0.5.1
## ✓ readr 1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag() masks stats::lag()
library(themis)
## Registered S3 methods overwritten by 'themis':
## method from
## bake.step_downsample recipes
## bake.step_upsample recipes
## prep.step_downsample recipes
## prep.step_upsample recipes
## tidy.step_downsample recipes
## tidy.step_upsample recipes
## tunable.step_downsample recipes
## tunable.step_upsample recipes
##
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
##
## step_downsample, step_upsample
library(knitr)
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(forcats)
library(stringr)
library(dplyr)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-1
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
## ✓ broom 0.7.5 ✓ rsample 0.0.9
## ✓ dials 0.0.9 ✓ tune 0.1.3
## ✓ infer 0.5.4 ✓ workflows 0.2.2
## ✓ modeldata 0.1.0 ✓ yardstick 0.0.8
## ✓ parsnip 0.1.5
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x Matrix::expand() masks tidyr::expand()
## x dplyr::filter() masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag() masks stats::lag()
## x Matrix::pack() masks tidyr::pack()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## x themis::step_downsample() masks recipes::step_downsample()
## x themis::step_upsample() masks recipes::step_upsample()
## x Matrix::unpack() masks tidyr::unpack()
## x Matrix::update() masks recipes::update(), stats::update()
spamData <- read.csv("spamData.csv", sep = ",")
1.Limpieza de los datos
Clean_String <- function(string){
# Remover caracteres no UTF-8
temp<- iconv(enc2utf8(string),sub="byte")
temp<- str_replace_all(temp,"[^[:graph:]]", " ")
# Remover todo lo que no sea numero o letra
temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
# remover espacios extra
temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
# minuscula
temp <- tolower(temp)
return(temp)
}
spamData$message <- Clean_String(spamData$message)
2. Muestra de entrenamiento y prueba
2.1 Verificación del tamaño de las clases
set.seed(124)
#Realizar la particion de las muestras
spam_split <- initial_split(spamData, prop=.7)
spam_train <- training(spam_split)
spam_test <- testing(spam_split)
dim(spam_train);dim(spam_test)
## [1] 3901 2
## [1] 1671 2
3.Creación de receta de preprocesamiento de datos
# Setear la receta del modelo a utilizar
spam_recipe <- recipe(class ~ message,
data = spam_train)
#Aplicar los pasos de procesamiento de datos
spam_recipeProcessed <- spam_recipe %>%
step_tokenize(message) %>%
step_stopwords(message, keep = FALSE) %>%
step_untokenize(message) %>%
step_tokenize(message, token = "ngrams",
options = list(n = 2, n_min = 1)) %>%
step_tokenfilter(message, max_tokens = 100) %>%
step_tfidf(message)
#Ejecutar la receta del paso anterior
spam_recipeProcessedF <- prep(spam_recipeProcessed)
spam_recipeProcessedF
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 1
##
## Training data contained 3901 data points and no missing data.
##
## Operations:
##
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
#Setear el workflow para trabajar el modelo de Machine Learning
spam_wf <- workflow() %>%
add_recipe(spam_recipeProcessed)
4.Ajuste del modelo inicial
#Verificamos las frecuencias de nuestro dataframe
spam_train %>%
group_by(class) %>%
summarise(n=n()) %>%
mutate(freq = prop.table(n))
## # A tibble: 2 x 3
## class n freq
## <chr> <int> <dbl>
## 1 ham 3373 0.865
## 2 spam 528 0.135
# Si es requerido se utiliza la función step_smote sobre la receta que ya se había creado
spam_recipeProcessed2 <- spam_recipeProcessed %>%
step_smote(class)
#Ejecutar la receta del paso anterior
spam_recipeProcessedF2 <- prep(spam_recipeProcessed2)
spam_recipeProcessedF2
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 1
##
## Training data contained 3901 data points and no missing data.
##
## Operations:
##
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
## SMOTE based on class [trained]
#Setear el workflow para trabajar el modelo de Machine Learning
spam_wf2 <- workflow() %>%
add_recipe(spam_recipeProcessed2)
# Especificación del modelo
rl_spec <- logistic_reg() %>%
set_engine("glm")
rl_spec
## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
#Ajustar el modelo con los datos
rl_fit <- spam_wf2 %>%
add_model(rl_spec) %>%
fit(data = spam_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
rl_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 7 Recipe Steps
##
## ● step_tokenize()
## ● step_stopwords()
## ● step_untokenize()
## ● step_tokenize()
## ● step_tokenfilter()
## ● step_tfidf()
## ● step_smote()
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call: stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
##
## Coefficients:
## (Intercept) tfidf_message_already tfidf_message_amp
## -1.007e+00 -4.147e+01 -6.034e+01
## tfidf_message_b tfidf_message_babe tfidf_message_back
## -1.632e-01 3.457e-01 -2.947e-02
## tfidf_message_c tfidf_message_call tfidf_message_can
## 7.797e-01 3.468e+00 -2.140e+00
## tfidf_message_cash tfidf_message_cc tfidf_message_claim
## 1.453e+00 -2.602e+00 7.064e+02
## tfidf_message_come tfidf_message_d tfidf_message_da
## -1.171e+00 -2.652e+00 -4.247e+01
## tfidf_message_day tfidf_message_dear tfidf_message_don
## -3.146e-01 -1.586e+00 -6.873e+01
## `tfidf_message_don t` tfidf_message_dont tfidf_message_e
## 6.464e+01 -5.975e-01 3.005e+00
## tfidf_message_free tfidf_message_get tfidf_message_give
## 3.433e+00 -1.496e-01 5.976e-02
## tfidf_message_go tfidf_message_going tfidf_message_good
## -1.635e-01 -1.327e+00 -4.495e+00
## tfidf_message_got tfidf_message_great tfidf_message_gt
## -2.716e+00 -2.910e+00 -2.704e+01
## tfidf_message_happy tfidf_message_hey tfidf_message_hi
## -4.276e+01 -1.919e+00 -6.506e-01
## tfidf_message_home tfidf_message_hope tfidf_message_im
## -3.594e+00 -2.187e+00 -1.960e-01
## tfidf_message_just tfidf_message_k tfidf_message_know
## -1.362e+00 -8.341e-01 -8.885e-01
## tfidf_message_later tfidf_message_life tfidf_message_like
## -3.513e+01 -7.942e-01 -6.417e-01
## tfidf_message_ll tfidf_message_lor tfidf_message_love
## -8.393e-01 -5.643e+01 -1.275e+00
## tfidf_message_lt `tfidf_message_lt gt` tfidf_message_m
## -2.808e+01 -1.595e+03 -7.549e-01
## tfidf_message_make tfidf_message_message tfidf_message_mobile
## 2.817e-01 4.505e-01 6.625e+00
## tfidf_message_msg tfidf_message_much tfidf_message_n
## 4.082e-01 -1.110e+00 -1.073e+00
## tfidf_message_need tfidf_message_new tfidf_message_night
## -1.193e+00 5.205e-01 -2.914e-02
## tfidf_message_now tfidf_message_number tfidf_message_oh
## 4.691e-01 -1.686e+00 -5.485e+01
## tfidf_message_ok tfidf_message_one tfidf_message_p
## -3.672e+00 -4.568e-01 8.477e+00
## tfidf_message_phone tfidf_message_please tfidf_message_pls
## 2.684e-01 2.179e-01 -1.853e+00
## tfidf_message_prize tfidf_message_r tfidf_message_re
## 4.183e+02 5.120e-01 -1.414e-01
##
## ...
## and 26 more lines.
spam_spec <- logistic_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")
# Observamos el modelo especificado
spam_spec
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = tune()
## mixture = 1
##
## Computational engine: glmnet
#Ajustar el modelo con los datos
spam_wf <- workflow() %>%
add_recipe(spam_recipeProcessed) %>%
add_model(spam_spec)
# Observamos el modelo
spam_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
##
## ● step_tokenize()
## ● step_stopwords()
## ● step_untokenize()
## ● step_tokenize()
## ● step_tokenfilter()
## ● step_tfidf()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = tune()
## mixture = 1
##
## Computational engine: glmnet
# Creamos un grid para entrenar los parametros adicionales del modelo
spam_grid <- grid_random(penalty(), size = 25)
spam_grid
## # A tibble: 25 x 1
## penalty
## <dbl>
## 1 0.00174
## 2 0.00000000110
## 3 0.00000814
## 4 0.0901
## 5 0.485
## 6 0.177
## 7 0.0854
## 8 0.466
## 9 0.000000498
## 10 0.0000000139
## # … with 15 more rows
#Seteamos semilla aleatoria y creamos los subsets de la validación cruzada
set.seed(123)
spam_folds <- vfold_cv(spam_train,v=3)
spam_folds
## # 3-fold cross-validation
## # A tibble: 3 x 2
## splits id
## <list> <chr>
## 1 <split [2600/1301]> Fold1
## 2 <split [2601/1300]> Fold2
## 3 <split [2601/1300]> Fold3
5. Medición de métricas mediante validación cruzada
set.seed(234)
# Se genera las submuestras de validación cruzada
spam_folds <- vfold_cv(spam_train,v=5)
spam_folds
## # 5-fold cross-validation
## # A tibble: 5 x 2
## splits id
## <list> <chr>
## 1 <split [3120/781]> Fold1
## 2 <split [3121/780]> Fold2
## 3 <split [3121/780]> Fold3
## 4 <split [3121/780]> Fold4
## 5 <split [3121/780]> Fold5
# Se ajusta el modelo para cada fold
rl_rs <- fit_resamples(rl_fit,
spam_folds,
control = control_resamples(save_pred = TRUE),
metrics = metric_set(f_meas,recall,precision)
)
## ! Fold1: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold2: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold3: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold4: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold5: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
# Se calcula la matriz de confusión
rl_rs %>% conf_mat_resampled(tidy = F)
## ham spam
## ham 646.2 28.4
## spam 12.2 93.4
rl_rs %>% collect_metrics()
## # A tibble: 3 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 f_meas binary 0.969 5 0.00345 Preprocessor1_Model1
## 2 precision binary 0.981 5 0.00268 Preprocessor1_Model1
## 3 recall binary 0.958 5 0.00507 Preprocessor1_Model1
#Calculo de predicciones
rl_rs %>% collect_predictions() %>% head(20)
## # A tibble: 20 x 5
## id .pred_class .row class .config
## <chr> <fct> <int> <fct> <chr>
## 1 Fold1 spam 8 spam Preprocessor1_Model1
## 2 Fold1 ham 25 ham Preprocessor1_Model1
## 3 Fold1 ham 32 ham Preprocessor1_Model1
## 4 Fold1 ham 36 ham Preprocessor1_Model1
## 5 Fold1 ham 39 ham Preprocessor1_Model1
## 6 Fold1 spam 40 spam Preprocessor1_Model1
## 7 Fold1 ham 42 spam Preprocessor1_Model1
## 8 Fold1 ham 43 ham Preprocessor1_Model1
## 9 Fold1 ham 46 ham Preprocessor1_Model1
## 10 Fold1 ham 48 ham Preprocessor1_Model1
## 11 Fold1 ham 49 ham Preprocessor1_Model1
## 12 Fold1 ham 55 ham Preprocessor1_Model1
## 13 Fold1 spam 56 ham Preprocessor1_Model1
## 14 Fold1 ham 61 ham Preprocessor1_Model1
## 15 Fold1 ham 62 ham Preprocessor1_Model1
## 16 Fold1 ham 65 ham Preprocessor1_Model1
## 17 Fold1 ham 71 ham Preprocessor1_Model1
## 18 Fold1 spam 77 spam Preprocessor1_Model1
## 19 Fold1 spam 79 ham Preprocessor1_Model1
## 20 Fold1 ham 83 ham Preprocessor1_Model1
# Entrenamos el modelo con los diferentes valores del grid para que escoja los mejores valores de los parametros
set.seed(2020)
spam_grid <- tune_grid(spam_wf,
resamples = spam_folds,
grid = spam_grid,
control = control_resamples(save_pred = TRUE),
metrics = metric_set(f_meas, recall, precision)
)
spam_grid
## # Tuning results
## # 5-fold cross-validation
## # A tibble: 5 x 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [3120/7… Fold1 <tibble[,5] [75 ×… <tibble[,1] [0… <tibble[,5] [19,525 …
## 2 <split [3121/7… Fold2 <tibble[,5] [75 ×… <tibble[,1] [0… <tibble[,5] [19,500 …
## 3 <split [3121/7… Fold3 <tibble[,5] [75 ×… <tibble[,1] [0… <tibble[,5] [19,500 …
## 4 <split [3121/7… Fold4 <tibble[,5] [75 ×… <tibble[,1] [0… <tibble[,5] [19,500 …
## 5 <split [3121/7… Fold5 <tibble[,5] [75 ×… <tibble[,1] [0… <tibble[,5] [19,500 …
#Visualizamos las métricas del modelo resultante
spam_grid %>%
collect_metrics()
## # A tibble: 75 x 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 2.24e-10 f_meas binary 0.973 5 0.00211 Preprocessor1_Model01
## 2 2.24e-10 precision binary 0.966 5 0.00338 Preprocessor1_Model01
## 3 2.24e-10 recall binary 0.981 5 0.00279 Preprocessor1_Model01
## 4 1.10e- 9 f_meas binary 0.973 5 0.00211 Preprocessor1_Model02
## 5 1.10e- 9 precision binary 0.966 5 0.00338 Preprocessor1_Model02
## 6 1.10e- 9 recall binary 0.981 5 0.00279 Preprocessor1_Model02
## 7 1.39e- 8 f_meas binary 0.973 5 0.00211 Preprocessor1_Model03
## 8 1.39e- 8 precision binary 0.966 5 0.00338 Preprocessor1_Model03
## 9 1.39e- 8 recall binary 0.981 5 0.00279 Preprocessor1_Model03
## 10 1.90e- 8 f_meas binary 0.973 5 0.00211 Preprocessor1_Model04
## # … with 65 more rows
#Visualizamos los cambios de las métricas en función de los valores de penalidad
spam_grid %>%
collect_metrics() %>%
ggplot(aes(penalty, mean, color = .metric)) +
geom_line(size = 1.5, show.legend = FALSE) +
facet_wrap(~.metric) +
scale_x_log10() +
theme_minimal()

6. Creación del modelo final y validación de métricas
best_f <- spam_grid %>%
select_best("f_meas")
best_f
## # A tibble: 1 x 2
## penalty .config
## <dbl> <chr>
## 1 0.0000979 Preprocessor1_Model14
#Entrenamos el modelo final con los valores del mejor modelo de entrenamiento
final_spam <- finalize_workflow(spam_wf, best_f) %>%
fit(spam_train)
final_spam
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
##
## ● step_tokenize()
## ● step_stopwords()
## ● step_untokenize()
## ● step_tokenize()
## ● step_tokenfilter()
## ● step_tfidf()
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
##
## Df %Dev Lambda
## 1 0 0.00 0.123200
## 2 3 3.16 0.112300
## 3 5 9.96 0.102300
## 4 6 14.99 0.093190
## 5 8 19.48 0.084910
## 6 9 24.11 0.077370
## 7 9 27.78 0.070500
## 8 10 31.18 0.064230
## 9 10 34.15 0.058530
## 10 11 36.82 0.053330
## 11 13 39.63 0.048590
## 12 14 42.27 0.044270
## 13 14 44.64 0.040340
## 14 14 46.68 0.036760
## 15 14 48.46 0.033490
## 16 14 50.03 0.030520
## 17 14 51.41 0.027810
## 18 15 52.73 0.025340
## 19 16 54.02 0.023080
## 20 16 55.16 0.021030
## 21 17 56.16 0.019170
## 22 18 57.22 0.017460
## 23 18 58.13 0.015910
## 24 19 58.92 0.014500
## 25 20 59.70 0.013210
## 26 22 60.45 0.012040
## 27 22 61.12 0.010970
## 28 23 61.71 0.009993
## 29 24 62.25 0.009105
## 30 25 62.73 0.008296
## 31 26 63.19 0.007559
## 32 27 63.62 0.006888
## 33 28 64.00 0.006276
## 34 30 64.36 0.005718
## 35 34 64.75 0.005210
## 36 36 65.14 0.004747
## 37 40 65.52 0.004326
## 38 46 65.89 0.003941
## 39 49 66.28 0.003591
## 40 56 66.66 0.003272
## 41 59 67.05 0.002982
## 42 59 67.40 0.002717
## 43 60 67.72 0.002475
## 44 60 68.02 0.002255
## 45 67 68.30 0.002055
## 46 68 68.56 0.001872
##
## ...
## and 54 more lines.
spam_final <- last_fit(final_spam,
split=spam_split,
metrics = metric_set(f_meas, recall, precision)
)
# Observamos métricas del modelo evaluado en datos de prueba
spam_final %>%
collect_metrics()
## # A tibble: 3 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 f_meas binary 0.974 Preprocessor1_Model1
## 2 recall binary 0.979 Preprocessor1_Model1
## 3 precision binary 0.969 Preprocessor1_Model1
# Visualizar las predicciones del dataframe de prueba
spam_final %>%
collect_predictions %>%
head()
## # A tibble: 6 x 5
## id .pred_class .row class .config
## <chr> <fct> <int> <fct> <chr>
## 1 train/test split ham 4 ham Preprocessor1_Model1
## 2 train/test split ham 8 ham Preprocessor1_Model1
## 3 train/test split spam 16 spam Preprocessor1_Model1
## 4 train/test split ham 17 ham Preprocessor1_Model1
## 5 train/test split ham 18 ham Preprocessor1_Model1
## 6 train/test split ham 19 ham Preprocessor1_Model1