library(textrecipes)
## Warning: package 'textrecipes' was built under R version 4.0.4
## Loading required package: recipes
## Warning: package 'recipes' was built under R version 4.0.4
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(stopwords)
## Warning: package 'stopwords' was built under R version 4.0.4
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v stringr 1.4.0
## v tidyr 1.1.3 v forcats 0.5.1
## v readr 1.4.0
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.4
## Warning: package 'tidyr' was built under R version 4.0.4
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'purrr' was built under R version 4.0.3
## Warning: package 'stringr' was built under R version 4.0.3
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag() masks stats::lag()
library(themis)
## Warning: package 'themis' was built under R version 4.0.4
## Registered S3 methods overwritten by 'themis':
## method from
## bake.step_downsample recipes
## bake.step_upsample recipes
## prep.step_downsample recipes
## prep.step_upsample recipes
## tidy.step_downsample recipes
## tidy.step_upsample recipes
## tunable.step_downsample recipes
## tunable.step_upsample recipes
##
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
##
## step_downsample, step_upsample
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.0.4
## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --
## v broom 0.7.5 v rsample 0.0.9
## v dials 0.0.9 v tune 0.1.3
## v infer 0.5.4 v workflows 0.2.2
## v modeldata 0.1.0 v yardstick 0.0.7
## v parsnip 0.1.5
## Warning: package 'dials' was built under R version 4.0.4
## Warning: package 'scales' was built under R version 4.0.4
## Warning: package 'infer' was built under R version 4.0.4
## Warning: package 'modeldata' was built under R version 4.0.4
## Warning: package 'parsnip' was built under R version 4.0.4
## Warning: package 'rsample' was built under R version 4.0.4
## Warning: package 'tune' was built under R version 4.0.4
## Warning: package 'workflows' was built under R version 4.0.4
## Warning: package 'yardstick' was built under R version 4.0.4
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## x themis::step_downsample() masks recipes::step_downsample()
## x themis::step_upsample() masks recipes::step_upsample()
library(magrittr)
## Warning: package 'magrittr' was built under R version 4.0.3
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(readr)
setwd("C:/Users/carlos mario/Dropbox/Data Analysis/R Directory/Carlos Mario")
spamData <- read_csv("spamData.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## class = col_character(),
## message = col_character()
## )
Clean_String <- function(string){
# Remover caracteres no UTF-8
temp<- iconv(enc2utf8(string),sub="byte")
temp<- str_replace_all(temp,"[^[:graph:]]", " ")
# Remover todo lo que no sea número o letra
temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
# remover espacios extra
temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
# minúscula
temp <- tolower(temp)
return(temp)
}
# Aplicar la función a los comentarios
spamData$message <- Clean_String(spamData$message)
library(tidymodels)
set.seed(1234) #Semilla Aleatoria no puede faltar
#Realizar la partición de las muestras
emails_split <- initial_split(spamData,prop=.7)
emails_train <- training(emails_split)
emails_test <- testing(emails_split)
dim(emails_train);dim(emails_test)
## [1] 3901 2
## [1] 1671 2
library(textrecipes)
library(stopwords)
# Setear la receta del modelo a utilizar
emails_recipe <- recipe(class ~ message,
data = emails_train)
#Aplicar los pasos de procesamiento de datos
emails_recipeProcessed <- emails_recipe %>%
step_tokenize(message) %>%
step_stopwords(message, keep = FALSE) %>%
step_untokenize(message) %>%
step_tokenize(message, token = "ngrams",
options = list(n = 2, n_min = 1)) %>%
step_tokenfilter(message, max_tokens = 100) %>%
step_tfidf(message)
#Ejecutar la receta del paso anterior
emails_recipeProcessedF <- prep(emails_recipeProcessed)
emails_recipeProcessedF
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 1
##
## Training data contained 3901 data points and no missing data.
##
## Operations:
##
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
#Setear el workflow para trabajar el modelo de Machine Learning
emails_wf <- workflow() %>%
add_recipe(emails_recipeProcessed)
library(themis)
#Verificamos las frecuencias de nuestro dataframe
emails_train %>%
group_by(class) %>%
summarise(n=n()) %>%
mutate(freq = prop.table(n))
## # A tibble: 2 x 3
## class n freq
## * <chr> <int> <dbl>
## 1 ham 3387 0.868
## 2 spam 514 0.132
# Si es requerido se utiliza la función step_smote sobre la receta que ya se había creado
emails_recipeProcessed2 <- emails_recipeProcessed %>%
step_smote(class)
#Ejecutar la receta del paso anterior
emails_recipeProcessedF2 <- prep(emails_recipeProcessed2)
emails_recipeProcessedF2
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 1
##
## Training data contained 3901 data points and no missing data.
##
## Operations:
##
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
## SMOTE based on class [trained]
#Setear el workflow para trabajar el modelo de Machine Learning
emails_wf2 <- workflow() %>%
add_recipe(emails_recipeProcessed2)
# Especificación del modelo
rl_spec <- logistic_reg() %>%
set_engine("glm")
rl_spec
## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
#Ajustar el modelo con los datos
rl_fit <- emails_wf2 %>%
add_model(rl_spec) %>%
fit(data = emails_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
rl_fit
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 7 Recipe Steps
##
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_smote()
##
## -- Model -----------------------------------------------------------------------
##
## Call: stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
##
## Coefficients:
## (Intercept) tfidf_message_already tfidf_message_ask
## -1.327e+00 2.808e-01 -4.252e+01
## tfidf_message_b tfidf_message_babe tfidf_message_back
## 2.219e-01 1.704e-01 1.402e-01
## tfidf_message_c tfidf_message_call tfidf_message_can
## 9.184e-01 3.493e+00 -1.216e+00
## tfidf_message_cash tfidf_message_claim tfidf_message_com
## 2.615e+00 8.792e+02 1.059e+00
## tfidf_message_come tfidf_message_d tfidf_message_da
## -5.330e+01 -2.362e+00 -2.835e+00
## tfidf_message_day tfidf_message_dear tfidf_message_don
## -1.410e-01 -1.342e+00 -2.362e+01
## `tfidf_message_don t` tfidf_message_dont tfidf_message_e
## 2.183e+01 -3.640e-01 -1.151e+00
## tfidf_message_free tfidf_message_get tfidf_message_give
## 2.698e+00 -3.768e-01 -1.761e+00
## tfidf_message_go tfidf_message_going tfidf_message_good
## -3.469e-01 -1.005e+01 -4.301e+00
## tfidf_message_got tfidf_message_great tfidf_message_gt
## -3.903e+00 -3.536e-01 -4.480e+01
## tfidf_message_happy tfidf_message_hey tfidf_message_hi
## -4.674e+01 -3.496e+00 -2.789e-01
## tfidf_message_home tfidf_message_hope tfidf_message_im
## -4.221e+00 -9.387e-01 -7.116e-01
## tfidf_message_just tfidf_message_k tfidf_message_know
## -1.189e+00 -1.353e+00 -6.839e-01
## tfidf_message_later tfidf_message_let tfidf_message_like
## -6.989e+01 -1.369e+00 -4.893e-01
## tfidf_message_ll tfidf_message_lor tfidf_message_love
## -9.973e-01 -5.491e+01 -1.885e+00
## tfidf_message_lt `tfidf_message_lt gt` tfidf_message_m
## -2.878e+01 -2.101e+03 -2.308e-01
## tfidf_message_make tfidf_message_meet tfidf_message_message
## 2.676e-01 -8.221e-01 6.134e-01
## tfidf_message_mobile tfidf_message_msg tfidf_message_much
## 5.052e+00 6.985e-01 -1.408e+00
## tfidf_message_n tfidf_message_need tfidf_message_new
## -9.983e-01 -5.059e-01 4.342e-01
## tfidf_message_night tfidf_message_now tfidf_message_number
## -4.609e-01 6.923e-01 -3.813e-01
## tfidf_message_oh tfidf_message_ok tfidf_message_one
## -9.485e-01 -3.524e+00 -3.073e-01
## tfidf_message_p tfidf_message_phone tfidf_message_please
## 2.056e+01 6.583e-03 6.667e-01
## tfidf_message_pls tfidf_message_prize tfidf_message_r
## -2.553e+00 1.666e+03 1.402e+00
##
## ...
## and 26 more lines.
#Se setea una semilla aleatoria para evitar diferentes resultados cada corrida
set.seed(234)
# Se genera las submuestras de validación cruzada
emails_folds <- vfold_cv(emails_train,v=7)
emails_folds
## # 7-fold cross-validation
## # A tibble: 7 x 2
## splits id
## <list> <chr>
## 1 <split [3343/558]> Fold1
## 2 <split [3343/558]> Fold2
## 3 <split [3344/557]> Fold3
## 4 <split [3344/557]> Fold4
## 5 <split [3344/557]> Fold5
## 6 <split [3344/557]> Fold6
## 7 <split [3344/557]> Fold7
# Se ajusta el modelo para cada fold
rl_rs <- fit_resamples(rl_fit,
emails_folds,
control = control_resamples(save_pred = TRUE),
metrics = metric_set(f_meas,recall,precision)
)
## Warning: package 'rlang' was built under R version 4.0.3
## Warning: package 'vctrs' was built under R version 4.0.3
## ! Fold1: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold2: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold3: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold4: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Fold5: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold6: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold7: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
# Se calcula la matriz de confusión
rl_rs %>% conf_mat_resampled(tidy = F)
## ham spam
## ham 462.000000 21.85714
## spam 8.285714 65.14286
Resumen de las metricas
rl_rs %>% collect_metrics()
## # A tibble: 3 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 f_meas binary 0.968 7 0.00223 Preprocessor1_Model1
## 2 precision binary 0.982 7 0.00299 Preprocessor1_Model1
## 3 recall binary 0.955 7 0.00334 Preprocessor1_Model1
Calculo de predicciones
rl_rs %>% collect_predictions() %>% head()
## # A tibble: 6 x 5
## id .pred_class .row class .config
## <chr> <fct> <int> <fct> <chr>
## 1 Fold1 ham 1 ham Preprocessor1_Model1
## 2 Fold1 ham 17 ham Preprocessor1_Model1
## 3 Fold1 ham 29 ham Preprocessor1_Model1
## 4 Fold1 ham 31 ham Preprocessor1_Model1
## 5 Fold1 ham 44 spam Preprocessor1_Model1
## 6 Fold1 ham 57 ham Preprocessor1_Model1
Evaluar resultados en datos de prueba
rl_fitFinal <- emails_wf2 %>%
add_model(rl_spec) %>%
# Ajusta en el dataset de entrenamiento y evalúa en el dataset de prueba
last_fit(emails_split,
metrics = metric_set(f_meas,recall,precision)
)
## ! train/test split: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
# recolectamos las métricas
rl_fitFinal %>% collect_metrics()
## # A tibble: 3 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 f_meas binary 0.969 Preprocessor1_Model1
## 2 recall binary 0.958 Preprocessor1_Model1
## 3 precision binary 0.980 Preprocessor1_Model1
Importancia de las variables predictoras
tokensImp<-pull_workflow_fit(rl_fit)$fit
tokensImp<- tokensImp$coefficients
tokensImpDF <- data.frame(token=names(tokensImp),values=tokensImp) %>%
mutate(token=str_remove_all(token,"tfidf_comments_"))
tokensImpDF %>%
top_n(15, abs(tokensImpDF$values)) %>%
ungroup() %>%
ggplot(aes(fct_reorder(token, values), values, fill = values > 0)) +
geom_col(alpha = 0.8, show.legend = FALSE) +
coord_flip() +
labs(
x = NULL,
title = "Coefficients that increase/decrease probability the most")+
theme_minimal()
new_comment <- tribble(~message,"access this link")
new_comment
## # A tibble: 1 x 1
## message
## <chr>
## 1 access this link
prediction<-predict(rl_fit, new_data = new_comment)
paste0("el resultado para el comentario ","'",new_comment,"'","es: ",prediction$.pred_class)
## [1] "el resultado para el comentario 'access this link'es: ham"