library(textrecipes)
## Warning: package 'textrecipes' was built under R version 4.0.4
## Loading required package: recipes
## Warning: package 'recipes' was built under R version 4.0.4
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(stopwords)
## Warning: package 'stopwords' was built under R version 4.0.4
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.4
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.1.0 v stringr 1.4.0
## v tidyr 1.1.2 v forcats 0.5.0
## v readr 1.4.0
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.4
## Warning: package 'tidyr' was built under R version 4.0.3
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'stringr' was built under R version 4.0.3
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag() masks stats::lag()
library(themis)
## Warning: package 'themis' was built under R version 4.0.4
## Registered S3 methods overwritten by 'themis':
## method from
## bake.step_downsample recipes
## bake.step_upsample recipes
## prep.step_downsample recipes
## prep.step_upsample recipes
## tidy.step_downsample recipes
## tidy.step_upsample recipes
## tunable.step_downsample recipes
## tunable.step_upsample recipes
##
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
##
## step_downsample, step_upsample
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.0.4
## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --
## v broom 0.7.2 v rsample 0.0.9
## v dials 0.0.9 v tune 0.1.3
## v infer 0.5.4 v workflows 0.2.2
## v modeldata 0.1.0 v yardstick 0.0.7
## v parsnip 0.1.5
## Warning: package 'broom' was built under R version 4.0.3
## Warning: package 'dials' was built under R version 4.0.4
## Warning: package 'scales' was built under R version 4.0.3
## Warning: package 'infer' was built under R version 4.0.4
## Warning: package 'modeldata' was built under R version 4.0.4
## Warning: package 'parsnip' was built under R version 4.0.4
## Warning: package 'rsample' was built under R version 4.0.4
## Warning: package 'tune' was built under R version 4.0.4
## Warning: package 'workflows' was built under R version 4.0.4
## Warning: package 'yardstick' was built under R version 4.0.4
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## x themis::step_downsample() masks recipes::step_downsample()
## x themis::step_upsample() masks recipes::step_upsample()
library(readr)
spamData <- read_csv("spamData.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## class = col_character(),
## message = col_character()
## )
Clean_String <- function(string){
# Remover caracteres no UTF-8
temp<- iconv(enc2utf8(string),sub="byte")
temp<- str_replace_all(temp,"[^[:graph:]]", " ")
# Remover todo lo que no sea número o letra
temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
# remover espacios extra
temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
# minúscula
temp <- tolower(temp)
return(temp)
}
# Aplicar la función a los comentarios
spamData$message <- Clean_String(spamData$message)
2.1 Verificación del tamaño de las clases
library(tidymodels)
set.seed(1234) # Asegurar siempre una misma semilla aleatoria.
#Realizar la partición de las muestras
reviews_split <- initial_split(spamData,prop=.7)
reviews_train <- training(reviews_split)
reviews_test <- testing(reviews_split)
dim(reviews_train);dim(reviews_test)
## [1] 3901 2
## [1] 1671 2
library(textrecipes)
library(stopwords)
# Setear la receta del modelo a utilizar
reviews_recipe <- recipe(class ~ message,
data = reviews_train)
#Aplicar los pasos de procesamiento de datos
reviews_recipeProcessed <- reviews_recipe %>%
step_tokenize(message) %>%
step_stopwords(message, keep = FALSE) %>%
step_untokenize(message) %>%
step_tokenize(message, token = "ngrams",
options = list(n = 2, n_min = 1)) %>%
step_tokenfilter(message, max_tokens = 500) %>%
step_tfidf(message)
#Ejecutar la receta del paso anterior
reviews_recipeProcessedF <- prep(reviews_recipeProcessed)
reviews_recipeProcessedF
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 1
##
## Training data contained 3901 data points and no missing data.
##
## Operations:
##
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
#Setear el workflow para trabajar el modelo de Machine Learning
reviews_wf <- workflow() %>%
add_recipe(reviews_recipeProcessed)
library(themis)
#Verificamos las frecuencias de nuestro dataframe
reviews_train %>%
group_by(class) %>%
summarise(n=n()) %>%
mutate(freq = prop.table(n))
## # A tibble: 2 x 3
## class n freq
## <chr> <int> <dbl>
## 1 ham 3387 0.868
## 2 spam 514 0.132
# Si es requerido se utiliza la función step_smote sobre la receta que ya se había creado
reviews_recipeProcessed2 <- reviews_recipeProcessed %>%
step_smote(class)
#Ejecutar la receta del paso anterior
reviews_recipeProcessedF2 <- prep(reviews_recipeProcessed2)
reviews_recipeProcessedF2
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 1
##
## Training data contained 3901 data points and no missing data.
##
## Operations:
##
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
## SMOTE based on class [trained]
#Setear el workflow para trabajar el modelo de Machine Learning
reviews_wf2 <- workflow() %>%
add_recipe(reviews_recipeProcessed2)
#Setear el workflow para trabajar el modelo de Machine Learning
reviews_wf2 <- workflow() %>%
add_recipe(reviews_recipeProcessed2)
# Especificación del modelo
rl_spec <- logistic_reg() %>%
set_engine("glm")
rl_spec
## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
#Ajustar el modelo con los datos
rl_fit <- reviews_wf2 %>%
add_model(rl_spec) %>%
fit(data = reviews_train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
rl_fit
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 7 Recipe Steps
##
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_smote()
##
## -- Model -----------------------------------------------------------------------
##
## Call: stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
##
## Coefficients:
## (Intercept) tfidf_message_able
## -2.072e+01 -6.036e+14
## tfidf_message_abt tfidf_message_account
## -1.509e+15 -4.608e+13
## tfidf_message_actually tfidf_message_address
## -9.276e+14 -4.050e+13
## tfidf_message_aft tfidf_message_afternoon
## -2.291e+13 -3.107e+14
## tfidf_message_age tfidf_message_ah
## 1.333e+14 3.346e+13
## tfidf_message_aight tfidf_message_already
## -5.766e+14 -5.256e+14
## tfidf_message_alright tfidf_message_also
## -4.376e+14 -5.844e+14
## tfidf_message_always tfidf_message_amp
## -7.958e+14 -1.575e+15
## tfidf_message_another tfidf_message_answer
## -6.964e+14 -1.698e+14
## tfidf_message_anything tfidf_message_anyway
## -1.130e+15 -1.113e+14
## tfidf_message_apply tfidf_message_ard
## 8.366e+14 -1.218e+15
## tfidf_message_around tfidf_message_ask
## 1.378e+15 9.041e+13
## tfidf_message_asked tfidf_message_auction
## -8.761e+14 1.439e+15
## tfidf_message_awarded tfidf_message_away
## 2.959e+15 -7.127e+14
## tfidf_message_b tfidf_message_babe
## 1.145e+15 -4.921e+14
## tfidf_message_baby tfidf_message_back
## -1.105e+15 2.432e+14
## tfidf_message_bad tfidf_message_bed
## -3.366e+14 -6.131e+14
## tfidf_message_best tfidf_message_better
## -1.315e+14 -2.062e+15
## tfidf_message_big tfidf_message_birthday
## -6.043e+14 6.991e+14
## tfidf_message_bit tfidf_message_book
## -9.977e+13 -6.323e+14
## tfidf_message_bored tfidf_message_box
## -1.358e+15 -3.108e+14
## tfidf_message_boy tfidf_message_bring
## -8.905e+14 -2.729e+14
## tfidf_message_bt tfidf_message_bus
## -3.742e+15 -1.284e+15
##
## ...
## and 460 more lines.
#Se setea una semilla aleatoria para evitar diferentes resultados cada corrida
set.seed(234)
# Se genera las submuestras de validación cruzada
reviews_folds <- vfold_cv(reviews_train,v=5)
reviews_folds
## # 5-fold cross-validation
## # A tibble: 5 x 2
## splits id
## <list> <chr>
## 1 <split [3120/781]> Fold1
## 2 <split [3121/780]> Fold2
## 3 <split [3121/780]> Fold3
## 4 <split [3121/780]> Fold4
## 5 <split [3121/780]> Fold5
# Se ajusta el modelo para cada fold
rl_rs <- fit_resamples(rl_fit,
reviews_folds,
control = control_resamples(save_pred = TRUE),
metrics = metric_set(f_meas,recall,precision)
)
## Warning: package 'rlang' was built under R version 4.0.4
## Warning: package 'vctrs' was built under R version 4.0.4
## ! Fold1: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Fold1: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Fold2: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold2: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Fold3: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold3: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Fold4: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold4: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Fold5: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold5: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
# Se calcula la matriz de confusión
rl_rs %>% conf_mat_resampled(tidy = F)
## ham spam
## ham 640.8 36.6
## spam 14.4 88.4
#Cálculo de Predicciones
rl_rs %>% collect_predictions() %>% head()
## # A tibble: 6 x 5
## id .pred_class .row class .config
## <chr> <fct> <int> <fct> <chr>
## 1 Fold1 spam 8 spam Preprocessor1_Model1
## 2 Fold1 ham 25 ham Preprocessor1_Model1
## 3 Fold1 ham 32 ham Preprocessor1_Model1
## 4 Fold1 ham 36 ham Preprocessor1_Model1
## 5 Fold1 ham 39 ham Preprocessor1_Model1
## 6 Fold1 ham 40 ham Preprocessor1_Model1
rl_fitFinal <- reviews_wf2 %>%
add_model(rl_spec) %>%
# Ajusta en el dataset de entrenamiento y evalúa en el dataset de prueba
last_fit(reviews_split,
metrics = metric_set(f_meas,recall,precision)
)
## ! train/test split: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! train/test split: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
# recolectamos las métricas
rl_fitFinal %>% collect_metrics()
## # A tibble: 3 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 f_meas binary 0.969 Preprocessor1_Model1
## 2 recall binary 0.953 Preprocessor1_Model1
## 3 precision binary 0.986 Preprocessor1_Model1
tokensImp<-pull_workflow_fit(rl_fit)$fit
tokensImp<- tokensImp$coefficients
tokensImpDF <- data.frame(token=names(tokensImp),values=tokensImp) %>%
mutate(token=str_remove_all(token,"tfidf_comments_"))
tokensImpDF %>%
top_n(15, abs(tokensImpDF$values)) %>%
ungroup() %>%
ggplot(aes(fct_reorder(token, values), values, fill = values > 0)) +
geom_col(alpha = 0.8, show.legend = FALSE) +
coord_flip() +
labs(
x = NULL,
title = "Coefficients that increase/decrease probability the most")+
theme_minimal()
#Predecir nuevos datos
new_comment <- tribble(~message,"price")
new_comment
## # A tibble: 1 x 1
## message
## <chr>
## 1 price
prediction<-predict(rl_fit, new_data = new_comment)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
paste0("el resultado para el comentario ","'",new_comment,"'","es: ",prediction$.pred_class)
## [1] "el resultado para el comentario 'price'es: spam"