Carga de librerías
#install.packages("textrecipes")
#("stopwords")
#install.packages("tidyverse")
#install.packages("themis")
#install.packages("tidymodels")
library(textrecipes)
library(stopwords)
library(tidyverse)
library(themis)
library(tidymodels)
Carga y limpieza de datos
#Carga del dataset
spam <- read.csv('spamData.csv')
View(spam)
Clean_String <- function(string){
# minúscula
temp <- tolower(string)
# Remover todo lo que no sea número o letra
temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
# remover espacios extra
temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
return(temp)
}
spam$message <- Clean_String(spam$message)
Muestra de entrenamiento y prueba. Verificación del tamaño de las clases
set.seed(1234) # Asegurar siempre una misma semilla aleatoria.
#Realizar la partición de las muestras
messages_split <- initial_split(spam,prop=.7)
messages_train <- training(messages_split)
messages_test <- testing(messages_split)
dim(messages_train);dim(messages_test)
## [1] 3901 2
## [1] 1671 2
Creación de receta de preprocesamiento de datos
#Receta del modelo
messages_recipe <- recipe(class ~ message,
data = messages_train)
#Aplicar los pasos de procesamiento de datos
messages_recipeProcessed <- messages_recipe %>%
step_tokenize(message) %>%
step_stopwords(message, keep = FALSE) %>%
step_untokenize(message) %>%
step_tokenize(message, token = "ngrams",
options = list(n = 2, n_min =1)) %>%
step_tokenfilter(message, max_tokens = 500) %>%
step_tfidf(message)
#Ejecutar la receta del paso anterior
messages_recipeProcessedF <- prep(messages_recipeProcessed)
messages_recipeProcessedF
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 1
##
## Training data contained 3901 data points and no missing data.
##
## Operations:
##
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
#Setear el workflow para trabajar el modelo de ML
message_wf <- workflow()%>%
add_recipe(messages_recipeProcessed)
Balance de las clases
#Verificar las frecuencias del dataframe
messages_train %>%
group_by(class) %>%
summarise(n=n()) %>%
mutate (freq = prop.table(n))
## # A tibble: 2 x 3
## class n freq
## <chr> <int> <dbl>
## 1 ham 3387 0.868
## 2 spam 514 0.132
#Si es requerido se utiliza la función step_smote sobre la receta que ya se había creado
messages_recipeProcessed2 <- messages_recipeProcessed %>%
step_smote(class)
#Ejectuar la receta del paso anterior
messages_recipeProcessedF2 <- prep(messages_recipeProcessed2)
messages_recipeProcessedF2
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 1
##
## Training data contained 3901 data points and no missing data.
##
## Operations:
##
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
## SMOTE based on class [trained]
#Setetar el wf para trabajar el modelo de ML
message_wf2 <- workflow() %>%
add_recipe(messages_recipeProcessed2)
Especificación y ajuste del modelo inicial
# Especificación del modelo
rl_spec <- logistic_reg() %>%
set_engine("glm")
rl_spec
## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
#Ajustar el modelo con los datos
rl_fit <- message_wf2 %>%
add_model(rl_spec) %>%
fit(data = messages_train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
rl_fit
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 7 Recipe Steps
##
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_smote()
##
## -- Model -----------------------------------------------------------------------
##
## Call: stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
##
## Coefficients:
## (Intercept) tfidf_message_able
## -2.072e+01 -6.036e+14
## tfidf_message_abt tfidf_message_account
## -1.509e+15 -4.608e+13
## tfidf_message_actually tfidf_message_address
## -9.276e+14 -4.050e+13
## tfidf_message_aft tfidf_message_afternoon
## -2.291e+13 -3.107e+14
## tfidf_message_age tfidf_message_ah
## 1.333e+14 3.346e+13
## tfidf_message_aight tfidf_message_already
## -5.766e+14 -5.256e+14
## tfidf_message_alright tfidf_message_also
## -4.376e+14 -5.844e+14
## tfidf_message_always tfidf_message_amp
## -7.958e+14 -1.575e+15
## tfidf_message_another tfidf_message_answer
## -6.964e+14 -1.698e+14
## tfidf_message_anything tfidf_message_anyway
## -1.130e+15 -1.113e+14
## tfidf_message_apply tfidf_message_ard
## 8.366e+14 -1.218e+15
## tfidf_message_around tfidf_message_ask
## 1.378e+15 9.041e+13
## tfidf_message_asked tfidf_message_auction
## -8.761e+14 1.439e+15
## tfidf_message_awarded tfidf_message_away
## 2.959e+15 -7.127e+14
## tfidf_message_b tfidf_message_babe
## 1.145e+15 -4.921e+14
## tfidf_message_baby tfidf_message_back
## -1.105e+15 2.432e+14
## tfidf_message_bad tfidf_message_bed
## -3.366e+14 -6.131e+14
## tfidf_message_best tfidf_message_better
## -1.315e+14 -2.062e+15
## tfidf_message_big tfidf_message_birthday
## -6.043e+14 6.991e+14
## tfidf_message_bit tfidf_message_book
## -9.977e+13 -6.323e+14
## tfidf_message_bored tfidf_message_box
## -1.358e+15 -3.108e+14
## tfidf_message_boy tfidf_message_bring
## -8.905e+14 -2.729e+14
## tfidf_message_bt tfidf_message_bus
## -3.742e+15 -1.284e+15
##
## ...
## and 460 more lines.
Medición mediante validación cruzada
#Se setea una semilla aleatoria para evitar diferentes resultados cada corrida
set.seed(234)
# Se genera las submuestras de validación cruzada
messages_folds <- vfold_cv(messages_train,v=5)
messages_folds
## # 5-fold cross-validation
## # A tibble: 5 x 2
## splits id
## <list> <chr>
## 1 <split [3120/781]> Fold1
## 2 <split [3121/780]> Fold2
## 3 <split [3121/780]> Fold3
## 4 <split [3121/780]> Fold4
## 5 <split [3121/780]> Fold5
# Se ajusta el modelo para cada fold
rl_rs <- fit_resamples(rl_fit,
messages_folds,
control = control_resamples(save_pred = TRUE),
metrics = metric_set(f_meas,recall,precision)
)
## ! Fold1: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Fold1: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Fold2: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold2: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Fold3: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold3: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Fold4: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold4: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Fold5: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold5: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
# Se calcula la matriz de confusión
rl_rs %>% conf_mat_resampled(tidy = F)
## ham spam
## ham 640.8 36.6
## spam 14.4 88.4
# Valores de las métricas
rl_rs %>% collect_metrics()
## # A tibble: 3 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 f_meas binary 0.962 5 0.00631 Preprocessor1_Model1
## 2 precision binary 0.978 5 0.00434 Preprocessor1_Model1
## 3 recall binary 0.946 5 0.0107 Preprocessor1_Model1
#Calculo de predicciones
rl_rs %>% collect_predictions() %>% head()
## # A tibble: 6 x 5
## id .pred_class .row class .config
## <chr> <fct> <int> <fct> <chr>
## 1 Fold1 spam 8 spam Preprocessor1_Model1
## 2 Fold1 ham 25 ham Preprocessor1_Model1
## 3 Fold1 ham 32 ham Preprocessor1_Model1
## 4 Fold1 ham 36 ham Preprocessor1_Model1
## 5 Fold1 ham 39 ham Preprocessor1_Model1
## 6 Fold1 ham 40 ham Preprocessor1_Model1
Evaluación de resultados en datos de prueba
rl_fitFinal <- message_wf2 %>%
add_model(rl_spec) %>%
# Ajusta en el dataset de entrenamiento y evalúa en el dataset de prueba
last_fit(messages_split,
metrics = metric_set(f_meas,recall,precision)
)
## ! train/test split: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! train/test split: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
# recolectamos las métricas
rl_fitFinal %>% collect_metrics()
## # A tibble: 3 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 f_meas binary 0.969 Preprocessor1_Model1
## 2 recall binary 0.953 Preprocessor1_Model1
## 3 precision binary 0.986 Preprocessor1_Model1
¿Cúales características colaboran más en el modelo?
tokensImp<-pull_workflow_fit(rl_fit)$fit
tokensImp<- tokensImp$coefficients
tokensImpDF <- data.frame(token=names(tokensImp),values=tokensImp) %>%
mutate(token=str_remove_all(token,"tfidf_message_"))
tokensImpDF %>%
top_n(15, abs(tokensImpDF$values)) %>%
ungroup() %>%
ggplot(aes(fct_reorder(token, values), values, fill = values > 0)) +
geom_col(alpha = 0.8, show.legend = FALSE) +
coord_flip() +
labs(
x = NULL,
title = "Coefficients that increase/decrease probability the most")+
theme_minimal()

Predecir con datos nuevos
new_message <- tribble(~message,"rate your experience and win a free passs")
new_message
## # A tibble: 1 x 1
## message
## <chr>
## 1 rate your experience and win a free passs
prediction<-predict(rl_fit, new_data = new_message)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
paste0("el resultado para el comentario ","'",new_message,"'","es: ",prediction$.pred_class)
## [1] "el resultado para el comentario 'rate your experience and win a free passs'es: spam"