library(textrecipes)
library(stopwords)
library(tidyverse)
library(themis)
library(tidymodels)
library(stringr)
library(rsample)
library(tidytext)
##Limpieza de los datos
Clean_String <- function(string){
# Remover caracteres no UTF-8
temp<- iconv(enc2utf8(string),sub="byte")
temp<- str_replace_all(temp,"[^[:graph:]]", " ")
# Remover todo lo que no sea número o letra
temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
# remover espacios extra
temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
# minúscula
temp <- tolower(temp)
return(temp)
}
library(readr)
wd <- "C:/Users/Aleba/Documents/spamData.csv"
spamData <- read.csv(wd)
spamData$message <- Clean_String(spamData$message)
str(spamData)
## 'data.frame': 5572 obs. of 2 variables:
## $ class : chr "ham" "ham" "spam" "ham" ...
## $ message: chr "go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat " "ok lar joking wif u oni " "free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s" "u dun say so early hor u c already then say " ...
set.seed(1234)
spamdividido <- initial_split(spamData,prop=.7)
spam_training <- training(spamdividido)
spam_testing <- testing(spamdividido)
dim(spam_training);dim(spam_testing)
## [1] 3901 2
## [1] 1671 2
receta_1<- recipe(class ~ message,
data = spamData)
recetaprocesada1 <- receta_1 %>%
step_tokenize(message) %>%
step_stopwords(message, keep = FALSE) %>%
step_untokenize(message) %>%
step_tokenize(message, token = "ngrams",
options = list(n = 2, n_min = 1)) %>%
step_tokenfilter(message, max_tokens = 500) %>%
step_tfidf(message)
recetaejecutada <- prep(recetaprocesada1)
recetaejecutada
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 1
##
## Training data contained 5572 data points and no missing data.
##
## Operations:
##
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
recetaworkflow_1 <- workflow() %>%
add_recipe(recetaprocesada1)
spam_training %>%
group_by(class) %>%
summarise(n=n()) %>%
mutate(freq = prop.table(n))
## # A tibble: 2 x 3
## class n freq
## <chr> <int> <dbl>
## 1 ham 3387 0.868
## 2 spam 514 0.132
recetasobremuestreada <- recetaprocesada1 %>%
step_smote(class)
#Creación de receta de preprocesamiento de datos
recetaejecutada_2 <- prep(recetasobremuestreada)
recetaejecutada_2
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 1
##
## Training data contained 5572 data points and no missing data.
##
## Operations:
##
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
## SMOTE based on class [trained]
recetaworkflow_2 <- workflow() %>%
add_recipe(recetasobremuestreada)
regresion <- logistic_reg() %>%
set_engine("glm")
regresion
## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
modeloajustado <- recetaworkflow_2 %>%
add_model(regresion) %>%
fit(data = spam_training)
modeloajustado
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 7 Recipe Steps
##
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_smote()
##
## -- Model -----------------------------------------------------------------------
##
## Call: stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
##
## Coefficients:
## (Intercept) tfidf_message_able
## -2.072e+01 -6.036e+14
## tfidf_message_abt tfidf_message_account
## -1.509e+15 -4.608e+13
## tfidf_message_actually tfidf_message_address
## -9.276e+14 -4.050e+13
## tfidf_message_aft tfidf_message_afternoon
## -2.291e+13 -3.107e+14
## tfidf_message_age tfidf_message_ah
## 1.333e+14 3.346e+13
## tfidf_message_aight tfidf_message_already
## -5.766e+14 -5.256e+14
## tfidf_message_alright tfidf_message_also
## -4.376e+14 -5.844e+14
## tfidf_message_always tfidf_message_amp
## -7.958e+14 -1.575e+15
## tfidf_message_another tfidf_message_answer
## -6.964e+14 -1.698e+14
## tfidf_message_anything tfidf_message_anyway
## -1.130e+15 -1.113e+14
## tfidf_message_apply tfidf_message_ard
## 8.366e+14 -1.218e+15
## tfidf_message_around tfidf_message_ask
## 1.378e+15 9.041e+13
## tfidf_message_asked tfidf_message_auction
## -8.761e+14 1.439e+15
## tfidf_message_awarded tfidf_message_away
## 2.959e+15 -7.127e+14
## tfidf_message_b tfidf_message_babe
## 1.145e+15 -4.921e+14
## tfidf_message_baby tfidf_message_back
## -1.105e+15 2.432e+14
## tfidf_message_bad tfidf_message_bed
## -3.366e+14 -6.131e+14
## tfidf_message_best tfidf_message_better
## -1.315e+14 -2.062e+15
## tfidf_message_big tfidf_message_birthday
## -6.043e+14 6.991e+14
## tfidf_message_bit tfidf_message_book
## -9.977e+13 -6.323e+14
## tfidf_message_bored tfidf_message_box
## -1.358e+15 -3.108e+14
## tfidf_message_boy tfidf_message_bring
## -8.905e+14 -2.729e+14
## tfidf_message_bt tfidf_message_bus
## -3.742e+15 -1.284e+15
##
## ...
## and 460 more lines.
set.seed(234)
matriz_folds <- vfold_cv(spam_training, v=5)
matriz_folds
## # 5-fold cross-validation
## # A tibble: 5 x 2
## splits id
## <list> <chr>
## 1 <split [3120/781]> Fold1
## 2 <split [3121/780]> Fold2
## 3 <split [3121/780]> Fold3
## 4 <split [3121/780]> Fold4
## 5 <split [3121/780]> Fold5
medidas <- fit_resamples(modeloajustado,
matriz_folds,
control = control_resamples(save_pred = TRUE),
metrics = metric_set(f_meas,recall,precision))
medidas %>% conf_mat_resampled(tidy = F)
## ham spam
## ham 640.8 36.6
## spam 14.4 88.4
medidas %>% collect_metrics()
## # A tibble: 3 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 f_meas binary 0.962 5 0.00631 Preprocessor1_Model1
## 2 precision binary 0.978 5 0.00434 Preprocessor1_Model1
## 3 recall binary 0.946 5 0.0107 Preprocessor1_Model1
medidas %>% collect_predictions() %>% head()
## # A tibble: 6 x 5
## id .pred_class .row class .config
## <chr> <fct> <int> <fct> <chr>
## 1 Fold1 spam 8 spam Preprocessor1_Model1
## 2 Fold1 ham 25 ham Preprocessor1_Model1
## 3 Fold1 ham 32 ham Preprocessor1_Model1
## 4 Fold1 ham 36 ham Preprocessor1_Model1
## 5 Fold1 ham 39 ham Preprocessor1_Model1
## 6 Fold1 ham 40 ham Preprocessor1_Model1
new_comment <- tribble(~message,"ok lar joking wif u oni ")
new_comment
## # A tibble: 1 x 1
## message
## <chr>
## 1 "ok lar joking wif u oni "
prediction<-predict(modeloajustado, new_data = new_comment)
paste0("el resultado para el comentario ","'",new_comment,"'","es: ",prediction$.pred_class)
## [1] "el resultado para el comentario 'ok lar joking wif u oni 'es: ham"