1.Limpieza de los datos
library(stringr)
library(textrecipes)
library(stopwords)
library(tidyverse)
library(themis)
library(tidymodels)
library(stringr)
library(rsample)
library(tidytext)
Clean_String <- function(string){
# Remover caracteres no UTF-8
temp<- iconv(enc2utf8(string),sub="byte")
temp<- str_replace_all(temp,"[^[:graph:]]", " ")
# Remover todo lo que no sea número o letra
temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
# remover espacios extra
temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
# minúscula
temp <- tolower(temp)
return(temp)
}
library(readr)
wd <- "C:/Users/Aleba/Documents/spamData.csv"
spamData <- read.csv(wd)
spamData$message <- Clean_String(spamData$message)
str(spamData)
## 'data.frame': 5572 obs. of 2 variables:
## $ class : chr "ham" "ham" "spam" "ham" ...
## $ message: chr "go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat " "ok lar joking wif u oni " "free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s" "u dun say so early hor u c already then say " ...
2.Muestra de entrenamiento y prueba
set.seed(1234)
spamdividido <- initial_split(spamData,prop=.7)
spam_training <- training(spamdividido)
spam_testing <- testing(spamdividido)
dim(spam_training);dim(spam_testing)
## [1] 3901 2
## [1] 1671 2
receta_1<- recipe(class ~ message,
data = spam_training)
recetaprocesada1 <- receta_1 %>%
step_text_normalization(message) %>%
step_tokenize(message) %>%
step_stopwords(message, keep = FALSE) %>%
step_untokenize(message) %>%
step_tokenize(message, token = "ngrams",
options = list(n = 4, n_min = 1)) %>%
step_tokenfilter(message, max_tokens = 200) %>%
step_tfidf(message) %>%
step_upsample(class)
2.1 Verificación del tamaño de las clases
spam_training %>%
group_by(class) %>%
summarise(n=n()) %>%
mutate(freq = prop.table(n))
## # A tibble: 2 x 3
## class n freq
## <chr> <int> <dbl>
## 1 ham 3387 0.868
## 2 spam 514 0.132
recetasobremuestreada <- recetaprocesada1 %>%
step_smote(class)
3.Creación de receta de preprocesamiento de datos
recetaejecutada <- prep(recetaprocesada1)
recetaworkflow_1 <- workflow() %>%
add_recipe(recetaprocesada1)
head(recetaejecutada, 2)
## $var_info
## # A tibble: 2 x 4
## variable type role source
## <chr> <chr> <chr> <chr>
## 1 message nominal predictor original
## 2 class nominal outcome original
##
## $term_info
## # A tibble: 201 x 4
## variable type role source
## <chr> <chr> <chr> <chr>
## 1 class nominal outcome original
## 2 tfidf_message_already numeric predictor derived
## 3 tfidf_message_also numeric predictor derived
## 4 tfidf_message_always numeric predictor derived
## 5 tfidf_message_amp numeric predictor derived
## 6 tfidf_message_anything numeric predictor derived
## 7 tfidf_message_around numeric predictor derived
## 8 tfidf_message_ask numeric predictor derived
## 9 tfidf_message_b numeric predictor derived
## 10 tfidf_message_babe numeric predictor derived
## # ... with 191 more rows
4.Ajuste del modelo inicial
rlasso_spec <- logistic_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")
lasso_wf <- workflow() %>%
add_recipe(recetaprocesada1) %>%
add_model(rlasso_spec)
lasso_wf
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
##
## * step_text_normalization()
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_upsample()
##
## -- Model -----------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = tune()
## mixture = 1
##
## Computational engine: glmnet
6. Creación del modelo final y validación de métricas
set.seed(2020)
lasso_grid <- tune_grid(lasso_wf,
resamples = reviews_folds,
grid = lambda_grid,
control = control_resamples(save_pred = TRUE),
metrics = metric_set(f_meas, recall, precision)
)
lasso_grid
## # Tuning results
## # 5-fold cross-validation
## # A tibble: 5 x 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [3120/781]> Fold1 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,525 x 5~
## 2 <split [3121/780]> Fold2 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,500 x 5~
## 3 <split [3121/780]> Fold3 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
## 4 <split [3121/780]> Fold4 <tibble [75 x 5~ <tibble [0 x 1~ <tibble [19,500 x 5~
## 5 <split [3121/780]> Fold5 <tibble [75 x 5~ <tibble [1 x 1~ <tibble [19,500 x 5~
lasso_grid %>%
collect_metrics()
## # A tibble: 75 x 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 3.61e-10 f_meas binary 0.970 5 0.00219 Preprocessor1_Model01
## 2 3.61e-10 precision binary 0.987 5 0.00206 Preprocessor1_Model01
## 3 3.61e-10 recall binary 0.954 5 0.00482 Preprocessor1_Model01
## 4 3.62e-10 f_meas binary 0.970 5 0.00219 Preprocessor1_Model02
## 5 3.62e-10 precision binary 0.987 5 0.00206 Preprocessor1_Model02
## 6 3.62e-10 recall binary 0.954 5 0.00482 Preprocessor1_Model02
## 7 3.44e- 9 f_meas binary 0.970 5 0.00219 Preprocessor1_Model03
## 8 3.44e- 9 precision binary 0.987 5 0.00206 Preprocessor1_Model03
## 9 3.44e- 9 recall binary 0.954 5 0.00482 Preprocessor1_Model03
## 10 5.96e- 9 f_meas binary 0.970 5 0.00219 Preprocessor1_Model04
## # ... with 65 more rows
best_f <- lasso_grid %>%
select_best("f_meas")
best_f
## # A tibble: 1 x 2
## penalty .config
## <dbl> <chr>
## 1 0.00337 Preprocessor1_Model19
final_lasso <- finalize_workflow(lasso_wf, best_f) %>%
fit(spam_training)
final_lasso
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 8 Recipe Steps
##
## * step_text_normalization()
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_upsample()
##
## -- Model -----------------------------------------------------------------------
##
## Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
##
## Df %Dev Lambda
## 1 0 0.00 0.173900
## 2 1 1.49 0.158500
## 3 2 3.73 0.144400
## 4 4 6.41 0.131600
## 5 4 9.70 0.119900
## 6 5 13.17 0.109200
## 7 5 16.36 0.099520
## 8 7 19.44 0.090680
## 9 13 23.21 0.082620
## 10 14 27.23 0.075280
## 11 15 30.92 0.068590
## 12 19 34.42 0.062500
## 13 20 37.77 0.056950
## 14 21 40.87 0.051890
## 15 25 43.91 0.047280
## 16 29 46.85 0.043080
## 17 30 49.56 0.039250
## 18 32 52.09 0.035760
## 19 36 54.42 0.032590
## 20 37 56.63 0.029690
## 21 38 58.59 0.027050
## 22 38 60.35 0.024650
## 23 41 61.96 0.022460
## 24 43 63.44 0.020470
## 25 47 64.89 0.018650
## 26 51 66.22 0.016990
## 27 55 67.47 0.015480
## 28 58 68.69 0.014110
## 29 59 69.80 0.012850
## 30 63 70.81 0.011710
## 31 69 71.79 0.010670
## 32 75 72.77 0.009723
## 33 75 73.67 0.008859
## 34 77 74.49 0.008072
## 35 82 75.24 0.007355
## 36 88 75.94 0.006702
## 37 92 76.60 0.006106
## 38 101 77.24 0.005564
## 39 110 77.84 0.005070
## 40 117 78.42 0.004619
## 41 121 78.96 0.004209
## 42 123 79.46 0.003835
## 43 125 79.92 0.003494
## 44 130 80.34 0.003184
## 45 137 80.74 0.002901
## 46 140 81.11 0.002643
##
## ...
## and 54 more lines.
review_final <- last_fit(final_lasso,
split=spamdividido,
metrics = metric_set(f_meas, recall, precision))
review_final %>%
collect_metrics()
## # A tibble: 3 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 f_meas binary 0.969 Preprocessor1_Model1
## 2 recall binary 0.949 Preprocessor1_Model1
## 3 precision binary 0.990 Preprocessor1_Model1
review_final %>%
collect_predictions %>%
head()
## # A tibble: 6 x 5
## id .pred_class .row class .config
## <chr> <fct> <int> <fct> <chr>
## 1 train/test split ham 7 ham Preprocessor1_Model1
## 2 train/test split ham 11 ham Preprocessor1_Model1
## 3 train/test split spam 13 spam Preprocessor1_Model1
## 4 train/test split ham 15 ham Preprocessor1_Model1
## 5 train/test split spam 16 spam Preprocessor1_Model1
## 6 train/test split ham 17 ham Preprocessor1_Model1
7. Prueba de nuevos datos(Sugerencia:Observar ejemplos del dataset e incluir textos similares).
new_comment <- tribble(~message,"ok lar joking wif u oni ")
new_comment
## # A tibble: 1 x 1
## message
## <chr>
## 1 "ok lar joking wif u oni "
prediction<-predict(final_lasso, new_data = new_comment)
paste0("el resultado para el comentario ","'",new_comment,"'","es: ",prediction$.pred_class)
## [1] "el resultado para el comentario 'ok lar joking wif u oni 'es: ham"