Introducción a Machine Learning

library(textrecipes)
## Warning: package 'textrecipes' was built under R version 4.0.4
## Loading required package: recipes
## Warning: package 'recipes' was built under R version 4.0.4
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library(stopwords)
## Warning: package 'stopwords' was built under R version 4.0.4
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## v readr   1.4.0
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.4
## Warning: package 'tidyr' was built under R version 4.0.4
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'purrr' was built under R version 4.0.3
## Warning: package 'stringr' was built under R version 4.0.3
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()  masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag()     masks stats::lag()
library(themis)
## Warning: package 'themis' was built under R version 4.0.4
## Registered S3 methods overwritten by 'themis':
##   method                  from   
##   bake.step_downsample    recipes
##   bake.step_upsample      recipes
##   prep.step_downsample    recipes
##   prep.step_upsample      recipes
##   tidy.step_downsample    recipes
##   tidy.step_upsample      recipes
##   tunable.step_downsample recipes
##   tunable.step_upsample   recipes
## 
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
## 
##     step_downsample, step_upsample
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.0.4
## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --
## v broom     0.7.5     v rsample   0.0.9
## v dials     0.0.9     v tune      0.1.3
## v infer     0.5.4     v workflows 0.2.2
## v modeldata 0.1.0     v yardstick 0.0.7
## v parsnip   0.1.5
## Warning: package 'dials' was built under R version 4.0.4
## Warning: package 'scales' was built under R version 4.0.4
## Warning: package 'infer' was built under R version 4.0.4
## Warning: package 'modeldata' was built under R version 4.0.4
## Warning: package 'parsnip' was built under R version 4.0.4
## Warning: package 'rsample' was built under R version 4.0.4
## Warning: package 'tune' was built under R version 4.0.4
## Warning: package 'workflows' was built under R version 4.0.4
## Warning: package 'yardstick' was built under R version 4.0.4
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard()         masks purrr::discard()
## x dplyr::filter()           masks stats::filter()
## x stringr::fixed()          masks recipes::fixed()
## x dplyr::lag()              masks stats::lag()
## x yardstick::spec()         masks readr::spec()
## x recipes::step()           masks stats::step()
## x themis::step_downsample() masks recipes::step_downsample()
## x themis::step_upsample()   masks recipes::step_upsample()
library(magrittr)
## Warning: package 'magrittr' was built under R version 4.0.3
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(readr)

setwd("C:/Users/carlos mario/Dropbox/Data Analysis/R Directory/Carlos Mario")
spamData <- read_csv("spamData.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   class = col_character(),
##   message = col_character()
## )

1.Limpieza de Datos

Clean_String <- function(string){

  # Remover caracteres no UTF-8
  temp<- iconv(enc2utf8(string),sub="byte")
  temp<- str_replace_all(temp,"[^[:graph:]]", " ") 
  # Remover todo lo que no sea número o letra 
  temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
  # remover espacios extra
  temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
  # minúscula
  temp <- tolower(temp)
  
  return(temp)
  
}

# Aplicar la función a los comentarios
spamData$message <- Clean_String(spamData$message)

2.Muestra de entrenamiento y prueba 2.1 Verificación del tamaño de las clases

library(tidymodels)

set.seed(1234) #Semilla Aleatoria no puede faltar

#Realizar la partición de las muestras

emails_split <- initial_split(spamData,prop=.7)

emails_train <- training(emails_split)
emails_test <- testing(emails_split)

dim(emails_train);dim(emails_test)
## [1] 3901    2
## [1] 1671    2

3.Creación de receta de preprocesamiento de datos

library(textrecipes)
library(stopwords)

# Setear la receta del modelo a utilizar

emails_recipe <- recipe(class ~ message, 
                         data = emails_train)

#Aplicar los pasos de procesamiento de datos

emails_recipeProcessed <- emails_recipe %>%
  step_tokenize(message) %>%
  step_stopwords(message, keep = FALSE) %>%
  step_untokenize(message) %>%
  step_tokenize(message, token = "ngrams", 
                options = list(n = 2, n_min = 1)) %>%
  step_tokenfilter(message, max_tokens = 100) %>%
  step_tfidf(message)

#Ejecutar la receta del paso anterior
emails_recipeProcessedF <- prep(emails_recipeProcessed)

emails_recipeProcessedF
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          1
## 
## Training data contained 3901 data points and no missing data.
## 
## Operations:
## 
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
#Setear el workflow para trabajar el modelo de Machine Learning

emails_wf <- workflow() %>%
  add_recipe(emails_recipeProcessed)

4.Ajuste del modelo inicial

library(themis)

#Verificamos las frecuencias de nuestro dataframe

emails_train %>%
  group_by(class) %>%
  summarise(n=n()) %>%
  mutate(freq = prop.table(n))
## # A tibble: 2 x 3
##   class     n  freq
## * <chr> <int> <dbl>
## 1 ham    3387 0.868
## 2 spam    514 0.132
# Si es requerido se utiliza la función step_smote sobre la receta que ya se había creado


emails_recipeProcessed2 <- emails_recipeProcessed %>%
  step_smote(class)

#Ejecutar la receta del paso anterior
emails_recipeProcessedF2 <- prep(emails_recipeProcessed2)

emails_recipeProcessedF2
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          1
## 
## Training data contained 3901 data points and no missing data.
## 
## Operations:
## 
## Tokenization for message [trained]
## Stop word removal for message [trained]
## Untokenization for message [trained]
## Tokenization for message [trained]
## Text filtering for message [trained]
## Term frequency-inverse document frequency with message [trained]
## SMOTE based on class [trained]

5.Medición de métricas mediante validación cruzada

#Setear el workflow para trabajar el modelo de Machine Learning

emails_wf2 <- workflow() %>%
  add_recipe(emails_recipeProcessed2)

# Especificación del modelo

rl_spec <- logistic_reg() %>% 
  set_engine("glm")

rl_spec
## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm
#Ajustar el modelo con los datos

rl_fit <- emails_wf2 %>%
  add_model(rl_spec) %>%
  fit(data = emails_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
rl_fit
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 7 Recipe Steps
## 
## * step_tokenize()
## * step_stopwords()
## * step_untokenize()
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_smote()
## 
## -- Model -----------------------------------------------------------------------
## 
## Call:  stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
## 
## Coefficients:
##            (Intercept)   tfidf_message_already       tfidf_message_ask  
##             -1.327e+00               2.808e-01              -4.252e+01  
##        tfidf_message_b      tfidf_message_babe      tfidf_message_back  
##              2.219e-01               1.704e-01               1.402e-01  
##        tfidf_message_c      tfidf_message_call       tfidf_message_can  
##              9.184e-01               3.493e+00              -1.216e+00  
##     tfidf_message_cash     tfidf_message_claim       tfidf_message_com  
##              2.615e+00               8.792e+02               1.059e+00  
##     tfidf_message_come         tfidf_message_d        tfidf_message_da  
##             -5.330e+01              -2.362e+00              -2.835e+00  
##      tfidf_message_day      tfidf_message_dear       tfidf_message_don  
##             -1.410e-01              -1.342e+00              -2.362e+01  
##  `tfidf_message_don t`      tfidf_message_dont         tfidf_message_e  
##              2.183e+01              -3.640e-01              -1.151e+00  
##     tfidf_message_free       tfidf_message_get      tfidf_message_give  
##              2.698e+00              -3.768e-01              -1.761e+00  
##       tfidf_message_go     tfidf_message_going      tfidf_message_good  
##             -3.469e-01              -1.005e+01              -4.301e+00  
##      tfidf_message_got     tfidf_message_great        tfidf_message_gt  
##             -3.903e+00              -3.536e-01              -4.480e+01  
##    tfidf_message_happy       tfidf_message_hey        tfidf_message_hi  
##             -4.674e+01              -3.496e+00              -2.789e-01  
##     tfidf_message_home      tfidf_message_hope        tfidf_message_im  
##             -4.221e+00              -9.387e-01              -7.116e-01  
##     tfidf_message_just         tfidf_message_k      tfidf_message_know  
##             -1.189e+00              -1.353e+00              -6.839e-01  
##    tfidf_message_later       tfidf_message_let      tfidf_message_like  
##             -6.989e+01              -1.369e+00              -4.893e-01  
##       tfidf_message_ll       tfidf_message_lor      tfidf_message_love  
##             -9.973e-01              -5.491e+01              -1.885e+00  
##       tfidf_message_lt   `tfidf_message_lt gt`         tfidf_message_m  
##             -2.878e+01              -2.101e+03              -2.308e-01  
##     tfidf_message_make      tfidf_message_meet   tfidf_message_message  
##              2.676e-01              -8.221e-01               6.134e-01  
##   tfidf_message_mobile       tfidf_message_msg      tfidf_message_much  
##              5.052e+00               6.985e-01              -1.408e+00  
##        tfidf_message_n      tfidf_message_need       tfidf_message_new  
##             -9.983e-01              -5.059e-01               4.342e-01  
##    tfidf_message_night       tfidf_message_now    tfidf_message_number  
##             -4.609e-01               6.923e-01              -3.813e-01  
##       tfidf_message_oh        tfidf_message_ok       tfidf_message_one  
##             -9.485e-01              -3.524e+00              -3.073e-01  
##        tfidf_message_p     tfidf_message_phone    tfidf_message_please  
##              2.056e+01               6.583e-03               6.667e-01  
##      tfidf_message_pls     tfidf_message_prize         tfidf_message_r  
##             -2.553e+00               1.666e+03               1.402e+00  
## 
## ...
## and 26 more lines.

6.Creación del modelo final y validación de métricas

#Se setea una semilla aleatoria para evitar diferentes resultados cada corrida
set.seed(234)

# Se genera las submuestras de validación cruzada

emails_folds <- vfold_cv(emails_train,v=7)

emails_folds
## #  7-fold cross-validation 
## # A tibble: 7 x 2
##   splits             id   
##   <list>             <chr>
## 1 <split [3343/558]> Fold1
## 2 <split [3343/558]> Fold2
## 3 <split [3344/557]> Fold3
## 4 <split [3344/557]> Fold4
## 5 <split [3344/557]> Fold5
## 6 <split [3344/557]> Fold6
## 7 <split [3344/557]> Fold7
# Se ajusta el modelo para cada fold

rl_rs <- fit_resamples(rl_fit,
                       emails_folds,
                       control = control_resamples(save_pred = TRUE),
                       metrics = metric_set(f_meas,recall,precision)
                       )
## Warning: package 'rlang' was built under R version 4.0.3
## Warning: package 'vctrs' was built under R version 4.0.3
## ! Fold1: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold2: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold3: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold4: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Fold5: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold6: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
## ! Fold7: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
# Se calcula la matriz de confusión

rl_rs %>% conf_mat_resampled(tidy = F)
##             ham     spam
## ham  462.000000 21.85714
## spam   8.285714 65.14286

Resumen de las metricas

rl_rs %>% collect_metrics()
## # A tibble: 3 x 6
##   .metric   .estimator  mean     n std_err .config             
##   <chr>     <chr>      <dbl> <int>   <dbl> <chr>               
## 1 f_meas    binary     0.968     7 0.00223 Preprocessor1_Model1
## 2 precision binary     0.982     7 0.00299 Preprocessor1_Model1
## 3 recall    binary     0.955     7 0.00334 Preprocessor1_Model1

Calculo de predicciones

rl_rs %>% collect_predictions() %>% head()
## # A tibble: 6 x 5
##   id    .pred_class  .row class .config             
##   <chr> <fct>       <int> <fct> <chr>               
## 1 Fold1 ham             1 ham   Preprocessor1_Model1
## 2 Fold1 ham            17 ham   Preprocessor1_Model1
## 3 Fold1 ham            29 ham   Preprocessor1_Model1
## 4 Fold1 ham            31 ham   Preprocessor1_Model1
## 5 Fold1 ham            44 spam  Preprocessor1_Model1
## 6 Fold1 ham            57 ham   Preprocessor1_Model1

7.Prueba de nuevos datos.

Evaluar resultados en datos de prueba

rl_fitFinal <- emails_wf2 %>%
  add_model(rl_spec) %>%
  # Ajusta en el dataset de entrenamiento y evalúa en el dataset de prueba
  last_fit(emails_split,
           metrics = metric_set(f_meas,recall,precision)
           )
## ! train/test split: preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fi...
# recolectamos las métricas
rl_fitFinal %>% collect_metrics()
## # A tibble: 3 x 4
##   .metric   .estimator .estimate .config             
##   <chr>     <chr>          <dbl> <chr>               
## 1 f_meas    binary         0.969 Preprocessor1_Model1
## 2 recall    binary         0.958 Preprocessor1_Model1
## 3 precision binary         0.980 Preprocessor1_Model1

Importancia de las variables predictoras

tokensImp<-pull_workflow_fit(rl_fit)$fit 

tokensImp<- tokensImp$coefficients

tokensImpDF <- data.frame(token=names(tokensImp),values=tokensImp) %>%
  mutate(token=str_remove_all(token,"tfidf_comments_"))
  
tokensImpDF %>%
  top_n(15, abs(tokensImpDF$values)) %>%
  ungroup() %>%
  ggplot(aes(fct_reorder(token, values), values, fill = values > 0)) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  coord_flip() +
  labs(
    x = NULL,
    title = "Coefficients that increase/decrease probability the most")+
  theme_minimal()

new_comment <- tribble(~message,"access this link")
new_comment
## # A tibble: 1 x 1
##   message         
##   <chr>           
## 1 access this link
prediction<-predict(rl_fit, new_data = new_comment)


paste0("el resultado para el comentario ","'",new_comment,"'","es: ",prediction$.pred_class)
## [1] "el resultado para el comentario 'access this link'es: ham"