Datos iniciales

load("datos_spam.Rdata")

Train y Testing

# Creando partición 70 y 30%
library(caret)
datos$V58 <- factor(datos$V58)
ind <- createDataPartition(y = datos$V58, times = 1, p = 0.7, list = FALSE)
df_train <- datos[ind, ]
df_test <- datos[-ind, ]

Frecuencias de variable respuesta

# Frecuencia training
prop.table(table(df_train$V58))
## 
##         0         1 
## 0.6058349 0.3941651

# Frecuencia testing
prop.table(table(df_test$V58))
## 
##         0         1 
## 0.6062364 0.3937636

Modelos

Regresión logística

mod_reglogi <- glm(V58 ~ ., data = df_train, family = "binomial")
summary(mod_reglogi)
## 
## Call:
## glm(formula = V58 ~ ., family = "binomial", data = df_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.2063  -0.1388   0.0000   0.0889   3.0124  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.660e+00  1.799e-01  -9.225  < 2e-16 ***
## V1          -4.900e-01  3.056e-01  -1.603 0.108897    
## V2          -1.598e-01  8.040e-02  -1.988 0.046786 *  
## V3           7.200e-02  1.408e-01   0.512 0.608972    
## V4           1.405e+00  1.735e+00   0.810 0.417872    
## V5           4.888e-01  1.222e-01   3.999 6.35e-05 ***
## V6           1.181e+00  3.312e-01   3.566 0.000363 ***
## V7           1.811e+00  3.496e-01   5.179 2.23e-07 ***
## V8           3.304e-01  1.446e-01   2.286 0.022275 *  
## V9           1.471e+00  4.403e-01   3.341 0.000835 ***
## V10          4.715e-02  8.470e-02   0.557 0.577770    
## V11          7.457e-01  3.901e-01   1.911 0.055943 .  
## V12         -1.578e-01  9.441e-02  -1.672 0.094571 .  
## V13          1.482e-02  3.022e-01   0.049 0.960892    
## V14          3.097e-01  2.220e-01   1.395 0.163099    
## V15          7.208e-01  7.613e-01   0.947 0.343722    
## V16          1.284e+00  2.079e-01   6.177 6.55e-10 ***
## V17          2.155e+00  3.879e-01   5.554 2.80e-08 ***
## V18          4.683e-02  1.468e-01   0.319 0.749753    
## V19          6.273e-02  4.595e-02   1.365 0.172217    
## V20          1.153e+00  7.601e-01   1.517 0.129351    
## V21          1.932e-01  6.462e-02   2.990 0.002788 ** 
## V22          3.188e-02  1.982e-01   0.161 0.872243    
## V23          3.309e+00  8.082e-01   4.094 4.25e-05 ***
## V24          5.383e-01  2.424e-01   2.220 0.026391 *  
## V25         -3.515e+00  5.741e-01  -6.122 9.22e-10 ***
## V26         -6.339e-01  4.959e-01  -1.278 0.201131    
## V27         -2.502e+01  3.966e+00  -6.308 2.83e-10 ***
## V28          5.643e-01  2.957e-01   1.908 0.056357 .  
## V29         -2.547e+00  1.600e+00  -1.592 0.111465    
## V30         -7.361e-01  4.980e-01  -1.478 0.139433    
## V31          1.229e+00  8.049e-01   1.528 0.126629    
## V32          2.322e+00  3.666e+00   0.633 0.526501    
## V33         -1.511e+00  6.228e-01  -2.427 0.015231 *  
## V34          8.300e-01  1.764e+00   0.470 0.638045    
## V35         -1.620e+00  1.223e+00  -1.325 0.185085    
## V36          1.273e+00  4.145e-01   3.070 0.002139 ** 
## V37         -8.950e-03  1.968e-01  -0.045 0.963721    
## V38         -8.526e-01  6.348e-01  -1.343 0.179273    
## V39         -6.879e-01  4.993e-01  -1.378 0.168286    
## V40         -4.981e-01  4.239e-01  -1.175 0.239977    
## V41         -4.577e+01  3.919e+01  -1.168 0.242868    
## V42         -2.061e+00  8.106e-01  -2.543 0.011005 *  
## V43         -7.745e-01  7.166e-01  -1.081 0.279835    
## V44         -1.859e+00  6.265e-01  -2.968 0.002999 ** 
## V45         -6.585e-01  1.763e-01  -3.734 0.000188 ***
## V46         -1.623e+00  3.215e-01  -5.048 4.48e-07 ***
## V47         -1.984e+00  1.840e+00  -1.078 0.281077    
## V48         -5.678e+00  2.365e+00  -2.401 0.016338 *  
## V49         -1.063e+00  5.178e-01  -2.053 0.040078 *  
## V50         -5.447e-01  4.132e-01  -1.318 0.187363    
## V51         -6.604e-01  1.012e+00  -0.653 0.513972    
## V52          4.308e-01  1.184e-01   3.640 0.000272 ***
## V53          4.481e+00  7.101e-01   6.310 2.79e-10 ***
## V54          2.251e+00  1.092e+00   2.061 0.039332 *  
## V55          1.317e-01  3.276e-02   4.019 5.86e-05 ***
## V56          5.305e-04  3.371e-03   0.157 0.874971    
## V57          1.614e-03  3.667e-04   4.403 1.07e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4321.2  on 3221  degrees of freedom
## Residual deviance: 1149.7  on 3164  degrees of freedom
## AIC: 1265.7
## 
## Number of Fisher Scoring iterations: 14

Valores predichos en training

library(dplyr)
# Predicciones del training
predichos_train <- mod_reglogi$fitted.values
predichos_train2 <- if_else(predichos_train <= 0.5, true = "0", false = "1") 

# Matriz de confusión
table(df_train$V58, predichos_train2, dnn = c("Real", "Predicho"))
##     Predicho
## Real    0    1
##    0 1867   85
##    1  132 1138

# Accuracy o precisión del modelo en training
mean(df_train$V58 == predichos_train2)
## [1] 0.9326505

Valores predichos en testing

# Valores predichos en testing
predichos_test <- predict(mod_reglogi, newdata = df_test[, -58], type = "response")
predichos_test2 <- if_else(predichos_test <= 0.5, true = "0", false = "1") 

# Matriz de confusión
table(df_test$V58, predichos_test2, dnn = c("Real", "Predicho"))
##     Predicho
## Real   0   1
##    0 776  60
##    1  46 497

# Accuracy
mean(df_test$V58 == predichos_test2)
## [1] 0.9231327

Árbol de clasificación

library(rpart)
mod_arbol <- rpart(V58 ~ ., data = df_train, method = "class")
summary(mod_arbol)
## Call:
## rpart(formula = V58 ~ ., data = df_train, method = "class")
##   n= 3222 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.48346457      0 1.0000000 1.0000000 0.02184114
## 2 0.14645669      1 0.5165354 0.5299213 0.01816879
## 3 0.05275591      2 0.3700787 0.3803150 0.01595523
## 4 0.03700787      3 0.3173228 0.3472441 0.01536220
## 5 0.02362205      4 0.2803150 0.2921260 0.01426655
## 6 0.01000000      5 0.2566929 0.2582677 0.01351511
## 
## Variable importance
## V53  V7 V23 V24 V52 V57  V9 V20 V25 V56 V26 V55 V31 V30 V27 V28 
##  28  13  11  10   7   6   6   5   4   2   2   1   1   1   1   1 
## 
## Node number 1: 3222 observations,    complexity param=0.4834646
##   predicted class=0  expected loss=0.3941651  P(node) =1
##     class counts:  1952  1270
##    probabilities: 0.606 0.394 
##   left son=2 (2402 obs) right son=3 (820 obs)
##   Primary splits:
##       V53 < 0.0445 to the left,  improve=507.3249, (0 missing)
##       V52 < 0.0795 to the left,  improve=484.0923, (0 missing)
##       V16 < 0.095  to the left,  improve=410.5672, (0 missing)
##       V7  < 0.01   to the left,  improve=404.4405, (0 missing)
##       V21 < 0.635  to the left,  improve=377.2643, (0 missing)
##   Surrogate splits:
##       V23 < 0.045  to the left,  agree=0.838, adj=0.365, (0 split)
##       V24 < 0.045  to the left,  agree=0.836, adj=0.356, (0 split)
##       V9  < 0.095  to the left,  agree=0.797, adj=0.204, (0 split)
##       V20 < 0.02   to the left,  agree=0.791, adj=0.178, (0 split)
##       V57 < 616.5  to the left,  agree=0.789, adj=0.171, (0 split)
## 
## Node number 2: 2402 observations,    complexity param=0.1464567
##   predicted class=0  expected loss=0.2302248  P(node) =0.7454997
##     class counts:  1849   553
##    probabilities: 0.770 0.230 
##   left son=4 (2178 obs) right son=5 (224 obs)
##   Primary splits:
##       V7  < 0.06   to the left,  improve=231.8012, (0 missing)
##       V52 < 0.1885 to the left,  improve=190.5761, (0 missing)
##       V16 < 0.195  to the left,  improve=187.8261, (0 missing)
##       V55 < 3.6835 to the left,  improve=113.7468, (0 missing)
##       V21 < 0.615  to the left,  improve=104.7919, (0 missing)
##   Surrogate splits:
##       V56 < 131.5  to the left,  agree=0.912, adj=0.054, (0 split)
##       V11 < 0.37   to the left,  agree=0.909, adj=0.027, (0 split)
##       V54 < 0.8325 to the left,  agree=0.908, adj=0.018, (0 split)
##       V4  < 8.115  to the left,  agree=0.908, adj=0.009, (0 split)
##       V17 < 3.98   to the left,  agree=0.908, adj=0.009, (0 split)
## 
## Node number 3: 820 observations,    complexity param=0.03700787
##   predicted class=1  expected loss=0.1256098  P(node) =0.2545003
##     class counts:   103   717
##    probabilities: 0.126 0.874 
##   left son=6 (57 obs) right son=7 (763 obs)
##   Primary splits:
##       V25 < 0.385  to the right, improve=75.81941, (0 missing)
##       V26 < 0.12   to the right, improve=40.62536, (0 missing)
##       V52 < 0.051  to the left,  improve=35.51776, (0 missing)
##       V27 < 0.21   to the right, improve=29.46368, (0 missing)
##       V37 < 0.025  to the right, improve=27.93081, (0 missing)
##   Surrogate splits:
##       V26 < 0.305  to the right, agree=0.959, adj=0.404, (0 split)
##       V31 < 0.045  to the right, agree=0.946, adj=0.228, (0 split)
##       V30 < 0.05   to the right, agree=0.943, adj=0.175, (0 split)
##       V27 < 0.225  to the right, agree=0.941, adj=0.158, (0 split)
##       V28 < 0.025  to the right, agree=0.941, adj=0.158, (0 split)
## 
## Node number 4: 2178 observations,    complexity param=0.05275591
##   predicted class=0  expected loss=0.1597796  P(node) =0.6759777
##     class counts:  1830   348
##    probabilities: 0.840 0.160 
##   left son=8 (1963 obs) right son=9 (215 obs)
##   Primary splits:
##       V52 < 0.5085 to the left,  improve=117.38960, (0 missing)
##       V16 < 0.215  to the left,  improve=105.71120, (0 missing)
##       V55 < 3.6395 to the left,  improve= 56.21815, (0 missing)
##       V25 < 0.025  to the right, improve= 44.66833, (0 missing)
##       V21 < 0.615  to the left,  improve= 40.66541, (0 missing)
##   Surrogate splits:
##       V16 < 2.375  to the left,  agree=0.904, adj=0.028, (0 split)
##       V23 < 0.62   to the left,  agree=0.904, adj=0.023, (0 split)
##       V4  < 0.56   to the left,  agree=0.902, adj=0.009, (0 split)
##       V12 < 5.19   to the left,  agree=0.902, adj=0.009, (0 split)
##       V20 < 5.26   to the left,  agree=0.902, adj=0.009, (0 split)
## 
## Node number 5: 224 observations
##   predicted class=1  expected loss=0.08482143  P(node) =0.06952204
##     class counts:    19   205
##    probabilities: 0.085 0.915 
## 
## Node number 6: 57 observations
##   predicted class=0  expected loss=0.0877193  P(node) =0.01769088
##     class counts:    52     5
##    probabilities: 0.912 0.088 
## 
## Node number 7: 763 observations
##   predicted class=1  expected loss=0.06684142  P(node) =0.2368094
##     class counts:    51   712
##    probabilities: 0.067 0.933 
## 
## Node number 8: 1963 observations
##   predicted class=0  expected loss=0.1054508  P(node) =0.6092489
##     class counts:  1756   207
##    probabilities: 0.895 0.105 
## 
## Node number 9: 215 observations,    complexity param=0.02362205
##   predicted class=1  expected loss=0.344186  P(node) =0.06672874
##     class counts:    74   141
##    probabilities: 0.344 0.656 
##   left son=18 (84 obs) right son=19 (131 obs)
##   Primary splits:
##       V56 < 10.5   to the left,  improve=30.82982, (0 missing)
##       V57 < 73.5   to the left,  improve=29.47258, (0 missing)
##       V16 < 0.115  to the left,  improve=25.08449, (0 missing)
##       V55 < 2.581  to the left,  improve=24.82215, (0 missing)
##       V45 < 0.585  to the right, improve=13.39567, (0 missing)
##   Surrogate splits:
##       V57 < 24.5   to the left,  agree=0.888, adj=0.714, (0 split)
##       V55 < 1.969  to the left,  agree=0.865, adj=0.655, (0 split)
##       V16 < 0.115  to the left,  agree=0.670, adj=0.155, (0 split)
##       V19 < 0.4    to the left,  agree=0.670, adj=0.155, (0 split)
##       V12 < 2.52   to the right, agree=0.660, adj=0.131, (0 split)
## 
## Node number 18: 84 observations
##   predicted class=0  expected loss=0.3214286  P(node) =0.02607076
##     class counts:    57    27
##    probabilities: 0.679 0.321 
## 
## Node number 19: 131 observations
##   predicted class=1  expected loss=0.129771  P(node) =0.04065798
##     class counts:    17   114
##    probabilities: 0.130 0.870

Valores predichos en train

# Predicciones del training
predichos_train_arbol <- predict(mod_arbol, df_train[, -58], type = "prob")
predichos_train2_arbol <- if_else(predichos_train_arbol[, 1] <= 0.5, true = "1", false = "0") 

# Matriz de confusión
table(df_train$V58, predichos_train2_arbol, dnn = c("Real", "Predicho"))
##     Predicho
## Real    0    1
##    0 1865   87
##    1  239 1031

# Accuracy o precisión del modelo en training
mean(df_train$V58 == predichos_train2_arbol)
## [1] 0.8988206

Valores predichos en testing

# Predicciones del training
predichos_test_arbol <- predict(mod_arbol, df_test[, -58], type = "prob")
predichos_test2_arbol <- if_else(predichos_test_arbol[, 1] <= 0.5, true = "1", false = "0") 

# Matriz de confusión
table(df_test$V58, predichos_test2_arbol, dnn = c("Real", "Predicho"))
##     Predicho
## Real   0   1
##    0 795  41
##    1 101 442

# Accuracy o precisión del modelo en training
mean(df_test$V58 == predichos_test2_arbol)
## [1] 0.8970268

LDA

library(doMC)
registerDoMC(cores = 4)
set.seed(1000)
mod_lda <- train(V58 ~ .,
                data = df_train,
                method = "lda",
                allowParallel = TRUE,
                trControl = trainControl(method = "cv", number = 5)) 

Valores predichos en train

# Predicciones del training
predichos_train_lda <- predict(mod_lda, df_train[, -58], type = "prob")
predichos_train2_lda <- if_else(predichos_train_lda[, 1] <= 0.5, true = "1", false = "0") 

# Matriz de confusión
table(df_train$V58, predichos_train2_lda, dnn = c("Real", "Predicho"))
##     Predicho
## Real    0    1
##    0 1875   77
##    1  262 1008

# Accuracy o precisión del modelo en training
mean(df_train$V58 == predichos_train2_lda)
## [1] 0.8947858

Valores predichos en test

# Predicciones del training
predichos_test_lda <- predict(mod_lda, df_test[, -58], type = "prob")
predichos_test2_lda <- if_else(predichos_test_lda[, 1] <= 0.5, true = "1", false = "0") 

# Matriz de confusión
table(df_test$V58, predichos_test2_lda, dnn = c("Real", "Predicho"))
##     Predicho
## Real   0   1
##    0 788  48
##    1 103 440

# Accuracy o precisión del modelo en training
mean(df_test$V58 == predichos_test2_lda)
## [1] 0.8905004

SVM

library(doMC)
registerDoMC(cores = 4)
set.seed(1000)
mod_svm <- train(V58 ~ .,
                data = df_train,
                method = "svmRadial",
                allowParallel = TRUE,
                trControl = trainControl(method = "cv", number = 5)) 

Valores predichos en train

# Predicciones del training
predichos_train_svm <- predict(mod_svm, df_train[, -58], type = "raw")

# Matriz de confusión
table(df_train$V58, predichos_train_svm, dnn = c("Real", "Predicho"))
##     Predicho
## Real    0    1
##    0 1904   48
##    1   99 1171

# Accuracy o precisión del modelo en training
mean(df_train$V58 == predichos_train_svm)
## [1] 0.9543762

Valores predichos en test

# Predicciones del training
predichos_test_svm <- predict(mod_svm, df_test[, -58], type = "raw")

# Matriz de confusión
table(df_test$V58, predichos_test_svm, dnn = c("Real", "Predicho"))
##     Predicho
## Real   0   1
##    0 803  33
##    1  67 476

# Accuracy o precisión del modelo en training
mean(df_test$V58 == predichos_test_svm)
## [1] 0.9274837

Comparando modelos

# Precisiòn en training
accur_reglogi_train <- mean(df_train$V58 == predichos_train2)
accur_arbolde_train <- mean(df_train$V58 == predichos_train2_arbol)
accur_lda_train     <- mean(df_train$V58 == predichos_train2_lda)
accur_svm_train     <- mean(df_train$V58 == predichos_train_svm)

# Precisión en testing
accur_reglogi_testi <- mean(df_test$V58 == predichos_test2)
accur_arbolde_testi <- mean(df_test$V58 == predichos_test2_arbol)
accur_lda_testi     <- mean(df_test$V58 == predichos_test2_lda)
accur_svm_testi     <- mean(df_test$V58 == predichos_test_svm)

# Comparación de modelos
accuracy_df <- data.frame(
  modelo = c("R. Logìstica",  "Árbol", "LDA", "SVM", "R. Logìstica",  "Árbol", "LDA", "SVM"),
  data   = c(rep("Training", 4), rep("Testing", 4)),
  accuracy = c(accur_reglogi_train, accur_arbolde_train, accur_lda_train,
               accur_svm_train, accur_reglogi_testi, accur_arbolde_testi,
               accur_lda_testi, accur_svm_testi)
  
)

# Gráfico
library(ggplot2)
library(plotly)
ggplotly(accuracy_df %>% 
  ggplot(data = ., aes(x = modelo, y = accuracy, color = data)) +
  geom_point(size = 2) +
  geom_line(aes(group = data)) +
  scale_color_brewer(palette = "Set1") +
  labs(x = "Modelo", y = "Accuracy",
       title = "Comparación de modelos de Machine Learning\nAprendizaje Supervisado - Detección de Spam",
       color = "") +
  theme_light() +
  theme(legend.position = "bottom",
        title = element_text(size = 8.5)))
ÁrbolLDAR. LogìsticaSVM0.900.920.94
TestingTrainingComparación de modelos de Machine LearningAprendizaje Supervisado - Detección de SpamModeloAccuracy