Datos iniciales
Train y Testing
- Frecuencias de variable respuesta
Modelos
Comparando modelos

Datos iniciales

Spambase Data Set - UCI Machine Learning

load("datos_spam.Rdata")

Train y Testing

# Creando partición 70 y 30%
library(caret)
datos$V58 <- factor(datos$V58)
ind <- createDataPartition(y = datos$V58, times = 1, p = 0.7, list = FALSE)
df_train <- datos[ind, ]
df_test <- datos[-ind, ]

Frecuencias de variable respuesta

# Frecuencia training
prop.table(table(df_train$V58))
## 
##         0         1 
## 0.6058349 0.3941651

# Frecuencia testing
prop.table(table(df_test$V58))
## 
##         0         1 
## 0.6062364 0.3937636

Modelos

Regresión logística

mod_reglogi <- glm(V58 ~ ., data = df_train, family = "binomial")
summary(mod_reglogi)
## 
## Call:
## glm(formula = V58 ~ ., family = "binomial", data = df_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.2063  -0.1388   0.0000   0.0889   3.0124  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.660e+00  1.799e-01  -9.225  < 2e-16 ***
## V1          -4.900e-01  3.056e-01  -1.603 0.108897    
## V2          -1.598e-01  8.040e-02  -1.988 0.046786 *  
## V3           7.200e-02  1.408e-01   0.512 0.608972    
## V4           1.405e+00  1.735e+00   0.810 0.417872    
## V5           4.888e-01  1.222e-01   3.999 6.35e-05 ***
## V6           1.181e+00  3.312e-01   3.566 0.000363 ***
## V7           1.811e+00  3.496e-01   5.179 2.23e-07 ***
## V8           3.304e-01  1.446e-01   2.286 0.022275 *  
## V9           1.471e+00  4.403e-01   3.341 0.000835 ***
## V10          4.715e-02  8.470e-02   0.557 0.577770    
## V11          7.457e-01  3.901e-01   1.911 0.055943 .  
## V12         -1.578e-01  9.441e-02  -1.672 0.094571 .  
## V13          1.482e-02  3.022e-01   0.049 0.960892    
## V14          3.097e-01  2.220e-01   1.395 0.163099    
## V15          7.208e-01  7.613e-01   0.947 0.343722    
## V16          1.284e+00  2.079e-01   6.177 6.55e-10 ***
## V17          2.155e+00  3.879e-01   5.554 2.80e-08 ***
## V18          4.683e-02  1.468e-01   0.319 0.749753    
## V19          6.273e-02  4.595e-02   1.365 0.172217    
## V20          1.153e+00  7.601e-01   1.517 0.129351    
## V21          1.932e-01  6.462e-02   2.990 0.002788 ** 
## V22          3.188e-02  1.982e-01   0.161 0.872243    
## V23          3.309e+00  8.082e-01   4.094 4.25e-05 ***
## V24          5.383e-01  2.424e-01   2.220 0.026391 *  
## V25         -3.515e+00  5.741e-01  -6.122 9.22e-10 ***
## V26         -6.339e-01  4.959e-01  -1.278 0.201131    
## V27         -2.502e+01  3.966e+00  -6.308 2.83e-10 ***
## V28          5.643e-01  2.957e-01   1.908 0.056357 .  
## V29         -2.547e+00  1.600e+00  -1.592 0.111465    
## V30         -7.361e-01  4.980e-01  -1.478 0.139433    
## V31          1.229e+00  8.049e-01   1.528 0.126629    
## V32          2.322e+00  3.666e+00   0.633 0.526501    
## V33         -1.511e+00  6.228e-01  -2.427 0.015231 *  
## V34          8.300e-01  1.764e+00   0.470 0.638045    
## V35         -1.620e+00  1.223e+00  -1.325 0.185085    
## V36          1.273e+00  4.145e-01   3.070 0.002139 ** 
## V37         -8.950e-03  1.968e-01  -0.045 0.963721    
## V38         -8.526e-01  6.348e-01  -1.343 0.179273    
## V39         -6.879e-01  4.993e-01  -1.378 0.168286    
## V40         -4.981e-01  4.239e-01  -1.175 0.239977    
## V41         -4.577e+01  3.919e+01  -1.168 0.242868    
## V42         -2.061e+00  8.106e-01  -2.543 0.011005 *  
## V43         -7.745e-01  7.166e-01  -1.081 0.279835    
## V44         -1.859e+00  6.265e-01  -2.968 0.002999 ** 
## V45         -6.585e-01  1.763e-01  -3.734 0.000188 ***
## V46         -1.623e+00  3.215e-01  -5.048 4.48e-07 ***
## V47         -1.984e+00  1.840e+00  -1.078 0.281077    
## V48         -5.678e+00  2.365e+00  -2.401 0.016338 *  
## V49         -1.063e+00  5.178e-01  -2.053 0.040078 *  
## V50         -5.447e-01  4.132e-01  -1.318 0.187363    
## V51         -6.604e-01  1.012e+00  -0.653 0.513972    
## V52          4.308e-01  1.184e-01   3.640 0.000272 ***
## V53          4.481e+00  7.101e-01   6.310 2.79e-10 ***
## V54          2.251e+00  1.092e+00   2.061 0.039332 *  
## V55          1.317e-01  3.276e-02   4.019 5.86e-05 ***
## V56          5.305e-04  3.371e-03   0.157 0.874971    
## V57          1.614e-03  3.667e-04   4.403 1.07e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4321.2  on 3221  degrees of freedom
## Residual deviance: 1149.7  on 3164  degrees of freedom
## AIC: 1265.7
## 
## Number of Fisher Scoring iterations: 14

Valores predichos en training

library(dplyr)
# Predicciones del training
predichos_train <- mod_reglogi$fitted.values
predichos_train2 <- if_else(predichos_train <= 0.5, true = "0", false = "1") 

# Matriz de confusión
table(df_train$V58, predichos_train2, dnn = c("Real", "Predicho"))
##     Predicho
## Real    0    1
##    0 1867   85
##    1  132 1138

# Accuracy o precisión del modelo en training
mean(df_train$V58 == predichos_train2)
## [1] 0.9326505

Valores predichos en testing

# Valores predichos en testing
predichos_test <- predict(mod_reglogi, newdata = df_test[, -58], type = "response")
predichos_test2 <- if_else(predichos_test <= 0.5, true = "0", false = "1") 

# Matriz de confusión
table(df_test$V58, predichos_test2, dnn = c("Real", "Predicho"))
##     Predicho
## Real   0   1
##    0 776  60
##    1  46 497

# Accuracy
mean(df_test$V58 == predichos_test2)
## [1] 0.9231327

Árbol de clasificación

library(rpart)
mod_arbol <- rpart(V58 ~ ., data = df_train, method = "class")
summary(mod_arbol)
## Call:
## rpart(formula = V58 ~ ., data = df_train, method = "class")
##   n= 3222 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.48346457      0 1.0000000 1.0000000 0.02184114
## 2 0.14645669      1 0.5165354 0.5299213 0.01816879
## 3 0.05275591      2 0.3700787 0.3803150 0.01595523
## 4 0.03700787      3 0.3173228 0.3472441 0.01536220
## 5 0.02362205      4 0.2803150 0.2921260 0.01426655
## 6 0.01000000      5 0.2566929 0.2582677 0.01351511
## 
## Variable importance
## V53  V7 V23 V24 V52 V57  V9 V20 V25 V56 V26 V55 V31 V30 V27 V28 
##  28  13  11  10   7   6   6   5   4   2   2   1   1   1   1   1 
## 
## Node number 1: 3222 observations,    complexity param=0.4834646
##   predicted class=0  expected loss=0.3941651  P(node) =1
##     class counts:  1952  1270
##    probabilities: 0.606 0.394 
##   left son=2 (2402 obs) right son=3 (820 obs)
##   Primary splits:
##       V53 < 0.0445 to the left,  improve=507.3249, (0 missing)
##       V52 < 0.0795 to the left,  improve=484.0923, (0 missing)
##       V16 < 0.095  to the left,  improve=410.5672, (0 missing)
##       V7  < 0.01   to the left,  improve=404.4405, (0 missing)
##       V21 < 0.635  to the left,  improve=377.2643, (0 missing)
##   Surrogate splits:
##       V23 < 0.045  to the left,  agree=0.838, adj=0.365, (0 split)
##       V24 < 0.045  to the left,  agree=0.836, adj=0.356, (0 split)
##       V9  < 0.095  to the left,  agree=0.797, adj=0.204, (0 split)
##       V20 < 0.02   to the left,  agree=0.791, adj=0.178, (0 split)
##       V57 < 616.5  to the left,  agree=0.789, adj=0.171, (0 split)
## 
## Node number 2: 2402 observations,    complexity param=0.1464567
##   predicted class=0  expected loss=0.2302248  P(node) =0.7454997
##     class counts:  1849   553
##    probabilities: 0.770 0.230 
##   left son=4 (2178 obs) right son=5 (224 obs)
##   Primary splits:
##       V7  < 0.06   to the left,  improve=231.8012, (0 missing)
##       V52 < 0.1885 to the left,  improve=190.5761, (0 missing)
##       V16 < 0.195  to the left,  improve=187.8261, (0 missing)
##       V55 < 3.6835 to the left,  improve=113.7468, (0 missing)
##       V21 < 0.615  to the left,  improve=104.7919, (0 missing)
##   Surrogate splits:
##       V56 < 131.5  to the left,  agree=0.912, adj=0.054, (0 split)
##       V11 < 0.37   to the left,  agree=0.909, adj=0.027, (0 split)
##       V54 < 0.8325 to the left,  agree=0.908, adj=0.018, (0 split)
##       V4  < 8.115  to the left,  agree=0.908, adj=0.009, (0 split)
##       V17 < 3.98   to the left,  agree=0.908, adj=0.009, (0 split)
## 
## Node number 3: 820 observations,    complexity param=0.03700787
##   predicted class=1  expected loss=0.1256098  P(node) =0.2545003
##     class counts:   103   717
##    probabilities: 0.126 0.874 
##   left son=6 (57 obs) right son=7 (763 obs)
##   Primary splits:
##       V25 < 0.385  to the right, improve=75.81941, (0 missing)
##       V26 < 0.12   to the right, improve=40.62536, (0 missing)
##       V52 < 0.051  to the left,  improve=35.51776, (0 missing)
##       V27 < 0.21   to the right, improve=29.46368, (0 missing)
##       V37 < 0.025  to the right, improve=27.93081, (0 missing)
##   Surrogate splits:
##       V26 < 0.305  to the right, agree=0.959, adj=0.404, (0 split)
##       V31 < 0.045  to the right, agree=0.946, adj=0.228, (0 split)
##       V30 < 0.05   to the right, agree=0.943, adj=0.175, (0 split)
##       V27 < 0.225  to the right, agree=0.941, adj=0.158, (0 split)
##       V28 < 0.025  to the right, agree=0.941, adj=0.158, (0 split)
## 
## Node number 4: 2178 observations,    complexity param=0.05275591
##   predicted class=0  expected loss=0.1597796  P(node) =0.6759777
##     class counts:  1830   348
##    probabilities: 0.840 0.160 
##   left son=8 (1963 obs) right son=9 (215 obs)
##   Primary splits:
##       V52 < 0.5085 to the left,  improve=117.38960, (0 missing)
##       V16 < 0.215  to the left,  improve=105.71120, (0 missing)
##       V55 < 3.6395 to the left,  improve= 56.21815, (0 missing)
##       V25 < 0.025  to the right, improve= 44.66833, (0 missing)
##       V21 < 0.615  to the left,  improve= 40.66541, (0 missing)
##   Surrogate splits:
##       V16 < 2.375  to the left,  agree=0.904, adj=0.028, (0 split)
##       V23 < 0.62   to the left,  agree=0.904, adj=0.023, (0 split)
##       V4  < 0.56   to the left,  agree=0.902, adj=0.009, (0 split)
##       V12 < 5.19   to the left,  agree=0.902, adj=0.009, (0 split)
##       V20 < 5.26   to the left,  agree=0.902, adj=0.009, (0 split)
## 
## Node number 5: 224 observations
##   predicted class=1  expected loss=0.08482143  P(node) =0.06952204
##     class counts:    19   205
##    probabilities: 0.085 0.915 
## 
## Node number 6: 57 observations
##   predicted class=0  expected loss=0.0877193  P(node) =0.01769088
##     class counts:    52     5
##    probabilities: 0.912 0.088 
## 
## Node number 7: 763 observations
##   predicted class=1  expected loss=0.06684142  P(node) =0.2368094
##     class counts:    51   712
##    probabilities: 0.067 0.933 
## 
## Node number 8: 1963 observations
##   predicted class=0  expected loss=0.1054508  P(node) =0.6092489
##     class counts:  1756   207
##    probabilities: 0.895 0.105 
## 
## Node number 9: 215 observations,    complexity param=0.02362205
##   predicted class=1  expected loss=0.344186  P(node) =0.06672874
##     class counts:    74   141
##    probabilities: 0.344 0.656 
##   left son=18 (84 obs) right son=19 (131 obs)
##   Primary splits:
##       V56 < 10.5   to the left,  improve=30.82982, (0 missing)
##       V57 < 73.5   to the left,  improve=29.47258, (0 missing)
##       V16 < 0.115  to the left,  improve=25.08449, (0 missing)
##       V55 < 2.581  to the left,  improve=24.82215, (0 missing)
##       V45 < 0.585  to the right, improve=13.39567, (0 missing)
##   Surrogate splits:
##       V57 < 24.5   to the left,  agree=0.888, adj=0.714, (0 split)
##       V55 < 1.969  to the left,  agree=0.865, adj=0.655, (0 split)
##       V16 < 0.115  to the left,  agree=0.670, adj=0.155, (0 split)
##       V19 < 0.4    to the left,  agree=0.670, adj=0.155, (0 split)
##       V12 < 2.52   to the right, agree=0.660, adj=0.131, (0 split)
## 
## Node number 18: 84 observations
##   predicted class=0  expected loss=0.3214286  P(node) =0.02607076
##     class counts:    57    27
##    probabilities: 0.679 0.321 
## 
## Node number 19: 131 observations
##   predicted class=1  expected loss=0.129771  P(node) =0.04065798
##     class counts:    17   114
##    probabilities: 0.130 0.870

Valores predichos en train

# Predicciones del training
predichos_train_arbol <- predict(mod_arbol, df_train[, -58], type = "prob")
predichos_train2_arbol <- if_else(predichos_train_arbol[, 1] <= 0.5, true = "1", false = "0") 

# Matriz de confusión
table(df_train$V58, predichos_train2_arbol, dnn = c("Real", "Predicho"))
##     Predicho
## Real    0    1
##    0 1865   87
##    1  239 1031

# Accuracy o precisión del modelo en training
mean(df_train$V58 == predichos_train2_arbol)
## [1] 0.8988206

Valores predichos en testing

# Predicciones del training
predichos_test_arbol <- predict(mod_arbol, df_test[, -58], type = "prob")
predichos_test2_arbol <- if_else(predichos_test_arbol[, 1] <= 0.5, true = "1", false = "0") 

# Matriz de confusión
table(df_test$V58, predichos_test2_arbol, dnn = c("Real", "Predicho"))
##     Predicho
## Real   0   1
##    0 795  41
##    1 101 442

# Accuracy o precisión del modelo en training
mean(df_test$V58 == predichos_test2_arbol)
## [1] 0.8970268

LDA

library(doMC)
registerDoMC(cores = 4)
set.seed(1000)
mod_lda <- train(V58 ~ .,
                data = df_train,
                method = "lda",
                allowParallel = TRUE,
                trControl = trainControl(method = "cv", number = 5))

Valores predichos en train

# Predicciones del training
predichos_train_lda <- predict(mod_lda, df_train[, -58], type = "prob")
predichos_train2_lda <- if_else(predichos_train_lda[, 1] <= 0.5, true = "1", false = "0") 

# Matriz de confusión
table(df_train$V58, predichos_train2_lda, dnn = c("Real", "Predicho"))
##     Predicho
## Real    0    1
##    0 1875   77
##    1  262 1008

# Accuracy o precisión del modelo en training
mean(df_train$V58 == predichos_train2_lda)
## [1] 0.8947858

Valores predichos en test

# Predicciones del training
predichos_test_lda <- predict(mod_lda, df_test[, -58], type = "prob")
predichos_test2_lda <- if_else(predichos_test_lda[, 1] <= 0.5, true = "1", false = "0") 

# Matriz de confusión
table(df_test$V58, predichos_test2_lda, dnn = c("Real", "Predicho"))
##     Predicho
## Real   0   1
##    0 788  48
##    1 103 440

# Accuracy o precisión del modelo en training
mean(df_test$V58 == predichos_test2_lda)
## [1] 0.8905004

SVM

library(doMC)
registerDoMC(cores = 4)
set.seed(1000)
mod_svm <- train(V58 ~ .,
                data = df_train,
                method = "svmRadial",
                allowParallel = TRUE,
                trControl = trainControl(method = "cv", number = 5))

Valores predichos en train

# Predicciones del training
predichos_train_svm <- predict(mod_svm, df_train[, -58], type = "raw")

# Matriz de confusión
table(df_train$V58, predichos_train_svm, dnn = c("Real", "Predicho"))
##     Predicho
## Real    0    1
##    0 1904   48
##    1   99 1171

# Accuracy o precisión del modelo en training
mean(df_train$V58 == predichos_train_svm)
## [1] 0.9543762

Valores predichos en test

# Predicciones del training
predichos_test_svm <- predict(mod_svm, df_test[, -58], type = "raw")

# Matriz de confusión
table(df_test$V58, predichos_test_svm, dnn = c("Real", "Predicho"))
##     Predicho
## Real   0   1
##    0 803  33
##    1  67 476

# Accuracy o precisión del modelo en training
mean(df_test$V58 == predichos_test_svm)
## [1] 0.9274837

Comparando modelos

# Precisiòn en training
accur_reglogi_train <- mean(df_train$V58 == predichos_train2)
accur_arbolde_train <- mean(df_train$V58 == predichos_train2_arbol)
accur_lda_train     <- mean(df_train$V58 == predichos_train2_lda)
accur_svm_train     <- mean(df_train$V58 == predichos_train_svm)

# Precisión en testing
accur_reglogi_testi <- mean(df_test$V58 == predichos_test2)
accur_arbolde_testi <- mean(df_test$V58 == predichos_test2_arbol)
accur_lda_testi     <- mean(df_test$V58 == predichos_test2_lda)
accur_svm_testi     <- mean(df_test$V58 == predichos_test_svm)

# Comparación de modelos
accuracy_df <- data.frame(
  modelo = c("R. Logìstica",  "Árbol", "LDA", "SVM", "R. Logìstica",  "Árbol", "LDA", "SVM"),
  data   = c(rep("Training", 4), rep("Testing", 4)),
  accuracy = c(accur_reglogi_train, accur_arbolde_train, accur_lda_train,
               accur_svm_train, accur_reglogi_testi, accur_arbolde_testi,
               accur_lda_testi, accur_svm_testi)
  
)

# Gráfico
library(ggplot2)
library(plotly)
ggplotly(accuracy_df %>% 
  ggplot(data = ., aes(x = modelo, y = accuracy, color = data)) +
  geom_point(size = 2) +
  geom_line(aes(group = data)) +
  scale_color_brewer(palette = "Set1") +
  labs(x = "Modelo", y = "Accuracy",
       title = "Comparación de modelos de Machine Learning\nAprendizaje Supervisado - Detección de Spam",
       color = "") +
  theme_light() +
  theme(legend.position = "bottom",
        title = element_text(size = 8.5)))

Clasificación de correos Spam

Aprendizaje Supervisado

Edimer David Jaramillo

2019-10-01

Datos iniciales

Train y Testing

Frecuencias de variable respuesta

Modelos

Regresión logística

Valores predichos en training

Valores predichos en testing

Árbol de clasificación

Valores predichos en train

Valores predichos en testing

LDA

Valores predichos en train

Valores predichos en test

SVM

Valores predichos en train

Valores predichos en test

Comparando modelos