# Creando partición 70 y 30%
library(caret)
datos$V58 <- factor(datos$V58)
ind <- createDataPartition(y = datos$V58, times = 1, p = 0.7, list = FALSE)
df_train <- datos[ind, ]
df_test <- datos[-ind, ]
# Frecuencia training
prop.table(table(df_train$V58))
##
## 0 1
## 0.6058349 0.3941651
# Frecuencia testing
prop.table(table(df_test$V58))
##
## 0 1
## 0.6062364 0.3937636
mod_reglogi <- glm(V58 ~ ., data = df_train, family = "binomial")
summary(mod_reglogi)
##
## Call:
## glm(formula = V58 ~ ., family = "binomial", data = df_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.2063 -0.1388 0.0000 0.0889 3.0124
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.660e+00 1.799e-01 -9.225 < 2e-16 ***
## V1 -4.900e-01 3.056e-01 -1.603 0.108897
## V2 -1.598e-01 8.040e-02 -1.988 0.046786 *
## V3 7.200e-02 1.408e-01 0.512 0.608972
## V4 1.405e+00 1.735e+00 0.810 0.417872
## V5 4.888e-01 1.222e-01 3.999 6.35e-05 ***
## V6 1.181e+00 3.312e-01 3.566 0.000363 ***
## V7 1.811e+00 3.496e-01 5.179 2.23e-07 ***
## V8 3.304e-01 1.446e-01 2.286 0.022275 *
## V9 1.471e+00 4.403e-01 3.341 0.000835 ***
## V10 4.715e-02 8.470e-02 0.557 0.577770
## V11 7.457e-01 3.901e-01 1.911 0.055943 .
## V12 -1.578e-01 9.441e-02 -1.672 0.094571 .
## V13 1.482e-02 3.022e-01 0.049 0.960892
## V14 3.097e-01 2.220e-01 1.395 0.163099
## V15 7.208e-01 7.613e-01 0.947 0.343722
## V16 1.284e+00 2.079e-01 6.177 6.55e-10 ***
## V17 2.155e+00 3.879e-01 5.554 2.80e-08 ***
## V18 4.683e-02 1.468e-01 0.319 0.749753
## V19 6.273e-02 4.595e-02 1.365 0.172217
## V20 1.153e+00 7.601e-01 1.517 0.129351
## V21 1.932e-01 6.462e-02 2.990 0.002788 **
## V22 3.188e-02 1.982e-01 0.161 0.872243
## V23 3.309e+00 8.082e-01 4.094 4.25e-05 ***
## V24 5.383e-01 2.424e-01 2.220 0.026391 *
## V25 -3.515e+00 5.741e-01 -6.122 9.22e-10 ***
## V26 -6.339e-01 4.959e-01 -1.278 0.201131
## V27 -2.502e+01 3.966e+00 -6.308 2.83e-10 ***
## V28 5.643e-01 2.957e-01 1.908 0.056357 .
## V29 -2.547e+00 1.600e+00 -1.592 0.111465
## V30 -7.361e-01 4.980e-01 -1.478 0.139433
## V31 1.229e+00 8.049e-01 1.528 0.126629
## V32 2.322e+00 3.666e+00 0.633 0.526501
## V33 -1.511e+00 6.228e-01 -2.427 0.015231 *
## V34 8.300e-01 1.764e+00 0.470 0.638045
## V35 -1.620e+00 1.223e+00 -1.325 0.185085
## V36 1.273e+00 4.145e-01 3.070 0.002139 **
## V37 -8.950e-03 1.968e-01 -0.045 0.963721
## V38 -8.526e-01 6.348e-01 -1.343 0.179273
## V39 -6.879e-01 4.993e-01 -1.378 0.168286
## V40 -4.981e-01 4.239e-01 -1.175 0.239977
## V41 -4.577e+01 3.919e+01 -1.168 0.242868
## V42 -2.061e+00 8.106e-01 -2.543 0.011005 *
## V43 -7.745e-01 7.166e-01 -1.081 0.279835
## V44 -1.859e+00 6.265e-01 -2.968 0.002999 **
## V45 -6.585e-01 1.763e-01 -3.734 0.000188 ***
## V46 -1.623e+00 3.215e-01 -5.048 4.48e-07 ***
## V47 -1.984e+00 1.840e+00 -1.078 0.281077
## V48 -5.678e+00 2.365e+00 -2.401 0.016338 *
## V49 -1.063e+00 5.178e-01 -2.053 0.040078 *
## V50 -5.447e-01 4.132e-01 -1.318 0.187363
## V51 -6.604e-01 1.012e+00 -0.653 0.513972
## V52 4.308e-01 1.184e-01 3.640 0.000272 ***
## V53 4.481e+00 7.101e-01 6.310 2.79e-10 ***
## V54 2.251e+00 1.092e+00 2.061 0.039332 *
## V55 1.317e-01 3.276e-02 4.019 5.86e-05 ***
## V56 5.305e-04 3.371e-03 0.157 0.874971
## V57 1.614e-03 3.667e-04 4.403 1.07e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4321.2 on 3221 degrees of freedom
## Residual deviance: 1149.7 on 3164 degrees of freedom
## AIC: 1265.7
##
## Number of Fisher Scoring iterations: 14
library(dplyr)
# Predicciones del training
predichos_train <- mod_reglogi$fitted.values
predichos_train2 <- if_else(predichos_train <= 0.5, true = "0", false = "1")
# Matriz de confusión
table(df_train$V58, predichos_train2, dnn = c("Real", "Predicho"))
## Predicho
## Real 0 1
## 0 1867 85
## 1 132 1138
# Accuracy o precisión del modelo en training
mean(df_train$V58 == predichos_train2)
## [1] 0.9326505
# Valores predichos en testing
predichos_test <- predict(mod_reglogi, newdata = df_test[, -58], type = "response")
predichos_test2 <- if_else(predichos_test <= 0.5, true = "0", false = "1")
# Matriz de confusión
table(df_test$V58, predichos_test2, dnn = c("Real", "Predicho"))
## Predicho
## Real 0 1
## 0 776 60
## 1 46 497
# Accuracy
mean(df_test$V58 == predichos_test2)
## [1] 0.9231327
library(rpart)
mod_arbol <- rpart(V58 ~ ., data = df_train, method = "class")
summary(mod_arbol)
## Call:
## rpart(formula = V58 ~ ., data = df_train, method = "class")
## n= 3222
##
## CP nsplit rel error xerror xstd
## 1 0.48346457 0 1.0000000 1.0000000 0.02184114
## 2 0.14645669 1 0.5165354 0.5299213 0.01816879
## 3 0.05275591 2 0.3700787 0.3803150 0.01595523
## 4 0.03700787 3 0.3173228 0.3472441 0.01536220
## 5 0.02362205 4 0.2803150 0.2921260 0.01426655
## 6 0.01000000 5 0.2566929 0.2582677 0.01351511
##
## Variable importance
## V53 V7 V23 V24 V52 V57 V9 V20 V25 V56 V26 V55 V31 V30 V27 V28
## 28 13 11 10 7 6 6 5 4 2 2 1 1 1 1 1
##
## Node number 1: 3222 observations, complexity param=0.4834646
## predicted class=0 expected loss=0.3941651 P(node) =1
## class counts: 1952 1270
## probabilities: 0.606 0.394
## left son=2 (2402 obs) right son=3 (820 obs)
## Primary splits:
## V53 < 0.0445 to the left, improve=507.3249, (0 missing)
## V52 < 0.0795 to the left, improve=484.0923, (0 missing)
## V16 < 0.095 to the left, improve=410.5672, (0 missing)
## V7 < 0.01 to the left, improve=404.4405, (0 missing)
## V21 < 0.635 to the left, improve=377.2643, (0 missing)
## Surrogate splits:
## V23 < 0.045 to the left, agree=0.838, adj=0.365, (0 split)
## V24 < 0.045 to the left, agree=0.836, adj=0.356, (0 split)
## V9 < 0.095 to the left, agree=0.797, adj=0.204, (0 split)
## V20 < 0.02 to the left, agree=0.791, adj=0.178, (0 split)
## V57 < 616.5 to the left, agree=0.789, adj=0.171, (0 split)
##
## Node number 2: 2402 observations, complexity param=0.1464567
## predicted class=0 expected loss=0.2302248 P(node) =0.7454997
## class counts: 1849 553
## probabilities: 0.770 0.230
## left son=4 (2178 obs) right son=5 (224 obs)
## Primary splits:
## V7 < 0.06 to the left, improve=231.8012, (0 missing)
## V52 < 0.1885 to the left, improve=190.5761, (0 missing)
## V16 < 0.195 to the left, improve=187.8261, (0 missing)
## V55 < 3.6835 to the left, improve=113.7468, (0 missing)
## V21 < 0.615 to the left, improve=104.7919, (0 missing)
## Surrogate splits:
## V56 < 131.5 to the left, agree=0.912, adj=0.054, (0 split)
## V11 < 0.37 to the left, agree=0.909, adj=0.027, (0 split)
## V54 < 0.8325 to the left, agree=0.908, adj=0.018, (0 split)
## V4 < 8.115 to the left, agree=0.908, adj=0.009, (0 split)
## V17 < 3.98 to the left, agree=0.908, adj=0.009, (0 split)
##
## Node number 3: 820 observations, complexity param=0.03700787
## predicted class=1 expected loss=0.1256098 P(node) =0.2545003
## class counts: 103 717
## probabilities: 0.126 0.874
## left son=6 (57 obs) right son=7 (763 obs)
## Primary splits:
## V25 < 0.385 to the right, improve=75.81941, (0 missing)
## V26 < 0.12 to the right, improve=40.62536, (0 missing)
## V52 < 0.051 to the left, improve=35.51776, (0 missing)
## V27 < 0.21 to the right, improve=29.46368, (0 missing)
## V37 < 0.025 to the right, improve=27.93081, (0 missing)
## Surrogate splits:
## V26 < 0.305 to the right, agree=0.959, adj=0.404, (0 split)
## V31 < 0.045 to the right, agree=0.946, adj=0.228, (0 split)
## V30 < 0.05 to the right, agree=0.943, adj=0.175, (0 split)
## V27 < 0.225 to the right, agree=0.941, adj=0.158, (0 split)
## V28 < 0.025 to the right, agree=0.941, adj=0.158, (0 split)
##
## Node number 4: 2178 observations, complexity param=0.05275591
## predicted class=0 expected loss=0.1597796 P(node) =0.6759777
## class counts: 1830 348
## probabilities: 0.840 0.160
## left son=8 (1963 obs) right son=9 (215 obs)
## Primary splits:
## V52 < 0.5085 to the left, improve=117.38960, (0 missing)
## V16 < 0.215 to the left, improve=105.71120, (0 missing)
## V55 < 3.6395 to the left, improve= 56.21815, (0 missing)
## V25 < 0.025 to the right, improve= 44.66833, (0 missing)
## V21 < 0.615 to the left, improve= 40.66541, (0 missing)
## Surrogate splits:
## V16 < 2.375 to the left, agree=0.904, adj=0.028, (0 split)
## V23 < 0.62 to the left, agree=0.904, adj=0.023, (0 split)
## V4 < 0.56 to the left, agree=0.902, adj=0.009, (0 split)
## V12 < 5.19 to the left, agree=0.902, adj=0.009, (0 split)
## V20 < 5.26 to the left, agree=0.902, adj=0.009, (0 split)
##
## Node number 5: 224 observations
## predicted class=1 expected loss=0.08482143 P(node) =0.06952204
## class counts: 19 205
## probabilities: 0.085 0.915
##
## Node number 6: 57 observations
## predicted class=0 expected loss=0.0877193 P(node) =0.01769088
## class counts: 52 5
## probabilities: 0.912 0.088
##
## Node number 7: 763 observations
## predicted class=1 expected loss=0.06684142 P(node) =0.2368094
## class counts: 51 712
## probabilities: 0.067 0.933
##
## Node number 8: 1963 observations
## predicted class=0 expected loss=0.1054508 P(node) =0.6092489
## class counts: 1756 207
## probabilities: 0.895 0.105
##
## Node number 9: 215 observations, complexity param=0.02362205
## predicted class=1 expected loss=0.344186 P(node) =0.06672874
## class counts: 74 141
## probabilities: 0.344 0.656
## left son=18 (84 obs) right son=19 (131 obs)
## Primary splits:
## V56 < 10.5 to the left, improve=30.82982, (0 missing)
## V57 < 73.5 to the left, improve=29.47258, (0 missing)
## V16 < 0.115 to the left, improve=25.08449, (0 missing)
## V55 < 2.581 to the left, improve=24.82215, (0 missing)
## V45 < 0.585 to the right, improve=13.39567, (0 missing)
## Surrogate splits:
## V57 < 24.5 to the left, agree=0.888, adj=0.714, (0 split)
## V55 < 1.969 to the left, agree=0.865, adj=0.655, (0 split)
## V16 < 0.115 to the left, agree=0.670, adj=0.155, (0 split)
## V19 < 0.4 to the left, agree=0.670, adj=0.155, (0 split)
## V12 < 2.52 to the right, agree=0.660, adj=0.131, (0 split)
##
## Node number 18: 84 observations
## predicted class=0 expected loss=0.3214286 P(node) =0.02607076
## class counts: 57 27
## probabilities: 0.679 0.321
##
## Node number 19: 131 observations
## predicted class=1 expected loss=0.129771 P(node) =0.04065798
## class counts: 17 114
## probabilities: 0.130 0.870
# Predicciones del training
predichos_train_arbol <- predict(mod_arbol, df_train[, -58], type = "prob")
predichos_train2_arbol <- if_else(predichos_train_arbol[, 1] <= 0.5, true = "1", false = "0")
# Matriz de confusión
table(df_train$V58, predichos_train2_arbol, dnn = c("Real", "Predicho"))
## Predicho
## Real 0 1
## 0 1865 87
## 1 239 1031
# Accuracy o precisión del modelo en training
mean(df_train$V58 == predichos_train2_arbol)
## [1] 0.8988206
# Predicciones del training
predichos_test_arbol <- predict(mod_arbol, df_test[, -58], type = "prob")
predichos_test2_arbol <- if_else(predichos_test_arbol[, 1] <= 0.5, true = "1", false = "0")
# Matriz de confusión
table(df_test$V58, predichos_test2_arbol, dnn = c("Real", "Predicho"))
## Predicho
## Real 0 1
## 0 795 41
## 1 101 442
# Accuracy o precisión del modelo en training
mean(df_test$V58 == predichos_test2_arbol)
## [1] 0.8970268
library(doMC)
registerDoMC(cores = 4)
set.seed(1000)
mod_lda <- train(V58 ~ .,
data = df_train,
method = "lda",
allowParallel = TRUE,
trControl = trainControl(method = "cv", number = 5))
# Predicciones del training
predichos_train_lda <- predict(mod_lda, df_train[, -58], type = "prob")
predichos_train2_lda <- if_else(predichos_train_lda[, 1] <= 0.5, true = "1", false = "0")
# Matriz de confusión
table(df_train$V58, predichos_train2_lda, dnn = c("Real", "Predicho"))
## Predicho
## Real 0 1
## 0 1875 77
## 1 262 1008
# Accuracy o precisión del modelo en training
mean(df_train$V58 == predichos_train2_lda)
## [1] 0.8947858
# Predicciones del training
predichos_test_lda <- predict(mod_lda, df_test[, -58], type = "prob")
predichos_test2_lda <- if_else(predichos_test_lda[, 1] <= 0.5, true = "1", false = "0")
# Matriz de confusión
table(df_test$V58, predichos_test2_lda, dnn = c("Real", "Predicho"))
## Predicho
## Real 0 1
## 0 788 48
## 1 103 440
# Accuracy o precisión del modelo en training
mean(df_test$V58 == predichos_test2_lda)
## [1] 0.8905004
library(doMC)
registerDoMC(cores = 4)
set.seed(1000)
mod_svm <- train(V58 ~ .,
data = df_train,
method = "svmRadial",
allowParallel = TRUE,
trControl = trainControl(method = "cv", number = 5))
# Predicciones del training
predichos_train_svm <- predict(mod_svm, df_train[, -58], type = "raw")
# Matriz de confusión
table(df_train$V58, predichos_train_svm, dnn = c("Real", "Predicho"))
## Predicho
## Real 0 1
## 0 1904 48
## 1 99 1171
# Accuracy o precisión del modelo en training
mean(df_train$V58 == predichos_train_svm)
## [1] 0.9543762
# Predicciones del training
predichos_test_svm <- predict(mod_svm, df_test[, -58], type = "raw")
# Matriz de confusión
table(df_test$V58, predichos_test_svm, dnn = c("Real", "Predicho"))
## Predicho
## Real 0 1
## 0 803 33
## 1 67 476
# Accuracy o precisión del modelo en training
mean(df_test$V58 == predichos_test_svm)
## [1] 0.9274837
# Precisiòn en training
accur_reglogi_train <- mean(df_train$V58 == predichos_train2)
accur_arbolde_train <- mean(df_train$V58 == predichos_train2_arbol)
accur_lda_train <- mean(df_train$V58 == predichos_train2_lda)
accur_svm_train <- mean(df_train$V58 == predichos_train_svm)
# Precisión en testing
accur_reglogi_testi <- mean(df_test$V58 == predichos_test2)
accur_arbolde_testi <- mean(df_test$V58 == predichos_test2_arbol)
accur_lda_testi <- mean(df_test$V58 == predichos_test2_lda)
accur_svm_testi <- mean(df_test$V58 == predichos_test_svm)
# Comparación de modelos
accuracy_df <- data.frame(
modelo = c("R. Logìstica", "Árbol", "LDA", "SVM", "R. Logìstica", "Árbol", "LDA", "SVM"),
data = c(rep("Training", 4), rep("Testing", 4)),
accuracy = c(accur_reglogi_train, accur_arbolde_train, accur_lda_train,
accur_svm_train, accur_reglogi_testi, accur_arbolde_testi,
accur_lda_testi, accur_svm_testi)
)
# Gráfico
library(ggplot2)
library(plotly)
ggplotly(accuracy_df %>%
ggplot(data = ., aes(x = modelo, y = accuracy, color = data)) +
geom_point(size = 2) +
geom_line(aes(group = data)) +
scale_color_brewer(palette = "Set1") +
labs(x = "Modelo", y = "Accuracy",
title = "Comparación de modelos de Machine Learning\nAprendizaje Supervisado - Detección de Spam",
color = "") +
theme_light() +
theme(legend.position = "bottom",
title = element_text(size = 8.5)))