glimpse(Student_mat)
## Rows: 395
## Columns: 33
## $ school <chr> "GP", "GP", "GP", "GP", "GP", "GP", "GP", "GP", "GP", "GP",…
## $ sex <chr> "F", "F", "F", "F", "F", "M", "M", "F", "M", "M", "F", "F",…
## $ age <fct> 18, 17, 15, 15, 16, 16, 16, 17, 15, 15, 15, 15, 15, 15, 15,…
## $ address <chr> "U", "U", "U", "U", "U", "U", "U", "U", "U", "U", "U", "U",…
## $ famsize <chr> "GT3", "GT3", "LE3", "GT3", "GT3", "LE3", "LE3", "GT3", "LE…
## $ Pstatus <chr> "A", "T", "T", "T", "T", "T", "T", "A", "A", "T", "T", "T",…
## $ Medu <fct> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4, 3, 3, 4,…
## $ Fedu <fct> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4, 4, 3, 2, 3,…
## $ Mjob <chr> "at_home", "at_home", "at_home", "health", "other", "servic…
## $ Fjob <chr> "teacher", "other", "other", "services", "other", "other", …
## $ reason <chr> "course", "course", "other", "home", "home", "reputation", …
## $ guardian <chr> "mother", "father", "mother", "mother", "father", "mother",…
## $ traveltime <fct> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 1, 3, 1, 1,…
## $ studytime <fct> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3, 2, 1, 1,…
## $ failures <fct> 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,…
## $ schoolsup <chr> "yes", "no", "yes", "no", "no", "no", "no", "yes", "no", "n…
## $ famsup <chr> "no", "yes", "no", "yes", "yes", "yes", "no", "yes", "yes",…
## $ paid <chr> "no", "no", "yes", "yes", "yes", "yes", "no", "no", "yes", …
## $ activities <chr> "no", "no", "no", "yes", "no", "yes", "no", "no", "no", "ye…
## $ nursery <chr> "yes", "no", "yes", "yes", "yes", "yes", "yes", "yes", "yes…
## $ higher <chr> "yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes", "ye…
## $ internet <chr> "no", "yes", "yes", "yes", "no", "yes", "yes", "no", "yes",…
## $ romantic <fct> no, no, no, yes, no, no, no, no, no, no, no, no, no, no, ye…
## $ famrel <fct> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 3, 5, 5, 3,…
## $ freetime <fct> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4, 2, 3, 5, 1,…
## $ goout <fct> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4, 3, 2, 5, 3,…
## $ Dalc <fct> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ Walc <fct> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 2, 1, 4, 3,…
## $ health <fct> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2, 2, 4, 5, 5,…
## $ absences <fct> 6, 4, 10, 2, 4, 10, 0, 6, 0, 0, 0, 4, 2, 2, 0, 4, 6, 4, 16,…
## $ G1 <fct> 5, 5, 7, 15, 6, 15, 12, 6, 16, 14, 10, 10, 14, 10, 14, 14, …
## $ G2 <fct> 6, 5, 8, 14, 10, 15, 12, 5, 18, 15, 8, 12, 14, 10, 16, 14, …
## $ G3 <fct> 6, 6, 10, 15, 10, 15, 11, 6, 19, 15, 9, 12, 14, 11, 16, 14,…
Student_mat <- Student_mat %>%
mutate(G3 = as.character(G3),
G3 = case_when(
G3 == "4" ~ "5",
G3 == "20" ~ "19",
TRUE ~ G3
),
G3 = as.factor(G3))
dim(Student_mat)
## [1] 395 33
Student_mat %>%
count(G3) %>%
mutate(percentage = round(n / sum(n) * 100, 2))
## # A tibble: 16 × 3
## G3 n percentage
## <fct> <int> <dbl>
## 1 0 38 9.62
## 2 10 56 14.2
## 3 11 47 11.9
## 4 12 31 7.85
## 5 13 31 7.85
## 6 14 27 6.84
## 7 15 33 8.35
## 8 16 16 4.05
## 9 17 6 1.52
## 10 18 12 3.04
## 11 19 6 1.52
## 12 5 8 2.03
## 13 6 15 3.8
## 14 7 9 2.28
## 15 8 32 8.1
## 16 9 28 7.09
set.seed(2010)
folds <- createFolds(Student_mat$G3, k = 3)
entrenamiento <- Student_mat[-folds[[3]],]
prueba <- Student_mat[folds[[3]],]
set.seed(2010)
train_control<- trainControl(method="cv",number=5, savePredictions = TRUE)
CB_training <- train(G3 ~ ., data = entrenamiento,
method = "naive_bayes",
trControl = train_control)
confusionMatrix(CB_training$pred$pred, CB_training$pred$obs)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 10 11 12 13 14 15 16 17 18 19 5 6 7 8 9
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 10 8 11 10 2 6 5 5 2 0 1 2 1 2 3 8 6
## 11 17 26 21 19 15 13 17 8 4 7 2 4 8 3 13 13
## 12 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 13 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 14 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 15 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 16 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0
## 17 8 16 13 9 9 8 11 5 3 3 1 2 4 1 11 7
## 18 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 19 16 21 17 11 11 10 10 5 1 5 3 3 4 5 10 12
## 5 1 0 1 1 0 0 0 0 0 0 0 0 2 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 7 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 8 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.0722
## 95% CI : (0.0516, 0.0978)
## No Information Rate : 0.1407
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0012
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 10 Class: 11 Class: 12 Class: 13 Class: 14
## Sensitivity 0.00000 0.14865 0.32812 0.00000 0.00000 0.00000
## Specificity 1.00000 0.86504 0.63420 1.00000 1.00000 1.00000
## Pos Pred Value NaN 0.15278 0.11053 NaN NaN NaN
## Neg Pred Value 0.90494 0.86123 0.87202 0.92015 0.92015 0.93156
## Prevalence 0.09506 0.14068 0.12167 0.07985 0.07985 0.06844
## Detection Rate 0.00000 0.02091 0.03992 0.00000 0.00000 0.00000
## Detection Prevalence 0.00000 0.13688 0.36122 0.00000 0.00000 0.00000
## Balanced Accuracy 0.50000 0.50685 0.48116 0.50000 0.50000 0.50000
## Class: 15 Class: 16 Class: 17 Class: 18 Class: 19 Class: 5
## Sensitivity 0.00000 0.000000 0.375000 0.00000 0.375000 0.000000
## Specificity 1.00000 0.996047 0.791506 1.00000 0.727799 0.990310
## Pos Pred Value NaN 0.000000 0.027027 NaN 0.020833 0.000000
## Neg Pred Value 0.91635 0.961832 0.987952 0.96958 0.986911 0.980806
## Prevalence 0.08365 0.038023 0.015209 0.03042 0.015209 0.019011
## Detection Rate 0.00000 0.000000 0.005703 0.00000 0.005703 0.000000
## Detection Prevalence 0.00000 0.003802 0.211027 0.00000 0.273764 0.009506
## Balanced Accuracy 0.50000 0.498024 0.583253 0.50000 0.551400 0.495155
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.00000 0.000000 0.000000 0.00000
## Specificity 1.00000 0.998054 0.997934 1.00000
## Pos Pred Value NaN 0.000000 0.000000 NaN
## Neg Pred Value 0.96198 0.977143 0.920000 0.92776
## Prevalence 0.03802 0.022814 0.079848 0.07224
## Detection Rate 0.00000 0.000000 0.000000 0.00000
## Detection Prevalence 0.00000 0.001901 0.001901 0.00000
## Balanced Accuracy 0.50000 0.499027 0.498967 0.50000
NBC_cv <- train(G3 ~ ., data = prueba,
method = "naive_bayes",
trControl = train_control)
confusionMatrix(NBC_cv$pred$pred, NBC_cv$pred$obs)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 10 11 12 13 14 15 16 17 18 19 5 6 7 8 9
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 10 5 7 6 4 4 3 4 3 0 2 0 1 2 1 5 4
## 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 12 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 13 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 14 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 15 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 16 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 17 1 4 3 0 2 1 0 3 0 1 0 0 0 0 1 3
## 18 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 19 1 1 1 1 0 1 3 0 0 0 0 0 0 1 0 1
## 5 1 0 0 0 0 0 0 0 0 0 0 1 1 0 4 0
## 6 0 1 1 1 0 1 1 0 0 1 0 0 1 0 0 0
## 7 2 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.0882
## 95% CI : (0.0411, 0.1609)
## No Information Rate : 0.1373
## P-Value [Acc > NIR] : 0.9505
##
## Kappa : 0.0141
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 10 Class: 11 Class: 12 Class: 13 Class: 14
## Sensitivity 0.00000 0.50000 0.0000 0.00000 0.00000 0.00000
## Specificity 1.00000 0.50000 1.0000 1.00000 1.00000 1.00000
## Pos Pred Value NaN 0.13725 NaN NaN NaN NaN
## Neg Pred Value 0.90196 0.86275 0.8824 0.92157 0.92157 0.94118
## Prevalence 0.09804 0.13725 0.1176 0.07843 0.07843 0.05882
## Detection Rate 0.00000 0.06863 0.0000 0.00000 0.00000 0.00000
## Detection Prevalence 0.00000 0.50000 0.0000 0.00000 0.00000 0.00000
## Balanced Accuracy 0.50000 0.50000 0.5000 0.50000 0.50000 0.50000
## Class: 15 Class: 16 Class: 17 Class: 18 Class: 19 Class: 5
## Sensitivity 0.00000 0.00000 NA 0.000000 NA 0.500000
## Specificity 1.00000 0.97917 0.8137 0.989796 0.90196 0.940000
## Pos Pred Value NaN 0.00000 NA 0.000000 NA 0.142857
## Neg Pred Value 0.92157 0.94000 NA 0.960396 NA 0.989474
## Prevalence 0.07843 0.05882 0.0000 0.039216 0.00000 0.019608
## Detection Rate 0.00000 0.00000 0.0000 0.000000 0.00000 0.009804
## Detection Prevalence 0.00000 0.01961 0.1863 0.009804 0.09804 0.068627
## Balanced Accuracy 0.50000 0.48958 NA 0.494898 NA 0.720000
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.250000 0.00000 0.00000 0.00000
## Specificity 0.938776 0.95000 1.00000 1.00000
## Pos Pred Value 0.142857 0.00000 NaN NaN
## Neg Pred Value 0.968421 0.97938 0.90196 0.92157
## Prevalence 0.039216 0.01961 0.09804 0.07843
## Detection Rate 0.009804 0.00000 0.00000 0.00000
## Detection Prevalence 0.068627 0.04902 0.00000 0.00000
## Balanced Accuracy 0.594388 0.47500 0.50000 0.50000
Al realizar 5 particiones, el modelo fue entrenado con diferentes subconjuntos del 67% de los datos. Las clases 4 y 20 fueron agrupadas dentro de las clases 5 y 19 respectivamente, ya que representaban apenas el 0.25% de los datos con una sola observación cada una.
Modelo de Entrenamiento
La tasa de acierto del modelo entrenado con el 67% de los datos fue de 7.22%, lo cual refleja una capacidad predictiva muy débil, inferior a la tasa de no información de 14.07%, confirmado por un P-value = 1. Esto indica que el modelo tiene peor desempeño que simplemente predecir siempre la clase más frecuente.
El Kappa de 0.0012 confirma una relacion debil entre predicciones y valores reales. La matriz de confusión indica que el modelo solo logró predecir parcialmente las clases 10, 11, 17 y 19, todas con sensibilidad menor al 40%.
Modelo de Prueba
El modelo evaluado con el 33% de los datos obtuvo una tasa de acierto de 8.82%, igualmente inferior a su tasa de no información (13.73%), con un P-value de 0.9505 que confirma que no hay diferencia significativa entre el modelo y seleccionar la clase más predominante.
Sin embargo, su Kappa fue ligeramente mayor (0.0141 vs 0.0012). La matriz de confusión muestra predicciones parciales únicamente para las clases 10, 5 y 6.
El modelo con datos de prueba (33%) obtuvo mayor exactitud y Kappa que el modelo entrenado (67%). En general, el modelo Naive Bayes demuestra una capacidad predictiva insuficiente para este conjunto de datos con 16 clases.