Matriz de confusión para estudiantes aprobados LOW, MEDIUM, HIGH

Las librerías

library(readr)
library(dplyr)

Los datos

cat("Los datos ...", "https://app.schoology.com/course/2376784342/materials/gp/2390959605") 
## Los datos ... https://app.schoology.com/course/2376784342/materials/gp/2390959605
datos <- read.csv("datos/college-perf.csv", 
                  header = TRUE, 
                  stringsAsFactors = F, 
                  na.strings = TRUE)
head(datos,10)
##     SAT  GPA Projects Community Income   Perf   Pred
## 1  1380 2.53        1         0  41800    Low    Low
## 2  1100 3.18        1         5  37600    Low    Low
## 3  1110 2.73        2        10  34800 Medium Medium
## 4  1180 2.49        3         0  24100    Low   High
## 5  1240 2.89        3         5  56000 Medium Medium
## 6  1140 2.85        2         0  50800    Low    Low
## 7   970 2.37        1         0  47000 Medium Medium
## 8  1100 2.67        2         0  50900 Medium Medium
## 9  1230 3.01        2         0  37500 Medium Medium
## 10 1280 3.08        0         5  60200    Low    Low
tail(datos,10)
##       SAT  GPA Projects Community Income   Perf   Pred
## 3991 1200 3.12        0         0  27300 Medium Medium
## 3992 1210 2.97        4         0  49000   High Medium
## 3993 1040 2.86        1        10  49600    Low    Low
## 3994 1120 2.41        0         5  39000    Low   High
## 3995 1190 3.16        3         5  36800   High   High
## 3996 1210 2.81        3         0  39300 Medium Medium
## 3997 1120 3.12        1         0  45900 Medium    Low
## 3998 1240 2.66        2         0  49300 Medium Medium
## 3999 1230 2.15        1        10  46400    Low   High
## 4000 1200 2.39        2         0  56300 Medium Medium

Explorando los datos

str(datos)
## 'data.frame':    4000 obs. of  7 variables:
##  $ SAT      : int  1380 1100 1110 1180 1240 1140 970 1100 1230 1280 ...
##  $ GPA      : num  2.53 3.18 2.73 2.49 2.89 2.85 2.37 2.67 3.01 3.08 ...
##  $ Projects : int  1 1 2 3 3 2 1 2 2 0 ...
##  $ Community: int  0 5 10 0 5 0 0 0 0 5 ...
##  $ Income   : int  41800 37600 34800 24100 56000 50800 47000 50900 37500 60200 ...
##  $ Perf     : chr  "Low" "Low" "Medium" "Low" ...
##  $ Pred     : chr  "Low" "Low" "Medium" "High" ...
summary(datos)
##       SAT            GPA           Projects       Community     
##  Min.   : 870   Min.   :1.750   Min.   :0.000   Min.   : 0.000  
##  1st Qu.:1130   1st Qu.:2.550   1st Qu.:0.000   1st Qu.: 0.000  
##  Median :1190   Median :2.750   Median :1.000   Median : 5.000  
##  Mean   :1193   Mean   :2.753   Mean   :1.025   Mean   : 5.104  
##  3rd Qu.:1260   3rd Qu.:2.950   3rd Qu.:2.000   3rd Qu.:10.000  
##  Max.   :1580   Max.   :3.760   Max.   :4.000   Max.   :20.000  
##      Income          Perf               Pred          
##  Min.   :12200   Length:4000        Length:4000       
##  1st Qu.:40100   Class :character   Class :character  
##  Median :46500   Mode  :character   Mode  :character  
##  Mean   :46510                                        
##  3rd Qu.:53300                                        
##  Max.   :82600
names(datos)
## [1] "SAT"       "GPA"       "Projects"  "Community" "Income"    "Perf"     
## [7] "Pred"

Ordenar las variables de interés

datos$Perf <- ordered(datos$Perf, 
                   levels=c("Low", "Medium", "High")
                   )

datos$Pred <- ordered(datos$Pred, 
                   levels=c("Low", "Medium", "High")
                   )

summary(datos)
##       SAT            GPA           Projects       Community     
##  Min.   : 870   Min.   :1.750   Min.   :0.000   Min.   : 0.000  
##  1st Qu.:1130   1st Qu.:2.550   1st Qu.:0.000   1st Qu.: 0.000  
##  Median :1190   Median :2.750   Median :1.000   Median : 5.000  
##  Mean   :1193   Mean   :2.753   Mean   :1.025   Mean   : 5.104  
##  3rd Qu.:1260   3rd Qu.:2.950   3rd Qu.:2.000   3rd Qu.:10.000  
##  Max.   :1580   Max.   :3.760   Max.   :4.000   Max.   :20.000  
##      Income          Perf          Pred     
##  Min.   :12200   Low   :1332   Low   :1351  
##  1st Qu.:40100   Medium:2137   Medium:1923  
##  Median :46500   High  : 531   High  : 726  
##  Mean   :46510                              
##  3rd Qu.:53300                              
##  Max.   :82600

Generando frecuencias con la función table()

tabla <- table(datos$Perf, datos$Pred,  
               dnn = c("Actual", "Predecido"))
tabla
##         Predecido
## Actual    Low Medium High
##   Low    1150     84   98
##   Medium  166   1801  170
##   High     35     38  458

Generando frecuencias con probabilidades con la función table()

prop.table(tabla)   # Las probabilidades
##         Predecido
## Actual       Low  Medium    High
##   Low    0.28750 0.02100 0.02450
##   Medium 0.04150 0.45025 0.04250
##   High   0.00875 0.00950 0.11450
round(prop.table(tabla, 1) * 100, 2) # Redondear, Lo hace por filas
##         Predecido
## Actual     Low Medium  High
##   Low    86.34   6.31  7.36
##   Medium  7.77  84.28  7.96
##   High    6.59   7.16 86.25
round(prop.table(tabla, 2) * 100, 2) # Redondear, Lo hace por columna
##         Predecido
## Actual     Low Medium  High
##   Low    85.12   4.37 13.50
##   Medium 12.29  93.66 23.42
##   High    2.59   1.98 63.09

Haciendo diagrama de la matriz de confusión

par(mfrow=c(1,1))
barplot(tabla, legend = TRUE, xlab="Nota predecida por el modelo")

Gráfica mosaico de la matriz de confusión

# Mosaic plot simplente
mosaicplot(tabla, main="Eficiencia del modelo")