Using Machine Learning to predict factors affecting academic performance: An inclusive education approach at the Universidad de Santander.

Exploring Data

A continuación, se presenta como se distribuyen los datos.

Basic Summaries

Se carga la base de datos

library (tidyverse)
library(readxl)
library(moments)
library(magrittr)
library(Hmisc)
library(ggplot2)
library (tidyverse)
library(dplyr)
library(plotly)
library(gridExtra)
library(rpart)
library(rpart.plot)
library(randomForest)
library(neuralnet)
library(ggplot2)
library(caret)  # Para la partición
library(pacman)

dataset <- read_excel("C:/Users/coordinador.analitic/OneDrive - Universidad de Santander/ArtículoImportEnseñanzaDrPatricio/Base de datos/CuantilUDES3final.xlsx")
#View(dataset)

attach(dataset)
names(dataset)

##  [1] "Camp"          "Gen"           "Estr"          "Plant"        
##  [5] "Facul"         "RPLC"          "RPMAT"         "RPSC"         
##  [9] "RPCN"          "RPING"         "Edad"          "Nlibro"       
## [13] "Depar"         "Estra"         "Depar_A"       "Gustolect"    
## [17] "Apoyemoc"      "Apofin"        "Pago_Semestre" "Prom2"        
## [21] "Prom"

str(dataset)

## tibble [4,182 × 21] (S3: tbl_df/tbl/data.frame)
##  $ Camp         : chr [1:4182] "Cuc" "Cuc" "Cuc" "Cuc" ...
##  $ Gen          : chr [1:4182] "Masc" "Fem" "Fem" "Fem" ...
##  $ Estr         : chr [1:4182] "Bajo" "Medio" "Medio" "Bajo" ...
##  $ Plant        : chr [1:4182] "Pub" "Priv" "Pub" "Pub" ...
##  $ Facul        : chr [1:4182] "Ing" "Econ" "Soc" "Econ" ...
##  $ RPLC         : chr [1:4182] "RA" "RM" "RM" "RB" ...
##  $ RPMAT        : chr [1:4182] "RM" "RB" "RM" "RB" ...
##  $ RPSC         : chr [1:4182] "RM" "RB" "RM" "RB" ...
##  $ RPCN         : chr [1:4182] "RM" "RB" "RM" "RB" ...
##  $ RPING        : chr [1:4182] "RM" "RM" "RM" "RM" ...
##  $ Edad         : chr [1:4182] "<20" "<20" "<20" "<20" ...
##  $ Nlibro       : chr [1:4182] "E0-10" "E11-25" "E11-25" "E11-25" ...
##  $ Depar        : chr [1:4182] "Nort" "Nort" "Nort" "Nort" ...
##  $ Estra        : chr [1:4182] "Est1" "Est3" "Est3" "Est1" ...
##  $ Depar_A      : chr [1:4182] "Nort" "Nort" "Nort" "Nort" ...
##  $ Gustolect    : chr [1:4182] "Si" "Si" "Si" "Si" ...
##  $ Apoyemoc     : chr [1:4182] "Si" "Si" "Si" "Si" ...
##  $ Apofin       : chr [1:4182] "Si" "Si" "Si" "Si" ...
##  $ Pago_Semestre: chr [1:4182] "Efect" "Efect" "Efect" "Efect" ...
##  $ Prom2        : chr [1:4182] ">3,8" ">3,8" ">3,8" ">3,8" ...
##  $ Prom         : num [1:4182] 5 4.89 4.85 4.83 4.81 4.8 4.8 4.79 4.79 4.79 ...

describe(dataset$Camp)

## dataset$Camp 
##        n  missing distinct 
##     4182        0        3 
##                             
## Value        Buc   Cuc   Val
## Frequency   1878  1151  1153
## Proportion 0.449 0.275 0.276

describe(dataset$Gen)

## dataset$Gen 
##        n  missing distinct 
##     4182        0        2 
##                       
## Value        Fem  Masc
## Frequency   2628  1554
## Proportion 0.628 0.372

describe(dataset$Estr)

## dataset$Estr 
##        n  missing distinct 
##     4182        0        3 
##                             
## Value       Alto  Bajo Medio
## Frequency    165  2395  1622
## Proportion 0.039 0.573 0.388

describe(dataset$Plant)

## dataset$Plant 
##        n  missing distinct 
##     4182        0        2 
##                       
## Value       Priv   Pub
## Frequency   1870  2312
## Proportion 0.447 0.553

describe(dataset$Facul)

## dataset$Facul 
##        n  missing distinct 
##     4182        0        6 
##                                               
## Value       Econ  Exac   Ing   Sal   Soc   Tec
## Frequency    800   326   526  1740   656   134
## Proportion 0.191 0.078 0.126 0.416 0.157 0.032

describe(dataset$RPLC)

## dataset$RPLC 
##        n  missing distinct 
##     4182        0        3 
##                             
## Value         RA    RB    RM
## Frequency    722   644  2816
## Proportion 0.173 0.154 0.673

describe(dataset$RPMAT)

## dataset$RPMAT 
##        n  missing distinct 
##     4182        0        3 
##                             
## Value         RA    RB    RM
## Frequency    653   640  2889
## Proportion 0.156 0.153 0.691

describe(dataset$RPSC)

## dataset$RPSC 
##        n  missing distinct 
##     4182        0        3 
##                             
## Value         RA    RB    RM
## Frequency    749   663  2770
## Proportion 0.179 0.159 0.662

describe(dataset$RPCN)

## dataset$RPCN 
##        n  missing distinct 
##     4182        0        3 
##                             
## Value         RA    RB    RM
## Frequency    733   645  2804
## Proportion 0.175 0.154 0.670

describe(dataset$RPING)

## dataset$RPING 
##        n  missing distinct 
##     4182        0        3 
##                             
## Value         RA    RB    RM
## Frequency    759   704  2719
## Proportion 0.181 0.168 0.650

describe(dataset$Edad)

## dataset$Edad 
##        n  missing distinct 
##     4182        0        3 
##                                
## Value         <20    >26 E21-25
## Frequency    3499    120    563
## Proportion  0.837  0.029  0.135

describe(dataset$Nlibro)

## dataset$Nlibro 
##        n  missing distinct 
##     4182        0        4 
##                                           
## Value         >100   E0-10  E11-25 E26-100
## Frequency      254    1938    1196     794
## Proportion   0.061   0.463   0.286   0.190

describe(dataset$Depar)

## dataset$Depar 
##        n  missing distinct 
##     4182        0        4 
##                                   
## Value      Cesar  Nort  Otro  Sant
## Frequency    894  1051  1056  1181
## Proportion 0.214 0.251 0.253 0.282

describe(dataset$Estra)

## dataset$Estra 
##        n  missing distinct 
##     4182        0        6 
##                                               
## Value       Est1  Est2  Est3  Est4  Est5  Est6
## Frequency   1009  1386  1087   535   134    31
## Proportion 0.241 0.331 0.260 0.128 0.032 0.007

describe(dataset$Zona)

##  
## NULL

describe(dataset$Depar_A)

## dataset$Depar_A 
##        n  missing distinct 
##     4182        0        4 
##                                   
## Value      Cesar  Nort  Otro  Sant
## Frequency    894  1051  1056  1181
## Proportion 0.214 0.251 0.253 0.282

describe(dataset$Gustolect)

## dataset$Gustolect 
##        n  missing distinct 
##     4182        0        2 
##                       
## Value         No    Si
## Frequency   1284  2898
## Proportion 0.307 0.693

describe(dataset$Apoyemoc)

## dataset$Apoyemoc 
##        n  missing distinct 
##     4182        0        2 
##                       
## Value         No    Si
## Frequency    482  3700
## Proportion 0.115 0.885

describe(dataset$Apofin)

## dataset$Apofin 
##        n  missing distinct 
##     4182        0        2 
##                       
## Value         No    Si
## Frequency   1775  2407
## Proportion 0.424 0.576

describe(dataset$Pago_Semestre)

## dataset$Pago_Semestre 
##        n  missing distinct 
##     4182        0        6 
##                                                     
## Value       Autof  Becas   Cred  Efect ICETEX   Otro
## Frequency     137    126   1035   1964    621    299
## Proportion  0.033  0.030  0.247  0.470  0.148  0.071

describe(dataset$ZProm)

##  
## NULL

describe(dataset$Prom)

## dataset$Prom 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     4182        0      296        1    3.788   0.6022     2.96     3.26 
##      .25      .50      .75      .90      .95 
##     3.59     3.87     4.16     4.39     4.51 
## 
## lowest : 0    0.03 0.07 0.11 0.12, highest: 4.81 4.83 4.85 4.89 5

describe(dataset$Prom2)

## dataset$Prom2 
##        n  missing distinct 
##     4182        0        2 
##                       
## Value       <3,8  >3,8
## Frequency   1786  2396
## Proportion 0.427 0.573

describe(dataset$Prom3)

##  
## NULL

# Revisar cuántos valores no-NA tiene cada columna
colSums(!is.na(dataset))

##          Camp           Gen          Estr         Plant         Facul 
##          4182          4182          4182          4182          4182 
##          RPLC         RPMAT          RPSC          RPCN         RPING 
##          4182          4182          4182          4182          4182 
##          Edad        Nlibro         Depar         Estra       Depar_A 
##          4182          4182          4182          4182          4182 
##     Gustolect      Apoyemoc        Apofin Pago_Semestre         Prom2 
##          4182          4182          4182          4182          4182 
##          Prom 
##          4182

Converting character type variables to factors

#Convertimos a factores las variables
camp <- factor(Camp)
gen <- factor(Gen)
estr <- factor(Estr)
plant <-factor (Plant)
facul <-factor (Facul)
rplc <-factor(RPLC)
rpmat <- factor(RPMAT)
rpsc <- factor(RPSC)
rpcn <- factor(RPCN)
rping <- factor(RPING)
edad <- factor(Edad)
nlibro <- factor(Nlibro)
depar <- factor(Depar)
estra <- factor(Estra)
depar_A <- factor(Depar_A)
gustolect <- factor(Gustolect)
apoyemoc <- factor(Apoyemoc)
apofin <- factor(Apofin)
pag_semes <- factor(Pago_Semestre)
prom2 <- factor(Prom2)

Visualising Distributions

# Use ggplot2 to generate histogram plot for Prom

# Distributions

# Generate the plot Promedio vs Genero.

p01 <- dataset %>%
  with(dataset[,]) %>%
  dplyr::mutate(Gen=as.factor(Gen)) %>%
  dplyr::select(Prom, Gen) %>%
  ggplot2::ggplot(ggplot2::aes(x=Prom)) +
  ggplot2::geom_density(lty=3) +
  ggplot2::geom_density(ggplot2::aes(fill=Gen, colour=Gen), alpha=0.55) +
  ggplot2::xlab("Prom") +
  ggplot2::ggtitle("Distribution of Prom by Gen") +
  ggplot2::labs(fill="Gen", y="Density")

# Display the plots.

gridExtra::grid.arrange(p01)

# Generate the plot Promedio vs Facul.

p02 <- dataset %>%
  with(dataset[,]) %>%
  dplyr::mutate(Gen=as.factor(Facul)) %>%
  dplyr::select(Prom, Facul) %>%
  ggplot2::ggplot(ggplot2::aes(x=Prom)) +
  ggplot2::geom_density(lty=3) +
  ggplot2::geom_density(ggplot2::aes(fill=Facul, colour=Facul), alpha=0.55) +
  ggplot2::xlab("Prom") +
  ggplot2::ggtitle("Distribution of Prom by Facul") +
  ggplot2::labs(fill="Facul", y="Density")

# Display the plots.

gridExtra::grid.arrange(p02)

# Generate the plot Promedio vs Campus.

p03 <- dataset %>%
  with(dataset[,]) %>%
  dplyr::mutate(Gen=as.factor(Camp)) %>%
  dplyr::select(Prom, Camp) %>%
  ggplot2::ggplot(ggplot2::aes(x=Prom)) +
  ggplot2::geom_density(lty=3) +
  ggplot2::geom_density(ggplot2::aes(fill=Camp, colour=Camp), alpha=0.55) +
  ggplot2::xlab("Camp") +
  ggplot2::ggtitle("Distribution of Prom by Camp") +
  ggplot2::labs(fill="Camp", y="Density")

# Display the plots.

gridExtra::grid.arrange(p03)

Bar plot

ggplot(data=dataset, aes(x=Camp, fill=gen)) + 
  geom_bar(position="dodge") +
  theme_minimal()+
  geom_text(stat='count', aes(label=..count..),      position=position_dodge(width=0.9), vjust=-0.5) +
  labs(fill="Gender")

ggplot(data=dataset, aes(x=Camp, fill=estr)) + 
  geom_bar(position="dodge") +
  theme_minimal() +
  geom_text(stat='count', aes(label=..count..),      position=position_dodge(width=0.9), vjust=-0.5) +
  labs(fill="Level")

ggplot(data=dataset, aes(x=Camp, fill=facul)) + 
  geom_bar(position="dodge") +
  theme_minimal() +
  geom_text(stat='count', aes(label=..count..),      position=position_dodge(width=0.9), vjust=-0.5) +
  labs(fill="Facultad")

ggplot(data=dataset, aes(x=Camp, fill=rplc)) + 
  geom_bar(position="dodge") +
  theme_minimal() +
  geom_text(stat='count', aes(label=..count..),      position=position_dodge(width=0.9), vjust=-0.5) +
  labs(fill="RPLC")

ggplot(data=dataset, aes(x=Camp, fill=rpmat)) + 
  geom_bar(position="dodge") +
  theme_minimal() +
  geom_text(stat='count', aes(label=..count..),      position=position_dodge(width=0.9), vjust=-0.5) +
  labs(fill="RPMAT")

ggplot(data=dataset, aes(x=Camp, fill=rpsc)) + 
  geom_bar(position="dodge") +
  theme_minimal() +
  geom_text(stat='count', aes(label=..count..),      position=position_dodge(width=0.9), vjust=-0.5) +
  labs(fill="RPSC")

ggplot(data=dataset, aes(x=Camp, fill=rpcn)) + 
  geom_bar(position="dodge") +
  theme_minimal() +
  geom_text(stat='count', aes(label=..count..),      position=position_dodge(width=0.9), vjust=-0.5) +
  labs(fill="RPCN")

ggplot(data=dataset, aes(x=Camp, fill=rping)) + 
  geom_bar(position="dodge") +
  theme_minimal() +
  geom_text(stat='count', aes(label=..count..),      position=position_dodge(width=0.9), vjust=-0.5) +
  labs(fill="RPING")

ggplot(data=dataset, aes(x=Camp, fill=edad)) + 
  geom_bar(position="dodge") +
  theme_minimal() +
  geom_text(stat='count', aes(label=..count..),      position=position_dodge(width=0.9), vjust=-0.5) +
  labs(fill="Edad")

ggplot(data=dataset, aes(x=Camp, fill=nlibro)) + 
  geom_bar(position="dodge") +
  theme_minimal() +
  geom_text(stat='count', aes(label=..count..),      position=position_dodge(width=0.9), vjust=-0.5) +
  labs(fill="Nlibro")

ggplot(data=dataset, aes(x=Camp, fill=depar)) + 
  geom_bar(position="dodge") +
  theme_minimal() +
  geom_text(stat='count', aes(label=..count..),      position=position_dodge(width=0.9), vjust=-0.5) +
  labs(fill="Depar")

ggplot(data=dataset, aes(x=Camp, fill=gustolect)) + 
  geom_bar(position="dodge") +
  theme_minimal() +
  geom_text(stat='count', aes(label=..count..),      position=position_dodge(width=0.9), vjust=-0.5) +
  labs(fill="Gustolect")

ggplot(data=dataset, aes(x=Camp, fill=apoyemoc)) + 
  geom_bar(position="dodge") +
  theme_minimal() +
  labs(fill="Apoyemoc")

ggplot(data=dataset, aes(x=Camp, fill=apofin)) + 
  geom_bar(position="dodge") +
  theme_minimal() +
  geom_text(stat='count', aes(label=..count..),      position=position_dodge(width=0.9), vjust=-0.5) +
  labs(fill="Apofin")

ggplot(data=dataset, aes(x=Camp, fill=pag_semes)) + 
  geom_bar(position="dodge") +
  theme_minimal() +
  geom_text(stat='count', aes(label=..count..),      position=position_dodge(width=0.9), vjust=-0.5) +
  labs(fill="Pag_semes")

Interactive Graphics

library(plotly)
plot_ly(data=dataset, x = ~ Camp, y = ~Prom, color = ~ Camp, type = "box")

library(plotly)
plot_ly(data=dataset, x = ~ Facul, y = ~Prom, color = ~Facul, type = "box")

Building Model

Training dataset

In building our model we used a 70% subset of all of the available data. We call the 70% sample the training dataset. The remainder is split equally into a validation dataset (15%) and a testing dataset (15%). Action the user selections from the Data tab. Build the train/validate/test datasets. nobs=4182 train=2927 validate=627 test=628

Decision Tree

# https://rpubs.com/jboscomendoza/arboles_decision_clasificacion


# Cargar las librerías necesarias


library(dplyr)
library(rpart)
library(rpart.plot)
library(caret)  # Necesaria para confusionMatrix

# Fijar la semilla para la reproducibilidad
set.seed(2927)

# Dividir el conjunto de datos en entrenamiento y prueba
inclusion_entrenamiento <- sample_frac(dataset, 0.7)
inclusion_prueba <- setdiff(dataset, inclusion_entrenamiento)

# Entrenando nuestro modelo
arbol_1 <- rpart(formula = prom2 ~ camp + gen + estr + plant + facul
                 + rplc + rpmat + rpsc + rpcn + rping
                 + edad + nlibro + depar + estra + depar_A
                 + gustolect + apoyemoc + apofin
                 + pag_semes, data = inclusion_entrenamiento)
summary(arbol_1)

## Call:
## rpart(formula = prom2 ~ camp + gen + estr + plant + facul + rplc + 
##     rpmat + rpsc + rpcn + rping + edad + nlibro + depar + estra + 
##     depar_A + gustolect + apoyemoc + apofin + pag_semes, data = inclusion_entrenamiento)
##   n= 4182 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.12765957      0 1.0000000 1.0000000 0.01791062
## 2 0.01213139      2 0.7446809 0.7446809 0.01686270
## 3 0.01000000      6 0.6892497 0.7066069 0.01662064
## 
## Variable importance
##  rpcn facul rpmat  rpsc rping  rplc   gen 
##    36    34    11     6     5     5     3 
## 
## Node number 1: 4182 observations,    complexity param=0.1276596
##   predicted class=>3,8  expected loss=0.4270684  P(node) =1
##     class counts:  1786  2396
##    probabilities: 0.427 0.573 
##   left son=2 (2592 obs) right son=3 (1590 obs)
##   Primary splits:
##       facul splits as  RLLLRR, improve=167.21030, (0 missing)
##       rpsc  splits as  LRR,    improve=102.05210, (0 missing)
##       rpcn  splits as  LRR,    improve= 98.96977, (0 missing)
##       rplc  splits as  LRR,    improve= 98.65563, (0 missing)
##       rping splits as  LRR,    improve= 88.55702, (0 missing)
##   Surrogate splits:
##       estra splits as  LLLLLR, agree=0.62, adj=0.001, (0 split)
## 
## Node number 2: 2592 observations,    complexity param=0.1276596
##   predicted class=<3,8  expected loss=0.4621914  P(node) =0.6197991
##     class counts:  1394  1198
##    probabilities: 0.538 0.462 
##   left son=4 (2152 obs) right son=5 (440 obs)
##   Primary splits:
##       rpcn  splits as  LRL, improve=117.72000, (0 missing)
##       rpsc  splits as  LRL, improve= 88.49848, (0 missing)
##       rpmat splits as  LRL, improve= 80.62461, (0 missing)
##       rplc  splits as  LRL, improve= 76.37716, (0 missing)
##       rping splits as  LRR, improve= 69.21944, (0 missing)
##   Surrogate splits:
##       rpmat splits as  LRL, agree=0.876, adj=0.270, (0 split)
##       rpsc  splits as  LRL, agree=0.864, adj=0.198, (0 split)
##       rplc  splits as  LRL, agree=0.856, adj=0.150, (0 split)
##       rping splits as  LRL, agree=0.842, adj=0.070, (0 split)
## 
## Node number 3: 1590 observations
##   predicted class=>3,8  expected loss=0.2465409  P(node) =0.3802009
##     class counts:   392  1198
##    probabilities: 0.247 0.753 
## 
## Node number 4: 2152 observations,    complexity param=0.01213139
##   predicted class=<3,8  expected loss=0.394052  P(node) =0.5145863
##     class counts:  1304   848
##    probabilities: 0.606 0.394 
##   left son=8 (414 obs) right son=9 (1738 obs)
##   Primary splits:
##       rpcn  splits as  L-R, improve=58.78933, (0 missing)
##       rpsc  splits as  LRR, improve=47.56854, (0 missing)
##       rplc  splits as  LRR, improve=42.20266, (0 missing)
##       rpmat splits as  LRR, improve=41.08405, (0 missing)
##       rping splits as  LRR, improve=37.11269, (0 missing)
##   Surrogate splits:
##       rpmat splits as  LRR, agree=0.844, adj=0.188, (0 split)
##       rplc  splits as  LRR, agree=0.827, adj=0.099, (0 split)
##       rpsc  splits as  LRR, agree=0.825, adj=0.089, (0 split)
## 
## Node number 5: 440 observations
##   predicted class=>3,8  expected loss=0.2045455  P(node) =0.1052128
##     class counts:    90   350
##    probabilities: 0.205 0.795 
## 
## Node number 8: 414 observations
##   predicted class=<3,8  expected loss=0.1545894  P(node) =0.0989957
##     class counts:   350    64
##    probabilities: 0.845 0.155 
## 
## Node number 9: 1738 observations,    complexity param=0.01213139
##   predicted class=<3,8  expected loss=0.4510932  P(node) =0.4155906
##     class counts:   954   784
##    probabilities: 0.549 0.451 
##   left son=18 (246 obs) right son=19 (1492 obs)
##   Primary splits:
##       rping splits as  LRR,    improve=17.48578, (0 missing)
##       gen   splits as  RL,     improve=17.11323, (0 missing)
##       rplc  splits as  LRR,    improve=15.38099, (0 missing)
##       rpsc  splits as  LRR,    improve=15.01757, (0 missing)
##       facul splits as  -LRR--, improve=13.24850, (0 missing)
## 
## Node number 18: 246 observations
##   predicted class=<3,8  expected loss=0.2764228  P(node) =0.05882353
##     class counts:   178    68
##    probabilities: 0.724 0.276 
## 
## Node number 19: 1492 observations,    complexity param=0.01213139
##   predicted class=<3,8  expected loss=0.4798928  P(node) =0.3567671
##     class counts:   776   716
##    probabilities: 0.520 0.480 
##   left son=38 (491 obs) right son=39 (1001 obs)
##   Primary splits:
##       gen   splits as  RL,     improve=16.815360, (0 missing)
##       rpsc  splits as  LRR,    improve=12.429930, (0 missing)
##       facul splits as  -LRR--, improve= 9.674061, (0 missing)
##       rpmat splits as  LRR,    improve= 8.931678, (0 missing)
##       rplc  splits as  LRR,    improve= 8.482659, (0 missing)
##   Surrogate splits:
##       facul splits as  -RLR--, agree=0.731, adj=0.181, (0 split)
##       estra splits as  RRRRRL, agree=0.672, adj=0.004, (0 split)
## 
## Node number 38: 491 observations
##   predicted class=<3,8  expected loss=0.3727088  P(node) =0.1174079
##     class counts:   308   183
##    probabilities: 0.627 0.373 
## 
## Node number 39: 1001 observations,    complexity param=0.01213139
##   predicted class=>3,8  expected loss=0.4675325  P(node) =0.2393592
##     class counts:   468   533
##    probabilities: 0.468 0.532 
##   left son=78 (70 obs) right son=79 (931 obs)
##   Primary splits:
##       rpmat splits as  LRR,    improve=11.410450, (0 missing)
##       camp  splits as  LRL,    improve= 8.735124, (0 missing)
##       rpsc  splits as  LRR,    improve= 7.600517, (0 missing)
##       facul splits as  -LRR--, improve= 6.084253, (0 missing)
##       rplc  splits as  LRL,    improve= 5.050962, (0 missing)
## 
## Node number 78: 70 observations
##   predicted class=<3,8  expected loss=0.2571429  P(node) =0.0167384
##     class counts:    52    18
##    probabilities: 0.743 0.257 
## 
## Node number 79: 931 observations
##   predicted class=>3,8  expected loss=0.4468314  P(node) =0.2226208
##     class counts:   416   515
##    probabilities: 0.447 0.553

# Evaluando el modelo
print(arbol_1)

## n= 4182 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 4182 1786 >3,8 (0.4270684 0.5729316)  
##    2) facul=Exac,Ing,Sal 2592 1198 <3,8 (0.5378086 0.4621914)  
##      4) rpcn=RA,RM 2152  848 <3,8 (0.6059480 0.3940520)  
##        8) rpcn=RA 414   64 <3,8 (0.8454106 0.1545894) *
##        9) rpcn=RM 1738  784 <3,8 (0.5489068 0.4510932)  
##         18) rping=RA 246   68 <3,8 (0.7235772 0.2764228) *
##         19) rping=RB,RM 1492  716 <3,8 (0.5201072 0.4798928)  
##           38) gen=Masc 491  183 <3,8 (0.6272912 0.3727088) *
##           39) gen=Fem 1001  468 >3,8 (0.4675325 0.5324675)  
##             78) rpmat=RA 70   18 <3,8 (0.7428571 0.2571429) *
##             79) rpmat=RB,RM 931  416 >3,8 (0.4468314 0.5531686) *
##      5) rpcn=RB 440   90 >3,8 (0.2045455 0.7954545) *
##    3) facul=Econ,Soc,Tec 1590  392 >3,8 (0.2465409 0.7534591) *

# Graficar el árbol de decisión
rpart.plot(arbol_1)

Random Forests

library(randomForest)
library(datasets)
library(caret)

# Fijar la semilla para la reproducibilidad
set.seed(2927)

# Dividir el conjunto de datos en entrenamiento y prueba (70% entrenamiento, 30% prueba)
inclusion_entrenamiento <- sample_frac(dataset, 0.7)
inclusion_prueba <- setdiff(dataset, inclusion_entrenamiento)

# Entrenando el modelo Random Forest (Etapa 1: Entrenamiento)
modelo_rf <- randomForest(
  formula = prom2 ~ camp + gen + estr + plant + facul + rplc + rpmat + rpsc + rpcn + rping + edad + nlibro + depar + estra + depar_A + gustolect + apoyemoc + apofin + pag_semes,
  dataset = inclusion_entrenamiento,
  ntree = 500,  # Número de árboles
  mtry = 4,     # Número de predictores probados en cada división
  importance = TRUE
)
summary(modelo_rf)

##                 Length Class  Mode     
## call               6   -none- call     
## type               1   -none- character
## predicted       4182   factor numeric  
## err.rate        1500   -none- numeric  
## confusion          6   -none- numeric  
## votes           8364   matrix numeric  
## oob.times       4182   -none- numeric  
## classes            2   -none- character
## importance        76   -none- numeric  
## importanceSD      57   -none- numeric  
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            14   -none- list     
## y               4182   factor numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call

# Evaluando el modelo (Etapa 2: Evaluación del modelo)
print(modelo_rf)

## 
## Call:
##  randomForest(formula = prom2 ~ camp + gen + estr + plant + facul +      rplc + rpmat + rpsc + rpcn + rping + edad + nlibro + depar +      estra + depar_A + gustolect + apoyemoc + apofin + pag_semes,      dataset = inclusion_entrenamiento, ntree = 500, mtry = 4,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 29.2%
## Confusion matrix:
##      <3,8 >3,8 class.error
## <3,8 1100  686   0.3840985
## >3,8  535 1861   0.2232888

Neural networks

# Cargar las librerías necesarias
library(nnet)  # Para la red neuronal
library(caret)  # Para la matriz de confusión
library(dplyr)  # Para manipulación de datos

# Fijar la semilla para la reproducibilidad
set.seed(2927)

# Verificar si la columna 'prom2' está en el dataset
print("Columnas en el dataset:")

## [1] "Columnas en el dataset:"

print(colnames(dataset))

##  [1] "Camp"          "Gen"           "Estr"          "Plant"        
##  [5] "Facul"         "RPLC"          "RPMAT"         "RPSC"         
##  [9] "RPCN"          "RPING"         "Edad"          "Nlibro"       
## [13] "Depar"         "Estra"         "Depar_A"       "Gustolect"    
## [17] "Apoyemoc"      "Apofin"        "Pago_Semestre" "Prom2"        
## [21] "Prom"

# Verificar si la columna 'prom2' tiene valores
print("Revisar los primeros registros de 'prom2':")

## [1] "Revisar los primeros registros de 'prom2':"

print(head(dataset$prom2))

## NULL

# Dividir el conjunto de datos en entrenamiento y prueba (70% para entrenamiento, 30% para prueba)
inclusion_entrenamiento <- sample_frac(dataset, 0.7)
inclusion_prueba <- setdiff(dataset, inclusion_entrenamiento)

# Entrenar la red neuronal con la función nnet
# Tamaño de la red con 5 nodos ocultos
modelo_nn <- nnet(
  formula = prom2 ~ camp + gen + estr + plant + facul
            + rplc + rpmat + rpsc + rpcn + rping
            + edad + nlibro + depar + estra + depar_A
            + gustolect + apoyemoc + apofin + pag_semes, 
  data = inclusion_entrenamiento, 
  size = 5,   # Número de neuronas en la capa oculta
  maxit = 500, # Máximo de iteraciones
  decay = 0.1, # Parámetro de regularización
  trace = FALSE # Para evitar imprimir las salidas en cada iteración
)

summary(modelo_nn)

## a 45-5-1 network with 236 weights
## options were - entropy fitting  decay=0.1
##   b->h1  i1->h1  i2->h1  i3->h1  i4->h1  i5->h1  i6->h1  i7->h1  i8->h1  i9->h1 
##   -3.36    2.97    2.55   -0.47   -1.84   -0.59   -0.19    1.21    3.37   -1.94 
## i10->h1 i11->h1 i12->h1 i13->h1 i14->h1 i15->h1 i16->h1 i17->h1 i18->h1 i19->h1 
##   -1.41    3.35   -1.47   -1.27    3.97    3.57    0.78    0.74   -0.16   -0.59 
## i20->h1 i21->h1 i22->h1 i23->h1 i24->h1 i25->h1 i26->h1 i27->h1 i28->h1 i29->h1 
##   -0.25   -0.24    2.16   -0.29    0.19    0.93   -0.34   -0.39   -0.13   -0.46 
## i30->h1 i31->h1 i32->h1 i33->h1 i34->h1 i35->h1 i36->h1 i37->h1 i38->h1 i39->h1 
##    0.40   -0.60    0.01   -2.29    1.36   -0.39   -0.13   -0.46    0.59   -1.18 
## i40->h1 i41->h1 i42->h1 i43->h1 i44->h1 i45->h1 
##    0.57    0.34    0.18   -0.01   -0.78   -0.26 
##   b->h2  i1->h2  i2->h2  i3->h2  i4->h2  i5->h2  i6->h2  i7->h2  i8->h2  i9->h2 
##    1.69   -1.03   -2.14    2.69    1.26   -0.32    0.48   -0.27    0.02    0.46 
## i10->h2 i11->h2 i12->h2 i13->h2 i14->h2 i15->h2 i16->h2 i17->h2 i18->h2 i19->h2 
##    1.36    1.85   -0.44   -1.43   -1.21   -0.53   -2.43   -0.42    0.21    2.11 
## i20->h2 i21->h2 i22->h2 i23->h2 i24->h2 i25->h2 i26->h2 i27->h2 i28->h2 i29->h2 
##    0.29    0.77    0.55    2.01    0.93   -0.19    0.58    0.00   -0.51   -0.97 
## i30->h2 i31->h2 i32->h2 i33->h2 i34->h2 i35->h2 i36->h2 i37->h2 i38->h2 i39->h2 
##   -0.74    0.09   -0.41    1.45   -0.70    0.00   -0.51   -0.97    0.92    0.66 
## i40->h2 i41->h2 i42->h2 i43->h2 i44->h2 i45->h2 
##   -0.07   -1.82   -0.90   -0.48    2.38   -0.95 
##   b->h3  i1->h3  i2->h3  i3->h3  i4->h3  i5->h3  i6->h3  i7->h3  i8->h3  i9->h3 
##   -1.17    0.91    0.18   -0.72    0.82    0.05    0.49   -1.26    0.32   -2.83 
## i10->h3 i11->h3 i12->h3 i13->h3 i14->h3 i15->h3 i16->h3 i17->h3 i18->h3 i19->h3 
##    0.29    1.09    1.47    1.19    1.78    1.50    0.58   -0.31    0.30   -0.57 
## i20->h3 i21->h3 i22->h3 i23->h3 i24->h3 i25->h3 i26->h3 i27->h3 i28->h3 i29->h3 
##    1.76    1.43    0.98    0.06   -1.22   -1.06   -1.15    0.33    0.10    0.60 
## i30->h3 i31->h3 i32->h3 i33->h3 i34->h3 i35->h3 i36->h3 i37->h3 i38->h3 i39->h3 
##   -0.45    0.28   -0.23   -0.18   -1.86    0.33    0.10    0.60   -0.15    0.75 
## i40->h3 i41->h3 i42->h3 i43->h3 i44->h3 i45->h3 
##    0.47   -0.71   -1.37   -0.68   -1.31   -0.11 
##   b->h4  i1->h4  i2->h4  i3->h4  i4->h4  i5->h4  i6->h4  i7->h4  i8->h4  i9->h4 
##    2.85    0.48   -2.41    0.84    1.49   -0.35   -0.39   -4.12   -0.85   -1.53 
## i10->h4 i11->h4 i12->h4 i13->h4 i14->h4 i15->h4 i16->h4 i17->h4 i18->h4 i19->h4 
##    0.81    2.47   -0.34   -1.12    0.32    0.65   -2.32   -0.24    2.07    2.81 
## i20->h4 i21->h4 i22->h4 i23->h4 i24->h4 i25->h4 i26->h4 i27->h4 i28->h4 i29->h4 
##    2.00    0.94    2.96    1.46    2.04    1.01    1.48   -0.74   -1.02   -1.38 
## i30->h4 i31->h4 i32->h4 i33->h4 i34->h4 i35->h4 i36->h4 i37->h4 i38->h4 i39->h4 
##   -0.99    0.09   -0.45    1.59    0.13   -0.74   -1.02   -1.38    0.91   -0.67 
## i40->h4 i41->h4 i42->h4 i43->h4 i44->h4 i45->h4 
##   -0.54   -2.75   -0.73   -0.83    0.89   -2.09 
##   b->h5  i1->h5  i2->h5  i3->h5  i4->h5  i5->h5  i6->h5  i7->h5  i8->h5  i9->h5 
##   -0.80   -0.62   -0.56   -0.10    1.06   -0.02   -0.71   -0.38    0.25    0.30 
## i10->h5 i11->h5 i12->h5 i13->h5 i14->h5 i15->h5 i16->h5 i17->h5 i18->h5 i19->h5 
##    0.31   -1.46    0.95    1.52    0.28    0.14   -1.52   -2.01   -1.29   -1.18 
## i20->h5 i21->h5 i22->h5 i23->h5 i24->h5 i25->h5 i26->h5 i27->h5 i28->h5 i29->h5 
##    2.29    1.64    0.51    0.46   -0.87   -1.25   -0.74    0.77   -0.61    0.94 
## i30->h5 i31->h5 i32->h5 i33->h5 i34->h5 i35->h5 i36->h5 i37->h5 i38->h5 i39->h5 
##   -0.98    0.28   -0.30    0.12   -1.97    0.77   -0.61    0.94   -0.59   -0.41 
## i40->h5 i41->h5 i42->h5 i43->h5 i44->h5 i45->h5 
##   -0.20   -1.53   -1.34   -0.75   -1.75    0.00 
##  b->o h1->o h2->o h3->o h4->o h5->o 
## -0.04 -3.16 -5.89  3.69  5.25 -4.49

print(modelo_nn)

## a 45-5-1 network with 236 weights
## inputs: campCuc campVal genMasc estrBajo estrMedio plantPub faculExac faculIng faculSal faculSoc faculTec rplcRB rplcRM rpmatRB rpmatRM rpscRB rpscRM rpcnRB rpcnRM rpingRB rpingRM edad>26 edadE21-25 nlibroE0-10 nlibroE11-25 nlibroE26-100 deparNort deparOtro deparSant estraEst2 estraEst3 estraEst4 estraEst5 estraEst6 depar_ANort depar_AOtro depar_ASant gustolectSi apoyemocSi apofinSi pag_semesBecas pag_semesCred pag_semesEfect pag_semesICETEX pag_semesOtro 
## output(s): prom2 
## options were - entropy fitting  decay=0.1

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.