A continuación, se presenta como se distribuyen los datos.
Se carga la base de datos
library (tidyverse)
library(readxl)
library(moments)
library(magrittr)
library(Hmisc)
library(ggplot2)
library (tidyverse)
library(dplyr)
library(plotly)
library(gridExtra)
library(rpart)
library(rpart.plot)
library(randomForest)
library(neuralnet)
library(ggplot2)
library(caret) # Para la partición
library(pacman)dataset <- read_excel("C:/Users/coordinador.analitic/OneDrive - Universidad de Santander/ArtículoImportEnseñanzaDrPatricio/Base de datos/CuantilUDES3final.xlsx")
#View(dataset)
attach(dataset)
names(dataset)## [1] "Camp" "Gen" "Estr" "Plant"
## [5] "Facul" "RPLC" "RPMAT" "RPSC"
## [9] "RPCN" "RPING" "Edad" "Nlibro"
## [13] "Depar" "Estra" "Depar_A" "Gustolect"
## [17] "Apoyemoc" "Apofin" "Pago_Semestre" "Prom2"
## [21] "Prom"
## tibble [4,182 × 21] (S3: tbl_df/tbl/data.frame)
## $ Camp : chr [1:4182] "Cuc" "Cuc" "Cuc" "Cuc" ...
## $ Gen : chr [1:4182] "Masc" "Fem" "Fem" "Fem" ...
## $ Estr : chr [1:4182] "Bajo" "Medio" "Medio" "Bajo" ...
## $ Plant : chr [1:4182] "Pub" "Priv" "Pub" "Pub" ...
## $ Facul : chr [1:4182] "Ing" "Econ" "Soc" "Econ" ...
## $ RPLC : chr [1:4182] "RA" "RM" "RM" "RB" ...
## $ RPMAT : chr [1:4182] "RM" "RB" "RM" "RB" ...
## $ RPSC : chr [1:4182] "RM" "RB" "RM" "RB" ...
## $ RPCN : chr [1:4182] "RM" "RB" "RM" "RB" ...
## $ RPING : chr [1:4182] "RM" "RM" "RM" "RM" ...
## $ Edad : chr [1:4182] "<20" "<20" "<20" "<20" ...
## $ Nlibro : chr [1:4182] "E0-10" "E11-25" "E11-25" "E11-25" ...
## $ Depar : chr [1:4182] "Nort" "Nort" "Nort" "Nort" ...
## $ Estra : chr [1:4182] "Est1" "Est3" "Est3" "Est1" ...
## $ Depar_A : chr [1:4182] "Nort" "Nort" "Nort" "Nort" ...
## $ Gustolect : chr [1:4182] "Si" "Si" "Si" "Si" ...
## $ Apoyemoc : chr [1:4182] "Si" "Si" "Si" "Si" ...
## $ Apofin : chr [1:4182] "Si" "Si" "Si" "Si" ...
## $ Pago_Semestre: chr [1:4182] "Efect" "Efect" "Efect" "Efect" ...
## $ Prom2 : chr [1:4182] ">3,8" ">3,8" ">3,8" ">3,8" ...
## $ Prom : num [1:4182] 5 4.89 4.85 4.83 4.81 4.8 4.8 4.79 4.79 4.79 ...
## dataset$Camp
## n missing distinct
## 4182 0 3
##
## Value Buc Cuc Val
## Frequency 1878 1151 1153
## Proportion 0.449 0.275 0.276
## dataset$Gen
## n missing distinct
## 4182 0 2
##
## Value Fem Masc
## Frequency 2628 1554
## Proportion 0.628 0.372
## dataset$Estr
## n missing distinct
## 4182 0 3
##
## Value Alto Bajo Medio
## Frequency 165 2395 1622
## Proportion 0.039 0.573 0.388
## dataset$Plant
## n missing distinct
## 4182 0 2
##
## Value Priv Pub
## Frequency 1870 2312
## Proportion 0.447 0.553
## dataset$Facul
## n missing distinct
## 4182 0 6
##
## Value Econ Exac Ing Sal Soc Tec
## Frequency 800 326 526 1740 656 134
## Proportion 0.191 0.078 0.126 0.416 0.157 0.032
## dataset$RPLC
## n missing distinct
## 4182 0 3
##
## Value RA RB RM
## Frequency 722 644 2816
## Proportion 0.173 0.154 0.673
## dataset$RPMAT
## n missing distinct
## 4182 0 3
##
## Value RA RB RM
## Frequency 653 640 2889
## Proportion 0.156 0.153 0.691
## dataset$RPSC
## n missing distinct
## 4182 0 3
##
## Value RA RB RM
## Frequency 749 663 2770
## Proportion 0.179 0.159 0.662
## dataset$RPCN
## n missing distinct
## 4182 0 3
##
## Value RA RB RM
## Frequency 733 645 2804
## Proportion 0.175 0.154 0.670
## dataset$RPING
## n missing distinct
## 4182 0 3
##
## Value RA RB RM
## Frequency 759 704 2719
## Proportion 0.181 0.168 0.650
## dataset$Edad
## n missing distinct
## 4182 0 3
##
## Value <20 >26 E21-25
## Frequency 3499 120 563
## Proportion 0.837 0.029 0.135
## dataset$Nlibro
## n missing distinct
## 4182 0 4
##
## Value >100 E0-10 E11-25 E26-100
## Frequency 254 1938 1196 794
## Proportion 0.061 0.463 0.286 0.190
## dataset$Depar
## n missing distinct
## 4182 0 4
##
## Value Cesar Nort Otro Sant
## Frequency 894 1051 1056 1181
## Proportion 0.214 0.251 0.253 0.282
## dataset$Estra
## n missing distinct
## 4182 0 6
##
## Value Est1 Est2 Est3 Est4 Est5 Est6
## Frequency 1009 1386 1087 535 134 31
## Proportion 0.241 0.331 0.260 0.128 0.032 0.007
##
## NULL
## dataset$Depar_A
## n missing distinct
## 4182 0 4
##
## Value Cesar Nort Otro Sant
## Frequency 894 1051 1056 1181
## Proportion 0.214 0.251 0.253 0.282
## dataset$Gustolect
## n missing distinct
## 4182 0 2
##
## Value No Si
## Frequency 1284 2898
## Proportion 0.307 0.693
## dataset$Apoyemoc
## n missing distinct
## 4182 0 2
##
## Value No Si
## Frequency 482 3700
## Proportion 0.115 0.885
## dataset$Apofin
## n missing distinct
## 4182 0 2
##
## Value No Si
## Frequency 1775 2407
## Proportion 0.424 0.576
## dataset$Pago_Semestre
## n missing distinct
## 4182 0 6
##
## Value Autof Becas Cred Efect ICETEX Otro
## Frequency 137 126 1035 1964 621 299
## Proportion 0.033 0.030 0.247 0.470 0.148 0.071
##
## NULL
## dataset$Prom
## n missing distinct Info Mean Gmd .05 .10
## 4182 0 296 1 3.788 0.6022 2.96 3.26
## .25 .50 .75 .90 .95
## 3.59 3.87 4.16 4.39 4.51
##
## lowest : 0 0.03 0.07 0.11 0.12, highest: 4.81 4.83 4.85 4.89 5
## dataset$Prom2
## n missing distinct
## 4182 0 2
##
## Value <3,8 >3,8
## Frequency 1786 2396
## Proportion 0.427 0.573
##
## NULL
## Camp Gen Estr Plant Facul
## 4182 4182 4182 4182 4182
## RPLC RPMAT RPSC RPCN RPING
## 4182 4182 4182 4182 4182
## Edad Nlibro Depar Estra Depar_A
## 4182 4182 4182 4182 4182
## Gustolect Apoyemoc Apofin Pago_Semestre Prom2
## 4182 4182 4182 4182 4182
## Prom
## 4182
#Convertimos a factores las variables
camp <- factor(Camp)
gen <- factor(Gen)
estr <- factor(Estr)
plant <-factor (Plant)
facul <-factor (Facul)
rplc <-factor(RPLC)
rpmat <- factor(RPMAT)
rpsc <- factor(RPSC)
rpcn <- factor(RPCN)
rping <- factor(RPING)
edad <- factor(Edad)
nlibro <- factor(Nlibro)
depar <- factor(Depar)
estra <- factor(Estra)
depar_A <- factor(Depar_A)
gustolect <- factor(Gustolect)
apoyemoc <- factor(Apoyemoc)
apofin <- factor(Apofin)
pag_semes <- factor(Pago_Semestre)
prom2 <- factor(Prom2)# Use ggplot2 to generate histogram plot for Prom
# Distributions
# Generate the plot Promedio vs Genero.
p01 <- dataset %>%
with(dataset[,]) %>%
dplyr::mutate(Gen=as.factor(Gen)) %>%
dplyr::select(Prom, Gen) %>%
ggplot2::ggplot(ggplot2::aes(x=Prom)) +
ggplot2::geom_density(lty=3) +
ggplot2::geom_density(ggplot2::aes(fill=Gen, colour=Gen), alpha=0.55) +
ggplot2::xlab("Prom") +
ggplot2::ggtitle("Distribution of Prom by Gen") +
ggplot2::labs(fill="Gen", y="Density")
# Display the plots.
gridExtra::grid.arrange(p01)# Generate the plot Promedio vs Facul.
p02 <- dataset %>%
with(dataset[,]) %>%
dplyr::mutate(Gen=as.factor(Facul)) %>%
dplyr::select(Prom, Facul) %>%
ggplot2::ggplot(ggplot2::aes(x=Prom)) +
ggplot2::geom_density(lty=3) +
ggplot2::geom_density(ggplot2::aes(fill=Facul, colour=Facul), alpha=0.55) +
ggplot2::xlab("Prom") +
ggplot2::ggtitle("Distribution of Prom by Facul") +
ggplot2::labs(fill="Facul", y="Density")
# Display the plots.
gridExtra::grid.arrange(p02)# Generate the plot Promedio vs Campus.
p03 <- dataset %>%
with(dataset[,]) %>%
dplyr::mutate(Gen=as.factor(Camp)) %>%
dplyr::select(Prom, Camp) %>%
ggplot2::ggplot(ggplot2::aes(x=Prom)) +
ggplot2::geom_density(lty=3) +
ggplot2::geom_density(ggplot2::aes(fill=Camp, colour=Camp), alpha=0.55) +
ggplot2::xlab("Camp") +
ggplot2::ggtitle("Distribution of Prom by Camp") +
ggplot2::labs(fill="Camp", y="Density")
# Display the plots.
gridExtra::grid.arrange(p03)ggplot(data=dataset, aes(x=Camp, fill=gen)) +
geom_bar(position="dodge") +
theme_minimal()+
geom_text(stat='count', aes(label=..count..), position=position_dodge(width=0.9), vjust=-0.5) +
labs(fill="Gender")ggplot(data=dataset, aes(x=Camp, fill=estr)) +
geom_bar(position="dodge") +
theme_minimal() +
geom_text(stat='count', aes(label=..count..), position=position_dodge(width=0.9), vjust=-0.5) +
labs(fill="Level")ggplot(data=dataset, aes(x=Camp, fill=facul)) +
geom_bar(position="dodge") +
theme_minimal() +
geom_text(stat='count', aes(label=..count..), position=position_dodge(width=0.9), vjust=-0.5) +
labs(fill="Facultad")ggplot(data=dataset, aes(x=Camp, fill=rplc)) +
geom_bar(position="dodge") +
theme_minimal() +
geom_text(stat='count', aes(label=..count..), position=position_dodge(width=0.9), vjust=-0.5) +
labs(fill="RPLC")ggplot(data=dataset, aes(x=Camp, fill=rpmat)) +
geom_bar(position="dodge") +
theme_minimal() +
geom_text(stat='count', aes(label=..count..), position=position_dodge(width=0.9), vjust=-0.5) +
labs(fill="RPMAT")ggplot(data=dataset, aes(x=Camp, fill=rpsc)) +
geom_bar(position="dodge") +
theme_minimal() +
geom_text(stat='count', aes(label=..count..), position=position_dodge(width=0.9), vjust=-0.5) +
labs(fill="RPSC")ggplot(data=dataset, aes(x=Camp, fill=rpcn)) +
geom_bar(position="dodge") +
theme_minimal() +
geom_text(stat='count', aes(label=..count..), position=position_dodge(width=0.9), vjust=-0.5) +
labs(fill="RPCN")ggplot(data=dataset, aes(x=Camp, fill=rping)) +
geom_bar(position="dodge") +
theme_minimal() +
geom_text(stat='count', aes(label=..count..), position=position_dodge(width=0.9), vjust=-0.5) +
labs(fill="RPING")ggplot(data=dataset, aes(x=Camp, fill=edad)) +
geom_bar(position="dodge") +
theme_minimal() +
geom_text(stat='count', aes(label=..count..), position=position_dodge(width=0.9), vjust=-0.5) +
labs(fill="Edad")ggplot(data=dataset, aes(x=Camp, fill=nlibro)) +
geom_bar(position="dodge") +
theme_minimal() +
geom_text(stat='count', aes(label=..count..), position=position_dodge(width=0.9), vjust=-0.5) +
labs(fill="Nlibro")ggplot(data=dataset, aes(x=Camp, fill=depar)) +
geom_bar(position="dodge") +
theme_minimal() +
geom_text(stat='count', aes(label=..count..), position=position_dodge(width=0.9), vjust=-0.5) +
labs(fill="Depar")ggplot(data=dataset, aes(x=Camp, fill=gustolect)) +
geom_bar(position="dodge") +
theme_minimal() +
geom_text(stat='count', aes(label=..count..), position=position_dodge(width=0.9), vjust=-0.5) +
labs(fill="Gustolect")ggplot(data=dataset, aes(x=Camp, fill=apoyemoc)) +
geom_bar(position="dodge") +
theme_minimal() +
labs(fill="Apoyemoc")ggplot(data=dataset, aes(x=Camp, fill=apofin)) +
geom_bar(position="dodge") +
theme_minimal() +
geom_text(stat='count', aes(label=..count..), position=position_dodge(width=0.9), vjust=-0.5) +
labs(fill="Apofin")ggplot(data=dataset, aes(x=Camp, fill=pag_semes)) +
geom_bar(position="dodge") +
theme_minimal() +
geom_text(stat='count', aes(label=..count..), position=position_dodge(width=0.9), vjust=-0.5) +
labs(fill="Pag_semes")In building our model we used a 70% subset of all of the available data. We call the 70% sample the training dataset. The remainder is split equally into a validation dataset (15%) and a testing dataset (15%). Action the user selections from the Data tab. Build the train/validate/test datasets. nobs=4182 train=2927 validate=627 test=628
# https://rpubs.com/jboscomendoza/arboles_decision_clasificacion
# Cargar las librerías necesarias
library(dplyr)
library(rpart)
library(rpart.plot)
library(caret) # Necesaria para confusionMatrix
# Fijar la semilla para la reproducibilidad
set.seed(2927)
# Dividir el conjunto de datos en entrenamiento y prueba
inclusion_entrenamiento <- sample_frac(dataset, 0.7)
inclusion_prueba <- setdiff(dataset, inclusion_entrenamiento)
# Entrenando nuestro modelo
arbol_1 <- rpart(formula = prom2 ~ camp + gen + estr + plant + facul
+ rplc + rpmat + rpsc + rpcn + rping
+ edad + nlibro + depar + estra + depar_A
+ gustolect + apoyemoc + apofin
+ pag_semes, data = inclusion_entrenamiento)
summary(arbol_1)## Call:
## rpart(formula = prom2 ~ camp + gen + estr + plant + facul + rplc +
## rpmat + rpsc + rpcn + rping + edad + nlibro + depar + estra +
## depar_A + gustolect + apoyemoc + apofin + pag_semes, data = inclusion_entrenamiento)
## n= 4182
##
## CP nsplit rel error xerror xstd
## 1 0.12765957 0 1.0000000 1.0000000 0.01791062
## 2 0.01213139 2 0.7446809 0.7446809 0.01686270
## 3 0.01000000 6 0.6892497 0.7066069 0.01662064
##
## Variable importance
## rpcn facul rpmat rpsc rping rplc gen
## 36 34 11 6 5 5 3
##
## Node number 1: 4182 observations, complexity param=0.1276596
## predicted class=>3,8 expected loss=0.4270684 P(node) =1
## class counts: 1786 2396
## probabilities: 0.427 0.573
## left son=2 (2592 obs) right son=3 (1590 obs)
## Primary splits:
## facul splits as RLLLRR, improve=167.21030, (0 missing)
## rpsc splits as LRR, improve=102.05210, (0 missing)
## rpcn splits as LRR, improve= 98.96977, (0 missing)
## rplc splits as LRR, improve= 98.65563, (0 missing)
## rping splits as LRR, improve= 88.55702, (0 missing)
## Surrogate splits:
## estra splits as LLLLLR, agree=0.62, adj=0.001, (0 split)
##
## Node number 2: 2592 observations, complexity param=0.1276596
## predicted class=<3,8 expected loss=0.4621914 P(node) =0.6197991
## class counts: 1394 1198
## probabilities: 0.538 0.462
## left son=4 (2152 obs) right son=5 (440 obs)
## Primary splits:
## rpcn splits as LRL, improve=117.72000, (0 missing)
## rpsc splits as LRL, improve= 88.49848, (0 missing)
## rpmat splits as LRL, improve= 80.62461, (0 missing)
## rplc splits as LRL, improve= 76.37716, (0 missing)
## rping splits as LRR, improve= 69.21944, (0 missing)
## Surrogate splits:
## rpmat splits as LRL, agree=0.876, adj=0.270, (0 split)
## rpsc splits as LRL, agree=0.864, adj=0.198, (0 split)
## rplc splits as LRL, agree=0.856, adj=0.150, (0 split)
## rping splits as LRL, agree=0.842, adj=0.070, (0 split)
##
## Node number 3: 1590 observations
## predicted class=>3,8 expected loss=0.2465409 P(node) =0.3802009
## class counts: 392 1198
## probabilities: 0.247 0.753
##
## Node number 4: 2152 observations, complexity param=0.01213139
## predicted class=<3,8 expected loss=0.394052 P(node) =0.5145863
## class counts: 1304 848
## probabilities: 0.606 0.394
## left son=8 (414 obs) right son=9 (1738 obs)
## Primary splits:
## rpcn splits as L-R, improve=58.78933, (0 missing)
## rpsc splits as LRR, improve=47.56854, (0 missing)
## rplc splits as LRR, improve=42.20266, (0 missing)
## rpmat splits as LRR, improve=41.08405, (0 missing)
## rping splits as LRR, improve=37.11269, (0 missing)
## Surrogate splits:
## rpmat splits as LRR, agree=0.844, adj=0.188, (0 split)
## rplc splits as LRR, agree=0.827, adj=0.099, (0 split)
## rpsc splits as LRR, agree=0.825, adj=0.089, (0 split)
##
## Node number 5: 440 observations
## predicted class=>3,8 expected loss=0.2045455 P(node) =0.1052128
## class counts: 90 350
## probabilities: 0.205 0.795
##
## Node number 8: 414 observations
## predicted class=<3,8 expected loss=0.1545894 P(node) =0.0989957
## class counts: 350 64
## probabilities: 0.845 0.155
##
## Node number 9: 1738 observations, complexity param=0.01213139
## predicted class=<3,8 expected loss=0.4510932 P(node) =0.4155906
## class counts: 954 784
## probabilities: 0.549 0.451
## left son=18 (246 obs) right son=19 (1492 obs)
## Primary splits:
## rping splits as LRR, improve=17.48578, (0 missing)
## gen splits as RL, improve=17.11323, (0 missing)
## rplc splits as LRR, improve=15.38099, (0 missing)
## rpsc splits as LRR, improve=15.01757, (0 missing)
## facul splits as -LRR--, improve=13.24850, (0 missing)
##
## Node number 18: 246 observations
## predicted class=<3,8 expected loss=0.2764228 P(node) =0.05882353
## class counts: 178 68
## probabilities: 0.724 0.276
##
## Node number 19: 1492 observations, complexity param=0.01213139
## predicted class=<3,8 expected loss=0.4798928 P(node) =0.3567671
## class counts: 776 716
## probabilities: 0.520 0.480
## left son=38 (491 obs) right son=39 (1001 obs)
## Primary splits:
## gen splits as RL, improve=16.815360, (0 missing)
## rpsc splits as LRR, improve=12.429930, (0 missing)
## facul splits as -LRR--, improve= 9.674061, (0 missing)
## rpmat splits as LRR, improve= 8.931678, (0 missing)
## rplc splits as LRR, improve= 8.482659, (0 missing)
## Surrogate splits:
## facul splits as -RLR--, agree=0.731, adj=0.181, (0 split)
## estra splits as RRRRRL, agree=0.672, adj=0.004, (0 split)
##
## Node number 38: 491 observations
## predicted class=<3,8 expected loss=0.3727088 P(node) =0.1174079
## class counts: 308 183
## probabilities: 0.627 0.373
##
## Node number 39: 1001 observations, complexity param=0.01213139
## predicted class=>3,8 expected loss=0.4675325 P(node) =0.2393592
## class counts: 468 533
## probabilities: 0.468 0.532
## left son=78 (70 obs) right son=79 (931 obs)
## Primary splits:
## rpmat splits as LRR, improve=11.410450, (0 missing)
## camp splits as LRL, improve= 8.735124, (0 missing)
## rpsc splits as LRR, improve= 7.600517, (0 missing)
## facul splits as -LRR--, improve= 6.084253, (0 missing)
## rplc splits as LRL, improve= 5.050962, (0 missing)
##
## Node number 78: 70 observations
## predicted class=<3,8 expected loss=0.2571429 P(node) =0.0167384
## class counts: 52 18
## probabilities: 0.743 0.257
##
## Node number 79: 931 observations
## predicted class=>3,8 expected loss=0.4468314 P(node) =0.2226208
## class counts: 416 515
## probabilities: 0.447 0.553
## n= 4182
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4182 1786 >3,8 (0.4270684 0.5729316)
## 2) facul=Exac,Ing,Sal 2592 1198 <3,8 (0.5378086 0.4621914)
## 4) rpcn=RA,RM 2152 848 <3,8 (0.6059480 0.3940520)
## 8) rpcn=RA 414 64 <3,8 (0.8454106 0.1545894) *
## 9) rpcn=RM 1738 784 <3,8 (0.5489068 0.4510932)
## 18) rping=RA 246 68 <3,8 (0.7235772 0.2764228) *
## 19) rping=RB,RM 1492 716 <3,8 (0.5201072 0.4798928)
## 38) gen=Masc 491 183 <3,8 (0.6272912 0.3727088) *
## 39) gen=Fem 1001 468 >3,8 (0.4675325 0.5324675)
## 78) rpmat=RA 70 18 <3,8 (0.7428571 0.2571429) *
## 79) rpmat=RB,RM 931 416 >3,8 (0.4468314 0.5531686) *
## 5) rpcn=RB 440 90 >3,8 (0.2045455 0.7954545) *
## 3) facul=Econ,Soc,Tec 1590 392 >3,8 (0.2465409 0.7534591) *
library(randomForest)
library(datasets)
library(caret)
# Fijar la semilla para la reproducibilidad
set.seed(2927)
# Dividir el conjunto de datos en entrenamiento y prueba (70% entrenamiento, 30% prueba)
inclusion_entrenamiento <- sample_frac(dataset, 0.7)
inclusion_prueba <- setdiff(dataset, inclusion_entrenamiento)
# Entrenando el modelo Random Forest (Etapa 1: Entrenamiento)
modelo_rf <- randomForest(
formula = prom2 ~ camp + gen + estr + plant + facul + rplc + rpmat + rpsc + rpcn + rping + edad + nlibro + depar + estra + depar_A + gustolect + apoyemoc + apofin + pag_semes,
dataset = inclusion_entrenamiento,
ntree = 500, # Número de árboles
mtry = 4, # Número de predictores probados en cada división
importance = TRUE
)
summary(modelo_rf)## Length Class Mode
## call 6 -none- call
## type 1 -none- character
## predicted 4182 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 8364 matrix numeric
## oob.times 4182 -none- numeric
## classes 2 -none- character
## importance 76 -none- numeric
## importanceSD 57 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 4182 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
##
## Call:
## randomForest(formula = prom2 ~ camp + gen + estr + plant + facul + rplc + rpmat + rpsc + rpcn + rping + edad + nlibro + depar + estra + depar_A + gustolect + apoyemoc + apofin + pag_semes, dataset = inclusion_entrenamiento, ntree = 500, mtry = 4, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 29.2%
## Confusion matrix:
## <3,8 >3,8 class.error
## <3,8 1100 686 0.3840985
## >3,8 535 1861 0.2232888
# Cargar las librerías necesarias
library(nnet) # Para la red neuronal
library(caret) # Para la matriz de confusión
library(dplyr) # Para manipulación de datos
# Fijar la semilla para la reproducibilidad
set.seed(2927)
# Verificar si la columna 'prom2' está en el dataset
print("Columnas en el dataset:")## [1] "Columnas en el dataset:"
## [1] "Camp" "Gen" "Estr" "Plant"
## [5] "Facul" "RPLC" "RPMAT" "RPSC"
## [9] "RPCN" "RPING" "Edad" "Nlibro"
## [13] "Depar" "Estra" "Depar_A" "Gustolect"
## [17] "Apoyemoc" "Apofin" "Pago_Semestre" "Prom2"
## [21] "Prom"
## [1] "Revisar los primeros registros de 'prom2':"
## NULL
# Dividir el conjunto de datos en entrenamiento y prueba (70% para entrenamiento, 30% para prueba)
inclusion_entrenamiento <- sample_frac(dataset, 0.7)
inclusion_prueba <- setdiff(dataset, inclusion_entrenamiento)
# Entrenar la red neuronal con la función nnet
# Tamaño de la red con 5 nodos ocultos
modelo_nn <- nnet(
formula = prom2 ~ camp + gen + estr + plant + facul
+ rplc + rpmat + rpsc + rpcn + rping
+ edad + nlibro + depar + estra + depar_A
+ gustolect + apoyemoc + apofin + pag_semes,
data = inclusion_entrenamiento,
size = 5, # Número de neuronas en la capa oculta
maxit = 500, # Máximo de iteraciones
decay = 0.1, # Parámetro de regularización
trace = FALSE # Para evitar imprimir las salidas en cada iteración
)
summary(modelo_nn)## a 45-5-1 network with 236 weights
## options were - entropy fitting decay=0.1
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1 i8->h1 i9->h1
## -3.36 2.97 2.55 -0.47 -1.84 -0.59 -0.19 1.21 3.37 -1.94
## i10->h1 i11->h1 i12->h1 i13->h1 i14->h1 i15->h1 i16->h1 i17->h1 i18->h1 i19->h1
## -1.41 3.35 -1.47 -1.27 3.97 3.57 0.78 0.74 -0.16 -0.59
## i20->h1 i21->h1 i22->h1 i23->h1 i24->h1 i25->h1 i26->h1 i27->h1 i28->h1 i29->h1
## -0.25 -0.24 2.16 -0.29 0.19 0.93 -0.34 -0.39 -0.13 -0.46
## i30->h1 i31->h1 i32->h1 i33->h1 i34->h1 i35->h1 i36->h1 i37->h1 i38->h1 i39->h1
## 0.40 -0.60 0.01 -2.29 1.36 -0.39 -0.13 -0.46 0.59 -1.18
## i40->h1 i41->h1 i42->h1 i43->h1 i44->h1 i45->h1
## 0.57 0.34 0.18 -0.01 -0.78 -0.26
## b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 i7->h2 i8->h2 i9->h2
## 1.69 -1.03 -2.14 2.69 1.26 -0.32 0.48 -0.27 0.02 0.46
## i10->h2 i11->h2 i12->h2 i13->h2 i14->h2 i15->h2 i16->h2 i17->h2 i18->h2 i19->h2
## 1.36 1.85 -0.44 -1.43 -1.21 -0.53 -2.43 -0.42 0.21 2.11
## i20->h2 i21->h2 i22->h2 i23->h2 i24->h2 i25->h2 i26->h2 i27->h2 i28->h2 i29->h2
## 0.29 0.77 0.55 2.01 0.93 -0.19 0.58 0.00 -0.51 -0.97
## i30->h2 i31->h2 i32->h2 i33->h2 i34->h2 i35->h2 i36->h2 i37->h2 i38->h2 i39->h2
## -0.74 0.09 -0.41 1.45 -0.70 0.00 -0.51 -0.97 0.92 0.66
## i40->h2 i41->h2 i42->h2 i43->h2 i44->h2 i45->h2
## -0.07 -1.82 -0.90 -0.48 2.38 -0.95
## b->h3 i1->h3 i2->h3 i3->h3 i4->h3 i5->h3 i6->h3 i7->h3 i8->h3 i9->h3
## -1.17 0.91 0.18 -0.72 0.82 0.05 0.49 -1.26 0.32 -2.83
## i10->h3 i11->h3 i12->h3 i13->h3 i14->h3 i15->h3 i16->h3 i17->h3 i18->h3 i19->h3
## 0.29 1.09 1.47 1.19 1.78 1.50 0.58 -0.31 0.30 -0.57
## i20->h3 i21->h3 i22->h3 i23->h3 i24->h3 i25->h3 i26->h3 i27->h3 i28->h3 i29->h3
## 1.76 1.43 0.98 0.06 -1.22 -1.06 -1.15 0.33 0.10 0.60
## i30->h3 i31->h3 i32->h3 i33->h3 i34->h3 i35->h3 i36->h3 i37->h3 i38->h3 i39->h3
## -0.45 0.28 -0.23 -0.18 -1.86 0.33 0.10 0.60 -0.15 0.75
## i40->h3 i41->h3 i42->h3 i43->h3 i44->h3 i45->h3
## 0.47 -0.71 -1.37 -0.68 -1.31 -0.11
## b->h4 i1->h4 i2->h4 i3->h4 i4->h4 i5->h4 i6->h4 i7->h4 i8->h4 i9->h4
## 2.85 0.48 -2.41 0.84 1.49 -0.35 -0.39 -4.12 -0.85 -1.53
## i10->h4 i11->h4 i12->h4 i13->h4 i14->h4 i15->h4 i16->h4 i17->h4 i18->h4 i19->h4
## 0.81 2.47 -0.34 -1.12 0.32 0.65 -2.32 -0.24 2.07 2.81
## i20->h4 i21->h4 i22->h4 i23->h4 i24->h4 i25->h4 i26->h4 i27->h4 i28->h4 i29->h4
## 2.00 0.94 2.96 1.46 2.04 1.01 1.48 -0.74 -1.02 -1.38
## i30->h4 i31->h4 i32->h4 i33->h4 i34->h4 i35->h4 i36->h4 i37->h4 i38->h4 i39->h4
## -0.99 0.09 -0.45 1.59 0.13 -0.74 -1.02 -1.38 0.91 -0.67
## i40->h4 i41->h4 i42->h4 i43->h4 i44->h4 i45->h4
## -0.54 -2.75 -0.73 -0.83 0.89 -2.09
## b->h5 i1->h5 i2->h5 i3->h5 i4->h5 i5->h5 i6->h5 i7->h5 i8->h5 i9->h5
## -0.80 -0.62 -0.56 -0.10 1.06 -0.02 -0.71 -0.38 0.25 0.30
## i10->h5 i11->h5 i12->h5 i13->h5 i14->h5 i15->h5 i16->h5 i17->h5 i18->h5 i19->h5
## 0.31 -1.46 0.95 1.52 0.28 0.14 -1.52 -2.01 -1.29 -1.18
## i20->h5 i21->h5 i22->h5 i23->h5 i24->h5 i25->h5 i26->h5 i27->h5 i28->h5 i29->h5
## 2.29 1.64 0.51 0.46 -0.87 -1.25 -0.74 0.77 -0.61 0.94
## i30->h5 i31->h5 i32->h5 i33->h5 i34->h5 i35->h5 i36->h5 i37->h5 i38->h5 i39->h5
## -0.98 0.28 -0.30 0.12 -1.97 0.77 -0.61 0.94 -0.59 -0.41
## i40->h5 i41->h5 i42->h5 i43->h5 i44->h5 i45->h5
## -0.20 -1.53 -1.34 -0.75 -1.75 0.00
## b->o h1->o h2->o h3->o h4->o h5->o
## -0.04 -3.16 -5.89 3.69 5.25 -4.49
## a 45-5-1 network with 236 weights
## inputs: campCuc campVal genMasc estrBajo estrMedio plantPub faculExac faculIng faculSal faculSoc faculTec rplcRB rplcRM rpmatRB rpmatRM rpscRB rpscRM rpcnRB rpcnRM rpingRB rpingRM edad>26 edadE21-25 nlibroE0-10 nlibroE11-25 nlibroE26-100 deparNort deparOtro deparSant estraEst2 estraEst3 estraEst4 estraEst5 estraEst6 depar_ANort depar_AOtro depar_ASant gustolectSi apoyemocSi apofinSi pag_semesBecas pag_semesCred pag_semesEfect pag_semesICETEX pag_semesOtro
## output(s): prom2
## options were - entropy fitting decay=0.1
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.