Este es un documento R Markdown. Contiene información de la Base de Datos relacionada con apsecto de inclusión y grupos de especial protección constitucional en la Universidad de Santander. Para la mineria de datos se empleó el paquete Rattle y R.
Se carga la base de datos
dataset <- read.csv("C:/Users/coordinador.analitic/OneDrive - Universidad de Santander/Alertas tempranas UDES/Publicaciones/Capitulo de libro ELSERVIER 2024/Markdown/base_datos_inclusion.csv", sep=";")
attach(dataset)
str(dataset)## 'data.frame': 1539 obs. of 32 variables:
## $ Tipo_Est : chr "Nuevo" "Nuevo" "Nuevo" "Nuevo" ...
## $ Prog : chr "ANTROPOLOGÍA" "ANTROPOLOGÍA" "INSTRUMENTACIÓN QUIRÚRGICA" "INSTRUMENTACIÓN QUIRÚRGICA" ...
## $ Prom : num 4.04 3.58 4.04 3.9 3.96 4.12 4.06 4 3.95 4.08 ...
## $ Edad : int 24 23 28 21 21 20 20 21 21 20 ...
## $ Edad_categ: chr "21-25años" "21-25años" ">26años" "21-25años" ...
## $ Gen : chr "F" "F" "F" "F" ...
## $ Es_civil : chr "Soltero" "Soltero" "Soltero" "Soltero" ...
## $ Origen : chr "COLOMBIA" "COLOMBIA" "COLOMBIA" "COLOMBIA" ...
## $ Origen_Col: chr "Colombia" "Colombia" "Colombia" "Colombia" ...
## $ Dto_origen: chr "NORTE DE SANTANDER" "GUAJIRA" "PUTUMAYO" "CESAR" ...
## $ Zona : chr "Urbana" "Urbana" "Urbana" "Urbana" ...
## $ Estrato : chr "Bajo" "Bajo" "Bajo" "Bajo" ...
## $ Trab : chr "No" "No" "Si" "No" ...
## $ coh : chr "20-ene" "20-feb" "20-ene" "20-ene" ...
## $ Campus : chr "Bucaramanga" "Bucaramanga" "Bucaramanga" "Bucaramanga" ...
## $ Prom_cat : chr "Superior" "Inferior" "Superior" "Superior" ...
## $ Area : chr "Sociales" "Sociales" "Salud" "Salud" ...
## $ NING : chr "A-" "A-" "A-" "A-" ...
## $ NLC : chr "N3" "N3" "N2" "N3" ...
## $ NMAT : chr "N2" "N2" "N2" "N3" ...
## $ NPSC : chr "N1" "N2" "N2" "N2" ...
## $ NCN : chr "N2" "N1" "N2" "N2" ...
## $ NING2 : chr "A-A2" "A-A2" "A-A2" "A-A2" ...
## $ NLC2 : chr "N3N4" "N3N4" "N1N2" "N3N4" ...
## $ NMAT2 : chr "N1N2" "N1N2" "N1N2" "N3N4" ...
## $ NPSC2 : chr "N1N2" "N1N2" "N1N2" "N1N2" ...
## $ NCN2 : chr "N1N2" "N1N2" "N1N2" "N1N2" ...
## $ PLC : int 51 51 47 61 52 51 62 67 61 47 ...
## $ PMA : int 41 46 38 53 50 65 58 67 61 45 ...
## $ PSC : int 40 48 50 49 30 39 53 63 69 47 ...
## $ PCN : int 49 38 41 53 42 56 52 68 69 41 ...
## $ PIN : int 38 40 33 46 39 48 51 69 55 52 ...
## [1] "Tipo_Est" "Prog" "Prom" "Edad" "Edad_categ"
## [6] "Gen" "Es_civil" "Origen" "Origen_Col" "Dto_origen"
## [11] "Zona" "Estrato" "Trab" "coh" "Campus"
## [16] "Prom_cat" "Area" "NING" "NLC" "NMAT"
## [21] "NPSC" "NCN" "NING2" "NLC2" "NMAT2"
## [26] "NPSC2" "NCN2" "PLC" "PMA" "PSC"
## [31] "PCN" "PIN"
library(moments)
summary(dataset) #note que ya no tendremos que separar por $, dado la función attach## Tipo_Est Prog Prom Edad
## Length:1539 Length:1539 Min. :1.040 Min. :16.00
## Class :character Class :character 1st Qu.:3.640 1st Qu.:19.00
## Mode :character Mode :character Median :3.880 Median :20.00
## Mean :3.835 Mean :20.86
## 3rd Qu.:4.090 3rd Qu.:22.00
## Max. :5.000 Max. :37.00
## Edad_categ Gen Es_civil Origen
## Length:1539 Length:1539 Length:1539 Length:1539
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Origen_Col Dto_origen Zona Estrato
## Length:1539 Length:1539 Length:1539 Length:1539
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Trab coh Campus Prom_cat
## Length:1539 Length:1539 Length:1539 Length:1539
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Area NING NLC NMAT
## Length:1539 Length:1539 Length:1539 Length:1539
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## NPSC NCN NING2 NLC2
## Length:1539 Length:1539 Length:1539 Length:1539
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## NMAT2 NPSC2 NCN2 PLC
## Length:1539 Length:1539 Length:1539 Min. :28.00
## Class :character Class :character Class :character 1st Qu.:48.00
## Mode :character Mode :character Mode :character Median :54.00
## Mean :54.25
## 3rd Qu.:61.00
## Max. :81.00
## PMA PSC PCN PIN
## Min. : 21.00 Min. :22.0 Min. :27.00 Min. : 0.00
## 1st Qu.: 45.00 1st Qu.:42.0 1st Qu.:43.50 1st Qu.: 42.00
## Median : 52.00 Median :49.0 Median :50.00 Median : 49.00
## Mean : 51.73 Mean :49.3 Mean :50.17 Mean : 49.95
## 3rd Qu.: 59.00 3rd Qu.:57.0 3rd Qu.:57.00 3rd Qu.: 57.00
## Max. :100.00 Max. :82.0 Max. :75.00 Max. :100.00
# El paquete 'Hmisc' ofrece la función 'contents'.
library(Hmisc, quietly=TRUE)
# Obtener un resumen del conjunto de datos continuos.
describe(dataset$Prom)## dataset$Prom
## n missing distinct Info Mean Gmd .05 .10
## 1539 0 206 1 3.835 0.4452 3.160 3.368
## .25 .50 .75 .90 .95
## 3.640 3.880 4.090 4.300 4.430
##
## lowest : 1.04 1.08 1.1 1.28 1.33, highest: 4.69 4.72 4.75 4.76 5
## dataset$Prom
## n missing distinct Info Mean Gmd .05 .10
## 1539 0 206 1 3.835 0.4452 3.160 3.368
## .25 .50 .75 .90 .95
## 3.640 3.880 4.090 4.300 4.430
##
## lowest : 1.04 1.08 1.1 1.28 1.33, highest: 4.69 4.72 4.75 4.76 5
## dataset$Edad
## n missing distinct Info Mean Gmd .05 .10
## 1539 0 21 0.975 20.86 2.657 18 18
## .25 .50 .75 .90 .95
## 19 20 22 24 26
##
## lowest : 16 17 18 19 20, highest: 32 33 34 35 37
## dataset$PLC
## n missing distinct Info Mean Gmd .05 .10
## 1539 0 50 0.999 54.25 9.981 40 43
## .25 .50 .75 .90 .95
## 48 54 61 65 69
##
## lowest : 28 30 31 32 33, highest: 74 75 76 77 81
## dataset$PMA
## n missing distinct Info Mean Gmd .05 .10
## 1539 0 62 0.999 51.73 11.42 35 39
## .25 .50 .75 .90 .95
## 45 52 59 65 68
##
## lowest : 21 22 24 25 26, highest: 79 80 83 87 100
## dataset$PSC
## n missing distinct Info Mean Gmd .05 .10
## 1539 0 57 0.999 49.3 11.76 32.0 36.0
## .25 .50 .75 .90 .95
## 42.0 49.0 57.0 62.0 65.1
##
## lowest : 22 24 25 26 27, highest: 75 76 79 80 82
## dataset$PCN
## n missing distinct Info Mean Gmd .05 .10
## 1539 0 49 0.999 50.17 10.31 36.0 38.0
## .25 .50 .75 .90 .95
## 43.5 50.0 57.0 62.0 65.0
##
## lowest : 27 28 29 30 31, highest: 71 72 73 74 75
## dataset$PIN
## n missing distinct Info Mean Gmd .05 .10
## 1539 0 57 0.999 49.95 12.02 33 37
## .25 .50 .75 .90 .95
## 42 49 57 64 69
##
## lowest : 0 22 23 28 29, highest: 78 79 81 93 100
## dataset$Gen
## n missing distinct
## 1539 0 2
##
## Value F M
## Frequency 980 559
## Proportion 0.637 0.363
## dataset$Edad_categ
## n missing distinct
## 1539 0 3
##
## Value <20años >26años 21-25años
## Frequency 807 97 635
## Proportion 0.524 0.063 0.413
## dataset$Es_civil
## n missing distinct
## 1539 0 4
##
## Value Casado Religioso Soltero Unión libre
## Frequency 8 1 1509 21
## Proportion 0.005 0.001 0.981 0.014
## dataset$Origen_Col
## n missing distinct
## 1539 0 2
##
## Value Colombia FueraColom
## Frequency 1532 7
## Proportion 0.995 0.005
## dataset$Origen
## n missing distinct
## 1539 0 4
##
## Value AUSTRIA COLOMBIA ECUADOR VENEZUELA
## Frequency 1 1532 1 5
## Proportion 0.001 0.995 0.001 0.003
## dataset$Dto_origen
## n missing distinct
## 1539 0 30
##
## lowest : - ANTIOQUIA ARAUCA ATLANTICO BOGOTA D.C
## highest: SAN ANDRES Y PROVIDENCIA SANTANDER SUCRE TOLIMA VALLE DEL CAUCA
## dataset$Zona
## n missing distinct
## 1539 0 2
##
## Value Rural Urbana
## Frequency 204 1335
## Proportion 0.133 0.867
## dataset$Estrato
## n missing distinct
## 1539 0 3
##
## Value Alto Bajo Medio
## Frequency 14 1296 229
## Proportion 0.009 0.842 0.149
## dataset$Trab
## n missing distinct
## 1539 0 2
##
## Value No Si
## Frequency 1456 83
## Proportion 0.946 0.054
## dataset$coh
## n missing distinct
## 1539 0 8
##
## Value 20-ene 20-feb 21-ene 21-feb 22-ene 22-feb 23-ene 23-feb
## Frequency 417 131 213 138 214 200 222 4
## Proportion 0.271 0.085 0.138 0.090 0.139 0.130 0.144 0.003
## dataset$Campus
## n missing distinct
## 1539 0 3
##
## Value Bucaramanga Cúcuta Valledupar
## Frequency 281 72 1186
## Proportion 0.183 0.047 0.771
## dataset$Prom_cat
## n missing distinct
## 1539 0 2
##
## Value Inferior Superior
## Frequency 704 835
## Proportion 0.457 0.543
## dataset$Area
## n missing distinct
## 1539 0 6
##
## Value Economicas Exactas Ingenieria Salud Sociales Tecnologia
## Frequency 160 97 149 581 517 35
## Proportion 0.104 0.063 0.097 0.378 0.336 0.023
## dataset$PLC
## n missing distinct Info Mean Gmd .05 .10
## 1539 0 50 0.999 54.25 9.981 40 43
## .25 .50 .75 .90 .95
## 48 54 61 65 69
##
## lowest : 28 30 31 32 33, highest: 74 75 76 77 81
## dataset$PMA
## n missing distinct Info Mean Gmd .05 .10
## 1539 0 62 0.999 51.73 11.42 35 39
## .25 .50 .75 .90 .95
## 45 52 59 65 68
##
## lowest : 21 22 24 25 26, highest: 79 80 83 87 100
## dataset$PSC
## n missing distinct Info Mean Gmd .05 .10
## 1539 0 57 0.999 49.3 11.76 32.0 36.0
## .25 .50 .75 .90 .95
## 42.0 49.0 57.0 62.0 65.1
##
## lowest : 22 24 25 26 27, highest: 75 76 79 80 82
## dataset$PCN
## n missing distinct Info Mean Gmd .05 .10
## 1539 0 49 0.999 50.17 10.31 36.0 38.0
## .25 .50 .75 .90 .95
## 43.5 50.0 57.0 62.0 65.0
##
## lowest : 27 28 29 30 31, highest: 71 72 73 74 75
## dataset$PIN
## n missing distinct Info Mean Gmd .05 .10
## 1539 0 57 0.999 49.95 12.02 33 37
## .25 .50 .75 .90 .95
## 42 49 57 64 69
##
## lowest : 0 22 23 28 29, highest: 78 79 81 93 100
###Histogramas
# Display histogram plots for the selected variables.
# Use ggplot2 to generate histogram plot for Edad
# Generate the plot.
p01 <- dataset %>%
with(dataset[,]) %>%
dplyr::mutate(Gen=as.factor(Gen)) %>%
dplyr::select(Edad, Gen) %>%
ggplot2::ggplot(ggplot2::aes(x=Edad)) +
ggplot2::geom_density(lty=3) +
ggplot2::geom_density(ggplot2::aes(fill=Gen, colour=Gen), alpha=0.55) +
ggplot2::xlab("Edad") +
ggplot2::ggtitle("Distribution of Edad by Gen") +
ggplot2::labs(fill="Gen", y="Density")
# Display the plots.
gridExtra::grid.arrange(p01)# Display histogram plots for the selected variables.
# Use ggplot2 to generate histogram plot for PLC
# Generate the plot.
p01 <- dataset %>%
with(dataset[,]) %>%
dplyr::mutate(Campus=as.factor(Campus)) %>%
dplyr::select(PLC, Campus) %>%
ggplot2::ggplot(ggplot2::aes(x=PLC)) +
ggplot2::geom_density(lty=3) +
ggplot2::geom_density(ggplot2::aes(fill=Campus, colour=Campus), alpha=0.55) +
ggplot2::xlab("PLC") +
ggplot2::ggtitle("Distribution of PLC by Campus") +
ggplot2::labs(fill="Campus", y="Density")
# Use ggplot2 to generate histogram plot for PMA
# Generate the plot.
p02 <- dataset %>%
with(dataset[,]) %>%
dplyr::mutate(Campus=as.factor(Campus)) %>%
dplyr::select(PMA, Campus) %>%
ggplot2::ggplot(ggplot2::aes(x=PMA)) +
ggplot2::geom_density(lty=3) +
ggplot2::geom_density(ggplot2::aes(fill=Campus, colour=Campus), alpha=0.55) +
ggplot2::xlab("PMA") +
ggplot2::ggtitle("Distribution of PMA by Campus") +
ggplot2::labs(fill="Campus", y="Density")
# Use ggplot2 to generate histogram plot for PSC
# Generate the plot.
p03 <- dataset %>%
with(dataset[,]) %>%
dplyr::mutate(Campus=as.factor(Campus)) %>%
dplyr::select(PSC, Campus) %>%
ggplot2::ggplot(ggplot2::aes(x=PSC)) +
ggplot2::geom_density(lty=3) +
ggplot2::geom_density(ggplot2::aes(fill=Campus, colour=Campus), alpha=0.55) +
ggplot2::xlab("PSC") +
ggplot2::ggtitle("Distribution of PSC by Campus") +
ggplot2::labs(fill="Campus", y="Density")
# Use ggplot2 to generate histogram plot for PCN
# Generate the plot.
p04 <- dataset %>%
with(dataset[,]) %>%
dplyr::mutate(Campus=as.factor(Campus)) %>%
dplyr::select(PCN, Campus) %>%
ggplot2::ggplot(ggplot2::aes(x=PCN)) +
ggplot2::geom_density(lty=3) +
ggplot2::geom_density(ggplot2::aes(fill=Campus, colour=Campus), alpha=0.55) +
ggplot2::xlab("PCN") +
ggplot2::ggtitle("Distribution of PCN by Campus") +
ggplot2::labs(fill="Campus", y="Density")
# Use ggplot2 to generate histogram plot for PIN
# Generate the plot.
p05 <- dataset %>%
with(dataset[,]) %>%
dplyr::mutate(Campus=as.factor(Campus)) %>%
dplyr::select(PIN, Campus) %>%
ggplot2::ggplot(ggplot2::aes(x=PIN)) +
ggplot2::geom_density(lty=3) +
ggplot2::geom_density(ggplot2::aes(fill=Campus, colour=Campus), alpha=0.55) +
ggplot2::xlab("PIN") +
ggplot2::ggtitle("Distribution of PIN by Campus") +
ggplot2::labs(fill="Campus", y="Density")
# Display the plots.
gridExtra::grid.arrange(p01, p02, p03, p04, p05)# Use ggplot2 to generate histogram plot for PLC
# Generate the plot.
p01 <- dataset %>%
with(dataset[,]) %>%
dplyr::mutate(Gen=as.factor(Gen)) %>%
dplyr::select(PLC, Gen) %>%
ggplot2::ggplot(ggplot2::aes(x=PLC)) +
ggplot2::geom_density(lty=3) +
ggplot2::geom_density(ggplot2::aes(fill=Gen, colour=Gen), alpha=0.55) +
ggplot2::xlab("PLC") +
ggplot2::ggtitle("Distribution of PLC by Gen") +
ggplot2::labs(fill="Gen", y="Density")
# Use ggplot2 to generate histogram plot for PMA
# Generate the plot.
p02 <- dataset %>%
with(dataset[,]) %>%
dplyr::mutate(Gen=as.factor(Gen)) %>%
dplyr::select(PMA, Gen) %>%
ggplot2::ggplot(ggplot2::aes(x=PMA)) +
ggplot2::geom_density(lty=3) +
ggplot2::geom_density(ggplot2::aes(fill=Gen, colour=Gen), alpha=0.55) +
ggplot2::xlab("PMA") +
ggplot2::ggtitle("Distribution of PMA by Gen") +
ggplot2::labs(fill="Gen", y="Density")
# Use ggplot2 to generate histogram plot for PSC
# Generate the plot.
p03 <- dataset %>%
with(dataset[,]) %>%
dplyr::mutate(Gen=as.factor(Gen)) %>%
dplyr::select(PSC, Gen) %>%
ggplot2::ggplot(ggplot2::aes(x=PSC)) +
ggplot2::geom_density(lty=3) +
ggplot2::geom_density(ggplot2::aes(fill=Gen, colour=Gen), alpha=0.55) +
ggplot2::xlab("PSC") +
ggplot2::ggtitle("Distribution of PSC by Gen") +
ggplot2::labs(fill="Gen", y="Density")
# Use ggplot2 to generate histogram plot for PCN
# Generate the plot.
p04 <- dataset %>%
with(dataset[,]) %>%
dplyr::mutate(Gen=as.factor(Gen)) %>%
dplyr::select(PCN, Gen) %>%
ggplot2::ggplot(ggplot2::aes(x=PCN)) +
ggplot2::geom_density(lty=3) +
ggplot2::geom_density(ggplot2::aes(fill=Gen, colour=Gen), alpha=0.55) +
ggplot2::xlab("PCN") +
ggplot2::ggtitle("Distribution of PCN by Gen") +
ggplot2::labs(fill="Gen", y="Density")
# Use ggplot2 to generate histogram plot for PIN
# Generate the plot.
p05 <- dataset %>%
with(dataset[,]) %>%
dplyr::mutate(Gen=as.factor(Gen)) %>%
dplyr::select(PIN, Gen) %>%
ggplot2::ggplot(ggplot2::aes(x=PIN)) +
ggplot2::geom_density(lty=3) +
ggplot2::geom_density(ggplot2::aes(fill=Gen, colour=Gen), alpha=0.55) +
ggplot2::xlab("PIN") +
ggplot2::ggtitle("Distribution of PIN by Gen") +
ggplot2::labs(fill="Gen", y="Density")
# Display the plots.
gridExtra::grid.arrange(p01, p02, p03, p04, p05)# Use ggplot2 to generate box plot for Edad
# Generate a box plot.
p01 <- dataset %>%
with(dataset[,]) %>%
dplyr::mutate(Campus=as.factor(Campus)) %>%
ggplot2::ggplot(ggplot2::aes(y=Edad)) +
ggplot2::geom_boxplot(ggplot2::aes(x="All"), notch=TRUE, fill="grey") +
ggplot2::stat_summary(ggplot2::aes(x="All"), fun.y=mean, geom="point", shape=8) +
ggplot2::geom_boxplot(ggplot2::aes(x=Campus, fill=Campus), notch=TRUE) +
ggplot2::stat_summary(ggplot2::aes(x=Campus), fun.y=mean, geom="point", shape=8) +
ggplot2::xlab("Campus\n\nRattle 2024-jul.-16 12:51:58 coordinador.analitic") +
ggplot2::ggtitle("Distribution of Edad by Campus") +
ggplot2::theme(legend.position="none")
# Display the plots.
gridExtra::grid.arrange(p01)library(corrplot, quietly=TRUE) # El paquete 'corrplot' ofrece la función 'corrplot'.
library(dplyr)
library(PerformanceAnalytics)
num1 <- dataset [, 28:32]# Variables numericas
summary(num1)## PLC PMA PSC PCN
## Min. :28.00 Min. : 21.00 Min. :22.0 Min. :27.00
## 1st Qu.:48.00 1st Qu.: 45.00 1st Qu.:42.0 1st Qu.:43.50
## Median :54.00 Median : 52.00 Median :49.0 Median :50.00
## Mean :54.25 Mean : 51.73 Mean :49.3 Mean :50.17
## 3rd Qu.:61.00 3rd Qu.: 59.00 3rd Qu.:57.0 3rd Qu.:57.00
## Max. :81.00 Max. :100.00 Max. :82.0 Max. :75.00
## PIN
## Min. : 0.00
## 1st Qu.: 42.00
## Median : 49.00
## Mean : 49.95
## 3rd Qu.: 57.00
## Max. :100.00