¿EXISTE PROBABILIDAD DE EXITO ENTRE PROMEDIO DE NOTAS Y SEXO DE LOS ALUMNOS?
NO, SEGÚN GRÁFICA DE CLASIFICACIÓN SVM NO SE OBSERVA RELACIÓN
#CONCLUSI?N, SEG?N LO OBSERVADO POR EL M?TODO DE CLASIFICACI?N PARA LA VARIABLE DEPENDIENTE SEXO (M,F), Y LAS INDEPENDIENTES P.GEN Y YY, SE CONCLUYE QUE LA VARIBLE DEPENDIENTE NO GUARDA RELACI?N CON EL COMPORTAMIENTO DE LAS VARIABLES INDEPENDIENTES. PERO SEG?N MATRIZ DE CONFUSION EL MODELO ARROJA DEMASIADOS FALSOS POSITIVOS.
# Regresión LogIstica
# Importar el dataset
dataset = read.csv2('../SVM MAQUINA SOPORTE VECTORIAL/SEXO_PGEN_YY.csv')
dataset = dataset[, 3:5]
dataset$SEXO <- as.numeric(as.character(dataset$SEXO))
#dataset$P.HIS <- as.numeric(as.character(dataset$P.HIS))
#dataset$P.LEN <- as.numeric(as.character(dataset$P.LEN))
dataset$P.GEN <- as.numeric(as.character(dataset$P.GEN))
dataset$YY <- as.numeric(as.character(dataset$YY))
dataset
## P.GEN SEXO YY
## 1 4.5 1 0
## 2 4.9 1 0
## 3 4.5 1 0
## 4 4.4 1 0
## 5 6.0 0 0
## 6 6.4 0 1
## 7 5.8 0 1
## 8 5.3 1 0
## 9 5.7 1 0
## 10 4.6 0 0
## 11 5.7 1 1
## 12 4.8 0 0
## 13 5.1 1 1
## 14 5.2 1 0
## 15 4.9 0 0
## 16 4.7 1 0
## 17 4.8 0 0
## 18 5.8 0 1
## 19 6.6 0 1
## 20 6.4 0 1
## 21 5.4 0 0
## 22 4.9 1 0
## 23 6.0 0 1
## 24 5.0 0 1
## 25 5.0 1 0
## 26 4.4 0 0
## 27 6.3 1 1
## 28 6.0 1 1
## 29 4.7 1 0
## 30 4.5 1 0
## 31 5.1 0 0
## 32 5.9 0 1
## 33 4.8 1 0
## 34 4.3 1 0
## 35 5.1 1 1
## 36 5.0 1 0
## 37 5.0 1 0
## 38 4.1 0 0
## 39 4.7 1 0
## 40 5.7 0 0
## 41 5.3 0 1
## 42 5.0 0 1
## 43 5.1 0 1
## 44 5.0 0 0
## 45 6.4 0 1
## 46 5.1 0 1
## 47 6.7 0 1
## 48 5.4 0 1
## 49 4.9 0 0
## 50 5.0 1 0
## 51 6.4 0 1
## 52 4.8 0 0
## 53 5.3 0 0
## 54 5.7 1 1
## 55 5.2 1 1
## 56 5.1 0 1
## 57 5.2 0 1
## 58 5.7 0 1
## 59 5.5 0 0
## 60 5.0 0 1
## 61 5.1 1 0
## 62 4.3 1 0
## 63 5.5 1 0
## 64 5.2 1 1
## 65 5.3 1 0
## 66 5.6 1 1
## 67 5.0 1 0
## 68 6.7 0 1
## 69 5.4 1 0
## 70 6.6 1 1
## 71 6.1 1 1
## 72 6.5 0 1
## 73 6.6 1 1
## 74 6.7 1 1
## 75 6.3 0 1
## 76 6.2 0 1
## 77 5.7 0 1
## 78 5.0 0 0
## 79 5.0 1 1
## 80 5.0 1 0
## 81 6.6 0 1
## 82 6.2 0 1
## 83 5.1 0 1
## 84 5.1 0 1
## 85 4.8 0 0
## 86 4.9 0 1
## 87 5.1 0 0
## 88 6.3 0 1
## 89 5.1 0 0
## 90 6.4 0 1
## 91 5.0 0 1
## 92 5.2 0 1
## 93 5.2 0 0
## 94 4.4 0 0
## 95 4.5 0 1
## 96 5.0 0 0
## 97 5.5 0 1
## 98 6.5 0 1
## 99 4.9 0 0
## 100 5.4 0 0
## 101 4.3 0 0
## 102 4.7 0 1
## 103 4.9 0 0
## 104 5.2 1 0
## 105 6.3 0 1
## 106 4.8 1 0
## 107 4.6 1 0
## 108 5.9 0 1
## 109 5.1 0 0
## 110 6.1 1 0
## 111 6.2 1 1
## 112 4.8 1 1
## 113 4.8 0 1
## 114 6.3 1 1
## 115 5.2 1 1
## 116 5.7 0 1
## 117 5.0 0 0
## 118 5.1 1 1
## 119 4.4 0 0
## 120 4.9 0 0
## 121 6.2 1 1
## 122 4.2 0 0
## 123 6.1 0 1
## 124 4.7 0 0
## 125 5.9 0 1
## 126 4.7 0 1
## 127 4.8 0 0
## 128 4.4 0 0
## 129 5.2 1 1
## 130 5.3 0 1
## 131 6.4 0 1
## 132 6.1 0 1
## 133 4.7 1 0
## 134 4.1 1 0
## 135 4.0 1 0
## 136 4.6 0 0
## 137 4.8 1 0
## 138 6.0 0 1
## 139 6.1 0 1
## 140 4.7 1 0
## 141 4.6 1 0
## 142 4.8 0 0
## 143 5.6 1 0
## 144 5.6 0 0
## 145 6.4 1 1
## 146 4.5 0 1
## 147 5.9 1 1
## 148 5.0 0 1
## 149 4.3 1 0
## 150 5.2 0 1
## 151 6.1 0 1
## 152 4.6 0 0
## 153 5.4 0 1
## 154 5.4 0 1
## 155 6.2 1 1
## 156 6.5 1 1
## 157 6.7 1 1
## 158 5.4 0 1
## 159 6.7 1 1
## 160 6.3 1 1
## 161 5.0 0 1
## 162 4.4 0 0
## 163 5.0 1 1
## 164 5.0 0 0
## 165 4.3 0 1
## 166 6.0 1 1
## 167 5.1 0 1
## 168 4.8 0 0
## 169 6.4 0 1
## 170 4.9 0 0
## 171 4.5 1 0
## 172 4.3 0 1
## 173 5.2 0 1
## 174 4.3 0 1
## 175 5.4 0 0
## 176 6.2 0 1
## 177 4.1 0 0
## 178 5.1 0 0
## 179 4.3 0 1
## 180 5.3 1 1
## 181 6.4 1 1
## 182 5.4 0 1
## 183 5.5 0 0
## 184 5.3 0 0
## 185 5.6 1 1
# Dividir los datos en conjunto de entrenamiento y conjunto de test
# install.packages("caTools")
library(caTools)
set.seed(1)
split = sample.split(dataset$SEXO, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
testing_set = subset(dataset, split == FALSE)
# Escalado de valores
training_set[,1:2] = scale(training_set[,1:2])
testing_set[,1:2] = scale(testing_set[,1:2])
training_set
## P.GEN SEXO YY
## 1 -1.1617507 1.2692391 0
## 2 -0.5647697 1.2692391 0
## 3 -1.1617507 1.2692391 0
## 6 1.6739088 -0.7822055 1
## 7 0.7784374 -0.7822055 1
## 8 0.0322112 1.2692391 0
## 10 -1.0125055 -0.7822055 0
## 13 -0.2662793 1.2692391 1
## 14 -0.1170340 1.2692391 0
## 16 -0.8632602 1.2692391 0
## 17 -0.7140150 -0.7822055 0
## 20 1.6739088 -0.7822055 1
## 21 0.1814564 -0.7822055 0
## 22 -0.5647697 1.2692391 0
## 23 1.0769279 -0.7822055 1
## 24 -0.4155245 -0.7822055 1
## 25 -0.4155245 1.2692391 0
## 27 1.5246636 1.2692391 1
## 28 1.0769279 1.2692391 1
## 30 -1.1617507 1.2692391 0
## 31 -0.2662793 -0.7822055 0
## 32 0.9276826 -0.7822055 1
## 33 -0.7140150 1.2692391 0
## 35 -0.2662793 1.2692391 1
## 38 -1.7587316 -0.7822055 0
## 39 -0.8632602 1.2692391 0
## 40 0.6291922 -0.7822055 0
## 41 0.0322112 -0.7822055 1
## 42 -0.4155245 -0.7822055 1
## 43 -0.2662793 -0.7822055 1
## 44 -0.4155245 -0.7822055 0
## 48 0.1814564 -0.7822055 1
## 49 -0.5647697 -0.7822055 0
## 50 -0.4155245 1.2692391 0
## 52 -0.7140150 -0.7822055 0
## 53 0.0322112 -0.7822055 0
## 54 0.6291922 1.2692391 1
## 55 -0.1170340 1.2692391 1
## 56 -0.2662793 -0.7822055 1
## 57 -0.1170340 -0.7822055 1
## 59 0.3307017 -0.7822055 0
## 60 -0.4155245 -0.7822055 1
## 61 -0.2662793 1.2692391 0
## 62 -1.4602412 1.2692391 0
## 63 0.3307017 1.2692391 0
## 65 0.0322112 1.2692391 0
## 66 0.4799469 1.2692391 1
## 67 -0.4155245 1.2692391 0
## 68 2.1216445 -0.7822055 1
## 69 0.1814564 1.2692391 0
## 70 1.9723993 1.2692391 1
## 72 1.8231540 -0.7822055 1
## 73 1.9723993 1.2692391 1
## 76 1.3754183 -0.7822055 1
## 78 -0.4155245 -0.7822055 0
## 79 -0.4155245 1.2692391 1
## 80 -0.4155245 1.2692391 0
## 81 1.9723993 -0.7822055 1
## 82 1.3754183 -0.7822055 1
## 83 -0.2662793 -0.7822055 1
## 84 -0.2662793 -0.7822055 1
## 85 -0.7140150 -0.7822055 0
## 86 -0.5647697 -0.7822055 1
## 87 -0.2662793 -0.7822055 0
## 88 1.5246636 -0.7822055 1
## 90 1.6739088 -0.7822055 1
## 91 -0.4155245 -0.7822055 1
## 92 -0.1170340 -0.7822055 1
## 94 -1.3109959 -0.7822055 0
## 95 -1.1617507 -0.7822055 1
## 96 -0.4155245 -0.7822055 0
## 97 0.3307017 -0.7822055 1
## 98 1.8231540 -0.7822055 1
## 99 -0.5647697 -0.7822055 0
## 100 0.1814564 -0.7822055 0
## 101 -1.4602412 -0.7822055 0
## 102 -0.8632602 -0.7822055 1
## 104 -0.1170340 1.2692391 0
## 105 1.5246636 -0.7822055 1
## 107 -1.0125055 1.2692391 0
## 108 0.9276826 -0.7822055 1
## 109 -0.2662793 -0.7822055 0
## 111 1.3754183 1.2692391 1
## 112 -0.7140150 1.2692391 1
## 115 -0.1170340 1.2692391 1
## 116 0.6291922 -0.7822055 1
## 117 -0.4155245 -0.7822055 0
## 118 -0.2662793 1.2692391 1
## 119 -1.3109959 -0.7822055 0
## 120 -0.5647697 -0.7822055 0
## 122 -1.6094864 -0.7822055 0
## 123 1.2261731 -0.7822055 1
## 124 -0.8632602 -0.7822055 0
## 125 0.9276826 -0.7822055 1
## 127 -0.7140150 -0.7822055 0
## 129 -0.1170340 1.2692391 1
## 130 0.0322112 -0.7822055 1
## 131 1.6739088 -0.7822055 1
## 132 1.2261731 -0.7822055 1
## 133 -0.8632602 1.2692391 0
## 135 -1.9079769 1.2692391 0
## 136 -1.0125055 -0.7822055 0
## 137 -0.7140150 1.2692391 0
## 138 1.0769279 -0.7822055 1
## 139 1.2261731 -0.7822055 1
## 140 -0.8632602 1.2692391 0
## 141 -1.0125055 1.2692391 0
## 142 -0.7140150 -0.7822055 0
## 143 0.4799469 1.2692391 0
## 144 0.4799469 -0.7822055 0
## 145 1.6739088 1.2692391 1
## 146 -1.1617507 -0.7822055 1
## 147 0.9276826 1.2692391 1
## 148 -0.4155245 -0.7822055 1
## 149 -1.4602412 1.2692391 0
## 150 -0.1170340 -0.7822055 1
## 152 -1.0125055 -0.7822055 0
## 156 1.8231540 1.2692391 1
## 157 2.1216445 1.2692391 1
## 158 0.1814564 -0.7822055 1
## 159 2.1216445 1.2692391 1
## 160 1.5246636 1.2692391 1
## 161 -0.4155245 -0.7822055 1
## 162 -1.3109959 -0.7822055 0
## 163 -0.4155245 1.2692391 1
## 164 -0.4155245 -0.7822055 0
## 165 -1.4602412 -0.7822055 1
## 166 1.0769279 1.2692391 1
## 167 -0.2662793 -0.7822055 1
## 170 -0.5647697 -0.7822055 0
## 172 -1.4602412 -0.7822055 1
## 174 -1.4602412 -0.7822055 1
## 175 0.1814564 -0.7822055 0
## 176 1.3754183 -0.7822055 1
## 178 -0.2662793 -0.7822055 0
## 179 -1.4602412 -0.7822055 1
## 180 0.0322112 1.2692391 1
## 183 0.3307017 -0.7822055 0
## 185 0.4799469 1.2692391 1
# Ajustar el modelo de regresión logIstica con el conjunto de entrenamiento.
classifier = glm(formula = YY ~ .,
data = training_set,
family = binomial)
classifier
##
## Call: glm(formula = YY ~ ., family = binomial, data = training_set)
##
## Coefficients:
## (Intercept) P.GEN SEXO
## 0.3272 1.5369 -0.3047
##
## Degrees of Freedom: 138 Total (i.e. Null); 136 Residual
## Null Deviance: 191.8
## Residual Deviance: 143.3 AIC: 149.3
summary(classifier)
##
## Call:
## glm(formula = YY ~ ., family = binomial, data = training_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8591 -0.8987 0.2825 0.9061 1.9235
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.3272 0.2141 1.528 0.127
## P.GEN 1.5369 0.2898 5.303 1.14e-07 ***
## SEXO -0.3047 0.2065 -1.476 0.140
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 191.82 on 138 degrees of freedom
## Residual deviance: 143.34 on 136 degrees of freedom
## AIC: 149.34
##
## Number of Fisher Scoring iterations: 5
# Predicción de los resultados con el conjunto de testing
prob_pred = predict(classifier, type = "response",
newdata = testing_set[,-3])
y_pred = ifelse(prob_pred> 0.6875, 1, 0)
prob_pred
## 4 5 9 11 12 15
## 0.10768601 0.85043537 0.62574159 0.62574159 0.33439570 0.38079596
## 18 19 26 29 34 36
## 0.79143904 0.95032088 0.18284843 0.18123526 0.08974132 0.28876221
## 37 45 46 47 51 58
## 0.28876221 0.92735936 0.47956819 0.95904304 0.92735936 0.75610128
## 64 71 74 75 77 89
## 0.37824478 0.78964513 0.92662616 0.91250544 0.75610128 0.47956819
## 93 103 106 110 113 114
## 0.53007027 0.38079596 0.21319023 0.78964513 0.33439570 0.84905213
## 121 126 128 134 151 153
## 0.82127165 0.29099232 0.18284843 0.06173416 0.87437620 0.62827526
## 154 155 168 169 171 173
## 0.62827526 0.82127165 0.33439570 0.92735936 0.12871156 0.53007027
## 177 181 182 184
## 0.10873146 0.87318136 0.62827526 0.57996399
# Crear la matriz de confusión
cm_test = table(testing_set[, 3], y_pred)
cm_test
## y_pred
## 0 1
## 0 19 2
## 1 9 16
# Visualización del conjunto de entrenamiento
#install.packages("ElemStatLearn")
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('SEXO', 'P.GEN')
prob_set = predict(classifier, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.6875, 1, 0)
plot(set[, -3],
main = 'MAQUINA DE SOPORTE VECTORIAL',
xlab = 'SEXO / PROMEDIO GENERAL DE NOTAS', ylab = 'YY = 0, 1',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

# Visualización del conjunto de testing
set = testing_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('SEXO', 'P.GEN')
prob_set = predict(classifier, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.6875, 1, 0)
plot(set[, -3],
main = 'YY VARIABLE DEPENDIENTE',
xlab = 'SEXO', ylab = 'PROMEDIO GENERAL DE NOTAL',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
LOS PUNTOS VERDES Y ROJOS REPRESENTAN EL SEXO, LOS CUALES NO PRESENTAN SEPARACION ENTRE FEMENINO Y MASCULINO, POR LO TANTO, NO EXISTE UNA SIGNIFICACIA DE LA VARIABLE SEXO RESPECTO A LOS PROMEDIOS GENERALES DE NOTAS.