¿EXISTE PROBABILIDAD DE EXITO ENTRE PROMEDIO DE NOTAS Y SEXO DE LOS ALUMNOS?

NO, SEGÚN GRÁFICA DE CLASIFICACIÓN SVM NO SE OBSERVA RELACIÓN

#CONCLUSI?N, SEG?N LO OBSERVADO POR EL M?TODO DE CLASIFICACI?N PARA LA VARIABLE DEPENDIENTE SEXO (M,F), Y LAS INDEPENDIENTES P.GEN Y YY, SE CONCLUYE QUE LA VARIBLE DEPENDIENTE NO GUARDA RELACI?N CON EL COMPORTAMIENTO DE LAS VARIABLES INDEPENDIENTES. PERO SEG?N MATRIZ DE CONFUSION EL MODELO ARROJA DEMASIADOS FALSOS POSITIVOS.


# Regresión LogIstica

# Importar el dataset
dataset = read.csv2('../SVM MAQUINA SOPORTE VECTORIAL/SEXO_PGEN_YY.csv')

dataset = dataset[, 3:5]


dataset$SEXO <- as.numeric(as.character(dataset$SEXO))
#dataset$P.HIS <- as.numeric(as.character(dataset$P.HIS))
#dataset$P.LEN <- as.numeric(as.character(dataset$P.LEN))
dataset$P.GEN <- as.numeric(as.character(dataset$P.GEN))
dataset$YY <- as.numeric(as.character(dataset$YY))

dataset
##     P.GEN SEXO YY
## 1     4.5    1  0
## 2     4.9    1  0
## 3     4.5    1  0
## 4     4.4    1  0
## 5     6.0    0  0
## 6     6.4    0  1
## 7     5.8    0  1
## 8     5.3    1  0
## 9     5.7    1  0
## 10    4.6    0  0
## 11    5.7    1  1
## 12    4.8    0  0
## 13    5.1    1  1
## 14    5.2    1  0
## 15    4.9    0  0
## 16    4.7    1  0
## 17    4.8    0  0
## 18    5.8    0  1
## 19    6.6    0  1
## 20    6.4    0  1
## 21    5.4    0  0
## 22    4.9    1  0
## 23    6.0    0  1
## 24    5.0    0  1
## 25    5.0    1  0
## 26    4.4    0  0
## 27    6.3    1  1
## 28    6.0    1  1
## 29    4.7    1  0
## 30    4.5    1  0
## 31    5.1    0  0
## 32    5.9    0  1
## 33    4.8    1  0
## 34    4.3    1  0
## 35    5.1    1  1
## 36    5.0    1  0
## 37    5.0    1  0
## 38    4.1    0  0
## 39    4.7    1  0
## 40    5.7    0  0
## 41    5.3    0  1
## 42    5.0    0  1
## 43    5.1    0  1
## 44    5.0    0  0
## 45    6.4    0  1
## 46    5.1    0  1
## 47    6.7    0  1
## 48    5.4    0  1
## 49    4.9    0  0
## 50    5.0    1  0
## 51    6.4    0  1
## 52    4.8    0  0
## 53    5.3    0  0
## 54    5.7    1  1
## 55    5.2    1  1
## 56    5.1    0  1
## 57    5.2    0  1
## 58    5.7    0  1
## 59    5.5    0  0
## 60    5.0    0  1
## 61    5.1    1  0
## 62    4.3    1  0
## 63    5.5    1  0
## 64    5.2    1  1
## 65    5.3    1  0
## 66    5.6    1  1
## 67    5.0    1  0
## 68    6.7    0  1
## 69    5.4    1  0
## 70    6.6    1  1
## 71    6.1    1  1
## 72    6.5    0  1
## 73    6.6    1  1
## 74    6.7    1  1
## 75    6.3    0  1
## 76    6.2    0  1
## 77    5.7    0  1
## 78    5.0    0  0
## 79    5.0    1  1
## 80    5.0    1  0
## 81    6.6    0  1
## 82    6.2    0  1
## 83    5.1    0  1
## 84    5.1    0  1
## 85    4.8    0  0
## 86    4.9    0  1
## 87    5.1    0  0
## 88    6.3    0  1
## 89    5.1    0  0
## 90    6.4    0  1
## 91    5.0    0  1
## 92    5.2    0  1
## 93    5.2    0  0
## 94    4.4    0  0
## 95    4.5    0  1
## 96    5.0    0  0
## 97    5.5    0  1
## 98    6.5    0  1
## 99    4.9    0  0
## 100   5.4    0  0
## 101   4.3    0  0
## 102   4.7    0  1
## 103   4.9    0  0
## 104   5.2    1  0
## 105   6.3    0  1
## 106   4.8    1  0
## 107   4.6    1  0
## 108   5.9    0  1
## 109   5.1    0  0
## 110   6.1    1  0
## 111   6.2    1  1
## 112   4.8    1  1
## 113   4.8    0  1
## 114   6.3    1  1
## 115   5.2    1  1
## 116   5.7    0  1
## 117   5.0    0  0
## 118   5.1    1  1
## 119   4.4    0  0
## 120   4.9    0  0
## 121   6.2    1  1
## 122   4.2    0  0
## 123   6.1    0  1
## 124   4.7    0  0
## 125   5.9    0  1
## 126   4.7    0  1
## 127   4.8    0  0
## 128   4.4    0  0
## 129   5.2    1  1
## 130   5.3    0  1
## 131   6.4    0  1
## 132   6.1    0  1
## 133   4.7    1  0
## 134   4.1    1  0
## 135   4.0    1  0
## 136   4.6    0  0
## 137   4.8    1  0
## 138   6.0    0  1
## 139   6.1    0  1
## 140   4.7    1  0
## 141   4.6    1  0
## 142   4.8    0  0
## 143   5.6    1  0
## 144   5.6    0  0
## 145   6.4    1  1
## 146   4.5    0  1
## 147   5.9    1  1
## 148   5.0    0  1
## 149   4.3    1  0
## 150   5.2    0  1
## 151   6.1    0  1
## 152   4.6    0  0
## 153   5.4    0  1
## 154   5.4    0  1
## 155   6.2    1  1
## 156   6.5    1  1
## 157   6.7    1  1
## 158   5.4    0  1
## 159   6.7    1  1
## 160   6.3    1  1
## 161   5.0    0  1
## 162   4.4    0  0
## 163   5.0    1  1
## 164   5.0    0  0
## 165   4.3    0  1
## 166   6.0    1  1
## 167   5.1    0  1
## 168   4.8    0  0
## 169   6.4    0  1
## 170   4.9    0  0
## 171   4.5    1  0
## 172   4.3    0  1
## 173   5.2    0  1
## 174   4.3    0  1
## 175   5.4    0  0
## 176   6.2    0  1
## 177   4.1    0  0
## 178   5.1    0  0
## 179   4.3    0  1
## 180   5.3    1  1
## 181   6.4    1  1
## 182   5.4    0  1
## 183   5.5    0  0
## 184   5.3    0  0
## 185   5.6    1  1
# Dividir los datos en conjunto de entrenamiento y conjunto de test
# install.packages("caTools")
library(caTools)
set.seed(1)
split = sample.split(dataset$SEXO, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
testing_set = subset(dataset, split == FALSE)
# Escalado de valores
training_set[,1:2] = scale(training_set[,1:2])
testing_set[,1:2] = scale(testing_set[,1:2])

training_set
##          P.GEN       SEXO YY
## 1   -1.1617507  1.2692391  0
## 2   -0.5647697  1.2692391  0
## 3   -1.1617507  1.2692391  0
## 6    1.6739088 -0.7822055  1
## 7    0.7784374 -0.7822055  1
## 8    0.0322112  1.2692391  0
## 10  -1.0125055 -0.7822055  0
## 13  -0.2662793  1.2692391  1
## 14  -0.1170340  1.2692391  0
## 16  -0.8632602  1.2692391  0
## 17  -0.7140150 -0.7822055  0
## 20   1.6739088 -0.7822055  1
## 21   0.1814564 -0.7822055  0
## 22  -0.5647697  1.2692391  0
## 23   1.0769279 -0.7822055  1
## 24  -0.4155245 -0.7822055  1
## 25  -0.4155245  1.2692391  0
## 27   1.5246636  1.2692391  1
## 28   1.0769279  1.2692391  1
## 30  -1.1617507  1.2692391  0
## 31  -0.2662793 -0.7822055  0
## 32   0.9276826 -0.7822055  1
## 33  -0.7140150  1.2692391  0
## 35  -0.2662793  1.2692391  1
## 38  -1.7587316 -0.7822055  0
## 39  -0.8632602  1.2692391  0
## 40   0.6291922 -0.7822055  0
## 41   0.0322112 -0.7822055  1
## 42  -0.4155245 -0.7822055  1
## 43  -0.2662793 -0.7822055  1
## 44  -0.4155245 -0.7822055  0
## 48   0.1814564 -0.7822055  1
## 49  -0.5647697 -0.7822055  0
## 50  -0.4155245  1.2692391  0
## 52  -0.7140150 -0.7822055  0
## 53   0.0322112 -0.7822055  0
## 54   0.6291922  1.2692391  1
## 55  -0.1170340  1.2692391  1
## 56  -0.2662793 -0.7822055  1
## 57  -0.1170340 -0.7822055  1
## 59   0.3307017 -0.7822055  0
## 60  -0.4155245 -0.7822055  1
## 61  -0.2662793  1.2692391  0
## 62  -1.4602412  1.2692391  0
## 63   0.3307017  1.2692391  0
## 65   0.0322112  1.2692391  0
## 66   0.4799469  1.2692391  1
## 67  -0.4155245  1.2692391  0
## 68   2.1216445 -0.7822055  1
## 69   0.1814564  1.2692391  0
## 70   1.9723993  1.2692391  1
## 72   1.8231540 -0.7822055  1
## 73   1.9723993  1.2692391  1
## 76   1.3754183 -0.7822055  1
## 78  -0.4155245 -0.7822055  0
## 79  -0.4155245  1.2692391  1
## 80  -0.4155245  1.2692391  0
## 81   1.9723993 -0.7822055  1
## 82   1.3754183 -0.7822055  1
## 83  -0.2662793 -0.7822055  1
## 84  -0.2662793 -0.7822055  1
## 85  -0.7140150 -0.7822055  0
## 86  -0.5647697 -0.7822055  1
## 87  -0.2662793 -0.7822055  0
## 88   1.5246636 -0.7822055  1
## 90   1.6739088 -0.7822055  1
## 91  -0.4155245 -0.7822055  1
## 92  -0.1170340 -0.7822055  1
## 94  -1.3109959 -0.7822055  0
## 95  -1.1617507 -0.7822055  1
## 96  -0.4155245 -0.7822055  0
## 97   0.3307017 -0.7822055  1
## 98   1.8231540 -0.7822055  1
## 99  -0.5647697 -0.7822055  0
## 100  0.1814564 -0.7822055  0
## 101 -1.4602412 -0.7822055  0
## 102 -0.8632602 -0.7822055  1
## 104 -0.1170340  1.2692391  0
## 105  1.5246636 -0.7822055  1
## 107 -1.0125055  1.2692391  0
## 108  0.9276826 -0.7822055  1
## 109 -0.2662793 -0.7822055  0
## 111  1.3754183  1.2692391  1
## 112 -0.7140150  1.2692391  1
## 115 -0.1170340  1.2692391  1
## 116  0.6291922 -0.7822055  1
## 117 -0.4155245 -0.7822055  0
## 118 -0.2662793  1.2692391  1
## 119 -1.3109959 -0.7822055  0
## 120 -0.5647697 -0.7822055  0
## 122 -1.6094864 -0.7822055  0
## 123  1.2261731 -0.7822055  1
## 124 -0.8632602 -0.7822055  0
## 125  0.9276826 -0.7822055  1
## 127 -0.7140150 -0.7822055  0
## 129 -0.1170340  1.2692391  1
## 130  0.0322112 -0.7822055  1
## 131  1.6739088 -0.7822055  1
## 132  1.2261731 -0.7822055  1
## 133 -0.8632602  1.2692391  0
## 135 -1.9079769  1.2692391  0
## 136 -1.0125055 -0.7822055  0
## 137 -0.7140150  1.2692391  0
## 138  1.0769279 -0.7822055  1
## 139  1.2261731 -0.7822055  1
## 140 -0.8632602  1.2692391  0
## 141 -1.0125055  1.2692391  0
## 142 -0.7140150 -0.7822055  0
## 143  0.4799469  1.2692391  0
## 144  0.4799469 -0.7822055  0
## 145  1.6739088  1.2692391  1
## 146 -1.1617507 -0.7822055  1
## 147  0.9276826  1.2692391  1
## 148 -0.4155245 -0.7822055  1
## 149 -1.4602412  1.2692391  0
## 150 -0.1170340 -0.7822055  1
## 152 -1.0125055 -0.7822055  0
## 156  1.8231540  1.2692391  1
## 157  2.1216445  1.2692391  1
## 158  0.1814564 -0.7822055  1
## 159  2.1216445  1.2692391  1
## 160  1.5246636  1.2692391  1
## 161 -0.4155245 -0.7822055  1
## 162 -1.3109959 -0.7822055  0
## 163 -0.4155245  1.2692391  1
## 164 -0.4155245 -0.7822055  0
## 165 -1.4602412 -0.7822055  1
## 166  1.0769279  1.2692391  1
## 167 -0.2662793 -0.7822055  1
## 170 -0.5647697 -0.7822055  0
## 172 -1.4602412 -0.7822055  1
## 174 -1.4602412 -0.7822055  1
## 175  0.1814564 -0.7822055  0
## 176  1.3754183 -0.7822055  1
## 178 -0.2662793 -0.7822055  0
## 179 -1.4602412 -0.7822055  1
## 180  0.0322112  1.2692391  1
## 183  0.3307017 -0.7822055  0
## 185  0.4799469  1.2692391  1
# Ajustar el modelo de regresión logIstica con el conjunto de entrenamiento.
classifier = glm(formula = YY ~ .,
                 data = training_set, 
                 family = binomial)
classifier
## 
## Call:  glm(formula = YY ~ ., family = binomial, data = training_set)
## 
## Coefficients:
## (Intercept)        P.GEN         SEXO  
##      0.3272       1.5369      -0.3047  
## 
## Degrees of Freedom: 138 Total (i.e. Null);  136 Residual
## Null Deviance:       191.8 
## Residual Deviance: 143.3     AIC: 149.3
summary(classifier)
## 
## Call:
## glm(formula = YY ~ ., family = binomial, data = training_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8591  -0.8987   0.2825   0.9061   1.9235  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.3272     0.2141   1.528    0.127    
## P.GEN         1.5369     0.2898   5.303 1.14e-07 ***
## SEXO         -0.3047     0.2065  -1.476    0.140    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 191.82  on 138  degrees of freedom
## Residual deviance: 143.34  on 136  degrees of freedom
## AIC: 149.34
## 
## Number of Fisher Scoring iterations: 5
# Predicción de los resultados con el conjunto de testing
prob_pred = predict(classifier, type = "response",
                    newdata = testing_set[,-3])

y_pred = ifelse(prob_pred> 0.6875, 1, 0)

prob_pred
##          4          5          9         11         12         15 
## 0.10768601 0.85043537 0.62574159 0.62574159 0.33439570 0.38079596 
##         18         19         26         29         34         36 
## 0.79143904 0.95032088 0.18284843 0.18123526 0.08974132 0.28876221 
##         37         45         46         47         51         58 
## 0.28876221 0.92735936 0.47956819 0.95904304 0.92735936 0.75610128 
##         64         71         74         75         77         89 
## 0.37824478 0.78964513 0.92662616 0.91250544 0.75610128 0.47956819 
##         93        103        106        110        113        114 
## 0.53007027 0.38079596 0.21319023 0.78964513 0.33439570 0.84905213 
##        121        126        128        134        151        153 
## 0.82127165 0.29099232 0.18284843 0.06173416 0.87437620 0.62827526 
##        154        155        168        169        171        173 
## 0.62827526 0.82127165 0.33439570 0.92735936 0.12871156 0.53007027 
##        177        181        182        184 
## 0.10873146 0.87318136 0.62827526 0.57996399
# Crear la matriz de confusión
cm_test = table(testing_set[, 3], y_pred)
cm_test
##    y_pred
##      0  1
##   0 19  2
##   1  9 16
# Visualización del conjunto de entrenamiento
#install.packages("ElemStatLearn")
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('SEXO', 'P.GEN')
prob_set = predict(classifier, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.6875, 1, 0)
plot(set[, -3],
     main = 'MAQUINA DE SOPORTE VECTORIAL',
     xlab = 'SEXO / PROMEDIO GENERAL DE NOTAS', ylab = 'YY = 0, 1',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

# Visualización del conjunto de testing
set = testing_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('SEXO', 'P.GEN')
prob_set = predict(classifier, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.6875, 1, 0)
plot(set[, -3],
     main = 'YY VARIABLE DEPENDIENTE',
     xlab = 'SEXO', ylab = 'PROMEDIO GENERAL DE NOTAL',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

LOS PUNTOS VERDES Y ROJOS REPRESENTAN EL SEXO, LOS CUALES NO PRESENTAN SEPARACION ENTRE FEMENINO Y MASCULINO, POR LO TANTO, NO EXISTE UNA SIGNIFICACIA DE LA VARIABLE SEXO RESPECTO A LOS PROMEDIOS GENERALES DE NOTAS.