Un el conjunto de datos de Admision tiene información de la admisión de estudiantes a una universidad. la variable admit, GRE(Graduate Record Exam scores), GPA (grade point average) y rank que se refiere al prestigio de la escuela secundaria a la que el alumno asistió. Un investigador estÔ interesado en averiguar cómo influyen estas variables en la admisión de una persona a la universidad.

mydata <- read.csv("binary.csv")
attach(mydata)
dim(mydata)
## [1] 400   4
head(mydata)
##   admit gre  gpa rank
## 1     0 380 3.61    3
## 2     1 660 3.67    3
## 3     1 800 4.00    1
## 4     1 640 3.19    4
## 5     0 520 2.93    4
## 6     1 760 3.00    2
mydata$rank <- factor(mydata$rank)

Regresión Logística utilizando solamente la variable gpa

logit1var<- glm(admit ~  gpa, data = mydata, family = "binomial")
summary(logit1var)
## 
## Call:
## glm(formula = admit ~ gpa, family = "binomial", data = mydata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1131  -0.8874  -0.7566   1.3305   1.9824  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.3576     1.0353  -4.209 2.57e-05 ***
## gpa           1.0511     0.2989   3.517 0.000437 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 486.97  on 398  degrees of freedom
## AIC: 490.97
## 
## Number of Fisher Scoring iterations: 4

El modelo: Pr(admit =1) = e^(-4.35 + 1.05 gpa)/(1 + e^(-4.35 + 1.05 gpa))

newdata = data.frame(gpa=c(2.5,3.5, 4, 4.3, 4.8,5))
prediccion <- predict(logit1var, newdata, type = 'response')

prediccion
##         1         2         3         4         5         6 
## 0.1506112 0.3365500 0.4617866 0.5404564 0.6654628 0.7105293

Regresión logística utilizando todas las variables

mydata$rank <- factor(mydata$rank)
logit3var<- glm(admit ~ gre + gpa + rank, data = mydata, family = "binomial")
summary(logit3var)
## 
## Call:
## glm(formula = admit ~ gre + gpa + rank, family = "binomial", 
##     data = mydata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6268  -0.8662  -0.6388   1.1490   2.0790  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.989979   1.139951  -3.500 0.000465 ***
## gre          0.002264   0.001094   2.070 0.038465 *  
## gpa          0.804038   0.331819   2.423 0.015388 *  
## rank2       -0.675443   0.316490  -2.134 0.032829 *  
## rank3       -1.340204   0.345306  -3.881 0.000104 ***
## rank4       -1.551464   0.417832  -3.713 0.000205 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 458.52  on 394  degrees of freedom
## AIC: 470.52
## 
## Number of Fisher Scoring iterations: 4

El modelo en este caso queda dividido: Para rank = 1

Pr(admit =1) = e^(-3.98 + 0.002 gre + 0.8 gpa)/(1 + e^(-3.98 + 0.002 gre + 0.8 gpa))

Para rank = 2 P(admit =1) = e^(-3.98 + 0.002 gre + 0.8 gpa - 0.67)/(1 + e^(-3.98 + 0.002 gre + 0.8 gpa - 0.67))

datosNuevos = data.frame(gpa=c(2.5,3.5), gre = c(600, 700), rank = factor(c(3,1)))
prediccion3var <- predict(logit3var, datosNuevos, type = 'response')
prediccion3var
##         1         2 
## 0.1233120 0.6009081
datosNuevos2 = data.frame(gpa=c(2.5,3.5), gre = c(600, 700), rank = factor(c(1,3)))
prediccion3var2 <- predict(logit3var, datosNuevos2, type = 'response')
prediccion3var2
##         1         2 
## 0.3495018 0.2827313

GrƔfico de la Probabilidad de ser admitido para rank = 2 y gpa = 1

x = seq(1, 11, 0.1)
gpa = 1
y =  exp(-3.98 + 0.002* 1 + 0.8 *x - 0.67)/(1 + exp(-3.98 + 0.002 *1 + 0.8 *x - 0.67))

qplot(x , y , geom = "line", xlab = "gre",ylab = "Probabilidad de ser admitido", main = "Probabilidad de ser admitido cuando gpa = 1")

Vamos a probar el modelo para un conjunto de datos de testeo y ver como clasifica

d = dim(mydata)
s = floor(0.85* d[1])
indic = sample(d[1], size = s, replace = FALSE)
train_data <- mydata[indic,]
test_data <- mydata[-indic,]

El modelo de regresion logistica

modelo2 <- glm(admit ~  gpa + gre + rank, data = train_data, family = "binomial")
summary(modelo2)
## 
## Call:
## glm(formula = admit ~ gpa + gre + rank, family = "binomial", 
##     data = train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5503  -0.8608  -0.6167   1.1071   2.0873  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.789259   1.226322  -3.090  0.00200 ** 
## gpa          0.793673   0.362696   2.188  0.02865 *  
## gre          0.002084   0.001200   1.737  0.08241 .  
## rank2       -0.677027   0.334705  -2.023  0.04310 *  
## rank3       -1.539056   0.373246  -4.123 3.73e-05 ***
## rank4       -1.578693   0.449150  -3.515  0.00044 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.06  on 339  degrees of freedom
## Residual deviance: 385.26  on 334  degrees of freedom
## AIC: 397.26
## 
## Number of Fisher Scoring iterations: 4

Realizamos las predicciones para el conjunto test

prediccion.test <- predict(modelo2, test_data, type = 'response')

prediccion.test
##          6         17         20         22         33         40         46 
## 0.37714215 0.33822968 0.58899290 0.44768042 0.20106688 0.10738108 0.16360217 
##         53         56         59         64         81         86         88 
## 0.24023792 0.35166571 0.32389651 0.29821860 0.16690157 0.26548637 0.38838238 
##         89         91         94         99        101        106        120 
## 0.56771656 0.50800762 0.28245634 0.32696247 0.10718662 0.36188794 0.09092845 
##        128        147        159        164        169        170        179 
## 0.29781662 0.31698801 0.42039275 0.22863000 0.24753531 0.23058132 0.19884580 
##        187        190        196        199        204        213        217 
## 0.21478597 0.31741887 0.38932300 0.23199241 0.20074849 0.22618705 0.31449924 
##        222        230        231        240        242        252        255 
## 0.32198139 0.43740522 0.17533874 0.24386756 0.57886862 0.17822778 0.26263367 
##        261        264        269        271        288        294        299 
## 0.35868321 0.28875328 0.42475132 0.50056377 0.25804485 0.73663268 0.36279296 
##        304        312        313        315        328        335        357 
## 0.52732554 0.45554265 0.27668564 0.18292203 0.36877799 0.18529622 0.39833118 
##        364        379        393        396 
## 0.31484287 0.17967926 0.19852908 0.50006644

Clasificamos

long.test = length(prediccion.test)
clasificacion.test = rep(0, long.test)

clasificacion.test[which(prediccion.test > 0.6)] = 1

clasificacion.test
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [39] 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
print(table( clasificacion.test , test_data$admit ))
##                   
## clasificacion.test  0  1
##                  0 40 19
##                  1  1  0
correctos.test <- mean(clasificacion.test == test_data$admit)
cat(correctos.test * 100,"% casos correctamente clasificados\n")
## 66.66667 % casos correctamente clasificados