Evidencia 2: Predicción de admisión en base a resultados de red neuronal de linear output

Abraham Castañon - A01747966 Angie Zerón - A00834060

En este Markdown se buscará hacer un análisis de las probabilidades que tiene cada caso de ser admitido en una graduate admission para entrar en una maestría o doctorado.

La base de datos que se utilizó fue sacada de Kaggle y todos los derechos de la misma son del propietario Link: https://www.kaggle.com/datasets/mohansacharya/graduate-admissions/data

Variables:

Librerías utilizadas

library(foreign)
library(dplyr)        # data manipulation 
library(forcats)      # to work with categorical variables
library(ggplot2)      # data visualization 
library(ggpubr)       # data visualization 
library(readr)        # read specific csv files
library(janitor)      # data exploration and cleaning 
library(Hmisc)        # several useful functions for data analysis 
library(psych)        # functions for multivariate analysis 
library(naniar)       # summaries and visualization of missing values NAs
library(dlookr)       # summaries and visualization of missing values NAs
library(corrplot)     # correlation plots
library(jtools)       # presentation of regression analysis 
library(lmtest)       # diagnostic checks - linear regression analysis 
library(car)          # diagnostic checks - linear regression analysis
library(olsrr)        # diagnostic checks - linear regression analysis 
library(naniar)       # identifying missing values
library(stargazer)    # create publication quality tables
library(effects)      # displays for linear and other regression models
library(tidyverse)    # collection of R packages designed for data science
library(caret)        # Classification and Regression Training 
library(glmnet)       # methods for prediction and plotting, and functions for cross-validation
library(mlbench)
library(magrittr)
library(neuralnet)
library(keras)
library(caret)
library(pROC)
data <- read.csv("Admission_Predict.csv")
data$GRE.Score = as.numeric(data$GRE.Score)
data$TOEFL.Score = as.numeric(data$TOEFL.Score)
head(data)
##   GRE.Score TOEFL.Score University.Rating SOP LOR CGPA Research Chance.of.Admit
## 1       337         118                 4 4.5 4.5 9.65        1            0.92
## 2       324         107                 4 4.0 4.5 8.87        1            0.76
## 3       316         104                 3 3.0 3.5 8.00        1            0.72
## 4       322         110                 3 3.5 2.5 8.67        1            0.80
## 5       314         103                 2 2.0 3.0 8.21        0            0.65
## 6       330         115                 5 4.5 3.0 9.34        1            0.90
str(data)
## 'data.frame':    400 obs. of  8 variables:
##  $ GRE.Score        : num  337 324 316 322 314 330 321 308 302 323 ...
##  $ TOEFL.Score      : num  118 107 104 110 103 115 109 101 102 108 ...
##  $ University.Rating: int  4 4 3 3 2 5 3 2 1 3 ...
##  $ SOP              : num  4.5 4 3 3.5 2 4.5 3 3 2 3.5 ...
##  $ LOR              : num  4.5 4.5 3.5 2.5 3 3 4 4 1.5 3 ...
##  $ CGPA             : num  9.65 8.87 8 8.67 8.21 9.34 8.2 7.9 8 8.6 ...
##  $ Research         : int  1 1 1 1 0 1 1 0 0 0 ...
##  $ Chance.of.Admit  : num  0.92 0.76 0.72 0.8 0.65 0.9 0.75 0.68 0.5 0.45 ...
summary(data)
##    GRE.Score      TOEFL.Score    University.Rating      SOP     
##  Min.   :290.0   Min.   : 92.0   Min.   :1.000     Min.   :1.0  
##  1st Qu.:308.0   1st Qu.:103.0   1st Qu.:2.000     1st Qu.:2.5  
##  Median :317.0   Median :107.0   Median :3.000     Median :3.5  
##  Mean   :316.8   Mean   :107.4   Mean   :3.087     Mean   :3.4  
##  3rd Qu.:325.0   3rd Qu.:112.0   3rd Qu.:4.000     3rd Qu.:4.0  
##  Max.   :340.0   Max.   :120.0   Max.   :5.000     Max.   :5.0  
##       LOR             CGPA          Research      Chance.of.Admit 
##  Min.   :1.000   Min.   :6.800   Min.   :0.0000   Min.   :0.3400  
##  1st Qu.:3.000   1st Qu.:8.170   1st Qu.:0.0000   1st Qu.:0.6400  
##  Median :3.500   Median :8.610   Median :1.0000   Median :0.7300  
##  Mean   :3.453   Mean   :8.599   Mean   :0.5475   Mean   :0.7244  
##  3rd Qu.:4.000   3rd Qu.:9.062   3rd Qu.:1.0000   3rd Qu.:0.8300  
##  Max.   :5.000   Max.   :9.920   Max.   :1.0000   Max.   :0.9700
colSums(is.na(data))
##         GRE.Score       TOEFL.Score University.Rating               SOP 
##                 0                 0                 0                 0 
##               LOR              CGPA          Research   Chance.of.Admit 
##                 0                 0                 0                 0
sum(is.na(data))
## [1] 0

Data visualization

summary(data$Chance.of.Admit)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3400  0.6400  0.7300  0.7244  0.8300  0.9700
ggplot(data =  data, aes(Chance.of.Admit)) +
  geom_histogram(fill = 'lightblue', color='black')+
  labs(x = 'Probabilidad de admisión') +
  ggtitle('Admisiones y su probabilidad')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data =  data, aes( x= log(Chance.of.Admit)))+
  geom_histogram(fill = 'lightblue', color='black')+
  labs(x = 'Probabilidad de aceptación en log') +
  ggtitle('Log')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data2 <- read.csv("Admission_Predict.csv")

data2$Chance.of.Admit <- as.factor(ifelse(data2$Chance.of.Admit >= 0.75, 1, 0))
toefl_hist <- data2 %>% ggplot(aes(x=TOEFL.Score, fill=Chance.of.Admit)) +
                     geom_density()

gre_hist <- data2 %>% ggplot(aes(x=GRE.Score, fill=Chance.of.Admit)) +
                     geom_density()

comb_plot <- ggarrange(toefl_hist, gre_hist, common.legend = TRUE,
                       ncol = 2, nrow = 1)

annotate_figure(comb_plot, 
                top = text_grob("Puntos del TOEFL & GRE por admisiones", 
               color = "red", face = "bold", size = 12))

head(data)
##   GRE.Score TOEFL.Score University.Rating SOP LOR CGPA Research Chance.of.Admit
## 1       337         118                 4 4.5 4.5 9.65        1            0.92
## 2       324         107                 4 4.0 4.5 8.87        1            0.76
## 3       316         104                 3 3.0 3.5 8.00        1            0.72
## 4       322         110                 3 3.5 2.5 8.67        1            0.80
## 5       314         103                 2 2.0 3.0 8.21        0            0.65
## 6       330         115                 5 4.5 3.0 9.34        1            0.90

Ajustar y Visualizar Redes Neuronales

set.seed(123)
train = createDataPartition(y = data$Chance.of.Admit, p=0.8, list=FALSE, times = 1)
data_train = data[train,]
data_test = data[-train,]
RN <- neuralnet(Chance.of.Admit ~ .,
               data = data_train,
               hidden = c(5,3),
               linear.output = T,
               lifesign = 'full',
               threshold = 0.05,
               stepmax=200000,
               rep=1)
?neuralnet
#train/test split en matrices y separando variable a predecir
training <- as.matrix(data_train[,1:7])
trainingtarget <- as.matrix(data_train[,8])
test <- as.matrix(data_test[,1:7])
testtarget <- as.matrix(data_test[,8])

#Estandarización de variables
m <- colMeans(training) #Obtener medias por columna
s <- apply(training, 2, sd) #Calcular StandDev por columna (por ello el apply lleva el 2, si pusieran 1 sería por renglón)
training <- scale(training, center = m, scale = s)
test <- scale(test, center = m, scale = s)

Reajustar modelo con Variables Estandarizadas

data_train_S <- as.data.frame(cbind(training,(trainingtarget - mean(trainingtarget))/sd(trainingtarget)))
colnames(data_train_S) <- colnames(data_train)
RNS <- neuralnet(Chance.of.Admit ~ .,
               data = data_train_S,
               hidden = c(5,3),
               linear.output = T, 
               #linear.output se debe poner como T en modelos de regresion y como F en modelos de clasificación
               lifesign = 'full',
               rep=1,
               # threshold=0.02,
               stepmax=200000)
column_names <- colnames(data_train_S)

Visualizar la Red Neuronal ajustada

La gráfica es de la red neuronal con valores no estandarizados pues si no, no es posible graficarla por los pesos

plot(RNS,
     col.hidden = 'darkgreen',
     col.hidden.synapse = 'darkgreen',
     show.weights = TRUE,  # Puedes usar TRUE en lugar de T para claridad
     information = FALSE,  # Cambiado a FALSE como un valor booleano
     fill = 'lightblue')
Gráfica
Gráfica

Realizar predicciones

data_test_S <- as.data.frame(test)
colnames(data_test_S) <- colnames(data_test)[1:7]

RNSPredictions <- predict(RNS,data_test_S)
cor(RNSPredictions,(testtarget-mean(trainingtarget))/sd(trainingtarget))
##           [,1]
## [1,] 0.8190177

Para calcular métricas de regresión es importante ‘desestandarizar’ las predicciones y eso es lo que se comparará contra los valores de testtarget

RNSPred <- RNSPredictions*sd(trainingtarget) + mean(trainingtarget)
plot(RNSPred,testtarget)
abline(a=0, b=1)

RSSnn <- (RNSPred - testtarget)^2
sum(RSSnn)/nrow(testtarget)
## [1] 0.007923635
1 - sum(RSSnn)/sum((testtarget - mean(trainingtarget))^2)
## [1] 0.6448442
LRM <- lm(Chance.of.Admit ~ CGPA+GRE.Score+TOEFL.Score+LOR+Research+University.Rating+SOP, data=data_train)
LRMPred <- predict(LRM, data_test[,1:7])
cor(LRMPred,data_test[,8])
## [1] 0.898695
plot(LRMPred,data_test[,8])
abline(a=0,b=1)

LRMRSS <- (LRMPred - data_test[,8])^2
sum(LRMRSS)/nrow(testtarget)
## [1] 0.004388214
1 - sum(LRMRSS)/sum((data_test[,8] - mean(data_train[,8]))^2)
## [1] 0.80331
# dev.off()
par(mfrow=c(1,2))
plot(data_test$Chance.of.Admit,RNSPred,col='red',main='Real vs red neuronal',pch=19,cex=1)
abline(0,1,lwd=2)
legend('bottomright',legend='NN',pch=18,col='red', bty='n')
plot(data_test$Chance.of.Admit,LRMPred,col='blue',main='Real vs regresión lineal',pch=15, cex=1)
abline(0,1,lwd=2)
legend('bottomright',legend='LM',pch=18,col='blue', bty='n', cex=.95)

Podemos ver que la lm es mejor sin embargo nuestra red neuronal no es para nada malo

Fin :)