Admissions Model Project

Introduction

The task at hand was to examine the admissions data and using that build a prediction model for new admissions. The task was to use the predictive models Random Forest, Support Vector Machines, and an Ensemble model and to find the best model for us. Following is the code reading the data set, getting familiar with it and running the three models.

data = read.csv("binary-1.csv", header=TRUE, stringsAsFactors = TRUE)
head(data,3)

##   admit gre  gpa rank id
## 1     0 380 3.61    3  1
## 2     1 660 3.67    3  2
## 3     1 800 4.00    1  3

str(data)

## 'data.frame':    400 obs. of  5 variables:
##  $ admit: int  0 1 1 1 0 1 1 0 1 0 ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 2 ...
##  $ id   : int  1 2 3 4 5 6 7 8 9 10 ...

names(data)

## [1] "admit" "gre"   "gpa"   "rank"  "id"

data$admit = as.numeric(data$admit)
table(as.numeric(data$admit))

## 
##   0   1 
## 273 127

#Create training and testing subsets
set.seed(123456)
train = data %>% sample_frac(0.8)
test = data %>% anti_join(train, by = "id")

data %<>% dplyr::select(-id)
train %<>% dplyr::select(-id)
test %<>% dplyr::select(-id)

Fitting the Models

1. Random Forest

Following, we fit the Random Forest model on the data set. The accuracy of the model is 20%.

set.seed(123456)
rf.model <- SuperLearner(
  train$admit, 
  train[,2:4], 
  family=binomial(),
  SL.library=list("SL.randomForest")
)

## Loading required namespace: randomForest

rf.model

## 
## Call:  
## SuperLearner(Y = train$admit, X = train[, 2:4], family = binomial(), SL.library = list("SL.randomForest")) 
## 
## 
## 
##                      Risk Coef
## SL.randomForest_All 0.197    1

2. SVM Classification Model

Following, we fit the Support Vector Machines model on the data set. The accuracy of the model is 21%.

set.seed(123456)
svm.model <- SuperLearner(
  train$admit, 
  train[,2:4], 
  family=binomial(),
  SL.library=list("SL.svm")
)

## Loading required namespace: e1071

svm.model

## 
## Call:  
## SuperLearner(Y = train$admit, X = train[, 2:4], family = binomial(), SL.library = list("SL.svm")) 
## 
## 
## 
##             Risk Coef
## SL.svm_All 0.209    1

3. The Ensemble Model

Following, we fit the Ensemble model on the data set, we added more models to our ensemble. The accuracy of the Ensemble model was not good enough compared to the Support Vector Machines model alone.

set.seed(123456)
model <- SuperLearner(
  train$admit,
  train[,2:4],
  family=binomial(),
  SL.library=c("SL.randomForest",
               "SL.svm",
               "SL.glm",
               "SL.ipredbagg",
               "SL.bayesglm", 
               "SL.mean",
               "SL.lm")
)

## Loading required namespace: ipred

model

## 
## Call:  
## SuperLearner(Y = train$admit, X = train[, 2:4], family = binomial(), SL.library = c("SL.randomForest",  
##     "SL.svm", "SL.glm", "SL.ipredbagg", "SL.bayesglm", "SL.mean", "SL.lm")) 
## 
## 
##                      Risk   Coef
## SL.randomForest_All 0.197 0.2881
## SL.svm_All          0.209 0.0000
## SL.glm_All          0.187 0.6752
## SL.ipredbagg_All    0.193 0.0000
## SL.bayesglm_All     0.187 0.0000
## SL.mean_All         0.216 0.0367
## SL.lm_All           0.189 0.0000

Predictions

Following we perform predictions and recode the probabilities using the three models.

rf_predictions <- predict.SuperLearner(rf.model, newdata=test[,2:4])
svm_predictions <- predict.SuperLearner(svm.model, newdata=test[,2:4])
ensemble_predictions <- predict.SuperLearner(model, newdata=test[,2:4])

# Probabilities 
names(rf_predictions)

## [1] "pred"            "library.predict"

head(rf_predictions$pred)

##       [,1]
## [1,] 0.050
## [2,] 0.118
## [3,] 0.042
## [4,] 0.412
## [5,] 0.908
## [6,] 0.402

head(rf_predictions$library.predict)

##      SL.randomForest_All
## [1,]               0.050
## [2,]               0.118
## [3,]               0.042
## [4,]               0.412
## [5,]               0.908
## [6,]               0.402

names(svm_predictions)

## [1] "pred"            "library.predict"

head(svm_predictions$pred)

##       [,1]
## [1,] 0.251
## [2,] 0.313
## [3,] 0.263
## [4,] 0.343
## [5,] 0.545
## [6,] 0.272

head(svm_predictions$library.predict)

##      SL.svm_All
## [1,]      0.251
## [2,]      0.313
## [3,]      0.263
## [4,]      0.343
## [5,]      0.545
## [6,]      0.272

names(ensemble_predictions)

## [1] "pred"            "library.predict"

head(ensemble_predictions$pred)

##        [,1]
## [1,] 0.1194
## [2,] 0.3168
## [3,] 0.0622
## [4,] 0.5481
## [5,] 0.6418
## [6,] 0.3772

head(ensemble_predictions$library.predict)

##      SL.randomForest_All SL.svm_All SL.glm_All SL.ipredbagg_All SL.bayesglm_All
## [1,]               0.038      0.239     0.1438            0.173          0.1464
## [2,]               0.130      0.307     0.3969            0.255          0.3952
## [3,]               0.046      0.251     0.0557            0.112          0.0577
## [4,]               0.415      0.340     0.6179            0.493          0.6129
## [5,]               0.896      0.567     0.5515            0.784          0.5473
## [6,]               0.421      0.262     0.3623            0.410          0.3617
##      SL.mean_All SL.lm_All
## [1,]       0.309     0.153
## [2,]       0.309     0.403
## [3,]       0.309     0.000
## [4,]       0.309     0.569
## [5,]       0.309     0.522
## [6,]       0.309     0.376

# Recode probabilities
rf_conv.preds <- ifelse(rf_predictions$pred>=0.5,1,0)
svm_conv.preds <- ifelse(svm_predictions$pred>=0.5,1,0)
ensemble_conv.preds <- ifelse(rf_predictions$pred>=0.5,1,0)

Confusion Matrix

Following we show the confusion matrix for the three models.

rf_cm <- caret::confusionMatrix(as.factor(rf_conv.preds), as.factor(test$admit))
rf_cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 48 25
##          1  4  3
##                                         
##                Accuracy : 0.637         
##                  95% CI : (0.522, 0.742)
##     No Information Rate : 0.65          
##     P-Value [Acc > NIR] : 0.641170      
##                                         
##                   Kappa : 0.037         
##                                         
##  Mcnemar's Test P-Value : 0.000204      
##                                         
##             Sensitivity : 0.923         
##             Specificity : 0.107         
##          Pos Pred Value : 0.658         
##          Neg Pred Value : 0.429         
##              Prevalence : 0.650         
##          Detection Rate : 0.600         
##    Detection Prevalence : 0.912         
##       Balanced Accuracy : 0.515         
##                                         
##        'Positive' Class : 0             
##

svm_cm <- caret::confusionMatrix(as.factor(svm_conv.preds), as.factor(test$admit))
svm_cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 51 27
##          1  1  1
##                                         
##                Accuracy : 0.65          
##                  95% CI : (0.535, 0.753)
##     No Information Rate : 0.65          
##     P-Value [Acc > NIR] : 0.551         
##                                         
##                   Kappa : 0.021         
##                                         
##  Mcnemar's Test P-Value : 0.00000231    
##                                         
##             Sensitivity : 0.9808        
##             Specificity : 0.0357        
##          Pos Pred Value : 0.6538        
##          Neg Pred Value : 0.5000        
##              Prevalence : 0.6500        
##          Detection Rate : 0.6375        
##    Detection Prevalence : 0.9750        
##       Balanced Accuracy : 0.5082        
##                                         
##        'Positive' Class : 0             
##

ensemble_cm <- caret::confusionMatrix(as.factor(ensemble_conv.preds), as.factor(test$admit))
ensemble_cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 48 25
##          1  4  3
##                                         
##                Accuracy : 0.637         
##                  95% CI : (0.522, 0.742)
##     No Information Rate : 0.65          
##     P-Value [Acc > NIR] : 0.641170      
##                                         
##                   Kappa : 0.037         
##                                         
##  Mcnemar's Test P-Value : 0.000204      
##                                         
##             Sensitivity : 0.923         
##             Specificity : 0.107         
##          Pos Pred Value : 0.658         
##          Neg Pred Value : 0.429         
##              Prevalence : 0.650         
##          Detection Rate : 0.600         
##    Detection Prevalence : 0.912         
##       Balanced Accuracy : 0.515         
##                                         
##        'Positive' Class : 0             
##

Accuracy

Following is the accuracy for all the three models.

mean(rf_conv.preds == test$admit)

## [1] 0.637

mean(svm_conv.preds == test$admit)

## [1] 0.65

mean(ensemble_conv.preds == test$admit)

## [1] 0.637

Conclusion

For the project, we had to examine if the ensemble model will improve over the two individual models that were run; Random Forest and the Support Vector Machines (SVM) Classification model. From the three models, the best was the Support Vector Machines Classification model with a 65% accuracy. In conclusion, the ensemble model did not improve over the two individual models as the accuracy from the ensemble model was lower than of the SVM model.