Introduction

The task at hand was to examine the admissions data and using that build a prediction model for new admissions. The task was to use the predictive models Random Forest, Support Vector Machines, and an Ensemble model and to find the best model for us. Following is the code reading the data set, getting familiar with it and running the three models.

data = read.csv("binary-1.csv", header=TRUE, stringsAsFactors = TRUE)
head(data,3)
##   admit gre  gpa rank id
## 1     0 380 3.61    3  1
## 2     1 660 3.67    3  2
## 3     1 800 4.00    1  3
str(data)
## 'data.frame':    400 obs. of  5 variables:
##  $ admit: int  0 1 1 1 0 1 1 0 1 0 ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 2 ...
##  $ id   : int  1 2 3 4 5 6 7 8 9 10 ...
names(data)
## [1] "admit" "gre"   "gpa"   "rank"  "id"
data$admit = as.numeric(data$admit)
table(as.numeric(data$admit))
## 
##   0   1 
## 273 127
#Create training and testing subsets
set.seed(123456)
train = data %>% sample_frac(0.8)
test = data %>% anti_join(train, by = "id")

data %<>% dplyr::select(-id)
train %<>% dplyr::select(-id)
test %<>% dplyr::select(-id)

Fitting the Models

1. Random Forest

Following, we fit the Random Forest model on the data set. The accuracy of the model is 20%.

set.seed(123456)
rf.model <- SuperLearner(
  train$admit, 
  train[,2:4], 
  family=binomial(),
  SL.library=list("SL.randomForest")
)
## Loading required namespace: randomForest
rf.model
## 
## Call:  
## SuperLearner(Y = train$admit, X = train[, 2:4], family = binomial(), SL.library = list("SL.randomForest")) 
## 
## 
## 
##                      Risk Coef
## SL.randomForest_All 0.197    1

2. SVM Classification Model

Following, we fit the Support Vector Machines model on the data set. The accuracy of the model is 21%.

set.seed(123456)
svm.model <- SuperLearner(
  train$admit, 
  train[,2:4], 
  family=binomial(),
  SL.library=list("SL.svm")
)
## Loading required namespace: e1071
svm.model
## 
## Call:  
## SuperLearner(Y = train$admit, X = train[, 2:4], family = binomial(), SL.library = list("SL.svm")) 
## 
## 
## 
##             Risk Coef
## SL.svm_All 0.209    1

3. The Ensemble Model

Following, we fit the Ensemble model on the data set, we added more models to our ensemble. The accuracy of the Ensemble model was not good enough compared to the Support Vector Machines model alone.

set.seed(123456)
model <- SuperLearner(
  train$admit,
  train[,2:4],
  family=binomial(),
  SL.library=c("SL.randomForest",
               "SL.svm",
               "SL.glm",
               "SL.ipredbagg",
               "SL.bayesglm", 
               "SL.mean",
               "SL.lm")
)
## Loading required namespace: ipred
model
## 
## Call:  
## SuperLearner(Y = train$admit, X = train[, 2:4], family = binomial(), SL.library = c("SL.randomForest",  
##     "SL.svm", "SL.glm", "SL.ipredbagg", "SL.bayesglm", "SL.mean", "SL.lm")) 
## 
## 
##                      Risk   Coef
## SL.randomForest_All 0.197 0.2881
## SL.svm_All          0.209 0.0000
## SL.glm_All          0.187 0.6752
## SL.ipredbagg_All    0.193 0.0000
## SL.bayesglm_All     0.187 0.0000
## SL.mean_All         0.216 0.0367
## SL.lm_All           0.189 0.0000

Predictions

Following we perform predictions and recode the probabilities using the three models.

rf_predictions <- predict.SuperLearner(rf.model, newdata=test[,2:4])
svm_predictions <- predict.SuperLearner(svm.model, newdata=test[,2:4])
ensemble_predictions <- predict.SuperLearner(model, newdata=test[,2:4])

# Probabilities 
names(rf_predictions)
## [1] "pred"            "library.predict"
head(rf_predictions$pred)
##       [,1]
## [1,] 0.050
## [2,] 0.118
## [3,] 0.042
## [4,] 0.412
## [5,] 0.908
## [6,] 0.402
head(rf_predictions$library.predict)
##      SL.randomForest_All
## [1,]               0.050
## [2,]               0.118
## [3,]               0.042
## [4,]               0.412
## [5,]               0.908
## [6,]               0.402
names(svm_predictions)
## [1] "pred"            "library.predict"
head(svm_predictions$pred)
##       [,1]
## [1,] 0.251
## [2,] 0.313
## [3,] 0.263
## [4,] 0.343
## [5,] 0.545
## [6,] 0.272
head(svm_predictions$library.predict)
##      SL.svm_All
## [1,]      0.251
## [2,]      0.313
## [3,]      0.263
## [4,]      0.343
## [5,]      0.545
## [6,]      0.272
names(ensemble_predictions)
## [1] "pred"            "library.predict"
head(ensemble_predictions$pred)
##        [,1]
## [1,] 0.1194
## [2,] 0.3168
## [3,] 0.0622
## [4,] 0.5481
## [5,] 0.6418
## [6,] 0.3772
head(ensemble_predictions$library.predict)
##      SL.randomForest_All SL.svm_All SL.glm_All SL.ipredbagg_All SL.bayesglm_All
## [1,]               0.038      0.239     0.1438            0.173          0.1464
## [2,]               0.130      0.307     0.3969            0.255          0.3952
## [3,]               0.046      0.251     0.0557            0.112          0.0577
## [4,]               0.415      0.340     0.6179            0.493          0.6129
## [5,]               0.896      0.567     0.5515            0.784          0.5473
## [6,]               0.421      0.262     0.3623            0.410          0.3617
##      SL.mean_All SL.lm_All
## [1,]       0.309     0.153
## [2,]       0.309     0.403
## [3,]       0.309     0.000
## [4,]       0.309     0.569
## [5,]       0.309     0.522
## [6,]       0.309     0.376
# Recode probabilities
rf_conv.preds <- ifelse(rf_predictions$pred>=0.5,1,0)
svm_conv.preds <- ifelse(svm_predictions$pred>=0.5,1,0)
ensemble_conv.preds <- ifelse(rf_predictions$pred>=0.5,1,0)

Confusion Matrix

Following we show the confusion matrix for the three models.

rf_cm <- caret::confusionMatrix(as.factor(rf_conv.preds), as.factor(test$admit))
rf_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 48 25
##          1  4  3
##                                         
##                Accuracy : 0.637         
##                  95% CI : (0.522, 0.742)
##     No Information Rate : 0.65          
##     P-Value [Acc > NIR] : 0.641170      
##                                         
##                   Kappa : 0.037         
##                                         
##  Mcnemar's Test P-Value : 0.000204      
##                                         
##             Sensitivity : 0.923         
##             Specificity : 0.107         
##          Pos Pred Value : 0.658         
##          Neg Pred Value : 0.429         
##              Prevalence : 0.650         
##          Detection Rate : 0.600         
##    Detection Prevalence : 0.912         
##       Balanced Accuracy : 0.515         
##                                         
##        'Positive' Class : 0             
## 
svm_cm <- caret::confusionMatrix(as.factor(svm_conv.preds), as.factor(test$admit))
svm_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 51 27
##          1  1  1
##                                         
##                Accuracy : 0.65          
##                  95% CI : (0.535, 0.753)
##     No Information Rate : 0.65          
##     P-Value [Acc > NIR] : 0.551         
##                                         
##                   Kappa : 0.021         
##                                         
##  Mcnemar's Test P-Value : 0.00000231    
##                                         
##             Sensitivity : 0.9808        
##             Specificity : 0.0357        
##          Pos Pred Value : 0.6538        
##          Neg Pred Value : 0.5000        
##              Prevalence : 0.6500        
##          Detection Rate : 0.6375        
##    Detection Prevalence : 0.9750        
##       Balanced Accuracy : 0.5082        
##                                         
##        'Positive' Class : 0             
## 
ensemble_cm <- caret::confusionMatrix(as.factor(ensemble_conv.preds), as.factor(test$admit))
ensemble_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 48 25
##          1  4  3
##                                         
##                Accuracy : 0.637         
##                  95% CI : (0.522, 0.742)
##     No Information Rate : 0.65          
##     P-Value [Acc > NIR] : 0.641170      
##                                         
##                   Kappa : 0.037         
##                                         
##  Mcnemar's Test P-Value : 0.000204      
##                                         
##             Sensitivity : 0.923         
##             Specificity : 0.107         
##          Pos Pred Value : 0.658         
##          Neg Pred Value : 0.429         
##              Prevalence : 0.650         
##          Detection Rate : 0.600         
##    Detection Prevalence : 0.912         
##       Balanced Accuracy : 0.515         
##                                         
##        'Positive' Class : 0             
## 

Accuracy

Following is the accuracy for all the three models.

mean(rf_conv.preds == test$admit)
## [1] 0.637
mean(svm_conv.preds == test$admit)
## [1] 0.65
mean(ensemble_conv.preds == test$admit)
## [1] 0.637

Conclusion

For the project, we had to examine if the ensemble model will improve over the two individual models that were run; Random Forest and the Support Vector Machines (SVM) Classification model. From the three models, the best was the Support Vector Machines Classification model with a 65% accuracy. In conclusion, the ensemble model did not improve over the two individual models as the accuracy from the ensemble model was lower than of the SVM model.