Load Packages

library(caret)
library(randomForest)

Example 1: Student Success Data

ss <- read.table('data/student_success_data.csv', header=TRUE, sep=',')
ss <- na.omit(ss)
ss <- ss[(ss$G3 <= 20) & (ss$G3 >= 0), ]
ss$Passed <- factor(ifelse(ss$G3 < 10, 0, 1))
ss$G1 <- NULL
ss$G2 <- NULL
ss$G3 <- NULL
ss$absences <- NULL
summary(ss)
 school    sex          age        address famsize   Pstatus      Medu      
 GP :462   F:297   Min.   :15.00   R:155   GT3:393   A: 45   Min.   :1.000  
 MHS:101   M:266   1st Qu.:16.00   U:408   LE3:170   T:518   1st Qu.:2.000  
                   Median :16.00                             Median :3.000  
                   Mean   :16.61                             Mean   :2.874  
                   3rd Qu.:18.00                             3rd Qu.:4.000  
                   Max.   :22.00                             Max.   :4.000  
      Fedu             Mjob           Fjob            reason      guardian  
 Min.   :1.000   at_home : 66   at_home : 31   course    :215   father:129  
 1st Qu.:2.000   health  : 39   health  : 25   home      :158   mother:398  
 Median :3.000   other   :184   other   :275   other     : 48   other : 36  
 Mean   :2.686   services:182   services:173   reputation:142               
 3rd Qu.:4.000   teacher : 92   teacher : 59                                
 Max.   :4.000                                                              
   traveltime      studytime        failures      schoolsup famsup     paid     activities
 Min.   :1.000   Min.   :1.000   Min.   :0.0000   no :507   no :254   no :332   no :262   
 1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000   yes: 56   yes:309   yes:231   yes:301   
 Median :1.000   Median :2.000   Median :0.0000                                           
 Mean   :1.481   Mean   :1.986   Mean   :0.2842                                           
 3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:0.0000                                           
 Max.   :4.000   Max.   :4.000   Max.   :3.0000                                           
 nursery   higher    internet  romantic      famrel         freetime         goout      
 no :102   no : 22   no : 93   no :341   Min.   :1.000   Min.   :1.000   Min.   :1.000  
 yes:461   yes:541   yes:470   yes:222   1st Qu.:4.000   1st Qu.:3.000   1st Qu.:2.000  
                                         Median :4.000   Median :3.000   Median :3.000  
                                         Mean   :3.938   Mean   :3.213   Mean   :3.021  
                                         3rd Qu.:5.000   3rd Qu.:4.000   3rd Qu.:4.000  
                                         Max.   :5.000   Max.   :5.000   Max.   :5.000  
      Dalc            Walc           health     Passed 
 Min.   :1.000   Min.   :1.000   Min.   :1.00   0:200  
 1st Qu.:1.000   1st Qu.:1.000   1st Qu.:3.00   1:363  
 Median :1.000   Median :2.000   Median :4.00          
 Mean   :1.401   Mean   :2.176   Mean   :3.67          
 3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.:5.00          
 Max.   :5.000   Max.   :5.000   Max.   :5.00          

Basic Example of a Random Forest

set.seed(1)
m1 <- randomForest(Passed ~ ., ss, importance=TRUE)
m1

Call:
 randomForest(formula = Passed ~ ., data = ss, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 5

        OOB estimate of  error rate: 16.7%
Confusion matrix:
    0   1 class.error
0 142  58  0.29000000
1  36 327  0.09917355

Tuning on Number of Trees

plot(m1)
legend("topright", cex =1, legend=colnames(m1$err.rate), lty=c(1,2,3), col=c(1,2,3))

m1_opt_ntrees <- which.min(m1$err.rate[,'OOB'])
m1_opt_err_rate <- min(m1$err.rate[,'OOB'])

cat("Optimal Number of Trees: ", m1_opt_ntrees, "\n",
    "Minimum Error Rate:      ", m1_opt_err_rate, sep="")
Optimal Number of Trees: 124
Minimum Error Rate:      0.1527531
set.seed(1)
m2 <- randomForest(Passed ~ ., ss, ntree=m1_opt_ntrees, importance=TRUE)
m2

Call:
 randomForest(formula = Passed ~ ., data = ss, ntree = m1_opt_ntrees,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 124
No. of variables tried at each split: 5

        OOB estimate of  error rate: 15.28%
Confusion matrix:
    0   1 class.error
0 146  54  0.27000000
1  32 331  0.08815427

Training Accuracy

train_predict <- predict(m2, ss)
mean(train_predict == ss$Passed)
[1] 1

Tuning on mtry and ntree

oob_acc_list <- c()
opt_ntree_list <- c()

for(i in 1:29){
  set.seed(1)
  temp_mod <- randomForest(Passed ~ ., ss, ntree=500, importance=TRUE, mtry=i)
  oob_acc_list <- c(oob_acc_list, min(temp_mod$err.rate[,'OOB']))
  opt_ntree_list <- c(opt_ntree_list, which.min(temp_mod$err.rate[,'OOB']))
}

opt_mtry <- which.min(oob_acc_list)
opt_ntree <- opt_ntree_list[opt_mtry]
min_oob_acc <- min(oob_acc_list)

cat("Optimal Value of mtry:  ", opt_mtry, "\n",
    "Optimal Value of ntree: ", opt_ntree, "\n",
    "Minimum OOB Accuracy:   ", min_oob_acc, sep="")
Optimal Value of mtry:  3
Optimal Value of ntree: 445
Minimum OOB Accuracy:   0.1420959
plot(1:29, oob_acc_list, xlab="Value of mtry", ylab="Minimum OOB Accuracy Score")
lines(1:29, oob_acc_list)
abline(v=which.min(oob_acc_list), col="red", lty=2, lwd=1)

set.seed(1)
m3 <- randomForest(Passed ~ ., ss, ntree=445, mtry=3, importance=TRUE)
m3

Call:
 randomForest(formula = Passed ~ ., data = ss, ntree = 445, mtry = 3,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 445
No. of variables tried at each split: 3

        OOB estimate of  error rate: 14.21%
Confusion matrix:
    0   1 class.error
0 145  55  0.27500000
1  25 338  0.06887052

Estimating Out-Of-Sample Performance

set.seed(1)
train(Passed ~ ., ss, method="rf", ntree=445,
      trControl = trainControl(method="cv", number=20), 
      tuneGrid = expand.grid(mtry=c(3)))
Random Forest 

563 samples
 29 predictor
  2 classes: '0', '1' 

No pre-processing
Resampling: Cross-Validated (20 fold) 
Summary of sample sizes: 535, 535, 535, 535, 535, 534, ... 
Resampling results:

  Accuracy   Kappa    
  0.8349754  0.6226192

Tuning parameter 'mtry' was held constant at a value of 3
set.seed(1)
randomForest(Passed ~ ., ss, ntree=445, mtry=3, importance=TRUE, 
  replace=FALSE, sampsize=floor(0.8*nrow(ss)))

Call:
 randomForest(formula = Passed ~ ., data = ss, ntree = 445, mtry = 3,      importance = TRUE, replace = FALSE, sampsize = floor(0.8 *          nrow(ss)))
               Type of random forest: classification
                     Number of trees: 445
No. of variables tried at each split: 3

        OOB estimate of  error rate: 14.92%
Confusion matrix:
    0   1 class.error
0 147  53  0.26500000
1  31 332  0.08539945
error_rate_list <- c()

for (i in 1:20){

  set.seed(i)
  temp_mod <- randomForest(Passed ~ ., ss, ntree=445, mtry=3, importance=TRUE, 
                           replace=FALSE, sampsize=floor(0.8*nrow(ss)))
  
  error_rate_list <- c(error_rate_list, temp_mod$err.rate[445,"OOB"])
}

mean(error_rate_list)
[1] 0.1590586
boxplot(error_rate_list)

Feature Importance

m3$importance
                       0            1 MeanDecreaseAccuracy MeanDecreaseGini
school      0.0111720672 0.0025577942          0.005694658         3.155759
sex         0.0233632155 0.0012521691          0.009095097         5.437597
age         0.0264599152 0.0032338034          0.011469429        11.435674
address     0.0149627498 0.0023202583          0.006837335         4.639802
famsize     0.0199481289 0.0008092124          0.007596645         4.387064
Pstatus     0.0046869379 0.0001193006          0.001765697         2.380665
Medu        0.0541793952 0.0174901597          0.030577784        14.968563
Fedu        0.0491084012 0.0177485314          0.028881618        15.012602
Mjob        0.0599594814 0.0131151192          0.029743839        17.688261
Fjob        0.0293255539 0.0039668412          0.013065376        10.509067
re