Project 3: Random Forests (RF)

Author

Margaret Gatongi

Load Data

# Import data

dropout_data <- read.csv("project3_data.csv")
View(dropout_data)

# Assign variable labels using attr()

attr(dropout_data$SES, "label") <- "Socioeconomic Status (1-100)"
attr(dropout_data$ParentEdu, "label") <- "Parental Education Level (1-5)"
attr(dropout_data$Attendance, "label") <- "Attendance Rate (percent)"
attr(dropout_data$HomeworkHours, "label") <- "Hours Spent on Homework (per week)"
attr(dropout_data$Motivation, "label") <- "Student Motivation (1-15)"
attr(dropout_data$PeerSupport, "label") <- "Peer Support (0-15)"
attr(dropout_data$TestAnxiety, "label") <- "Test Anxiety (0-15)"
attr(dropout_data$ExtraCurricular, "label") <- "Extracurricular Activities (number)"
attr(dropout_data$MathSelfEfficacy, "label") <- "Math Self-Efficacy (0-15)"
#attr(dropout_data$FavoriteNumber, "label") <- "Favorite Number (Random Unrelated Variable)"
attr(dropout_data$Dropout, "label") <- "Dropout (1=Dropout; 0=NotDropout)"

# Check the structure of variables in this dataset

str(dropout_data)

'data.frame':   600 obs. of  10 variables:
 $ SES             : num  30.75 91.18 42.72 6.82 66.56 ...
  ..- attr(*, "label")= chr "Socioeconomic Status (1-100)"
 $ ParentEdu       : int  4 4 5 1 2 3 4 2 1 3 ...
  ..- attr(*, "label")= chr "Parental Education Level (1-5)"
 $ Attendance      : num  71.3 89 91 90.3 99 ...
  ..- attr(*, "label")= chr "Attendance Rate (percent)"
 $ HomeworkHours   : num  3.76 28.54 13.03 27.18 25.02 ...
  ..- attr(*, "label")= chr "Hours Spent on Homework (per week)"
 $ Motivation      : num  2.24 6.87 4.57 8.06 1.2 ...
  ..- attr(*, "label")= chr "Student Motivation (1-15)"
 $ PeerSupport     : num  10.765 15 11.69 0.421 5.392 ...
  ..- attr(*, "label")= chr "Peer Support (0-15)"
 $ TestAnxiety     : num  13.11 4.1 3.66 10.92 0 ...
  ..- attr(*, "label")= chr "Test Anxiety (0-15)"
 $ ExtraCurricular : int  3 11 5 7 2 5 11 0 15 1 ...
  ..- attr(*, "label")= chr "Extracurricular Activities (number)"
 $ MathSelfEfficacy: num  0 15 8.08 15 12.42 ...
  ..- attr(*, "label")= chr "Math Self-Efficacy (0-15)"
 $ Dropout         : int  0 0 0 0 0 0 0 0 0 0 ...
  ..- attr(*, "label")= chr "Dropout (1=Dropout; 0=NotDropout)"

Convert DV to a Factor

# Convert Dropout to a Factor

dropout_data$Dropout <- as.factor(dropout_data$Dropout)

# Provide a description of what the values 0 and 1 represent.

attr(dropout_data$Dropout, "label") <- "Dropout (1=Dropout; 0=NotDropout)"

# Check the structure of variables in this dataset

str(dropout_data)

'data.frame':   600 obs. of  10 variables:
 $ SES             : num  30.75 91.18 42.72 6.82 66.56 ...
  ..- attr(*, "label")= chr "Socioeconomic Status (1-100)"
 $ ParentEdu       : int  4 4 5 1 2 3 4 2 1 3 ...
  ..- attr(*, "label")= chr "Parental Education Level (1-5)"
 $ Attendance      : num  71.3 89 91 90.3 99 ...
  ..- attr(*, "label")= chr "Attendance Rate (percent)"
 $ HomeworkHours   : num  3.76 28.54 13.03 27.18 25.02 ...
  ..- attr(*, "label")= chr "Hours Spent on Homework (per week)"
 $ Motivation      : num  2.24 6.87 4.57 8.06 1.2 ...
  ..- attr(*, "label")= chr "Student Motivation (1-15)"
 $ PeerSupport     : num  10.765 15 11.69 0.421 5.392 ...
  ..- attr(*, "label")= chr "Peer Support (0-15)"
 $ TestAnxiety     : num  13.11 4.1 3.66 10.92 0 ...
  ..- attr(*, "label")= chr "Test Anxiety (0-15)"
 $ ExtraCurricular : int  3 11 5 7 2 5 11 0 15 1 ...
  ..- attr(*, "label")= chr "Extracurricular Activities (number)"
 $ MathSelfEfficacy: num  0 15 8.08 15 12.42 ...
  ..- attr(*, "label")= chr "Math Self-Efficacy (0-15)"
 $ Dropout         : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
  ..- attr(*, "label")= chr "Dropout (1=Dropout; 0=NotDropout)"

# View Levels: In R, when performing logistic regression with a factor variable, the first level of the factor is treated as the reference category. This means that R will treat "NotDropout" (0) as the reference category and will model the log-odds of being a dropout (1) compared to not being a dropout (0).

levels(dropout_data$Dropout)

[1] "0" "1"

When calculating performance measures like Receiver Operating Characteristic (ROC) and Area Under the Curve (AUC), R has trouble using numbers like 0 and 1 as category labels, even if they are already factors, because they don’t work well as variable names in its system. To fix this, we can rename them with clearer labels like “NotDropout” and “Dropout.”

levels(dropout_data$Dropout) <- c("NotDropout", "Dropout")
levels(dropout_data$Dropout)

[1] "NotDropout" "Dropout"

Split the Data into Training and Test sets

# Load caret library

library(caret)

Loading required package: ggplot2

Loading required package: lattice

# Set a seed for reproducibility

set.seed(668)

# Split the data

split <- createDataPartition(dropout_data$Dropout, p = 0.8, list = FALSE)

# Create Training and Test Sets

train_data <- dropout_data[split, ]
test_data <- dropout_data[-split, ]

# Check the dimensions of the training and test sets

dim(train_data)  # Should be around 800 rows

[1] 480  10

dim(test_data)   # Should be around 200 rows

[1] 120  10

# Ensure the Dropout variable is a factor (Double Check!)

str(train_data)

'data.frame':   480 obs. of  10 variables:
 $ SES             : num  30.7 91.2 42.7 47.8 46.9 ...
 $ ParentEdu       : int  4 4 5 4 2 1 1 3 5 1 ...
 $ Attendance      : num  71.3 89 91 78.7 100 ...
 $ HomeworkHours   : num  3.76 28.54 13.03 27.53 24.43 ...
 $ Motivation      : num  2.24 6.87 4.57 1.65 1 ...
 $ PeerSupport     : num  10.77 15 11.69 4.84 6.46 ...
 $ TestAnxiety     : num  13.11 4.1 3.66 2.35 15 ...
 $ ExtraCurricular : int  3 11 5 11 0 15 6 7 6 7 ...
 $ MathSelfEfficacy: num  0 15 8.08 0 9.69 ...
 $ Dropout         : Factor w/ 2 levels "NotDropout","Dropout": 1 1 1 1 1 1 1 1 1 2 ...

str(test_data)

'data.frame':   120 obs. of  10 variables:
 $ SES             : num  6.82 66.56 31.38 33.69 71.73 ...
 $ ParentEdu       : int  1 2 3 3 4 1 1 2 4 3 ...
 $ Attendance      : num  90.3 99 51.1 86.2 52.2 ...
 $ HomeworkHours   : num  27.2 25 30.5 46.4 17.2 ...
 $ Motivation      : num  8.06 1.2 4.74 3.51 13.41 ...
 $ PeerSupport     : num  0.421 5.392 13.828 13.306 11.548 ...
 $ TestAnxiety     : num  10.92 0 5.43 13.84 2.19 ...
 $ ExtraCurricular : int  7 2 5 1 7 0 0 6 3 5 ...
 $ MathSelfEfficacy: num  15 12.42 15 11.59 4.76 ...
 $ Dropout         : Factor w/ 2 levels "NotDropout","Dropout": 1 1 1 1 1 1 2 1 1 1 ...

# (Convert it if needed)
# train_data$Dropout <- as.factor(train_data$Dropout)
# test_data$Dropout <- as.factor(test_data$Dropout)

Dealing with Class Imbalance using SMOTE

# Check Class Distribution

table(train_data$Dropout)


NotDropout    Dropout 
       432         48

# Install and Load themis Package

#install.packages("themis")
library(themis)

Loading required package: recipes

Loading required package: dplyr


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Attaching package: 'recipes'

The following object is masked from 'package:stats':

    step

# Apply SMOTE

set.seed(668)

train_data_smote <- smotenc(train_data, var = "Dropout", k = 5, over_ratio = 1)

# Check Class Distribution After SMOTE

table(train_data_smote$Dropout)


NotDropout    Dropout 
       432        432

10-fold cross-validation

# Install and load the MLmetrics package to calculate additional performance metrics

# install.packages("MLmetrics") # Uncomment to install the package if not installed
library(MLmetrics)


Attaching package: 'MLmetrics'

The following objects are masked from 'package:caret':

    MAE, RMSE

The following object is masked from 'package:base':

    Recall

# Set up 10-fold cross-validation with class probabilities enabled

cv10 <- trainControl(
  method = "cv",            # Perform cross-validation
  number = 10,              # Use 10-folds
  classProbs = TRUE,        # Enable class probabilities (needed for ROC/AUC)
  summaryFunction = multiClassSummary  # Evaluate performance using multiple metrics
)

RF Fit

# Grid search for mtry

mtry <- expand.grid(
  .mtry = c(2, 3, 4, 5)   # Reasonable range around sqrt(p) or p/3
)

# Train Random Forest model with cv10, multiple mtry values, and ntree = 500.

set.seed(668)

RF_fit <- train(
  Dropout ~ ., 
  data = train_data_smote,
  method = "rf", # Random Forests
  trControl = cv10,
  metric = "AUC",
  tuneGrid = mtry, # Grid for mtry tuning
  ntree = 500,  # Set ntree manually (e.g., 100, 300, 500, 1000),
  importance = TRUE  # Enable calculation of Mean Decrease Accuracy
)

Print RF Fit

# Print the results

print(RF_fit)

Random Forest 

864 samples
  9 predictor
  2 classes: 'NotDropout', 'Dropout' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 778, 776, 778, 778, 777, 778, ... 
Resampling results across tuning parameters:

  mtry  logLoss    AUC        prAUC      Accuracy   Kappa      F1       
  2     0.1177859  0.9994616  0.9682073  0.9895616  0.9791202  0.9893551
  3     0.1121234  0.9988704  0.9549210  0.9791627  0.9583227  0.9788972
  4     0.1108834  0.9982287  0.9427220  0.9745116  0.9490204  0.9742983
  5     0.1105733  0.9976460  0.9305571  0.9721860  0.9443692  0.9720031
  Sensitivity  Specificity  Pos_Pred_Value  Neg_Pred_Value  Precision
  0.9813953    0.9976744    0.9976744       0.9821650       0.9976744
  0.9721987    0.9860994    0.9861445       0.9732118       0.9861445
  0.9675476    0.9814482    0.9816448       0.9685025       0.9816448
  0.9652220    0.9791226    0.9792085       0.9659862       0.9792085
  Recall     Detection_Rate  Balanced_Accuracy
  0.9813953  0.4907244       0.9895349        
  0.9721987  0.4861130       0.9791490        
  0.9675476  0.4837875       0.9744979        
  0.9652220  0.4826247       0.9721723        

AUC was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.

plot(RF_fit)

RF_fit$finalModel


Call:
 randomForest(x = x, y = y, ntree = 500, mtry = param$mtry, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 0.81%
Confusion matrix:
           NotDropout Dropout class.error
NotDropout        426       6 0.013888889
Dropout             1     431 0.002314815

OOB Error Rate

# Extract the OOB error rates for each number of trees

oob_error <- RF_fit$finalModel$err.rate[, "OOB"]

# Create the OOB error plot

plot(oob_error, type = "l", 
     xlab = "Number of Trees", ylab = "OOB Error Rate",
     main = "OOB Error Rate vs. Number of Trees",
     col = "blue", lwd = 2)

RF Results

randomForest::varImpPlot(RF_fit$finalModel)

RF Fit Final Model

randomForest::varImpPlot(RF_fit$finalModel, type = 1)

Evaluate Model Performance on the Test Set

# Predict Outcome Using Model on Test Data

predictions <- predict(RF_fit, newdata=test_data) # We add type = "class" for this rpart generated model. In the predict() function, this ensures that the predictions are the class labels (e.g., "Dropout", "NotDropout"), not probabilities.

# Create Confusion Matrix to Assess Model Performance

confusionMatrix(data=predictions, test_data$Dropout, positive = "Dropout", mode = "everything") # MUST specify positive = "Dropout"; otherwise, the default setting treats "NotDropout" as the positive class.

Confusion Matrix and Statistics

            Reference
Prediction   NotDropout Dropout
  NotDropout        107       9
  Dropout             1       3
                                          
               Accuracy : 0.9167          
                 95% CI : (0.8521, 0.9593)
    No Information Rate : 0.9             
    P-Value [Acc > NIR] : 0.33609         
                                          
                  Kappa : 0.3421          
                                          
 Mcnemar's Test P-Value : 0.02686         
                                          
            Sensitivity : 0.25000         
            Specificity : 0.99074         
         Pos Pred Value : 0.75000         
         Neg Pred Value : 0.92241         
              Precision : 0.75000         
                 Recall : 0.25000         
                     F1 : 0.37500         
             Prevalence : 0.10000         
         Detection Rate : 0.02500         
   Detection Prevalence : 0.03333         
      Balanced Accuracy : 0.62037         
                                          
       'Positive' Class : Dropout

Predicted Probabilities

# Generate Predicted Probabilities for the Test Data

predicted_probs <- predict(RF_fit, newdata = test_data, type = "prob")[, 2] # Selects the second column, which contains the predicted probabilities for the "Dropout" class (class 1).

# The ROCR package is used to generate performance curves such as the ROC curve.

#install.packages("ROCR")
library(ROCR)

# Create the ROC Curve

pred <- prediction(predicted_probs, test_data$Dropout, label.ordering = c("NotDropout", "Dropout")) # This creates an object that stores the predicted probabilities (predicted_probs) and the true labels (test_data$Dropout). It’s required for generating performance metrics.label.ordering = c("NotDropout", "Dropout") tells R to treat "NotDropout" as the negative class (coded as 0) and "Dropout" as the positive class (coded as 1).

perf <- performance(pred, "tpr", "fpr") # This function generates the ROC curve, plotting the true positive rate (tpr) against the false positive rate (fpr).

Plot ROC Curve

# Plot the ROC Curve

plot(perf, col = "blue", main = "ROC Curve")
abline(a = 0, b = 1, col = "red", lty = 2) # Adds a red diagonal line to represent random guessing (where the true positive rate equals the false positive rate).

AUC

# Calculate AUC (Area Under the Curve)

auc <- performance(pred, "auc") # Calculates the AUC
auc_value <- auc@y.values[[1]] # Extracts the AUC value.
print(paste("AUC =", auc_value)) # Prints the AUC value

[1] "AUC = 0.907021604938272"

Random Forest results simultaneously. Below is the previous code used for CART.

# Install and load the rpart package

#install.packages("rpart")
library(rpart)

# Fit the CART model using the rpart function

set.seed(668)

CART_final <- rpart(formula = Dropout ~ ., 
                data = train_data_smote,
                method = "class", 
                control = rpart.control(cp = 0.03888889),
                parms = list(split = "gini"))


# Generate Predicted Probabilities for the Test Data

CART_predicted_probs <- predict(CART_final, newdata = test_data, type = "prob")[, 2] # Selects the second column, which contains the predicted probabilities for the "Dropout" class (class 1).

# The ROCR package is used to generate performance curves such as the ROC curve.

#install.packages("ROCR")
library(ROCR)

# Create the ROC Curve

CART_pred <- prediction(CART_predicted_probs, test_data$Dropout, label.ordering = c("NotDropout", "Dropout")) # This creates an object that stores the predicted probabilities (predicted_probs) and the true labels (test_data$Dropout). It’s required for generating performance metrics.label.ordering = c("NotDropout", "Dropout") tells R to treat "NotDropout" as the negative class (coded as 0) and "Dropout" as the positive class (coded as 1).

CART_perf <- performance(CART_pred, "tpr", "fpr") # This function generates the ROC curve, plotting the true positive rate (tpr) against the false positive rate (fpr).

Decision tree ROC plus Random Forest ROC.

# Plot the ROC Curve FOR Random Forest  (First Plot from RF)

plot(perf, col = "blue", main = "ROC Curve")
abline(a = 0, b = 1, col = "red", lty = 2) # Adds a red diagonal line to represent random guessing (where the true positive rate equals the false positive rate).

# Add the ROC curve for Decision Tree (Add another plot for Decision Tree)

plot(CART_perf, col = "purple", add = TRUE) #add = True is to add another plot.

# Add a legend

legend("bottomright", legend = c("Random Forest", "Decision Tree"),
       col = c("blue", "purple"), lwd = 1)

Acknowledgement

This document was created by the author with assistance from class material provided to the EDUS 668 class by Dr. Chi-Ning Chang. The code and notes within the code blocks were provided by Dr. Chi-Ning Chang.