Project 3: Random Forests (RF)

Author

Margaret Gatongi

Load Data

# Import data

dropout_data <- read.csv("project3_data.csv")
View(dropout_data)

# Assign variable labels using attr()

attr(dropout_data$SES, "label") <- "Socioeconomic Status (1-100)"
attr(dropout_data$ParentEdu, "label") <- "Parental Education Level (1-5)"
attr(dropout_data$Attendance, "label") <- "Attendance Rate (percent)"
attr(dropout_data$HomeworkHours, "label") <- "Hours Spent on Homework (per week)"
attr(dropout_data$Motivation, "label") <- "Student Motivation (1-15)"
attr(dropout_data$PeerSupport, "label") <- "Peer Support (0-15)"
attr(dropout_data$TestAnxiety, "label") <- "Test Anxiety (0-15)"
attr(dropout_data$ExtraCurricular, "label") <- "Extracurricular Activities (number)"
attr(dropout_data$MathSelfEfficacy, "label") <- "Math Self-Efficacy (0-15)"
#attr(dropout_data$FavoriteNumber, "label") <- "Favorite Number (Random Unrelated Variable)"
attr(dropout_data$Dropout, "label") <- "Dropout (1=Dropout; 0=NotDropout)"

# Check the structure of variables in this dataset

str(dropout_data)
'data.frame':   600 obs. of  10 variables:
 $ SES             : num  30.75 91.18 42.72 6.82 66.56 ...
  ..- attr(*, "label")= chr "Socioeconomic Status (1-100)"
 $ ParentEdu       : int  4 4 5 1 2 3 4 2 1 3 ...
  ..- attr(*, "label")= chr "Parental Education Level (1-5)"
 $ Attendance      : num  71.3 89 91 90.3 99 ...
  ..- attr(*, "label")= chr "Attendance Rate (percent)"
 $ HomeworkHours   : num  3.76 28.54 13.03 27.18 25.02 ...
  ..- attr(*, "label")= chr "Hours Spent on Homework (per week)"
 $ Motivation      : num  2.24 6.87 4.57 8.06 1.2 ...
  ..- attr(*, "label")= chr "Student Motivation (1-15)"
 $ PeerSupport     : num  10.765 15 11.69 0.421 5.392 ...
  ..- attr(*, "label")= chr "Peer Support (0-15)"
 $ TestAnxiety     : num  13.11 4.1 3.66 10.92 0 ...
  ..- attr(*, "label")= chr "Test Anxiety (0-15)"
 $ ExtraCurricular : int  3 11 5 7 2 5 11 0 15 1 ...
  ..- attr(*, "label")= chr "Extracurricular Activities (number)"
 $ MathSelfEfficacy: num  0 15 8.08 15 12.42 ...
  ..- attr(*, "label")= chr "Math Self-Efficacy (0-15)"
 $ Dropout         : int  0 0 0 0 0 0 0 0 0 0 ...
  ..- attr(*, "label")= chr "Dropout (1=Dropout; 0=NotDropout)"

Convert DV to a Factor

# Convert Dropout to a Factor

dropout_data$Dropout <- as.factor(dropout_data$Dropout)

# Provide a description of what the values 0 and 1 represent.

attr(dropout_data$Dropout, "label") <- "Dropout (1=Dropout; 0=NotDropout)"

# Check the structure of variables in this dataset

str(dropout_data)
'data.frame':   600 obs. of  10 variables:
 $ SES             : num  30.75 91.18 42.72 6.82 66.56 ...
  ..- attr(*, "label")= chr "Socioeconomic Status (1-100)"
 $ ParentEdu       : int  4 4 5 1 2 3 4 2 1 3 ...
  ..- attr(*, "label")= chr "Parental Education Level (1-5)"
 $ Attendance      : num  71.3 89 91 90.3 99 ...
  ..- attr(*, "label")= chr "Attendance Rate (percent)"
 $ HomeworkHours   : num  3.76 28.54 13.03 27.18 25.02 ...
  ..- attr(*, "label")= chr "Hours Spent on Homework (per week)"
 $ Motivation      : num  2.24 6.87 4.57 8.06 1.2 ...
  ..- attr(*, "label")= chr "Student Motivation (1-15)"
 $ PeerSupport     : num  10.765 15 11.69 0.421 5.392 ...
  ..- attr(*, "label")= chr "Peer Support (0-15)"
 $ TestAnxiety     : num  13.11 4.1 3.66 10.92 0 ...
  ..- attr(*, "label")= chr "Test Anxiety (0-15)"
 $ ExtraCurricular : int  3 11 5 7 2 5 11 0 15 1 ...
  ..- attr(*, "label")= chr "Extracurricular Activities (number)"
 $ MathSelfEfficacy: num  0 15 8.08 15 12.42 ...
  ..- attr(*, "label")= chr "Math Self-Efficacy (0-15)"
 $ Dropout         : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
  ..- attr(*, "label")= chr "Dropout (1=Dropout; 0=NotDropout)"
# View Levels: In R, when performing logistic regression with a factor variable, the first level of the factor is treated as the reference category. This means that R will treat "NotDropout" (0) as the reference category and will model the log-odds of being a dropout (1) compared to not being a dropout (0).

levels(dropout_data$Dropout)
[1] "0" "1"

When calculating performance measures like Receiver Operating Characteristic (ROC) and Area Under the Curve (AUC), R has trouble using numbers like 0 and 1 as category labels, even if they are already factors, because they don’t work well as variable names in its system. To fix this, we can rename them with clearer labels like “NotDropout” and “Dropout.”

levels(dropout_data$Dropout) <- c("NotDropout", "Dropout")
levels(dropout_data$Dropout)
[1] "NotDropout" "Dropout"   

Split the Data into Training and Test sets

# Load caret library

library(caret)
Loading required package: ggplot2
Loading required package: lattice
# Set a seed for reproducibility

set.seed(668)

# Split the data

split <- createDataPartition(dropout_data$Dropout, p = 0.8, list = FALSE)

# Create Training and Test Sets

train_data <- dropout_data[split, ]
test_data <- dropout_data[-split, ]

# Check the dimensions of the training and test sets

dim(train_data)  # Should be around 800 rows
[1] 480  10
dim(test_data)   # Should be around 200 rows
[1] 120  10
# Ensure the Dropout variable is a factor (Double Check!)

str(train_data)
'data.frame':   480 obs. of  10 variables:
 $ SES             : num  30.7 91.2 42.7 47.8 46.9 ...
 $ ParentEdu       : int  4 4 5 4 2 1 1 3 5 1 ...
 $ Attendance      : num  71.3 89 91 78.7 100 ...
 $ HomeworkHours   : num  3.76 28.54 13.03 27.53 24.43 ...
 $ Motivation      : num  2.24 6.87 4.57 1.65 1 ...
 $ PeerSupport     : num  10.77 15 11.69 4.84 6.46 ...
 $ TestAnxiety     : num  13.11 4.1 3.66 2.35 15 ...
 $ ExtraCurricular : int  3 11 5 11 0 15 6 7 6 7 ...
 $ MathSelfEfficacy: num  0 15 8.08 0 9.69 ...
 $ Dropout         : Factor w/ 2 levels "NotDropout","Dropout": 1 1 1 1 1 1 1 1 1 2 ...
str(test_data)
'data.frame':   120 obs. of  10 variables:
 $ SES             : num  6.82 66.56 31.38 33.69 71.73 ...
 $ ParentEdu       : int  1 2 3 3 4 1 1 2 4 3 ...
 $ Attendance      : num  90.3 99 51.1 86.2 52.2 ...
 $ HomeworkHours   : num  27.2 25 30.5 46.4 17.2 ...
 $ Motivation      : num  8.06 1.2 4.74 3.51 13.41 ...
 $ PeerSupport     : num  0.421 5.392 13.828 13.306 11.548 ...
 $ TestAnxiety     : num  10.92 0 5.43 13.84 2.19 ...
 $ ExtraCurricular : int  7 2 5 1 7 0 0 6 3 5 ...
 $ MathSelfEfficacy: num  15 12.42 15 11.59 4.76 ...
 $ Dropout         : Factor w/ 2 levels "NotDropout","Dropout": 1 1 1 1 1 1 2 1 1 1 ...
# (Convert it if needed)
# train_data$Dropout <- as.factor(train_data$Dropout)
# test_data$Dropout <- as.factor(test_data$Dropout)

Dealing with Class Imbalance using SMOTE

# Check Class Distribution

table(train_data$Dropout)

NotDropout    Dropout 
       432         48 
# Install and Load themis Package

#install.packages("themis")
library(themis)
Loading required package: recipes
Loading required package: dplyr

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Attaching package: 'recipes'
The following object is masked from 'package:stats':

    step
# Apply SMOTE

set.seed(668)

train_data_smote <- smotenc(train_data, var = "Dropout", k = 5, over_ratio = 1)

# Check Class Distribution After SMOTE

table(train_data_smote$Dropout)

NotDropout    Dropout 
       432        432 

10-fold cross-validation

# Install and load the MLmetrics package to calculate additional performance metrics

# install.packages("MLmetrics") # Uncomment to install the package if not installed
library(MLmetrics)

Attaching package: 'MLmetrics'
The following objects are masked from 'package:caret':

    MAE, RMSE
The following object is masked from 'package:base':

    Recall
# Set up 10-fold cross-validation with class probabilities enabled

cv10 <- trainControl(
  method = "cv",            # Perform cross-validation
  number = 10,              # Use 10-folds
  classProbs = TRUE,        # Enable class probabilities (needed for ROC/AUC)
  summaryFunction = multiClassSummary  # Evaluate performance using multiple metrics
)

RF Fit

# Grid search for mtry

mtry <- expand.grid(
  .mtry = c(2, 3, 4, 5)   # Reasonable range around sqrt(p) or p/3
)

# Train Random Forest model with cv10, multiple mtry values, and ntree = 500.

set.seed(668)

RF_fit <- train(
  Dropout ~ ., 
  data = train_data_smote,
  method = "rf", # Random Forests
  trControl = cv10,
  metric = "AUC",
  tuneGrid = mtry, # Grid for mtry tuning
  ntree = 500,  # Set ntree manually (e.g., 100, 300, 500, 1000),
  importance = TRUE  # Enable calculation of Mean Decrease Accuracy
)

OOB Error Rate

# Extract the OOB error rates for each number of trees

oob_error <- RF_fit$finalModel$err.rate[, "OOB"]

# Create the OOB error plot

plot(oob_error, type = "l", 
     xlab = "Number of Trees", ylab = "OOB Error Rate",
     main = "OOB Error Rate vs. Number of Trees",
     col = "blue", lwd = 2)

RF Results

randomForest::varImpPlot(RF_fit$finalModel)

RF Fit Final Model

randomForest::varImpPlot(RF_fit$finalModel, type = 1)

Evaluate Model Performance on the Test Set

# Predict Outcome Using Model on Test Data

predictions <- predict(RF_fit, newdata=test_data) # We add type = "class" for this rpart generated model. In the predict() function, this ensures that the predictions are the class labels (e.g., "Dropout", "NotDropout"), not probabilities.

# Create Confusion Matrix to Assess Model Performance

confusionMatrix(data=predictions, test_data$Dropout, positive = "Dropout", mode = "everything") # MUST specify positive = "Dropout"; otherwise, the default setting treats "NotDropout" as the positive class.
Confusion Matrix and Statistics

            Reference
Prediction   NotDropout Dropout
  NotDropout        107       9
  Dropout             1       3
                                          
               Accuracy : 0.9167          
                 95% CI : (0.8521, 0.9593)
    No Information Rate : 0.9             
    P-Value [Acc > NIR] : 0.33609         
                                          
                  Kappa : 0.3421          
                                          
 Mcnemar's Test P-Value : 0.02686         
                                          
            Sensitivity : 0.25000         
            Specificity : 0.99074         
         Pos Pred Value : 0.75000         
         Neg Pred Value : 0.92241         
              Precision : 0.75000         
                 Recall : 0.25000         
                     F1 : 0.37500         
             Prevalence : 0.10000         
         Detection Rate : 0.02500         
   Detection Prevalence : 0.03333         
      Balanced Accuracy : 0.62037         
                                          
       'Positive' Class : Dropout         
                                          

Predicted Probabilities

# Generate Predicted Probabilities for the Test Data

predicted_probs <- predict(RF_fit, newdata = test_data, type = "prob")[, 2] # Selects the second column, which contains the predicted probabilities for the "Dropout" class (class 1).

# The ROCR package is used to generate performance curves such as the ROC curve.

#install.packages("ROCR")
library(ROCR)

# Create the ROC Curve

pred <- prediction(predicted_probs, test_data$Dropout, label.ordering = c("NotDropout", "Dropout")) # This creates an object that stores the predicted probabilities (predicted_probs) and the true labels (test_data$Dropout). It’s required for generating performance metrics.label.ordering = c("NotDropout", "Dropout") tells R to treat "NotDropout" as the negative class (coded as 0) and "Dropout" as the positive class (coded as 1).

perf <- performance(pred, "tpr", "fpr") # This function generates the ROC curve, plotting the true positive rate (tpr) against the false positive rate (fpr).

Plot ROC Curve

# Plot the ROC Curve

plot(perf, col = "blue", main = "ROC Curve")
abline(a = 0, b = 1, col = "red", lty = 2) # Adds a red diagonal line to represent random guessing (where the true positive rate equals the false positive rate).

AUC

# Calculate AUC (Area Under the Curve)

auc <- performance(pred, "auc") # Calculates the AUC
auc_value <- auc@y.values[[1]] # Extracts the AUC value.
print(paste("AUC =", auc_value)) # Prints the AUC value
[1] "AUC = 0.907021604938272"

Random Forest results simultaneously. Below is the previous code used for CART.

# Install and load the rpart package

#install.packages("rpart")
library(rpart)

# Fit the CART model using the rpart function

set.seed(668)

CART_final <- rpart(formula = Dropout ~ ., 
                data = train_data_smote,
                method = "class", 
                control = rpart.control(cp = 0.03888889),
                parms = list(split = "gini"))


# Generate Predicted Probabilities for the Test Data

CART_predicted_probs <- predict(CART_final, newdata = test_data, type = "prob")[, 2] # Selects the second column, which contains the predicted probabilities for the "Dropout" class (class 1).

# The ROCR package is used to generate performance curves such as the ROC curve.

#install.packages("ROCR")
library(ROCR)

# Create the ROC Curve

CART_pred <- prediction(CART_predicted_probs, test_data$Dropout, label.ordering = c("NotDropout", "Dropout")) # This creates an object that stores the predicted probabilities (predicted_probs) and the true labels (test_data$Dropout). It’s required for generating performance metrics.label.ordering = c("NotDropout", "Dropout") tells R to treat "NotDropout" as the negative class (coded as 0) and "Dropout" as the positive class (coded as 1).

CART_perf <- performance(CART_pred, "tpr", "fpr") # This function generates the ROC curve, plotting the true positive rate (tpr) against the false positive rate (fpr).

Decision tree ROC plus Random Forest ROC.

# Plot the ROC Curve FOR Random Forest  (First Plot from RF)

plot(perf, col = "blue", main = "ROC Curve")
abline(a = 0, b = 1, col = "red", lty = 2) # Adds a red diagonal line to represent random guessing (where the true positive rate equals the false positive rate).

# Add the ROC curve for Decision Tree (Add another plot for Decision Tree)

plot(CART_perf, col = "purple", add = TRUE) #add = True is to add another plot.

# Add a legend

legend("bottomright", legend = c("Random Forest", "Decision Tree"),
       col = c("blue", "purple"), lwd = 1)

Acknowledgement

This document was created by the author with assistance from class material provided to the EDUS 668 class by Dr. Chi-Ning Chang. The code and notes within the code blocks were provided by Dr. Chi-Ning Chang.