# Convert Dropout to a Factordropout_data$Dropout <-as.factor(dropout_data$Dropout)# Provide a description of what the values 0 and 1 represent.attr(dropout_data$Dropout, "label") <-"Dropout (1=Dropout; 0=NotDropout)"# Check the structure of variables in this datasetstr(dropout_data)
# View Levels: In R, when performing logistic regression with a factor variable, the first level of the factor is treated as the reference category. This means that R will treat "NotDropout" (0) as the reference category and will model the log-odds of being a dropout (1) compared to not being a dropout (0).levels(dropout_data$Dropout)
[1] "0" "1"
When calculating performance measures like Receiver Operating Characteristic (ROC) and Area Under the Curve (AUC), R has trouble using numbers like 0 and 1 as category labels, even if they are already factors, because they don’t work well as variable names in its system. To fix this, we can rename them with clearer labels like “NotDropout” and “Dropout.”
# Set a seed for reproducibilityset.seed(668)# Split the datasplit <-createDataPartition(dropout_data$Dropout, p =0.8, list =FALSE)# Create Training and Test Setstrain_data <- dropout_data[split, ]test_data <- dropout_data[-split, ]# Check the dimensions of the training and test setsdim(train_data) # Should be around 800 rows
[1] 480 10
dim(test_data) # Should be around 200 rows
[1] 120 10
# Ensure the Dropout variable is a factor (Double Check!)str(train_data)
# (Convert it if needed)# train_data$Dropout <- as.factor(train_data$Dropout)# test_data$Dropout <- as.factor(test_data$Dropout)
Dealing with Class Imbalance using SMOTE
# Check Class Distributiontable(train_data$Dropout)
NotDropout Dropout
432 48
# Install and Load themis Package#install.packages("themis")library(themis)
Loading required package: recipes
Loading required package: dplyr
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
Attaching package: 'recipes'
The following object is masked from 'package:stats':
step
# Apply SMOTEset.seed(668)train_data_smote <-smotenc(train_data, var ="Dropout", k =5, over_ratio =1)# Check Class Distribution After SMOTEtable(train_data_smote$Dropout)
NotDropout Dropout
432 432
10-fold cross-validation
# Install and load the MLmetrics package to calculate additional performance metrics# install.packages("MLmetrics") # Uncomment to install the package if not installedlibrary(MLmetrics)
Attaching package: 'MLmetrics'
The following objects are masked from 'package:caret':
MAE, RMSE
The following object is masked from 'package:base':
Recall
# Set up 10-fold cross-validation with class probabilities enabledcv10 <-trainControl(method ="cv", # Perform cross-validationnumber =10, # Use 10-foldsclassProbs =TRUE, # Enable class probabilities (needed for ROC/AUC)summaryFunction = multiClassSummary # Evaluate performance using multiple metrics)
RF Fit
# Grid search for mtrymtry <-expand.grid(.mtry =c(2, 3, 4, 5) # Reasonable range around sqrt(p) or p/3)# Train Random Forest model with cv10, multiple mtry values, and ntree = 500.set.seed(668)RF_fit <-train( Dropout ~ ., data = train_data_smote,method ="rf", # Random ForeststrControl = cv10,metric ="AUC",tuneGrid = mtry, # Grid for mtry tuningntree =500, # Set ntree manually (e.g., 100, 300, 500, 1000),importance =TRUE# Enable calculation of Mean Decrease Accuracy)
Print RF Fit
# Print the resultsprint(RF_fit)
Random Forest
864 samples
9 predictor
2 classes: 'NotDropout', 'Dropout'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 778, 776, 778, 778, 777, 778, ...
Resampling results across tuning parameters:
mtry logLoss AUC prAUC Accuracy Kappa F1
2 0.1177859 0.9994616 0.9682073 0.9895616 0.9791202 0.9893551
3 0.1121234 0.9988704 0.9549210 0.9791627 0.9583227 0.9788972
4 0.1108834 0.9982287 0.9427220 0.9745116 0.9490204 0.9742983
5 0.1105733 0.9976460 0.9305571 0.9721860 0.9443692 0.9720031
Sensitivity Specificity Pos_Pred_Value Neg_Pred_Value Precision
0.9813953 0.9976744 0.9976744 0.9821650 0.9976744
0.9721987 0.9860994 0.9861445 0.9732118 0.9861445
0.9675476 0.9814482 0.9816448 0.9685025 0.9816448
0.9652220 0.9791226 0.9792085 0.9659862 0.9792085
Recall Detection_Rate Balanced_Accuracy
0.9813953 0.4907244 0.9895349
0.9721987 0.4861130 0.9791490
0.9675476 0.4837875 0.9744979
0.9652220 0.4826247 0.9721723
AUC was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
plot(RF_fit)
RF_fit$finalModel
Call:
randomForest(x = x, y = y, ntree = 500, mtry = param$mtry, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 0.81%
Confusion matrix:
NotDropout Dropout class.error
NotDropout 426 6 0.013888889
Dropout 1 431 0.002314815
OOB Error Rate
# Extract the OOB error rates for each number of treesoob_error <- RF_fit$finalModel$err.rate[, "OOB"]# Create the OOB error plotplot(oob_error, type ="l", xlab ="Number of Trees", ylab ="OOB Error Rate",main ="OOB Error Rate vs. Number of Trees",col ="blue", lwd =2)
RF Results
randomForest::varImpPlot(RF_fit$finalModel)
RF Fit Final Model
randomForest::varImpPlot(RF_fit$finalModel, type =1)
Evaluate Model Performance on the Test Set
# Predict Outcome Using Model on Test Datapredictions <-predict(RF_fit, newdata=test_data) # We add type = "class" for this rpart generated model. In the predict() function, this ensures that the predictions are the class labels (e.g., "Dropout", "NotDropout"), not probabilities.# Create Confusion Matrix to Assess Model PerformanceconfusionMatrix(data=predictions, test_data$Dropout, positive ="Dropout", mode ="everything") # MUST specify positive = "Dropout"; otherwise, the default setting treats "NotDropout" as the positive class.
Confusion Matrix and Statistics
Reference
Prediction NotDropout Dropout
NotDropout 107 9
Dropout 1 3
Accuracy : 0.9167
95% CI : (0.8521, 0.9593)
No Information Rate : 0.9
P-Value [Acc > NIR] : 0.33609
Kappa : 0.3421
Mcnemar's Test P-Value : 0.02686
Sensitivity : 0.25000
Specificity : 0.99074
Pos Pred Value : 0.75000
Neg Pred Value : 0.92241
Precision : 0.75000
Recall : 0.25000
F1 : 0.37500
Prevalence : 0.10000
Detection Rate : 0.02500
Detection Prevalence : 0.03333
Balanced Accuracy : 0.62037
'Positive' Class : Dropout
Predicted Probabilities
# Generate Predicted Probabilities for the Test Datapredicted_probs <-predict(RF_fit, newdata = test_data, type ="prob")[, 2] # Selects the second column, which contains the predicted probabilities for the "Dropout" class (class 1).# The ROCR package is used to generate performance curves such as the ROC curve.#install.packages("ROCR")library(ROCR)# Create the ROC Curvepred <-prediction(predicted_probs, test_data$Dropout, label.ordering =c("NotDropout", "Dropout")) # This creates an object that stores the predicted probabilities (predicted_probs) and the true labels (test_data$Dropout). It’s required for generating performance metrics.label.ordering = c("NotDropout", "Dropout") tells R to treat "NotDropout" as the negative class (coded as 0) and "Dropout" as the positive class (coded as 1).perf <-performance(pred, "tpr", "fpr") # This function generates the ROC curve, plotting the true positive rate (tpr) against the false positive rate (fpr).
Plot ROC Curve
# Plot the ROC Curveplot(perf, col ="blue", main ="ROC Curve")abline(a =0, b =1, col ="red", lty =2) # Adds a red diagonal line to represent random guessing (where the true positive rate equals the false positive rate).
AUC
# Calculate AUC (Area Under the Curve)auc <-performance(pred, "auc") # Calculates the AUCauc_value <- auc@y.values[[1]] # Extracts the AUC value.print(paste("AUC =", auc_value)) # Prints the AUC value
[1] "AUC = 0.907021604938272"
Random Forest results simultaneously. Below is the previous code used for CART.
# Install and load the rpart package#install.packages("rpart")library(rpart)# Fit the CART model using the rpart functionset.seed(668)CART_final <-rpart(formula = Dropout ~ ., data = train_data_smote,method ="class", control =rpart.control(cp =0.03888889),parms =list(split ="gini"))# Generate Predicted Probabilities for the Test DataCART_predicted_probs <-predict(CART_final, newdata = test_data, type ="prob")[, 2] # Selects the second column, which contains the predicted probabilities for the "Dropout" class (class 1).# The ROCR package is used to generate performance curves such as the ROC curve.#install.packages("ROCR")library(ROCR)# Create the ROC CurveCART_pred <-prediction(CART_predicted_probs, test_data$Dropout, label.ordering =c("NotDropout", "Dropout")) # This creates an object that stores the predicted probabilities (predicted_probs) and the true labels (test_data$Dropout). It’s required for generating performance metrics.label.ordering = c("NotDropout", "Dropout") tells R to treat "NotDropout" as the negative class (coded as 0) and "Dropout" as the positive class (coded as 1).CART_perf <-performance(CART_pred, "tpr", "fpr") # This function generates the ROC curve, plotting the true positive rate (tpr) against the false positive rate (fpr).
Decision tree ROC plus Random Forest ROC.
# Plot the ROC Curve FOR Random Forest (First Plot from RF)plot(perf, col ="blue", main ="ROC Curve")abline(a =0, b =1, col ="red", lty =2) # Adds a red diagonal line to represent random guessing (where the true positive rate equals the false positive rate).# Add the ROC curve for Decision Tree (Add another plot for Decision Tree)plot(CART_perf, col ="purple", add =TRUE) #add = True is to add another plot.# Add a legendlegend("bottomright", legend =c("Random Forest", "Decision Tree"),col =c("blue", "purple"), lwd =1)
Acknowledgement
This document was created by the author with assistance from class material provided to the EDUS 668 class by Dr. Chi-Ning Chang. The code and notes within the code blocks were provided by Dr. Chi-Ning Chang.