# Convert Dropout to a Factordropout_data$Dropout <-as.factor(dropout_data$Dropout)# Provide a description of what the values 0 and 1 represent.attr(dropout_data$Dropout, "label") <-"Dropout (1=Dropout; 0=NotDropout)"# Check the structure of variables in this datasetstr(dropout_data)
# View Levels: In R, when performing logistic regression with a factor variable, the first level of the factor is treated as the reference category. This means that R will treat "NotDropout" (0) as the reference category and will model the log-odds of being a dropout (1) compared to not being a dropout (0).levels(dropout_data$Dropout)
[1] "0" "1"
When calculating performance measures like Receiver Operating Characteristic (ROC) and Area Under the Curve (AUC), R has trouble using numbers like 0 and 1 as category labels, even if they are already factors, because they don’t work well as variable names in its system. To fix this, we can rename them with clearer labels like “NotDropout” and “Dropout.”
# Set a seed for reproducibilityset.seed(668)# Split the datasplit <-createDataPartition(dropout_data$Dropout, p =0.8, list =FALSE)# Create Training and Test Setstrain_data <- dropout_data[split, ]test_data <- dropout_data[-split, ]# Check the dimensions of the training and test setsdim(train_data) # Should be around 800 rows
[1] 480 10
dim(test_data) # Should be around 200 rows
[1] 120 10
# Ensure the Dropout variable is a factor (Double Check!)str(train_data)
# (Convert it if needed)# train_data$Dropout <- as.factor(train_data$Dropout)# test_data$Dropout <- as.factor(test_data$Dropout)
Dealing with Class Imbalance using SMOTE
# Check Class Distributiontable(train_data$Dropout)
NotDropout Dropout
432 48
# Install and Load themis Package#install.packages("themis")library(themis)
Loading required package: recipes
Loading required package: dplyr
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
Attaching package: 'recipes'
The following object is masked from 'package:stats':
step
# Apply SMOTEset.seed(668)train_data_smote <-smotenc(train_data, var ="Dropout", k =5, over_ratio =1)# Check Class Distribution After SMOTEtable(train_data_smote$Dropout)
NotDropout Dropout
432 432
10-fold cross-validation
# Define the control method for cross-validation with class probabilitiescv10 <-trainControl(method ="cv", # Cross-validationnumber =10, # 10-foldclassProbs =TRUE, # Enable class probabilities for ROC/AUCsummaryFunction = twoClassSummary # Use two-class metrics for evaluation )
Explore Optimal CP (Complexity Parameter) in the CART model.
# CART with 10-fold cross-validationset.seed(668)CART_fit <-train( Dropout ~ ., data = train_data_smote,method ="rpart", # CARTtrControl = cv10,metric ="ROC")
Print CART Results
# Print the resultsprint(CART_fit)
CART
864 samples
9 predictor
2 classes: 'NotDropout', 'Dropout'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 778, 776, 778, 778, 777, 778, ...
Resampling results across tuning parameters:
cp ROC Sens Spec
0.0462963 0.7933060 0.8008985 0.7596195
0.1018519 0.7663737 0.7319239 0.7826638
0.4976852 0.5779598 0.8365751 0.3193446
ROC was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.0462963.
CART Results
Run the final CART model with cp = 0.0462963.
# Install and load the rpart package#install.packages("rpart")library(rpart)# Fit the CART model using the rpart functionset.seed(668)CART_final <-rpart(formula = Dropout ~ ., data = train_data_smote,method ="class", control =rpart.control(cp =0.0462963),parms =list(split ="gini"))
Plot the Tree
# Install and Load rpart.plot Package#install.packages("rpart.plot")library(rpart.plot)# Plot the Decision Treerpart.plot(CART_final)
Display class, probability, and percentage of observations in each node.
rpart.plot(CART_final, extra =104)
Variable Importance:
# Extract and print variable importance (from rpart)importance <- CART_final$variable.importanceprint(importance)
# Predict Outcome Using Model on Test Datapredictions <-predict(CART_final, newdata=test_data, type ="class") # We add type = "class" for this rpart generated model. In the predict() function, this ensures that the predictions are the class labels (e.g., "Dropout", "NotDropout"), not probabilities.# Create Confusion Matrix to Assess Model PerformanceconfusionMatrix(data=predictions, test_data$Dropout, positive ="Dropout", mode ="everything") # MUST specify positive = "Dropout"; otherwise, the default setting treats "NotDropout" as the positive class.
Confusion Matrix and Statistics
Reference
Prediction NotDropout Dropout
NotDropout 86 7
Dropout 22 5
Accuracy : 0.7583
95% CI : (0.6717, 0.8318)
No Information Rate : 0.9
P-Value [Acc > NIR] : 1.00000
Kappa : 0.1369
Mcnemar's Test P-Value : 0.00933
Sensitivity : 0.41667
Specificity : 0.79630
Pos Pred Value : 0.18519
Neg Pred Value : 0.92473
Precision : 0.18519
Recall : 0.41667
F1 : 0.25641
Prevalence : 0.10000
Detection Rate : 0.04167
Detection Prevalence : 0.22500
Balanced Accuracy : 0.60648
'Positive' Class : Dropout
Model Performance
# Generate Predicted Probabilities for the Test Datapredicted_probs <-predict(CART_final, newdata = test_data, type ="prob")[, 2] # Selects the second column, which contains the predicted probabilities for the "Dropout" class (class 1).# The ROCR package is used to generate performance curves such as the ROC curve.#install.packages("ROCR")library(ROCR)# Create the ROC Curvepred <-prediction(predicted_probs, test_data$Dropout, label.ordering =c("NotDropout", "Dropout")) # This creates an object that stores the predicted probabilities (predicted_probs) and the true labels (test_data$Dropout). It’s required for generating performance metrics.label.ordering = c("NotDropout", "Dropout") tells R to treat "NotDropout" as the negative class (coded as 0) and "Dropout" as the positive class (coded as 1).perf <-performance(pred, "tpr", "fpr") # This function generates the ROC curve, plotting the true positive rate (tpr) against the false positive rate (fpr).
ROC Plot
# Plot the ROC Curveplot(perf, col ="blue", main ="ROC Curve")abline(a =0, b =1, col ="red", lty =2) # Adds a red diagonal line to represent random guessing (where the true positive rate equals the false positive rate).
Calculate AUC Curve
# Calculate AUC (Area Under the Curve)auc <-performance(pred, "auc") # Calculates the AUCauc_value <- auc@y.values[[1]] # Extracts the AUC value.print(paste("AUC =", auc_value)) # Prints the AUC value
[1] "AUC = 0.565972222222222"
Acknowledgement
This document was created by the author with assistance from class material provided to the EDUS 668 class by Dr. Chi-Ning Chang. The code and notes within the code blocks were provided by Dr. Chi-Ning Chang.