Project 3: Classification And Regression Trees (CART)

Author

Margaret Gatongi

Assignment Details

Create a Quarto document to run CART (You can simply modify our Quarto document).
Use set.seed(668) for reproducibility.
Upload all the files to your Google folder named “CART.”
Share the folder with Nick.

Provide the link to the folder: (0.5 points)

Interpret your tree plot: (1 point)

Summarize your model performance: (2 points)

Import Data

# Import data

dropout_data <- read.csv("project3_data.csv")
View(dropout_data)

# Assign variable labels using attr()

attr(dropout_data$SES, "label") <- "Socioeconomic Status (1-100)"
attr(dropout_data$ParentEdu, "label") <- "Parental Education Level (1-5)"
attr(dropout_data$Attendance, "label") <- "Attendance Rate (percent)"
attr(dropout_data$HomeworkHours, "label") <- "Hours Spent on Homework (per week)"
attr(dropout_data$Motivation, "label") <- "Student Motivation (1-15)"
attr(dropout_data$PeerSupport, "label") <- "Peer Support (0-15)"
attr(dropout_data$TestAnxiety, "label") <- "Test Anxiety (0-15)"
attr(dropout_data$ExtraCurricular, "label") <- "Extracurricular Activities (number)"
attr(dropout_data$MathSelfEfficacy, "label") <- "Math Self-Efficacy (0-15)"
#(dropout_data$FavoriteNumber, "label") <- "Favorite Number (Random Unrelated Variable)"
attr(dropout_data$Dropout, "label") <- "Dropout (1=Dropout; 0=NotDropout)"

# Check the structure of variables in this dataset

str(dropout_data)

'data.frame':   600 obs. of  10 variables:
 $ SES             : num  30.75 91.18 42.72 6.82 66.56 ...
  ..- attr(*, "label")= chr "Socioeconomic Status (1-100)"
 $ ParentEdu       : int  4 4 5 1 2 3 4 2 1 3 ...
  ..- attr(*, "label")= chr "Parental Education Level (1-5)"
 $ Attendance      : num  71.3 89 91 90.3 99 ...
  ..- attr(*, "label")= chr "Attendance Rate (percent)"
 $ HomeworkHours   : num  3.76 28.54 13.03 27.18 25.02 ...
  ..- attr(*, "label")= chr "Hours Spent on Homework (per week)"
 $ Motivation      : num  2.24 6.87 4.57 8.06 1.2 ...
  ..- attr(*, "label")= chr "Student Motivation (1-15)"
 $ PeerSupport     : num  10.765 15 11.69 0.421 5.392 ...
  ..- attr(*, "label")= chr "Peer Support (0-15)"
 $ TestAnxiety     : num  13.11 4.1 3.66 10.92 0 ...
  ..- attr(*, "label")= chr "Test Anxiety (0-15)"
 $ ExtraCurricular : int  3 11 5 7 2 5 11 0 15 1 ...
  ..- attr(*, "label")= chr "Extracurricular Activities (number)"
 $ MathSelfEfficacy: num  0 15 8.08 15 12.42 ...
  ..- attr(*, "label")= chr "Math Self-Efficacy (0-15)"
 $ Dropout         : int  0 0 0 0 0 0 0 0 0 0 ...
  ..- attr(*, "label")= chr "Dropout (1=Dropout; 0=NotDropout)"

Convert DV to a Factor

# Convert Dropout to a Factor

dropout_data$Dropout <- as.factor(dropout_data$Dropout)

# Provide a description of what the values 0 and 1 represent.

attr(dropout_data$Dropout, "label") <- "Dropout (1=Dropout; 0=NotDropout)"

# Check the structure of variables in this dataset

str(dropout_data)

'data.frame':   600 obs. of  10 variables:
 $ SES             : num  30.75 91.18 42.72 6.82 66.56 ...
  ..- attr(*, "label")= chr "Socioeconomic Status (1-100)"
 $ ParentEdu       : int  4 4 5 1 2 3 4 2 1 3 ...
  ..- attr(*, "label")= chr "Parental Education Level (1-5)"
 $ Attendance      : num  71.3 89 91 90.3 99 ...
  ..- attr(*, "label")= chr "Attendance Rate (percent)"
 $ HomeworkHours   : num  3.76 28.54 13.03 27.18 25.02 ...
  ..- attr(*, "label")= chr "Hours Spent on Homework (per week)"
 $ Motivation      : num  2.24 6.87 4.57 8.06 1.2 ...
  ..- attr(*, "label")= chr "Student Motivation (1-15)"
 $ PeerSupport     : num  10.765 15 11.69 0.421 5.392 ...
  ..- attr(*, "label")= chr "Peer Support (0-15)"
 $ TestAnxiety     : num  13.11 4.1 3.66 10.92 0 ...
  ..- attr(*, "label")= chr "Test Anxiety (0-15)"
 $ ExtraCurricular : int  3 11 5 7 2 5 11 0 15 1 ...
  ..- attr(*, "label")= chr "Extracurricular Activities (number)"
 $ MathSelfEfficacy: num  0 15 8.08 15 12.42 ...
  ..- attr(*, "label")= chr "Math Self-Efficacy (0-15)"
 $ Dropout         : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
  ..- attr(*, "label")= chr "Dropout (1=Dropout; 0=NotDropout)"

# View Levels: In R, when performing logistic regression with a factor variable, the first level of the factor is treated as the reference category. This means that R will treat "NotDropout" (0) as the reference category and will model the log-odds of being a dropout (1) compared to not being a dropout (0).

levels(dropout_data$Dropout)

[1] "0" "1"

When calculating performance measures like Receiver Operating Characteristic (ROC) and Area Under the Curve (AUC), R has trouble using numbers like 0 and 1 as category labels, even if they are already factors, because they don’t work well as variable names in its system. To fix this, we can rename them with clearer labels like “NotDropout” and “Dropout.”

levels(dropout_data$Dropout) <- c("NotDropout", "Dropout")
levels(dropout_data$Dropout)

[1] "NotDropout" "Dropout"

Split the Data into Training and Test sets

# Load caret library

library(caret)

Loading required package: ggplot2

Loading required package: lattice

# Set a seed for reproducibility

set.seed(668)

# Split the data

split <- createDataPartition(dropout_data$Dropout, p = 0.8, list = FALSE)

# Create Training and Test Sets

train_data <- dropout_data[split, ]
test_data <- dropout_data[-split, ]

# Check the dimensions of the training and test sets

dim(train_data)  # Should be around 800 rows

[1] 480  10

dim(test_data)   # Should be around 200 rows

[1] 120  10

# Ensure the Dropout variable is a factor (Double Check!)

str(train_data)

'data.frame':   480 obs. of  10 variables:
 $ SES             : num  30.7 91.2 42.7 47.8 46.9 ...
 $ ParentEdu       : int  4 4 5 4 2 1 1 3 5 1 ...
 $ Attendance      : num  71.3 89 91 78.7 100 ...
 $ HomeworkHours   : num  3.76 28.54 13.03 27.53 24.43 ...
 $ Motivation      : num  2.24 6.87 4.57 1.65 1 ...
 $ PeerSupport     : num  10.77 15 11.69 4.84 6.46 ...
 $ TestAnxiety     : num  13.11 4.1 3.66 2.35 15 ...
 $ ExtraCurricular : int  3 11 5 11 0 15 6 7 6 7 ...
 $ MathSelfEfficacy: num  0 15 8.08 0 9.69 ...
 $ Dropout         : Factor w/ 2 levels "NotDropout","Dropout": 1 1 1 1 1 1 1 1 1 2 ...

str(test_data)

'data.frame':   120 obs. of  10 variables:
 $ SES             : num  6.82 66.56 31.38 33.69 71.73 ...
 $ ParentEdu       : int  1 2 3 3 4 1 1 2 4 3 ...
 $ Attendance      : num  90.3 99 51.1 86.2 52.2 ...
 $ HomeworkHours   : num  27.2 25 30.5 46.4 17.2 ...
 $ Motivation      : num  8.06 1.2 4.74 3.51 13.41 ...
 $ PeerSupport     : num  0.421 5.392 13.828 13.306 11.548 ...
 $ TestAnxiety     : num  10.92 0 5.43 13.84 2.19 ...
 $ ExtraCurricular : int  7 2 5 1 7 0 0 6 3 5 ...
 $ MathSelfEfficacy: num  15 12.42 15 11.59 4.76 ...
 $ Dropout         : Factor w/ 2 levels "NotDropout","Dropout": 1 1 1 1 1 1 2 1 1 1 ...

# (Convert it if needed)
# train_data$Dropout <- as.factor(train_data$Dropout)
# test_data$Dropout <- as.factor(test_data$Dropout)

Dealing with Class Imbalance using SMOTE

# Check Class Distribution

table(train_data$Dropout)


NotDropout    Dropout 
       432         48

# Install and Load themis Package

#install.packages("themis")
library(themis)

Loading required package: recipes

Loading required package: dplyr


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Attaching package: 'recipes'

The following object is masked from 'package:stats':

    step

# Apply SMOTE

set.seed(668)

train_data_smote <- smotenc(train_data, var = "Dropout", k = 5, over_ratio = 1)

# Check Class Distribution After SMOTE

table(train_data_smote$Dropout)


NotDropout    Dropout 
       432        432

10-fold cross-validation

# Define the control method for cross-validation with class probabilities

cv10 <- trainControl(
  method = "cv", # Cross-validation
  number = 10,   # 10-fold
  classProbs = TRUE, # Enable class probabilities for ROC/AUC
  summaryFunction = twoClassSummary  # Use two-class metrics for evaluation 
)

Explore Optimal CP (Complexity Parameter) in the CART model.

# CART with 10-fold cross-validation

set.seed(668)

CART_fit <- train(
  Dropout ~ ., 
  data = train_data_smote,
  method = "rpart", # CART
  trControl = cv10,
  metric = "ROC"
)

Print CART Results

# Print the results

print(CART_fit)

CART 

864 samples
  9 predictor
  2 classes: 'NotDropout', 'Dropout' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 778, 776, 778, 778, 777, 778, ... 
Resampling results across tuning parameters:

  cp         ROC        Sens       Spec     
  0.0462963  0.7933060  0.8008985  0.7596195
  0.1018519  0.7663737  0.7319239  0.7826638
  0.4976852  0.5779598  0.8365751  0.3193446

ROC was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.0462963.

CART Results

Run the final CART model with cp = 0.0462963.

# Install and load the rpart package

#install.packages("rpart")
library(rpart)

# Fit the CART model using the rpart function

set.seed(668)

CART_final <- rpart(formula = Dropout ~ ., 
                data = train_data_smote,
                method = "class", 
                control = rpart.control(cp = 0.0462963),
                parms = list(split = "gini"))

Plot the Tree

# Install and Load rpart.plot Package

#install.packages("rpart.plot")
library(rpart.plot)

# Plot the Decision Tree

rpart.plot(CART_final)

Display class, probability, and percentage of observations in each node.

rpart.plot(CART_final, extra = 104)

Variable Importance:

# Extract and print variable importance (from rpart)

importance <- CART_final$variable.importance

print(importance)

     PeerSupport              SES       Attendance    HomeworkHours 
      109.803533        59.278770        17.846855        10.889607 
 ExtraCurricular       Motivation MathSelfEfficacy 
       10.561084         8.469694         0.990941

Evaluate Model Performance on the Test Set

# Predict Outcome Using Model on Test Data

predictions <- predict(CART_final, newdata=test_data, type = "class") # We add type = "class" for this rpart generated model. In the predict() function, this ensures that the predictions are the class labels (e.g., "Dropout", "NotDropout"), not probabilities.

# Create Confusion Matrix to Assess Model Performance

confusionMatrix(data=predictions, test_data$Dropout, positive = "Dropout", mode = "everything") # MUST specify positive = "Dropout"; otherwise, the default setting treats "NotDropout" as the positive class.

Confusion Matrix and Statistics

            Reference
Prediction   NotDropout Dropout
  NotDropout         86       7
  Dropout            22       5
                                          
               Accuracy : 0.7583          
                 95% CI : (0.6717, 0.8318)
    No Information Rate : 0.9             
    P-Value [Acc > NIR] : 1.00000         
                                          
                  Kappa : 0.1369          
                                          
 Mcnemar's Test P-Value : 0.00933         
                                          
            Sensitivity : 0.41667         
            Specificity : 0.79630         
         Pos Pred Value : 0.18519         
         Neg Pred Value : 0.92473         
              Precision : 0.18519         
                 Recall : 0.41667         
                     F1 : 0.25641         
             Prevalence : 0.10000         
         Detection Rate : 0.04167         
   Detection Prevalence : 0.22500         
      Balanced Accuracy : 0.60648         
                                          
       'Positive' Class : Dropout

Model Performance

# Generate Predicted Probabilities for the Test Data

predicted_probs <- predict(CART_final, newdata = test_data, type = "prob")[, 2] # Selects the second column, which contains the predicted probabilities for the "Dropout" class (class 1).

# The ROCR package is used to generate performance curves such as the ROC curve.

#install.packages("ROCR")
library(ROCR)

# Create the ROC Curve

pred <- prediction(predicted_probs, test_data$Dropout, label.ordering = c("NotDropout", "Dropout")) # This creates an object that stores the predicted probabilities (predicted_probs) and the true labels (test_data$Dropout). It’s required for generating performance metrics.label.ordering = c("NotDropout", "Dropout") tells R to treat "NotDropout" as the negative class (coded as 0) and "Dropout" as the positive class (coded as 1).

perf <- performance(pred, "tpr", "fpr") # This function generates the ROC curve, plotting the true positive rate (tpr) against the false positive rate (fpr).

ROC Plot

# Plot the ROC Curve

plot(perf, col = "blue", main = "ROC Curve")
abline(a = 0, b = 1, col = "red", lty = 2) # Adds a red diagonal line to represent random guessing (where the true positive rate equals the false positive rate).

Calculate AUC Curve

# Calculate AUC (Area Under the Curve)

auc <- performance(pred, "auc") # Calculates the AUC
auc_value <- auc@y.values[[1]] # Extracts the AUC value.
print(paste("AUC =", auc_value)) # Prints the AUC value

[1] "AUC = 0.565972222222222"

Acknowledgement

This document was created by the author with assistance from class material provided to the EDUS 668 class by Dr. Chi-Ning Chang. The code and notes within the code blocks were provided by Dr. Chi-Ning Chang.