Capstone Retention Analysis

Load and Clean Data

setwd("C:/Users/avaar/OneDrive/Desktop/CAPSTONE")
df <- read_csv("final_cleaned_master_dataset_3.csv", col_types = cols(study_id = col_character()))
courses_df <- read_csv("merged_courses_data (1).csv")

## Rows: 40850 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Collection_Term_all, Course_Subject_all, Course_Number_all, Section...
## dbl (5): StudyID, Collection_Year_combined, Course_Hours_all, CRN_all, Cours...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

df <- df %>%
  mutate(
    cumulative_gpa_fall = as.numeric(cumulative_gpa_fall),
    gpa_spring = as.numeric(gpa_spring),
    retained_in_fall = str_to_title(str_trim(retained_in_fall))
  ) %>%
  filter(retained_in_fall %in% c("Yes", "No"))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `cumulative_gpa_fall = as.numeric(cumulative_gpa_fall)`.
## Caused by warning:
## ! NAs introduced by coercion

df$retained_in_fall <- factor(df$retained_in_fall)

Prepare Data for Modeling

df_model <- df %>%
  filter(!is.na(gpa_spring),
         !is.na(expected_family_contribution),
         !is.na(cost_of_attendance),
         !is.na(financial_aid_disbursement_amount),
         !is.na(unmet_need_amount)) %>%
  mutate(
    retained_flag = ifelse(retained_in_fall == "Yes", 1, 0),
    expected_family_contribution = as.numeric(expected_family_contribution),
    cost_of_attendance = as.numeric(cost_of_attendance),
    financial_aid_disbursement_amount = as.numeric(financial_aid_disbursement_amount),
    unmet_need_amount = as.numeric(unmet_need_amount)
  ) %>%
  select(retained_flag, gpa_spring,
         expected_family_contribution, cost_of_attendance,
         financial_aid_disbursement_amount, unmet_need_amount)

Train/Test Split

set.seed(123)
train_proportion <- 0.7
train_index <- sample(seq_len(nrow(df_model)), size = floor(train_proportion * nrow(df_model)))
train_data <- df_model[train_index, ]
test_data <- df_model[-train_index, ]

cat("Training size:", nrow(train_data), "\n")

## Training size: 2683

cat("Testing size:", nrow(test_data), "\n")

## Testing size: 1150

cat("Training class distribution:\n")

## Training class distribution:

print(prop.table(table(train_data$retained_flag)))

## 
##         0         1 
## 0.1714499 0.8285501

cat("Testing class distribution:\n")

## Testing class distribution:

print(prop.table(table(test_data$retained_flag)))

## 
##         0         1 
## 0.1869565 0.8130435

# Check original balance
table(train_data$retained_flag)

## 
##    0    1 
##  460 2223

# Separate classes
majority <- train_data %>% filter(retained_flag == 1)
minority <- train_data %>% filter(retained_flag == 0)

# Upsample the minority class (repeat rows to match majority)
set.seed(42)
minority_upsampled <- minority %>%
  slice_sample(n = nrow(majority), replace = TRUE)

# Combine and shuffle
train_balanced <- bind_rows(majority, minority_upsampled) %>%
  slice_sample(n = nrow(.), replace = FALSE)

# Check new class balance
table(train_balanced$retained_flag)

## 
##    0    1 
## 2223 2223

Model Evaluation

# Train logistic regression on balanced training set
log_model_recall <- glm(retained_flag ~ ., data = train_balanced, family = "binomial")

# Predict probabilities on test set
log_probs <- predict(log_model_recall, newdata = test_data, type = "response")

# Apply lower threshold to favor recall
log_pred <- ifelse(log_probs > 0.35, 1, 0)
log_pred <- as.factor(log_pred)
actual <- test_data$retained_flag

# Confusion matrix
conf_matrix <- table(Predicted = log_pred, Actual = actual)
print(conf_matrix)

##          Actual
## Predicted   0   1
##         0  96 146
##         1 119 789

# Accuracy
accuracy <- mean(log_pred == actual)

# Metrics for Class 0 (Not Retained)
tp <- sum(log_pred == "0" & actual == "0")  # True positives
fp <- sum(log_pred == "0" & actual == "1")  # False positives
fn <- sum(log_pred == "1" & actual == "0")  # False negatives

# Calculate metrics
precision <- ifelse((tp + fp) > 0, tp / (tp + fp), NA)
recall <- ifelse((tp + fn) > 0, tp / (tp + fn), NA)
f1 <- ifelse(!is.na(precision) & !is.na(recall) & (precision + recall) > 0,
             2 * (precision * recall) / (precision + recall), NA)

# Print results
cat("🔹 Prioritizing Recall for Class 0 (Not Retained)\n")

## 🔹 Prioritizing Recall for Class 0 (Not Retained)

cat("Test Accuracy:", round(accuracy, 3), "\n")

## Test Accuracy: 0.77

cat("Precision (Class 0):", round(precision, 2), "\n")

## Precision (Class 0): 0.4

cat("Recall (Class 0):", round(recall, 2), "\n")

## Recall (Class 0): 0.45

cat("F1 Score (Class 0):", round(f1, 2), "\n")

## F1 Score (Class 0): 0.42

K 5-Fold Cross Validation

# Number of folds
k <- 5
set.seed(123)  

# Shuffle and split into k folds
n <- nrow(train_balanced)
folds <- sample(rep(1:k, length.out = n))

# Initialize vectors to store metrics
accuracies <- c()
precisions <- c()
recalls <- c()
f1_scores <- c()

# Run k-fold CV
for (i in 1:k) {
  test_idx <- which(folds == i)
  test_fold <- train_balanced[test_idx, ]
  train_fold <- train_balanced[-test_idx, ]
 # Fit model
  model <- glm(retained_flag ~ ., data = train_fold, family = "binomial")
  
  # Predict on test fold
  probs <- predict(model, newdata = test_fold, type = "response")
  pred <- ifelse(probs > 0.35, 1, 0)
  pred <- as.factor(pred)
  actual <- test_fold$retained_flag
  
  # Metrics for class 0
  tp <- sum(pred == "0" & actual == "0")
  fp <- sum(pred == "0" & actual == "1")
  fn <- sum(pred == "1" & actual == "0")
 precision <- ifelse(tp + fp > 0, tp / (tp + fp), NA)
  recall <- ifelse(tp + fn > 0, tp / (tp + fn), NA)
  f1 <- ifelse(!is.na(precision) & !is.na(recall) & (precision + recall) > 0,
               2 * (precision * recall) / (precision + recall), NA)
  acc <- mean(pred == actual)
  
  # Store results
  accuracies[i] <- acc
  precisions[i] <- precision
  recalls[i] <- recall
  f1_scores[i] <- f1
}

# Average results across folds
cat("🔁 5-Fold Cross-Validation Results\n")

## 🔁 5-Fold Cross-Validation Results

cat("Avg Accuracy:", round(mean(accuracies, na.rm = TRUE), 3), "\n")

## Avg Accuracy: 0.673

cat("Avg Precision (Class 0):", round(mean(precisions, na.rm = TRUE), 2), "\n")

## Avg Precision (Class 0): 0.78

cat("Avg Recall (Class 0):", round(mean(recalls, na.rm = TRUE), 2), "\n")

## Avg Recall (Class 0): 0.48

cat("Avg F1 Score (Class 0):", round(mean(f1_scores, na.rm = TRUE), 2), "\n")

## Avg F1 Score (Class 0): 0.59

Question 1: Academic and financial predictors model for retention

#clean Unmet: they are shown as data errors but could be scholarship money
unique(df$unmet_need_amount[df$unmet_need_amount < -100000])

##   [1]        NA  -9995877  -9984197  -9975697   -106606  -9969235  -9981690
##   [8]  -9985675   -114315  -9987397   -157132 -10000253   -112358  -9998197
##  [15]  -9967735  -9970235  -9968235   -130599  -9967235   -126310  -9982378
##  [22]  -9978197   -124377 -10002459  -9983185  -9985693   -100901 -10000197
##  [29]  -9981833   -141254  -9974235   -101249  -9972722  -9973429  -9975735
##  [36]  -9981506   -115239  -9981606 -10000310  -9981044  -9993697  -9972135
##  [43]  -9966735  -9978735  -9976197  -9992836  -9996197  -9977797  -9980197
##  [50]  -9998281  -9976697  -9987697  -9998503  -9970688  -9998423  -9981235
##  [57] -10000352  -9996288  -9974485  -9976235  -9975110  -9987781  -9973235
##  [64]   -133401  -9993725  -9971235  -9988812  -9977197  -9983785   -193561
##  [71]  -9974470  -9995763   -101700   -114779  -9962290   -182483  -9977235
##  [78]   -111055   -117358  -9997625  -9982697   -110491   -180355  -9979197
##  [85]  -9974735  -9980785  -9980749  -9978397  -9975435  -9981197  -9994565
##  [92]  -9990881  -9961790  -9982237  -9971406  -9977735  -9987470  -9998868
##  [99]  -9979935  -9973735  -9985785  -9984295  -9994266  -9973917  -9967285
## [106]  -9985310   -325889   -115904 -10005897  -9968735  -9971606  -9975797
## [113]  -9971735 -10005365  -9983612

#Set extreme negative unmet need to NA (threshold based on institutional data logic)
df$unmet_need_amount <- ifelse(df$unmet_need_amount < -100000, NA, df$unmet_need_amount)

# Filter data set with cleaned unmet need and known retention
df_model <- df %>%
  filter(retained_in_fall %in% c("Yes", "No")) %>%
  mutate(
    gpa_spring = as.numeric(gpa_spring),
    expected_family_contribution = as.numeric(expected_family_contribution),
    cost_of_attendance = as.numeric(cost_of_attendance),
    financial_aid_disbursement_amount = as.numeric(financial_aid_disbursement_amount),
    unmet_need_amount = as.numeric(unmet_need_amount)
  ) %>%
  filter(!is.na(gpa_spring),
         !is.na(expected_family_contribution),
         !is.na(cost_of_attendance),
         !is.na(financial_aid_disbursement_amount),
         !is.na(unmet_need_amount))

# Convert outcome to factor
df_model$retained_in_fall <- factor(df_model$retained_in_fall)

# Fit logistic regression model
model_acad_fin <- glm(retained_in_fall ~ gpa_spring +
                        expected_family_contribution + cost_of_attendance +
                        financial_aid_disbursement_amount + unmet_need_amount,
                      data = df_model,
                      family = "binomial")
summary(model_acad_fin)

## 
## Call:
## glm(formula = retained_in_fall ~ gpa_spring + expected_family_contribution + 
##     cost_of_attendance + financial_aid_disbursement_amount + 
##     unmet_need_amount, family = "binomial", data = df_model)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6365   0.3418   0.4524   0.6313   1.2889  
## 
## Coefficients: (1 not defined because of singularities)
##                                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        1.151e+00  2.975e-01   3.868  0.00011 ***
## gpa_spring                         6.711e-01  3.770e-02  17.801  < 2e-16 ***
## expected_family_contribution       6.474e-06  3.203e-06   2.021  0.04326 *  
## cost_of_attendance                -3.863e-05  9.214e-06  -4.193 2.75e-05 ***
## financial_aid_disbursement_amount  1.424e-05  5.100e-06   2.791  0.00525 ** 
## unmet_need_amount                         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3497.3  on 3714  degrees of freedom
## Residual deviance: 3116.8  on 3710  degrees of freedom
## AIC: 3126.8
## 
## Number of Fisher Scoring iterations: 5

# Calculate odds ratios for interpretation
exp(cbind(Odds_Ratio = coef(model_acad_fin), confint(model_acad_fin)))

## Waiting for profiling to be done...

##                                   Odds_Ratio     2.5 %    97.5 %
## (Intercept)                        3.1606443 1.7707475 5.6865943
## gpa_spring                         1.9563932 1.8180299 2.1076611
## expected_family_contribution       1.0000065 1.0000004 1.0000129
## cost_of_attendance                 0.9999614 0.9999432 0.9999794
## financial_aid_disbursement_amount  1.0000142 1.0000042 1.0000242
## unmet_need_amount                         NA        NA        NA

# Fit a decision tree model
tree_model <- rpart(retained_in_fall ~ gpa_spring +
                      expected_family_contribution + cost_of_attendance +
                      financial_aid_disbursement_amount + unmet_need_amount,
                    data = df_model,
                    method = "class")
# View model summary
summary(tree_model)

## Call:
## rpart(formula = retained_in_fall ~ gpa_spring + expected_family_contribution + 
##     cost_of_attendance + financial_aid_disbursement_amount + 
##     unmet_need_amount, data = df_model, method = "class")
##   n= 3715 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.04122939      0 1.0000000 1.0000000 0.03507237
## 2 0.01000000      2 0.9175412 0.9235382 0.03398568
## 
## Variable importance
##                        gpa_spring                 unmet_need_amount 
##                                98                                 1 
## financial_aid_disbursement_amount 
##                                 1 
## 
## Node number 1: 3715 observations,    complexity param=0.04122939
##   predicted class=Yes  expected loss=0.1795424  P(node) =1
##     class counts:   667  3048
##    probabilities: 0.180 0.820 
##   left son=2 (652 obs) right son=3 (3063 obs)
##   Primary splits:
##       gpa_spring                        < 0.9215  to the left,  improve=124.509900, (0 missing)
##       unmet_need_amount                 < 20105.5 to the right, improve=  6.348405, (0 missing)
##       cost_of_attendance                < 32468.5 to the right, improve=  5.187697, (0 missing)
##       expected_family_contribution      < 2091.5  to the left,  improve=  3.794012, (0 missing)
##       financial_aid_disbursement_amount < 40365.5 to the left,  improve=  1.622693, (0 missing)
##   Surrogate splits:
##       unmet_need_amount < 33484.5 to the right, agree=0.825, adj=0.002, (0 split)
## 
## Node number 2: 652 observations,    complexity param=0.04122939
##   predicted class=Yes  expected loss=0.4601227  P(node) =0.1755047
##     class counts:   300   352
##    probabilities: 0.460 0.540 
##   left son=4 (341 obs) right son=5 (311 obs)
##   Primary splits:
##       gpa_spring                        < 0.2425  to the left,  improve=20.7686200, (0 missing)
##       unmet_need_amount                 < 20206   to the right, improve= 6.3640930, (0 missing)
##       financial_aid_disbursement_amount < 17346.5 to the left,  improve= 3.2515290, (0 missing)
##       cost_of_attendance                < 39542.5 to the right, improve= 1.0526350, (0 missing)
##       expected_family_contribution      < 63531   to the right, improve= 0.9141988, (0 missing)
##   Surrogate splits:
##       financial_aid_disbursement_amount < 28246.5 to the left,  agree=0.541, adj=0.039, (0 split)
##       cost_of_attendance                < 39570.5 to the left,  agree=0.538, adj=0.032, (0 split)
##       unmet_need_amount                 < -6625   to the right, agree=0.538, adj=0.032, (0 split)
##       expected_family_contribution      < 30019.5 to the left,  agree=0.537, adj=0.029, (0 split)
## 
## Node number 3: 3063 observations
##   predicted class=Yes  expected loss=0.1198172  P(node) =0.8244953
##     class counts:   367  2696
##    probabilities: 0.120 0.880 
## 
## Node number 4: 341 observations
##   predicted class=No   expected loss=0.4193548  P(node) =0.09179004
##     class counts:   198   143
##    probabilities: 0.581 0.419 
## 
## Node number 5: 311 observations
##   predicted class=Yes  expected loss=0.3279743  P(node) =0.08371467
##     class counts:   102   209
##    probabilities: 0.328 0.672

# Visualize decision tree without fallen leaves
rpart.plot(tree_model,
           type = 2,       # shows split variable and threshold
           extra = 104,    # shows class, probability, and percentage
           fallen.leaves = FALSE,  # disables fallen leaves for standard layout
           main = "Decision Tree for Student Retention (All Predictors)")

#Tuning the Decision Tree
# Create a sequence of cp values to test
cp_values <- seq(0.001, 0.05, by = 0.005)
results <- data.frame(cp = numeric(), Accuracy = numeric())

for (cp in cp_values) {
  # Train model with this 0.001
  model <- rpart(retained_in_fall ~ gpa_spring + cost_of_attendance + expected_family_contribution +
                   financial_aid_disbursement_amount + unmet_need_amount,
                 data = df_model,
                 method = "class",
                 control = rpart.control(cp =0.001))
  
  # Predict on training data 
  pred <- predict(model, type = "class")
  acc <- mean(pred == df_model$retained_in_fall) 
 
   # Store results
  results <- rbind(results, data.frame(cp =0.001, Accuracy = acc))
}
# View results
print(results)

##       cp  Accuracy
## 1  0.001 0.8602961
## 2  0.001 0.8602961
## 3  0.001 0.8602961
## 4  0.001 0.8602961
## 5  0.001 0.8602961
## 6  0.001 0.8602961
## 7  0.001 0.8602961
## 8  0.001 0.8602961
## 9  0.001 0.8602961
## 10 0.001 0.8602961

for (cp in cp_values) {
  # Train model with this 0.05
  model <- rpart(retained_in_fall ~ gpa_spring + cost_of_attendance + expected_family_contribution +
                   financial_aid_disbursement_amount + unmet_need_amount,
                 data = df_model,
                 method = "class",
                 control = rpart.control(cp =0.05))
  
  # Predict on training data 
  pred <- predict(model, type = "class")
  acc <- mean(pred == df_model$retained_in_fall)  
  
  # Store results
  results <- rbind(results, data.frame(cp =0.05, Accuracy = acc))
}
# View results
print(results)

##       cp  Accuracy
## 1  0.001 0.8602961
## 2  0.001 0.8602961
## 3  0.001 0.8602961
## 4  0.001 0.8602961
## 5  0.001 0.8602961
## 6  0.001 0.8602961
## 7  0.001 0.8602961
## 8  0.001 0.8602961
## 9  0.001 0.8602961
## 10 0.001 0.8602961
## 11 0.050 0.8204576
## 12 0.050 0.8204576
## 13 0.050 0.8204576
## 14 0.050 0.8204576
## 15 0.050 0.8204576
## 16 0.050 0.8204576
## 17 0.050 0.8204576
## 18 0.050 0.8204576
## 19 0.050 0.8204576
## 20 0.050 0.8204576

for (cp in cp_values) {
  # Train model with this 0.005
  model <- rpart(retained_in_fall ~ gpa_spring + cost_of_attendance + expected_family_contribution +
                   financial_aid_disbursement_amount + unmet_need_amount,
                 data = df_model,
                 method = "class",
                 control = rpart.control(cp =0.005))
  
  # Predict on training data 
  pred <- predict(model, type = "class")
  acc <- mean(pred == df_model$retained_in_fall) 
  
  # Store results
  results <- rbind(results, data.frame(cp =0.005, Accuracy = acc))
}
# View results
print(results)

##       cp  Accuracy
## 1  0.001 0.8602961
## 2  0.001 0.8602961
## 3  0.001 0.8602961
## 4  0.001 0.8602961
## 5  0.001 0.8602961
## 6  0.001 0.8602961
## 7  0.001 0.8602961
## 8  0.001 0.8602961
## 9  0.001 0.8602961
## 10 0.001 0.8602961
## 11 0.050 0.8204576
## 12 0.050 0.8204576
## 13 0.050 0.8204576
## 14 0.050 0.8204576
## 15 0.050 0.8204576
## 16 0.050 0.8204576
## 17 0.050 0.8204576
## 18 0.050 0.8204576
## 19 0.050 0.8204576
## 20 0.050 0.8204576
## 21 0.005 0.8422611
## 22 0.005 0.8422611
## 23 0.005 0.8422611
## 24 0.005 0.8422611
## 25 0.005 0.8422611
## 26 0.005 0.8422611
## 27 0.005 0.8422611
## 28 0.005 0.8422611
## 29 0.005 0.8422611
## 30 0.005 0.8422611

# Best cp value
best_cp <- results$cp[which.max(results$Accuracy)]
cat("Best cp:", best_cp, "\n")

## Best cp: 0.001

# Retrain using best cp
final_model <- rpart(retained_in_fall ~ gpa_spring + cost_of_attendance + expected_family_contribution +
                       financial_aid_disbursement_amount + unmet_need_amount,
                     data = df_model,
                     method = "class",
                     control = rpart.control(cp = best_cp))

# Visualize
rpart.plot(final_model)

## Warning: labs do not fit even at cex 0.15, there may be some overplotting

# Print complexity parameter table
printcp(final_model)

## 
## Classification tree:
## rpart(formula = retained_in_fall ~ gpa_spring + cost_of_attendance + 
##     expected_family_contribution + financial_aid_disbursement_amount + 
##     unmet_need_amount, data = df_model, method = "class", control = rpart.control(cp = best_cp))
## 
## Variables actually used in tree construction:
## [1] cost_of_attendance                expected_family_contribution     
## [3] financial_aid_disbursement_amount gpa_spring                       
## [5] unmet_need_amount                
## 
## Root node error: 667/3715 = 0.17954
## 
## n= 3715 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.0412294      0   1.00000 1.00000 0.035072
## 2  0.0059970      2   0.91754 0.93103 0.034096
## 3  0.0054973      4   0.90555 0.98201 0.034824
## 4  0.0052474      7   0.88906 0.98501 0.034866
## 5  0.0033733      9   0.87856 0.99400 0.034990
## 6  0.0029985     23   0.81709 0.97751 0.034761
## 7  0.0022489     25   0.81109 0.97751 0.034761
## 8  0.0014993     31   0.79610 0.97601 0.034740
## 9  0.0010709     32   0.79460 0.98801 0.034907
## 10 0.0010000     43   0.77811 1.01799 0.035317

#chose cp = 0.005997 using the 1 Standard Error Rule (1-SE Rule) 

# Optimal cp value based on 1-SE Rule
optimal_cp <- 0.005997

# Prune the tree
pruned_model <- prune(final_model, cp = optimal_cp)

# Plot
rpart.plot(pruned_model, type = 2, extra = 104, main = "Pruned Classification Tree")

Question 2: Do CASA program students have higher retention rates?

# Recode CASA variable for consistency
df$casa_student <- ifelse(df$casa_student %in% c("1", "1.000"), 1,
                          ifelse(df$casa_student %in% c("0", "0.000", "0.00%"), 0, NA))
df$casa_student <- factor(df$casa_student)
table(df$casa_student, useNA = "ifany")

## 
##    0    1 
## 4040  426

df_clean <- df %>%
filter(retained_in_fall %in% c("Yes", "No"))
df_clean$retained_in_fall <- factor(df_clean$retained_in_fall)

# Chi-square test
casa_table <- table(df_clean$casa_student, df_clean$retained_in_fall)
print(casa_table)

##    
##       No  Yes
##   0 1092 2948
##   1  154  272

chi_casa <- chisq.test(casa_table)
print(chi_casa)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  casa_table
## X-squared = 15.486, df = 1, p-value = 8.313e-05

#There is a  statistically significant association between CASA program participation and retention. 
#Retention rates differ between CASA and non-CASA students.

#logistic regression model
model_casa <- glm(retained_in_fall ~ casa_student,
                  data = df_clean,
                  family = "binomial")
summary(model_casa)

## 
## Call:
## glm(formula = retained_in_fall ~ casa_student, family = "binomial", 
##     data = df_clean)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6176  -1.4265   0.7939   0.7939   0.9473  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    0.99312    0.03543  28.034  < 2e-16 ***
## casa_student1 -0.42427    0.10689  -3.969 7.21e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5287.8  on 4465  degrees of freedom
## Residual deviance: 5272.6  on 4464  degrees of freedom
## AIC: 5276.6
## 
## Number of Fisher Scoring iterations: 4

# Calculate odds ratios for interpretation
model_casa_tidy <- tidy(model_casa, conf.int = TRUE, exponentiate = TRUE)
print(model_casa_tidy)

## # A tibble: 2 × 7
##   term          estimate std.error statistic   p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 (Intercept)      2.70     0.0354     28.0  6.26e-173    2.52      2.89 
## 2 casa_student1    0.654    0.107      -3.97 7.21e-  5    0.531     0.808

Question 3: Is high school GPA a strong predictor of college GPA and retention among Financial Cliff students?

# Subset to Financial Cliff students only
financial_cliff_df <- df %>%
  filter(unmet_need_amount >= 15000)

# Linear regression: college GPA ~ high school GPA
model_hsgpa_collegegpa <- lm(gpa_spring ~ high_school_gpa, data = financial_cliff_df)
summary(model_hsgpa_collegegpa)

## 
## Call:
## lm(formula = gpa_spring ~ high_school_gpa, data = financial_cliff_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7669 -0.8928  0.1472  0.9174  2.2689 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.7669     0.1301  21.267  < 2e-16 ***
## high_school_gpaB  -0.3741     0.1503  -2.489   0.0131 *  
## high_school_gpaC  -1.0358     0.1468  -7.053 4.69e-12 ***
## high_school_gpaD  -1.7527     0.3602  -4.865 1.45e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.164 on 619 degrees of freedom
##   (442 observations deleted due to missingness)
## Multiple R-squared:  0.116,  Adjusted R-squared:  0.1117 
## F-statistic: 27.06 on 3 and 619 DF,  p-value: < 2.2e-16

#Students with higher high school GPAs tend to have higher spring college GPAs. Compared to students with an A high school GPA,
#students with a B average ~0.37 GPA points lower, C students ~1.04 points lower, and D students ~1.75 points lower in spring GPA. 
#High school GPA explains about 12% of variation in spring college GPA outcomes.

# Logistic regression: retention ~ high school GPA
model_hsgpa_retention <- glm(retained_in_fall ~ high_school_gpa,
                             data = financial_cliff_df,
                             family = "binomial")
summary(model_hsgpa_retention)

## 
## Call:
## glm(formula = retained_in_fall ~ high_school_gpa, family = "binomial", 
##     data = financial_cliff_df)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.242  -1.094  -1.094   1.162   1.264  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)  
## (Intercept)        0.1515     0.1839   0.824   0.4098  
## high_school_gpaB  -0.1152     0.2102  -0.548   0.5838  
## high_school_gpaC  -0.3518     0.2035  -1.729   0.0838 .
## high_school_gpaD  -0.3339     0.4660  -0.716   0.4737  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1463.9  on 1056  degrees of freedom
## Residual deviance: 1459.0  on 1053  degrees of freedom
##   (8 observations deleted due to missingness)
## AIC: 1467
## 
## Number of Fisher Scoring iterations: 3

# students with lower high school GPAs (especially C) tend to have slightly lower odds of being retained in fall compared to students with an A GPA.
#differences are not statistically significant except marginally for C students
#overall, high school GPA alone is not a strong predictor of fall retention
-
# Odds ratio interpretation
exp(cbind(OR = coef(model_hsgpa_retention), confint(model_hsgpa_retention)))

## Waiting for profiling to be done...

##                          OR      2.5 %    97.5 %
## (Intercept)      -1.1636364 -0.8120781 -1.672883
## high_school_gpaB -0.8912037 -0.5891040 -1.344905
## high_school_gpaC -0.7033926 -0.4710094 -1.047420
## high_school_gpaD -0.7161458 -0.2818191 -1.785610

#students with lower high school GPAs (B, C, D) tend to have lower odds of retention compared to A students,
#differences are not statistically significant in this model,  high school GPA alone is not a strong predictor of retention

Question 4: Improve grades in key gateway subjects (e.g., math and English) from D-level performance to C

master_df <- read.csv("final_cleaned_master_dataset_3.csv")
courses_df <- read.csv("merged_courses_data (1).csv")

# Standardize column name for joining
courses_df <- courses_df %>%
  rename(study_id = StudyID)

# Merge datasets on study_id
merged_df <- left_join(master_df, courses_df, by = "study_id")

# Filter Fall Math/English courses with a D grade
fall_d_students <- merged_df %>%
  filter(collection_term_x == "Fall",
         Course_Subject_all %in% c("MATH", "ENGL"),
         FinalGrade_all == "D") %>%
  mutate(retained_flag = ifelse(retained_in_fall == "Yes", 1, 0))

# Calculate current retention rate for D students
current_retention <- mean(fall_d_students$retained_flag, na.rm = TRUE)

# Create logistic regression model using full dataset with valid GPA and retention info
model_df <- merged_df %>%
  filter(!is.na(gpa_spring), retained_in_fall %in% c("Yes", "No")) %>%
  mutate(retained_flag = ifelse(retained_in_fall == "Yes", 1, 0))

glm_retention <- glm(retained_flag ~ gpa_spring,
                     data = model_df,
                     family = "binomial")

# Simulate GPA improvement: D → C (+0.7 bump capped at 4.0)
fall_d_students_simulated <- fall_d_students %>%
  mutate(simulated_gpa_spring = ifelse(!is.na(gpa_spring),
                                       pmin(gpa_spring + 0.7, 4.0),
                                       NA))

# Predict new retention probabilities using simulated GPA
fall_d_students_simulated$predicted_retention_prob <- predict(
  glm_retention,
  newdata = fall_d_students_simulated %>%
    mutate(gpa_spring = simulated_gpa_spring),
  type = "response"
)

# Estimated simulated retention rate
simulated_retention <- mean(fall_d_students_simulated$predicted_retention_prob, na.rm = TRUE)

# Output results
cat("Current Retention Rate for D Students: ", round(current_retention * 100, 1), "%\n")

## Current Retention Rate for D Students:  44.2 %

cat("Estimated Retention Rate if D's Improved to C's: ", round(simulated_retention * 100, 1), "%\n")

## Estimated Retention Rate if D's Improved to C's:  85.6 %

Question 5: Financial Cliff Students who have a D in Math and English

# Filter: D in Fall + Math/English + Unmet Need ≥ $15,000
cliff_d_students <- merged_df %>%
  filter(collection_term_x == "Fall",
         Course_Subject_all %in% c("MATH", "ENGL"),
         FinalGrade_all == "D",
         unmet_need_amount >= 15000) %>%
  mutate(retained_flag = ifelse(retained_in_fall == "Yes", 1, 0))
current_retention_cliff <- mean(cliff_d_students$retained_flag, na.rm = TRUE)

cliff_d_students_sim <- cliff_d_students %>%
  mutate(simulated_gpa_spring = ifelse(!is.na(gpa_spring), 
                                       pmin(gpa_spring + 0.7, 4.0), 
                                       NA))
# Predict retention using simulated GPA
cliff_d_students_sim$predicted_retention_prob <- predict(
  glm_retention,
  newdata = cliff_d_students_sim %>% 
    mutate(gpa_spring = simulated_gpa_spring),
  type = "response"
)

# Estimate average predicted retention under simulated improvement
simulated_retention_cliff <- mean(cliff_d_students_sim$predicted_retention_prob, na.rm = TRUE)
cat("Current Retention Rate for Financial Cliff D Students: ", 
    round(current_retention_cliff * 100, 1), "%\n")

## Current Retention Rate for Financial Cliff D Students:  25.2 %

cat("Estimated Retention Rate if Grades Improved to C: ", 
    round(simulated_retention_cliff * 100, 1), "%\n")

## Estimated Retention Rate if Grades Improved to C:  84.1 %

Question 6: CASA + Financial Cliff + D Students

# Convert CASA flag 
merged_df <- merged_df %>%
  mutate(casa_flag = ifelse(casa_student %in% c("1", "1.000"), 1, 0))

# Filter for target group
casa_cliff_d_students <- merged_df %>%
  filter(collection_term_x == "Fall",
         Course_Subject_all %in% c("MATH", "ENGL"),
         FinalGrade_all == "D",
         unmet_need_amount >= 15000,
         casa_flag == 1) %>%
  mutate(retained_flag = ifelse(retained_in_fall == "Yes", 1, 0))
current_retention_casa_cliff <- mean(casa_cliff_d_students$retained_flag, na.rm = TRUE)


#Simulate GPA Bump (D → C)
casa_cliff_d_sim <- casa_cliff_d_students %>%
  mutate(simulated_gpa_spring = ifelse(!is.na(gpa_spring), 
                                       pmin(gpa_spring + 0.7, 4.0), 
                                       NA))
#Predict Retention with Improved GPA
casa_cliff_d_sim$predicted_retention_prob <- predict(
  glm_retention,
  newdata = casa_cliff_d_sim %>%
    
    mutate(gpa_spring = simulated_gpa_spring),
  type = "response"
)
simulated_retention_casa_cliff <- mean(casa_cliff_d_sim$predicted_retention_prob, na.rm = TRUE)

#print 
cat("Current Retention Rate for CASA + Financial Cliff D Students: ", 
    round(current_retention_casa_cliff * 100, 1), "%\n")

## Current Retention Rate for CASA + Financial Cliff D Students:  16.7 %

cat("Estimated Retention Rate if Grades Improved to C: ", 
    round(simulated_retention_casa_cliff * 100, 1), "%\n")

## Estimated Retention Rate if Grades Improved to C:  78.2 %

Capstone Retention Analysis

Group D

2025-07-25

Load and Clean Data

Prepare Data for Modeling

Train/Test Split

Model Evaluation

K 5-Fold Cross Validation

Question 1: Academic and financial predictors model for retention

Question 2: Do CASA program students have higher retention rates?

Question 3: Is high school GPA a strong predictor of college GPA and retention among Financial Cliff students?

Question 4: Improve grades in key gateway subjects (e.g., math and English) from D-level performance to C

Question 5: Financial Cliff Students who have a D in Math and English

Question 6: CASA + Financial Cliff + D Students