setwd("C:/Users/avaar/OneDrive/Desktop/CAPSTONE")
df <- read_csv("final_cleaned_master_dataset_3.csv", col_types = cols(study_id = col_character()))
courses_df <- read_csv("merged_courses_data (1).csv")
## Rows: 40850 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Collection_Term_all, Course_Subject_all, Course_Number_all, Section...
## dbl (5): StudyID, Collection_Year_combined, Course_Hours_all, CRN_all, Cours...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- df %>%
mutate(
cumulative_gpa_fall = as.numeric(cumulative_gpa_fall),
gpa_spring = as.numeric(gpa_spring),
retained_in_fall = str_to_title(str_trim(retained_in_fall))
) %>%
filter(retained_in_fall %in% c("Yes", "No"))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `cumulative_gpa_fall = as.numeric(cumulative_gpa_fall)`.
## Caused by warning:
## ! NAs introduced by coercion
df$retained_in_fall <- factor(df$retained_in_fall)
df_model <- df %>%
filter(!is.na(gpa_spring),
!is.na(expected_family_contribution),
!is.na(cost_of_attendance),
!is.na(financial_aid_disbursement_amount),
!is.na(unmet_need_amount)) %>%
mutate(
retained_flag = ifelse(retained_in_fall == "Yes", 1, 0),
expected_family_contribution = as.numeric(expected_family_contribution),
cost_of_attendance = as.numeric(cost_of_attendance),
financial_aid_disbursement_amount = as.numeric(financial_aid_disbursement_amount),
unmet_need_amount = as.numeric(unmet_need_amount)
) %>%
select(retained_flag, gpa_spring,
expected_family_contribution, cost_of_attendance,
financial_aid_disbursement_amount, unmet_need_amount)
set.seed(123)
train_proportion <- 0.7
train_index <- sample(seq_len(nrow(df_model)), size = floor(train_proportion * nrow(df_model)))
train_data <- df_model[train_index, ]
test_data <- df_model[-train_index, ]
cat("Training size:", nrow(train_data), "\n")
## Training size: 2683
cat("Testing size:", nrow(test_data), "\n")
## Testing size: 1150
cat("Training class distribution:\n")
## Training class distribution:
print(prop.table(table(train_data$retained_flag)))
##
## 0 1
## 0.1714499 0.8285501
cat("Testing class distribution:\n")
## Testing class distribution:
print(prop.table(table(test_data$retained_flag)))
##
## 0 1
## 0.1869565 0.8130435
# Check original balance
table(train_data$retained_flag)
##
## 0 1
## 460 2223
# Separate classes
majority <- train_data %>% filter(retained_flag == 1)
minority <- train_data %>% filter(retained_flag == 0)
# Upsample the minority class (repeat rows to match majority)
set.seed(42)
minority_upsampled <- minority %>%
slice_sample(n = nrow(majority), replace = TRUE)
# Combine and shuffle
train_balanced <- bind_rows(majority, minority_upsampled) %>%
slice_sample(n = nrow(.), replace = FALSE)
# Check new class balance
table(train_balanced$retained_flag)
##
## 0 1
## 2223 2223
# Train logistic regression on balanced training set
log_model_recall <- glm(retained_flag ~ ., data = train_balanced, family = "binomial")
# Predict probabilities on test set
log_probs <- predict(log_model_recall, newdata = test_data, type = "response")
# Apply lower threshold to favor recall
log_pred <- ifelse(log_probs > 0.35, 1, 0)
log_pred <- as.factor(log_pred)
actual <- test_data$retained_flag
# Confusion matrix
conf_matrix <- table(Predicted = log_pred, Actual = actual)
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 96 146
## 1 119 789
# Accuracy
accuracy <- mean(log_pred == actual)
# Metrics for Class 0 (Not Retained)
tp <- sum(log_pred == "0" & actual == "0") # True positives
fp <- sum(log_pred == "0" & actual == "1") # False positives
fn <- sum(log_pred == "1" & actual == "0") # False negatives
# Calculate metrics
precision <- ifelse((tp + fp) > 0, tp / (tp + fp), NA)
recall <- ifelse((tp + fn) > 0, tp / (tp + fn), NA)
f1 <- ifelse(!is.na(precision) & !is.na(recall) & (precision + recall) > 0,
2 * (precision * recall) / (precision + recall), NA)
# Print results
cat("🔹 Prioritizing Recall for Class 0 (Not Retained)\n")
## 🔹 Prioritizing Recall for Class 0 (Not Retained)
cat("Test Accuracy:", round(accuracy, 3), "\n")
## Test Accuracy: 0.77
cat("Precision (Class 0):", round(precision, 2), "\n")
## Precision (Class 0): 0.4
cat("Recall (Class 0):", round(recall, 2), "\n")
## Recall (Class 0): 0.45
cat("F1 Score (Class 0):", round(f1, 2), "\n")
## F1 Score (Class 0): 0.42
# Number of folds
k <- 5
set.seed(123)
# Shuffle and split into k folds
n <- nrow(train_balanced)
folds <- sample(rep(1:k, length.out = n))
# Initialize vectors to store metrics
accuracies <- c()
precisions <- c()
recalls <- c()
f1_scores <- c()
# Run k-fold CV
for (i in 1:k) {
test_idx <- which(folds == i)
test_fold <- train_balanced[test_idx, ]
train_fold <- train_balanced[-test_idx, ]
# Fit model
model <- glm(retained_flag ~ ., data = train_fold, family = "binomial")
# Predict on test fold
probs <- predict(model, newdata = test_fold, type = "response")
pred <- ifelse(probs > 0.35, 1, 0)
pred <- as.factor(pred)
actual <- test_fold$retained_flag
# Metrics for class 0
tp <- sum(pred == "0" & actual == "0")
fp <- sum(pred == "0" & actual == "1")
fn <- sum(pred == "1" & actual == "0")
precision <- ifelse(tp + fp > 0, tp / (tp + fp), NA)
recall <- ifelse(tp + fn > 0, tp / (tp + fn), NA)
f1 <- ifelse(!is.na(precision) & !is.na(recall) & (precision + recall) > 0,
2 * (precision * recall) / (precision + recall), NA)
acc <- mean(pred == actual)
# Store results
accuracies[i] <- acc
precisions[i] <- precision
recalls[i] <- recall
f1_scores[i] <- f1
}
# Average results across folds
cat("🔁 5-Fold Cross-Validation Results\n")
## 🔁 5-Fold Cross-Validation Results
cat("Avg Accuracy:", round(mean(accuracies, na.rm = TRUE), 3), "\n")
## Avg Accuracy: 0.673
cat("Avg Precision (Class 0):", round(mean(precisions, na.rm = TRUE), 2), "\n")
## Avg Precision (Class 0): 0.78
cat("Avg Recall (Class 0):", round(mean(recalls, na.rm = TRUE), 2), "\n")
## Avg Recall (Class 0): 0.48
cat("Avg F1 Score (Class 0):", round(mean(f1_scores, na.rm = TRUE), 2), "\n")
## Avg F1 Score (Class 0): 0.59
#clean Unmet: they are shown as data errors but could be scholarship money
unique(df$unmet_need_amount[df$unmet_need_amount < -100000])
## [1] NA -9995877 -9984197 -9975697 -106606 -9969235 -9981690
## [8] -9985675 -114315 -9987397 -157132 -10000253 -112358 -9998197
## [15] -9967735 -9970235 -9968235 -130599 -9967235 -126310 -9982378
## [22] -9978197 -124377 -10002459 -9983185 -9985693 -100901 -10000197
## [29] -9981833 -141254 -9974235 -101249 -9972722 -9973429 -9975735
## [36] -9981506 -115239 -9981606 -10000310 -9981044 -9993697 -9972135
## [43] -9966735 -9978735 -9976197 -9992836 -9996197 -9977797 -9980197
## [50] -9998281 -9976697 -9987697 -9998503 -9970688 -9998423 -9981235
## [57] -10000352 -9996288 -9974485 -9976235 -9975110 -9987781 -9973235
## [64] -133401 -9993725 -9971235 -9988812 -9977197 -9983785 -193561
## [71] -9974470 -9995763 -101700 -114779 -9962290 -182483 -9977235
## [78] -111055 -117358 -9997625 -9982697 -110491 -180355 -9979197
## [85] -9974735 -9980785 -9980749 -9978397 -9975435 -9981197 -9994565
## [92] -9990881 -9961790 -9982237 -9971406 -9977735 -9987470 -9998868
## [99] -9979935 -9973735 -9985785 -9984295 -9994266 -9973917 -9967285
## [106] -9985310 -325889 -115904 -10005897 -9968735 -9971606 -9975797
## [113] -9971735 -10005365 -9983612
#Set extreme negative unmet need to NA (threshold based on institutional data logic)
df$unmet_need_amount <- ifelse(df$unmet_need_amount < -100000, NA, df$unmet_need_amount)
# Filter data set with cleaned unmet need and known retention
df_model <- df %>%
filter(retained_in_fall %in% c("Yes", "No")) %>%
mutate(
gpa_spring = as.numeric(gpa_spring),
expected_family_contribution = as.numeric(expected_family_contribution),
cost_of_attendance = as.numeric(cost_of_attendance),
financial_aid_disbursement_amount = as.numeric(financial_aid_disbursement_amount),
unmet_need_amount = as.numeric(unmet_need_amount)
) %>%
filter(!is.na(gpa_spring),
!is.na(expected_family_contribution),
!is.na(cost_of_attendance),
!is.na(financial_aid_disbursement_amount),
!is.na(unmet_need_amount))
# Convert outcome to factor
df_model$retained_in_fall <- factor(df_model$retained_in_fall)
# Fit logistic regression model
model_acad_fin <- glm(retained_in_fall ~ gpa_spring +
expected_family_contribution + cost_of_attendance +
financial_aid_disbursement_amount + unmet_need_amount,
data = df_model,
family = "binomial")
summary(model_acad_fin)
##
## Call:
## glm(formula = retained_in_fall ~ gpa_spring + expected_family_contribution +
## cost_of_attendance + financial_aid_disbursement_amount +
## unmet_need_amount, family = "binomial", data = df_model)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6365 0.3418 0.4524 0.6313 1.2889
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.151e+00 2.975e-01 3.868 0.00011 ***
## gpa_spring 6.711e-01 3.770e-02 17.801 < 2e-16 ***
## expected_family_contribution 6.474e-06 3.203e-06 2.021 0.04326 *
## cost_of_attendance -3.863e-05 9.214e-06 -4.193 2.75e-05 ***
## financial_aid_disbursement_amount 1.424e-05 5.100e-06 2.791 0.00525 **
## unmet_need_amount NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3497.3 on 3714 degrees of freedom
## Residual deviance: 3116.8 on 3710 degrees of freedom
## AIC: 3126.8
##
## Number of Fisher Scoring iterations: 5
# Calculate odds ratios for interpretation
exp(cbind(Odds_Ratio = coef(model_acad_fin), confint(model_acad_fin)))
## Waiting for profiling to be done...
## Odds_Ratio 2.5 % 97.5 %
## (Intercept) 3.1606443 1.7707475 5.6865943
## gpa_spring 1.9563932 1.8180299 2.1076611
## expected_family_contribution 1.0000065 1.0000004 1.0000129
## cost_of_attendance 0.9999614 0.9999432 0.9999794
## financial_aid_disbursement_amount 1.0000142 1.0000042 1.0000242
## unmet_need_amount NA NA NA
# Fit a decision tree model
tree_model <- rpart(retained_in_fall ~ gpa_spring +
expected_family_contribution + cost_of_attendance +
financial_aid_disbursement_amount + unmet_need_amount,
data = df_model,
method = "class")
# View model summary
summary(tree_model)
## Call:
## rpart(formula = retained_in_fall ~ gpa_spring + expected_family_contribution +
## cost_of_attendance + financial_aid_disbursement_amount +
## unmet_need_amount, data = df_model, method = "class")
## n= 3715
##
## CP nsplit rel error xerror xstd
## 1 0.04122939 0 1.0000000 1.0000000 0.03507237
## 2 0.01000000 2 0.9175412 0.9235382 0.03398568
##
## Variable importance
## gpa_spring unmet_need_amount
## 98 1
## financial_aid_disbursement_amount
## 1
##
## Node number 1: 3715 observations, complexity param=0.04122939
## predicted class=Yes expected loss=0.1795424 P(node) =1
## class counts: 667 3048
## probabilities: 0.180 0.820
## left son=2 (652 obs) right son=3 (3063 obs)
## Primary splits:
## gpa_spring < 0.9215 to the left, improve=124.509900, (0 missing)
## unmet_need_amount < 20105.5 to the right, improve= 6.348405, (0 missing)
## cost_of_attendance < 32468.5 to the right, improve= 5.187697, (0 missing)
## expected_family_contribution < 2091.5 to the left, improve= 3.794012, (0 missing)
## financial_aid_disbursement_amount < 40365.5 to the left, improve= 1.622693, (0 missing)
## Surrogate splits:
## unmet_need_amount < 33484.5 to the right, agree=0.825, adj=0.002, (0 split)
##
## Node number 2: 652 observations, complexity param=0.04122939
## predicted class=Yes expected loss=0.4601227 P(node) =0.1755047
## class counts: 300 352
## probabilities: 0.460 0.540
## left son=4 (341 obs) right son=5 (311 obs)
## Primary splits:
## gpa_spring < 0.2425 to the left, improve=20.7686200, (0 missing)
## unmet_need_amount < 20206 to the right, improve= 6.3640930, (0 missing)
## financial_aid_disbursement_amount < 17346.5 to the left, improve= 3.2515290, (0 missing)
## cost_of_attendance < 39542.5 to the right, improve= 1.0526350, (0 missing)
## expected_family_contribution < 63531 to the right, improve= 0.9141988, (0 missing)
## Surrogate splits:
## financial_aid_disbursement_amount < 28246.5 to the left, agree=0.541, adj=0.039, (0 split)
## cost_of_attendance < 39570.5 to the left, agree=0.538, adj=0.032, (0 split)
## unmet_need_amount < -6625 to the right, agree=0.538, adj=0.032, (0 split)
## expected_family_contribution < 30019.5 to the left, agree=0.537, adj=0.029, (0 split)
##
## Node number 3: 3063 observations
## predicted class=Yes expected loss=0.1198172 P(node) =0.8244953
## class counts: 367 2696
## probabilities: 0.120 0.880
##
## Node number 4: 341 observations
## predicted class=No expected loss=0.4193548 P(node) =0.09179004
## class counts: 198 143
## probabilities: 0.581 0.419
##
## Node number 5: 311 observations
## predicted class=Yes expected loss=0.3279743 P(node) =0.08371467
## class counts: 102 209
## probabilities: 0.328 0.672
# Visualize decision tree without fallen leaves
rpart.plot(tree_model,
type = 2, # shows split variable and threshold
extra = 104, # shows class, probability, and percentage
fallen.leaves = FALSE, # disables fallen leaves for standard layout
main = "Decision Tree for Student Retention (All Predictors)")
#Tuning the Decision Tree
# Create a sequence of cp values to test
cp_values <- seq(0.001, 0.05, by = 0.005)
results <- data.frame(cp = numeric(), Accuracy = numeric())
for (cp in cp_values) {
# Train model with this 0.001
model <- rpart(retained_in_fall ~ gpa_spring + cost_of_attendance + expected_family_contribution +
financial_aid_disbursement_amount + unmet_need_amount,
data = df_model,
method = "class",
control = rpart.control(cp =0.001))
# Predict on training data
pred <- predict(model, type = "class")
acc <- mean(pred == df_model$retained_in_fall)
# Store results
results <- rbind(results, data.frame(cp =0.001, Accuracy = acc))
}
# View results
print(results)
## cp Accuracy
## 1 0.001 0.8602961
## 2 0.001 0.8602961
## 3 0.001 0.8602961
## 4 0.001 0.8602961
## 5 0.001 0.8602961
## 6 0.001 0.8602961
## 7 0.001 0.8602961
## 8 0.001 0.8602961
## 9 0.001 0.8602961
## 10 0.001 0.8602961
for (cp in cp_values) {
# Train model with this 0.05
model <- rpart(retained_in_fall ~ gpa_spring + cost_of_attendance + expected_family_contribution +
financial_aid_disbursement_amount + unmet_need_amount,
data = df_model,
method = "class",
control = rpart.control(cp =0.05))
# Predict on training data
pred <- predict(model, type = "class")
acc <- mean(pred == df_model$retained_in_fall)
# Store results
results <- rbind(results, data.frame(cp =0.05, Accuracy = acc))
}
# View results
print(results)
## cp Accuracy
## 1 0.001 0.8602961
## 2 0.001 0.8602961
## 3 0.001 0.8602961
## 4 0.001 0.8602961
## 5 0.001 0.8602961
## 6 0.001 0.8602961
## 7 0.001 0.8602961
## 8 0.001 0.8602961
## 9 0.001 0.8602961
## 10 0.001 0.8602961
## 11 0.050 0.8204576
## 12 0.050 0.8204576
## 13 0.050 0.8204576
## 14 0.050 0.8204576
## 15 0.050 0.8204576
## 16 0.050 0.8204576
## 17 0.050 0.8204576
## 18 0.050 0.8204576
## 19 0.050 0.8204576
## 20 0.050 0.8204576
for (cp in cp_values) {
# Train model with this 0.005
model <- rpart(retained_in_fall ~ gpa_spring + cost_of_attendance + expected_family_contribution +
financial_aid_disbursement_amount + unmet_need_amount,
data = df_model,
method = "class",
control = rpart.control(cp =0.005))
# Predict on training data
pred <- predict(model, type = "class")
acc <- mean(pred == df_model$retained_in_fall)
# Store results
results <- rbind(results, data.frame(cp =0.005, Accuracy = acc))
}
# View results
print(results)
## cp Accuracy
## 1 0.001 0.8602961
## 2 0.001 0.8602961
## 3 0.001 0.8602961
## 4 0.001 0.8602961
## 5 0.001 0.8602961
## 6 0.001 0.8602961
## 7 0.001 0.8602961
## 8 0.001 0.8602961
## 9 0.001 0.8602961
## 10 0.001 0.8602961
## 11 0.050 0.8204576
## 12 0.050 0.8204576
## 13 0.050 0.8204576
## 14 0.050 0.8204576
## 15 0.050 0.8204576
## 16 0.050 0.8204576
## 17 0.050 0.8204576
## 18 0.050 0.8204576
## 19 0.050 0.8204576
## 20 0.050 0.8204576
## 21 0.005 0.8422611
## 22 0.005 0.8422611
## 23 0.005 0.8422611
## 24 0.005 0.8422611
## 25 0.005 0.8422611
## 26 0.005 0.8422611
## 27 0.005 0.8422611
## 28 0.005 0.8422611
## 29 0.005 0.8422611
## 30 0.005 0.8422611
# Best cp value
best_cp <- results$cp[which.max(results$Accuracy)]
cat("Best cp:", best_cp, "\n")
## Best cp: 0.001
# Retrain using best cp
final_model <- rpart(retained_in_fall ~ gpa_spring + cost_of_attendance + expected_family_contribution +
financial_aid_disbursement_amount + unmet_need_amount,
data = df_model,
method = "class",
control = rpart.control(cp = best_cp))
# Visualize
rpart.plot(final_model)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
# Print complexity parameter table
printcp(final_model)
##
## Classification tree:
## rpart(formula = retained_in_fall ~ gpa_spring + cost_of_attendance +
## expected_family_contribution + financial_aid_disbursement_amount +
## unmet_need_amount, data = df_model, method = "class", control = rpart.control(cp = best_cp))
##
## Variables actually used in tree construction:
## [1] cost_of_attendance expected_family_contribution
## [3] financial_aid_disbursement_amount gpa_spring
## [5] unmet_need_amount
##
## Root node error: 667/3715 = 0.17954
##
## n= 3715
##
## CP nsplit rel error xerror xstd
## 1 0.0412294 0 1.00000 1.00000 0.035072
## 2 0.0059970 2 0.91754 0.93103 0.034096
## 3 0.0054973 4 0.90555 0.98201 0.034824
## 4 0.0052474 7 0.88906 0.98501 0.034866
## 5 0.0033733 9 0.87856 0.99400 0.034990
## 6 0.0029985 23 0.81709 0.97751 0.034761
## 7 0.0022489 25 0.81109 0.97751 0.034761
## 8 0.0014993 31 0.79610 0.97601 0.034740
## 9 0.0010709 32 0.79460 0.98801 0.034907
## 10 0.0010000 43 0.77811 1.01799 0.035317
#chose cp = 0.005997 using the 1 Standard Error Rule (1-SE Rule)
# Optimal cp value based on 1-SE Rule
optimal_cp <- 0.005997
# Prune the tree
pruned_model <- prune(final_model, cp = optimal_cp)
# Plot
rpart.plot(pruned_model, type = 2, extra = 104, main = "Pruned Classification Tree")
# Recode CASA variable for consistency
df$casa_student <- ifelse(df$casa_student %in% c("1", "1.000"), 1,
ifelse(df$casa_student %in% c("0", "0.000", "0.00%"), 0, NA))
df$casa_student <- factor(df$casa_student)
table(df$casa_student, useNA = "ifany")
##
## 0 1
## 4040 426
df_clean <- df %>%
filter(retained_in_fall %in% c("Yes", "No"))
df_clean$retained_in_fall <- factor(df_clean$retained_in_fall)
# Chi-square test
casa_table <- table(df_clean$casa_student, df_clean$retained_in_fall)
print(casa_table)
##
## No Yes
## 0 1092 2948
## 1 154 272
chi_casa <- chisq.test(casa_table)
print(chi_casa)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: casa_table
## X-squared = 15.486, df = 1, p-value = 8.313e-05
#There is a statistically significant association between CASA program participation and retention.
#Retention rates differ between CASA and non-CASA students.
#logistic regression model
model_casa <- glm(retained_in_fall ~ casa_student,
data = df_clean,
family = "binomial")
summary(model_casa)
##
## Call:
## glm(formula = retained_in_fall ~ casa_student, family = "binomial",
## data = df_clean)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6176 -1.4265 0.7939 0.7939 0.9473
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.99312 0.03543 28.034 < 2e-16 ***
## casa_student1 -0.42427 0.10689 -3.969 7.21e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5287.8 on 4465 degrees of freedom
## Residual deviance: 5272.6 on 4464 degrees of freedom
## AIC: 5276.6
##
## Number of Fisher Scoring iterations: 4
# Calculate odds ratios for interpretation
model_casa_tidy <- tidy(model_casa, conf.int = TRUE, exponentiate = TRUE)
print(model_casa_tidy)
## # A tibble: 2 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 2.70 0.0354 28.0 6.26e-173 2.52 2.89
## 2 casa_student1 0.654 0.107 -3.97 7.21e- 5 0.531 0.808
# Subset to Financial Cliff students only
financial_cliff_df <- df %>%
filter(unmet_need_amount >= 15000)
# Linear regression: college GPA ~ high school GPA
model_hsgpa_collegegpa <- lm(gpa_spring ~ high_school_gpa, data = financial_cliff_df)
summary(model_hsgpa_collegegpa)
##
## Call:
## lm(formula = gpa_spring ~ high_school_gpa, data = financial_cliff_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7669 -0.8928 0.1472 0.9174 2.2689
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.7669 0.1301 21.267 < 2e-16 ***
## high_school_gpaB -0.3741 0.1503 -2.489 0.0131 *
## high_school_gpaC -1.0358 0.1468 -7.053 4.69e-12 ***
## high_school_gpaD -1.7527 0.3602 -4.865 1.45e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.164 on 619 degrees of freedom
## (442 observations deleted due to missingness)
## Multiple R-squared: 0.116, Adjusted R-squared: 0.1117
## F-statistic: 27.06 on 3 and 619 DF, p-value: < 2.2e-16
#Students with higher high school GPAs tend to have higher spring college GPAs. Compared to students with an A high school GPA,
#students with a B average ~0.37 GPA points lower, C students ~1.04 points lower, and D students ~1.75 points lower in spring GPA.
#High school GPA explains about 12% of variation in spring college GPA outcomes.
# Logistic regression: retention ~ high school GPA
model_hsgpa_retention <- glm(retained_in_fall ~ high_school_gpa,
data = financial_cliff_df,
family = "binomial")
summary(model_hsgpa_retention)
##
## Call:
## glm(formula = retained_in_fall ~ high_school_gpa, family = "binomial",
## data = financial_cliff_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.242 -1.094 -1.094 1.162 1.264
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1515 0.1839 0.824 0.4098
## high_school_gpaB -0.1152 0.2102 -0.548 0.5838
## high_school_gpaC -0.3518 0.2035 -1.729 0.0838 .
## high_school_gpaD -0.3339 0.4660 -0.716 0.4737
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1463.9 on 1056 degrees of freedom
## Residual deviance: 1459.0 on 1053 degrees of freedom
## (8 observations deleted due to missingness)
## AIC: 1467
##
## Number of Fisher Scoring iterations: 3
# students with lower high school GPAs (especially C) tend to have slightly lower odds of being retained in fall compared to students with an A GPA.
#differences are not statistically significant except marginally for C students
#overall, high school GPA alone is not a strong predictor of fall retention
-
# Odds ratio interpretation
exp(cbind(OR = coef(model_hsgpa_retention), confint(model_hsgpa_retention)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) -1.1636364 -0.8120781 -1.672883
## high_school_gpaB -0.8912037 -0.5891040 -1.344905
## high_school_gpaC -0.7033926 -0.4710094 -1.047420
## high_school_gpaD -0.7161458 -0.2818191 -1.785610
#students with lower high school GPAs (B, C, D) tend to have lower odds of retention compared to A students,
#differences are not statistically significant in this model, high school GPA alone is not a strong predictor of retention
master_df <- read.csv("final_cleaned_master_dataset_3.csv")
courses_df <- read.csv("merged_courses_data (1).csv")
# Standardize column name for joining
courses_df <- courses_df %>%
rename(study_id = StudyID)
# Merge datasets on study_id
merged_df <- left_join(master_df, courses_df, by = "study_id")
# Filter Fall Math/English courses with a D grade
fall_d_students <- merged_df %>%
filter(collection_term_x == "Fall",
Course_Subject_all %in% c("MATH", "ENGL"),
FinalGrade_all == "D") %>%
mutate(retained_flag = ifelse(retained_in_fall == "Yes", 1, 0))
# Calculate current retention rate for D students
current_retention <- mean(fall_d_students$retained_flag, na.rm = TRUE)
# Create logistic regression model using full dataset with valid GPA and retention info
model_df <- merged_df %>%
filter(!is.na(gpa_spring), retained_in_fall %in% c("Yes", "No")) %>%
mutate(retained_flag = ifelse(retained_in_fall == "Yes", 1, 0))
glm_retention <- glm(retained_flag ~ gpa_spring,
data = model_df,
family = "binomial")
# Simulate GPA improvement: D → C (+0.7 bump capped at 4.0)
fall_d_students_simulated <- fall_d_students %>%
mutate(simulated_gpa_spring = ifelse(!is.na(gpa_spring),
pmin(gpa_spring + 0.7, 4.0),
NA))
# Predict new retention probabilities using simulated GPA
fall_d_students_simulated$predicted_retention_prob <- predict(
glm_retention,
newdata = fall_d_students_simulated %>%
mutate(gpa_spring = simulated_gpa_spring),
type = "response"
)
# Estimated simulated retention rate
simulated_retention <- mean(fall_d_students_simulated$predicted_retention_prob, na.rm = TRUE)
# Output results
cat("Current Retention Rate for D Students: ", round(current_retention * 100, 1), "%\n")
## Current Retention Rate for D Students: 44.2 %
cat("Estimated Retention Rate if D's Improved to C's: ", round(simulated_retention * 100, 1), "%\n")
## Estimated Retention Rate if D's Improved to C's: 85.6 %
# Filter: D in Fall + Math/English + Unmet Need ≥ $15,000
cliff_d_students <- merged_df %>%
filter(collection_term_x == "Fall",
Course_Subject_all %in% c("MATH", "ENGL"),
FinalGrade_all == "D",
unmet_need_amount >= 15000) %>%
mutate(retained_flag = ifelse(retained_in_fall == "Yes", 1, 0))
current_retention_cliff <- mean(cliff_d_students$retained_flag, na.rm = TRUE)
cliff_d_students_sim <- cliff_d_students %>%
mutate(simulated_gpa_spring = ifelse(!is.na(gpa_spring),
pmin(gpa_spring + 0.7, 4.0),
NA))
# Predict retention using simulated GPA
cliff_d_students_sim$predicted_retention_prob <- predict(
glm_retention,
newdata = cliff_d_students_sim %>%
mutate(gpa_spring = simulated_gpa_spring),
type = "response"
)
# Estimate average predicted retention under simulated improvement
simulated_retention_cliff <- mean(cliff_d_students_sim$predicted_retention_prob, na.rm = TRUE)
cat("Current Retention Rate for Financial Cliff D Students: ",
round(current_retention_cliff * 100, 1), "%\n")
## Current Retention Rate for Financial Cliff D Students: 25.2 %
cat("Estimated Retention Rate if Grades Improved to C: ",
round(simulated_retention_cliff * 100, 1), "%\n")
## Estimated Retention Rate if Grades Improved to C: 84.1 %
# Convert CASA flag
merged_df <- merged_df %>%
mutate(casa_flag = ifelse(casa_student %in% c("1", "1.000"), 1, 0))
# Filter for target group
casa_cliff_d_students <- merged_df %>%
filter(collection_term_x == "Fall",
Course_Subject_all %in% c("MATH", "ENGL"),
FinalGrade_all == "D",
unmet_need_amount >= 15000,
casa_flag == 1) %>%
mutate(retained_flag = ifelse(retained_in_fall == "Yes", 1, 0))
current_retention_casa_cliff <- mean(casa_cliff_d_students$retained_flag, na.rm = TRUE)
#Simulate GPA Bump (D → C)
casa_cliff_d_sim <- casa_cliff_d_students %>%
mutate(simulated_gpa_spring = ifelse(!is.na(gpa_spring),
pmin(gpa_spring + 0.7, 4.0),
NA))
#Predict Retention with Improved GPA
casa_cliff_d_sim$predicted_retention_prob <- predict(
glm_retention,
newdata = casa_cliff_d_sim %>%
mutate(gpa_spring = simulated_gpa_spring),
type = "response"
)
simulated_retention_casa_cliff <- mean(casa_cliff_d_sim$predicted_retention_prob, na.rm = TRUE)
#print
cat("Current Retention Rate for CASA + Financial Cliff D Students: ",
round(current_retention_casa_cliff * 100, 1), "%\n")
## Current Retention Rate for CASA + Financial Cliff D Students: 16.7 %
cat("Estimated Retention Rate if Grades Improved to C: ",
round(simulated_retention_casa_cliff * 100, 1), "%\n")
## Estimated Retention Rate if Grades Improved to C: 78.2 %
##Tier 1 Unmet Need ≥ $15,000 AND GPA < 2.0 Urgent academic + financial support
##Tier 2 Unmet Need ≥ $15,000 AND GPA ≥ 2.0 Financial support needed
##Tier 3 Unmet Need < $15,000 OR GPA < 2.0 Moderate concern (either GPA or need)
df <- read_csv("final_cleaned_master_dataset_3.csv")
## Rows: 6835 Columns: 46
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): collection_term_x, high_school_gpa, majors, college, cumulative_gp...
## dbl (36): study_id, collection_year_x, expected_family_contribution, cost_of...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Clean and prepare key variables
df <- df %>%
mutate(
gpa_spring = as.numeric(gpa_spring),
unmet_need_amount = as.numeric(unmet_need_amount),
retained_in_fall = ifelse(retained_in_fall == "Yes", 1, 0)
) %>%
filter(unmet_need_amount > 0, !is.na(gpa_spring))
# Assign intervention tiers
df <- df %>%
mutate(
intervention_tier = case_when(
unmet_need_amount >= 15000 & gpa_spring < 2.0 ~ "Tier 1: Urgent (Low GPA + High Need)",
unmet_need_amount >= 15000 & gpa_spring >= 2.0 ~ "Tier 2: Financial Need Only",
unmet_need_amount < 15000 | gpa_spring < 2.0 ~ "Tier 3: Moderate Concern"
),
retention_status = ifelse(retained_in_fall == 1, "Retained", "Not Retained")
)
# Create final visualization
ggplot(df, aes(x = unmet_need_amount, y = gpa_spring,
color = intervention_tier, shape = retention_status)) +
geom_point(size = 2.5, alpha = 0.7) +
# Horizontal GPA threshold line at 2.0
geom_hline(yintercept = 2.0, linetype = "dashed", color = "gray40", linewidth = 0.8) +
# Vertical Unmet Need threshold line at $15,000
geom_vline(xintercept = 15000, linetype = "dashed", color = "gray40", linewidth = 0.8) +
scale_color_manual(values = c(
"Tier 1: Urgent (Low GPA + High Need)" = "red",
"Tier 2: Financial Need Only" = "orange",
"Tier 3: Moderate Concern" = "blue"
)) +
scale_shape_manual(values = c("Not Retained" = 16, "Retained" = 17)) +
scale_x_continuous(labels = scales::dollar_format()) +
labs(
title = "Financial Risk Tiers with Retention Status (All Students)",
subtitle = "Dashed lines mark GPA < 2.0 and Unmet Need ≥ $15,000",
x = "Unmet Financial Need ($)",
y = "Spring GPA",
color = "Intervention Tier",
shape = "Retention Status"
) +
theme_minimal()
# Filter for Tier 1 + Not Retained
tier1_not_retained <- df %>%
filter(
intervention_tier == "Tier 1: Urgent (Low GPA + High Need)",
retained_in_fall == 0
)
tier1_not_retained %>%
summarise(
total_students = n(),
avg_gpa = mean(gpa_spring, na.rm = TRUE),
avg_unmet_need = mean(unmet_need_amount, na.rm = TRUE)
)
## # A tibble: 1 × 3
## total_students avg_gpa avg_unmet_need
## <int> <dbl> <dbl>
## 1 273 0.795 21090.
# plot
ggplot(tier1_not_retained, aes(x = unmet_need_amount, y = gpa_spring)) +
geom_point(color = "red", size = 3, alpha = 0.8) +
geom_hline(yintercept = 2.0, linetype = "dashed", color = "gray40") +
geom_vline(xintercept = 15000, linetype = "dashed", color = "gray40") +
labs(
title = "Tier 1 Students Who Were Not Retained",
subtitle = "High unmet need and low GPA students most at risk (GPA < 2.0, Need ≥ $15,000)",
x = "Unmet Financial Need ($)",
y = "Spring GPA"
) +
scale_x_continuous(labels = scales::dollar_format()) +
theme_minimal()
# Filter Tier 1 students
tier1_df <- df %>%
filter(intervention_tier == "Tier 1: Urgent (Low GPA + High Need)")
# Boxplot: GPA by Retention Status
ggplot(tier1_df, aes(x = as.factor(retained_in_fall), y = gpa_spring, fill = as.factor(retained_in_fall))) +
geom_boxplot() +
scale_fill_manual(values = c("0" = "red", "1" = "green"), labels = c("Not Retained", "Retained")) +
labs(
title = "Spring GPA Distribution of Tier 1 Students",
subtitle = "Compared by Retention Status",
x = "Retention Status",
y = "Spring GPA",
fill = "Status"
) +
theme_minimal()
## Question 8: GPA Interventions
# Simulate GPA lift from D to C for Fall Math/English students with unmet need ≥ $15K
cliff_d_students <- merged_df %>%
filter(
collection_term_x == "Fall",
Course_Subject_all %in% c("MATH", "ENGL"),
FinalGrade_all == "D",
unmet_need_amount >= 15000
) %>%
mutate(
retained_flag = ifelse(retained_in_fall == "Yes", 1, 0),
simulated_gpa = ifelse(!is.na(gpa_spring), pmin(gpa_spring + 0.7, 4.0), NA),
original_tier = case_when(
unmet_need_amount >= 15000 & gpa_spring < 2.0 ~ "Tier 1: Urgent",
unmet_need_amount >= 15000 & gpa_spring >= 2.0 ~ "Tier 2: Financial Need Only",
unmet_need_amount < 15000 | gpa_spring < 2.0 ~ "Tier 3: Moderate Concern"
),
new_tier = case_when(
unmet_need_amount >= 15000 & simulated_gpa < 2.0 ~ "Tier 1: Urgent",
unmet_need_amount >= 15000 & simulated_gpa >= 2.0 ~ "Tier 2: Financial Need Only",
unmet_need_amount < 15000 | simulated_gpa < 2.0 ~ "Tier 3: Moderate Concern"
)
)
# Summary: How many students shifted tiers after simulation?
tier_shift_summary <- cliff_d_students %>%
count(original_tier, new_tier)
print(tier_shift_summary)
## original_tier new_tier n
## 1 Tier 1: Urgent Tier 1: Urgent 44
## 2 Tier 1: Urgent Tier 2: Financial Need Only 23
## 3 Tier 2: Financial Need Only Tier 2: Financial Need Only 50
## 4 <NA> <NA> 93
# plot All Simulated Students by New Tier
ggplot(cliff_d_students, aes(x = unmet_need_amount, y = simulated_gpa,
color = new_tier, shape = as.factor(retained_flag))) +
geom_point(size = 3, alpha = 0.8) +
geom_hline(yintercept = 2.0, linetype = "dashed", color = "gray40") +
geom_vline(xintercept = 15000, linetype = "dashed", color = "gray40") +
scale_color_manual(values = c(
"Tier 1: Urgent" = "red",
"Tier 2: Financial Need Only" = "orange",
"Tier 3: Moderate Concern" = "blue"
)) +
scale_shape_manual(values = c("0" = 16, "1" = 17), labels = c("Not Retained", "Retained")) +
scale_x_continuous(labels = scales::dollar_format()) +
labs(
title = "Simulated GPA Impact on Tier Assignment (Math/English D Students)",
subtitle = "Dashed lines indicate GPA < 2.0 and Unmet Need ≥ $15,000",
x = "Unmet Financial Need ($)",
y = "Simulated Spring GPA (D → C Boost)",
color = "Simulated Tier",
shape = "Retention Status"
) +
theme_minimal()
## Warning: Removed 93 rows containing missing values (`geom_point()`).
#plot: Focus on Tier 1 Students Not Retained After Simulation
tier1_sim_not_retained <- cliff_d_students %>%
filter(new_tier == "Tier 1: Urgent", retained_flag == 0)
ggplot(tier1_sim_not_retained, aes(x = unmet_need_amount, y = simulated_gpa)) +
geom_point(color = "red", size = 2.5, alpha = 0.8) +
geom_hline(yintercept = 2.0, linetype = "dashed", color = "gray40") +
geom_vline(xintercept = 15000, linetype = "dashed", color = "gray40") +
labs(
title = "Simulated GPA for Tier 1 Students Who Were Not Retained",
subtitle = "GPA < 2.0 and Unmet Need ≥ $15,000 after simulated C-grade boost",
x = "Unmet Financial Need ($)",
y = "Simulated Spring GPA (D → C Boost)"
) +
scale_x_continuous(labels = scales::dollar_format()) +
theme_minimal()
# Boxplot of Simulated GPA by Retention for Tier 1
tier1_all <- cliff_d_students %>%
filter(new_tier == "Tier 1: Urgent")
ggplot(tier1_all, aes(x = as.factor(retained_flag), y = simulated_gpa, fill = as.factor(retained_flag))) +
geom_boxplot() +
scale_fill_manual(values = c("0" = "red", "1" = "green"),
labels = c("Not Retained", "Retained")) +
labs(
title = "Simulated GPA Distribution (Tier 1 Students)",
subtitle = "Compared by Retention Status after GPA Improvement",
x = "Retention Status",
y = "Simulated Spring GPA",
fill = "Status"
) +
theme_minimal()
# Cluster-style plot to visualize tier shift after simulated GPA boost
ggplot(cliff_d_students, aes(x = unmet_need_amount, y = simulated_gpa,
color = new_tier, shape = as.factor(retained_flag))) +
geom_point(size = 2.5, alpha = 0.8) +
geom_hline(yintercept = 2.0, linetype = "dashed", color = "gray40", linewidth = 0.8) +
geom_vline(xintercept = 15000, linetype = "dashed", color = "gray40", linewidth = 0.8) +
scale_color_manual(values = c(
"Tier 1: Urgent" = "red",
"Tier 2: Financial Need Only" = "orange",
"Tier 3: Moderate Concern" = "blue"
)) +
scale_shape_manual(values = c("0" = 16, "1" = 17),
labels = c("Not Retained", "Retained")) +
scale_x_continuous(labels = scales::dollar_format()) +
labs(
title = "Post-Intervention Tier Assignment with Retention Status",
subtitle = "Dashed lines mark GPA < 2.0 and Unmet Need ≥ $15,000 (After GPA Boost)",
x = "Unmet Financial Need ($)",
y = "Simulated Spring GPA",
color = "Simulated Tier",
shape = "Retention Status"
) +
theme_minimal(base_size = 14)
## Warning: Removed 93 rows containing missing values (`geom_point()`).
# Create a before snapshot
cliff_d_students_before <- cliff_d_students %>%
mutate(
display_gpa = gpa_spring,
tier_label = case_when(
unmet_need_amount >= 15000 & gpa_spring < 2.0 ~ "Tier 1: Urgent",
unmet_need_amount >= 15000 & gpa_spring >= 2.0 ~ "Tier 2: Financial Need Only",
TRUE ~ "Tier 3: Moderate Concern"
),
stage = "Before GPA Intervention"
)
# Create an after snapshot
cliff_d_students_after <- cliff_d_students %>%
mutate(
display_gpa = simulated_gpa,
tier_label = new_tier,
stage = "After GPA Intervention"
)
# Combine before and after GPA intervention for high-need Math/English D students
cliff_combined <- bind_rows(
cliff_d_students %>%
mutate(
display_gpa = gpa_spring,
tier_label = case_when(
unmet_need_amount >= 15000 & gpa_spring < 2.0 ~ "Tier 1: Urgent",
unmet_need_amount >= 15000 & gpa_spring >= 2.0 ~ "Tier 2: Financial Need Only",
TRUE ~ "Tier 3: Moderate Concern"
),
stage = "Before GPA Intervention"
),
cliff_d_students %>%
mutate(
display_gpa = simulated_gpa,
tier_label = new_tier,
stage = "After GPA Intervention"
)
)
# Order facet panels: Before on the left, After on the right
cliff_combined$stage <- factor(cliff_combined$stage,
levels = c("Before GPA Intervention", "After GPA Intervention"))
# Plot the tier assignment shift
ggplot(cliff_combined, aes(x = unmet_need_amount, y = display_gpa,
color = tier_label, shape = as.factor(retained_flag))) +
geom_point(size = 2.8, alpha = 0.85) +
geom_hline(yintercept = 2.0, linetype = "dashed", color = "gray40", linewidth = 0.8) +
geom_vline(xintercept = 15000, linetype = "dashed", color = "gray40", linewidth = 0.8) +
facet_wrap(~stage) +
scale_color_manual(values = c(
"Tier 1: Urgent" = "red",
"Tier 2: Financial Need Only" = "orange",
"Tier 3: Moderate Concern" = "blue"
)) +
scale_shape_manual(values = c("0" = 16, "1" = 17),
labels = c("Not Retained", "Retained")) +
scale_x_continuous(labels = scales::dollar_format()) +
labs(
title = "Tier Assignment Before and After GPA Intervention",
subtitle = "Dashed lines show GPA < 2.0 and Unmet Need ≥ $15,000",
x = "Unmet Financial Need ($)",
y = "Spring GPA (Observed vs Simulated)",
color = "Tier",
shape = "Retention Status"
) +
theme_minimal(base_size = 13)
## Warning: Removed 186 rows containing missing values (`geom_point()`).
##Tier Shift Percentage
# Filter: Fall term, Math/English D students with unmet need ≥ $15,000
cliff_d_students <- merged_df %>%
filter(
collection_term_x == "Fall",
Course_Subject_all %in% c("MATH", "ENGL"),
FinalGrade_all == "D",
unmet_need_amount >= 15000
) %>%
mutate(
retained_flag = ifelse(retained_in_fall == "Yes", 1, 0),
# Save original tier
original_tier = case_when(
unmet_need_amount >= 15000 & gpa_spring < 2.0 ~ "Tier 1: Urgent",
unmet_need_amount >= 15000 & gpa_spring >= 2.0 ~ "Tier 2: Financial Need Only",
unmet_need_amount < 15000 | gpa_spring < 2.0 ~ "Tier 3: Moderate Concern"
),
# Simulate GPA improvement: D to C (~0.7 boost)
simulated_gpa = ifelse(!is.na(gpa_spring), pmin(gpa_spring + 0.7, 4.0), NA),
# Recalculate tier based on improved GPA
new_tier = case_when(
unmet_need_amount >= 15000 & simulated_gpa < 2.0 ~ "Tier 1: Urgent",
unmet_need_amount >= 15000 & simulated_gpa >= 2.0 ~ "Tier 2: Financial Need Only",
unmet_need_amount < 15000 | simulated_gpa < 2.0 ~ "Tier 3: Moderate Concern"
)
)
# Calculate percentage of students by original → new tier
tier_shift_pct <- cliff_d_students %>%
filter(!is.na(original_tier) & !is.na(new_tier)) %>%
count(original_tier, new_tier) %>%
group_by(original_tier) %>%
mutate(percentage = round(100 * n / sum(n), 1))
# Plot stacked percentage bar chart
ggplot(tier_shift_pct, aes(x = original_tier, y = percentage, fill = new_tier)) +
geom_bar(stat = "identity", position = "stack", width = 0.6) +
geom_text(aes(label = paste0(percentage, "%")),
position = position_stack(vjust = 0.5),
size = 4, color = "white") +
scale_fill_manual(values = c(
"Tier 1: Urgent" = "red",
"Tier 2: Financial Need Only" = "orange",
"Tier 3: Moderate Concern" = "blue"
)) +
scale_x_discrete(labels = function(x) stringr::str_wrap(x, width = 15)) +
labs(
title = "Tier Reassignment After GPA Boost",
subtitle = "Percentage of Students in Each New Tier (Originally Tier 1, 2, or 3)",
x = "Original Tier",
y = "Percentage of Students",
fill = "New Tier"
) +
theme_minimal(base_size = 13)