# This project applies logistic regression to examine which factors are associated with passing or failing the bar exam. The response variable is binary—whether a student passes or fails—making logistic regression an appropriate modeling approach for this analysis.
# In this study, I exclude bar exam component scores such as UBE, MBE, MEE, MPT, and Written Scaled Score as predictor variables. These measures are directly used to determine pass/fail outcomes, so including them would amount to using the outcome itself as a predictor, which would not provide meaningful insight. Additionally, variations in scoring across exam administrations further limit their usefulness for predictive modeling. Instead, I focus on modeling pass/fail outcomes directly using pre-exam information.
# The predictors used in this analysis consist of variables available prior to taking the bar exam. These include LSAT scores, undergraduate GPA, final law school GPA, and indicators of academic support such as mentoring, writing resources, accommodations, and participation in student success programs. These variables are more relevant for identifying opportunities where institutions can intervene earlier to support student success.
# I begin by cleaning and preparing the dataset, followed by fitting a logistic regression model to identify significant predictors. To improve model parsimony, I apply stepwise selection to retain only the most influential variables. Model performance is then evaluated using classification accuracy, a confusion matrix, residual diagnostics, and odds ratios to aid interpretation.
# This analysis provides insight into which students may benefit from additional support before taking the bar exam and offers guidance on how institutions can improve bar passage rates through targeted early interventions.
# Load Libraries
library(MASS)
library(tidyverse)
library(car)
# Load Dataset
df <- read.csv("https://raw.githubusercontent.com/tmatis12/datafiles/main/Updated_Bar_Data_For_Review_Final.csv")
# There are also binary or yes/no type variables like Accommodations, Probation, BarPrepMentor, and StudentSuccessInitiative, which are listed as “character” for now, and we’ll need to recode them into numbers later. The variable Pass shows who passed the bar exam
# View summary
summary(df)
## Year PassFail Age LSAT
## Min. :2021 Length:476 Min. :23.10 Min. :141.0
## 1st Qu.:2022 Class :character 1st Qu.:26.70 1st Qu.:153.0
## Median :2023 Mode :character Median :28.20 Median :156.0
## Mean :2023 Mean :29.13 Mean :155.3
## 3rd Qu.:2024 3rd Qu.:30.10 3rd Qu.:157.0
## Max. :2024 Max. :65.70 Max. :168.0
##
## UGPA CivPro LPI LPII
## Min. :2.010 Length:476 Length:476 Length:476
## 1st Qu.:3.250 Class :character Class :character Class :character
## Median :3.490 Mode :character Mode :character Mode :character
## Mean :3.451
## 3rd Qu.:3.710
## Max. :4.140
##
## GPA_1L GPA_Final FinalRankPercentile Accommodations
## Min. :2.200 Min. :2.44 Min. :0.0000 Length:476
## 1st Qu.:2.781 1st Qu.:3.05 1st Qu.:0.2600 Class :character
## Median :3.083 Median :3.27 Median :0.5150 Mode :character
## Mean :3.086 Mean :3.28 Mean :0.5067
## 3rd Qu.:3.383 3rd Qu.:3.52 3rd Qu.:0.7500
## Max. :4.000 Max. :3.99 Max. :0.9900
## NA's :4
## Probation LegalAnalysis_TexasPractice AdvLegalPerfSkills
## Length:476 Length:476 Length:476
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## AdvLegalAnalysis BarPrepCompany BarPrepCompletion OptIntoWritingGuide
## Length:476 Length:476 Min. :0.0200 Length:476
## Class :character Class :character 1st Qu.:0.8000 Class :character
## Mode :character Mode :character Median :0.8900 Mode :character
## Mean :0.8635
## 3rd Qu.:0.9800
## Max. :1.0000
## NA's :23
## X.LawSchoolBarPrepWorkshops StudentSuccessInitiative BarPrepMentor
## Min. :0.000 Length:476 Length:476
## 1st Qu.:0.000 Class :character Class :character
## Median :0.000 Mode :character Mode :character
## Mean :1.532
## 3rd Qu.:3.000
## Max. :5.000
##
## MPRE MPT MEE WrittenScaledScore
## Min. : 76.00 Min. :1.000 Min. :2.000 Min. :111.7
## 1st Qu.: 89.50 1st Qu.:3.000 1st Qu.:3.330 1st Qu.:138.0
## Median : 99.00 Median :3.500 Median :3.670 Median :146.9
## Mean : 99.46 Mean :3.651 Mean :3.719 Mean :146.6
## 3rd Qu.:107.00 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:155.7
## Max. :145.00 Max. :5.500 Max. :5.330 Max. :181.2
## NA's :273
## MBE UBE
## Min. :103.6 Min. :227.3
## 1st Qu.:138.7 1st Qu.:278.5
## Median :147.1 Median :293.5
## Mean :146.2 Mean :292.9
## 3rd Qu.:154.0 3rd Qu.:306.8
## Max. :187.9 Max. :358.7
##
# I changed some of the yes/no answers into numbers so they make more sense in the analysis. It’s easier to compare and model the data when things like “Accommodations” or “Mentor” are just 1s and 0s. This helps the model look at who had extra support and who didn’t, and how that might relate to passing the bar exam.
# Recode Pass/Fail (1 = Pass, 0 = Fail)
df$Pass <- ifelse(df$PassFail == "P", 1, 0)
# Recode Y/N Variables
df$Accom <- ifelse(df$Accommodations == "Y", 1, 0)
df$Prob <- ifelse(df$Probation == "Y", 1, 0)
df$Mentor <- ifelse(df$BarPrepMentor == "Y", 1, 0)
df$SSI <- ifelse(df$StudentSuccessInitiative == "Y", 1, 0)
df$WriteGuide <- ifelse(df$OptIntoWritingGuide == "Y", 1, 0)
# The bar exam score variables like UBE, MBE, and the others are removed because they are actually part of what decides if someone passes or not. It wouldn’t make sense to use them to predict something they already help determine. We want to focus only on things we know before the student takes the bar exam.
# Remove Bar Exam Score Variables
drop_vars <- c("MPRE", "MPT", "MEE", "WrittenScaledScore", "MBE", "UBE")
df <- df[, !(names(df) %in% drop_vars)]
# Remove original PassFail column
df$PassFail <- NULL
# Remove missing values
df <- na.omit(df)
#To start the analysis, I built a full logistic regression model that includes all the student background and support-related variables. These predictors were selected because they are available before a student takes the bar exam and may be helpful in identifying who could benefit from extra support. The model includes LSAT, undergraduate GPA, final law school GPA, and whether the student had accommodations, was on probation, had a mentor, received writing support, or participated in student success programs. This full model helps us see which of these factors might be connected to passing the bar.
# Fit full model
full_model <- glm(Pass ~ LSAT + UGPA + GPA_Final + Accom + Prob +
Mentor + SSI + WriteGuide,
data = df, family = binomial)
# The equation below represents the full logistic regression model used in this project. It predicts the log odds of passing the bar exam based on several student-related factors. Each variable has a beta coefficient (β) that shows its influence. For example, if a student’s LSAT score or GPA goes up, and that coefficient is positive, it means their chances of passing also go up. The variables in this equation like LSAT, UGPA, GPA_Final, Accom, Prob, Mentor, SSI, and WriteGuide — were chosen because they can be known before the bar exam and might help the school identify which students need more support ahead of time. This formula is what the logistic model uses to estimate probabilities for each student.
# Outcome distribution
table(df$Pass)
##
## 0 1
## 51 398
# Stepwise selection was used to simplify the model by removing predictors that didn’t improve performance. After testing different combinations, the final model kept only LSAT and GPA_Final, since they gave the best results with the lowest AIC.
# Stepwise selection
step_model <- step(full_model, direction = "both")
## Start: AIC=252.28
## Pass ~ LSAT + UGPA + GPA_Final + Accom + Prob + Mentor + SSI +
## WriteGuide
##
##
## Step: AIC=252.28
## Pass ~ LSAT + UGPA + GPA_Final + Accom + Prob + Mentor + WriteGuide
##
##
## Step: AIC=252.28
## Pass ~ LSAT + UGPA + GPA_Final + Accom + Prob + WriteGuide
##
## Df Deviance AIC
## - Accom 1 238.28 250.28
## - WriteGuide 1 238.31 250.31
## - Prob 1 238.73 250.73
## - UGPA 1 239.75 251.75
## <none> 238.28 252.28
## - LSAT 1 248.76 260.76
## - GPA_Final 1 287.47 299.47
##
## Step: AIC=250.28
## Pass ~ LSAT + UGPA + GPA_Final + Prob + WriteGuide
##
## Df Deviance AIC
## - WriteGuide 1 238.31 248.31
## - Prob 1 238.73 248.73
## - UGPA 1 239.75 249.75
## <none> 238.28 250.28
## + Accom 1 238.28 252.28
## - LSAT 1 248.81 258.81
## - GPA_Final 1 287.79 297.79
##
## Step: AIC=248.31
## Pass ~ LSAT + UGPA + GPA_Final + Prob
##
## Df Deviance AIC
## - Prob 1 238.79 246.79
## - UGPA 1 239.92 247.92
## <none> 238.31 248.31
## + WriteGuide 1 238.28 250.28
## + Accom 1 238.31 250.31
## - LSAT 1 248.83 256.83
## - GPA_Final 1 289.06 297.06
##
## Step: AIC=246.79
## Pass ~ LSAT + UGPA + GPA_Final
##
## Df Deviance AIC
## - UGPA 1 240.34 246.34
## <none> 238.79 246.79
## + Prob 1 238.31 248.31
## + WriteGuide 1 238.73 248.73
## + Accom 1 238.78 248.78
## - LSAT 1 250.28 256.28
## - GPA_Final 1 301.84 307.84
##
## Step: AIC=246.34
## Pass ~ LSAT + GPA_Final
##
## Df Deviance AIC
## <none> 240.34 246.34
## + UGPA 1 238.79 246.79
## + Prob 1 239.92 247.92
## + WriteGuide 1 240.13 248.13
## + Accom 1 240.34 248.34
## - LSAT 1 250.38 254.38
## - GPA_Final 1 308.29 312.29
# Model summary
summary(step_model)
##
## Call:
## glm(formula = Pass ~ LSAT + GPA_Final, family = binomial, data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -35.32684 7.80717 -4.525 6.04e-06 ***
## LSAT 0.14462 0.04624 3.128 0.00176 **
## GPA_Final 4.80107 0.70053 6.853 7.21e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 317.84 on 448 degrees of freedom
## Residual deviance: 240.34 on 446 degrees of freedom
## AIC: 246.34
##
## Number of Fisher Scoring iterations: 6
# This formula shows how much better the model fits compared to just guessing the average outcome. The bigger the difference between null deviance and residual deviance, the better the model is at explaining the results.
# This formula shows how much better the model fits compared to just guessing the average outcome. The bigger the difference between null deviance and residual deviance, the better the model is at explaining the results.
# Predict probabilities
df$Pred_Prob <- predict(step_model, type = "response")
# Classify outcomes
df$Pred_Class <- ifelse(df$Pred_Prob > 0.5, 1, 0)
# Confusion matrix
conf_matrix <- table(Predicted = df$Pred_Class, Actual = df$Pass)
conf_matrix
## Actual
## Predicted 0 1
## 0 7 11
## 1 44 387
# The model’s classification accuracy was calculated by comparing the predicted results to the actual outcomes. The accuracy was high, meaning the model correctly predicted whether a student passed or failed most of the time.
# Accuracy
accuracy <- mean(df$Pred_Class == df$Pass)
accuracy
## [1] 0.8775056
barplot(table(df$Pass),
main = "Bar Exam Outcomes",
names.arg = c("Fail", "Pass"),
col = c("red", "lightgreen"),
ylab = "Number of Students")

hist(df$Pred_Prob,
main = "Predicted Probabilities of Passing",
xlab = "Predicted Probability",
col = "darkblue",
border = "black")

boxplot(Pred_Prob ~ Pass, data = df,
main = "Predicted Probabilities by Outcome",
xlab = "Actual Outcome (0 = Fail, 1 = Pass)",
ylab = "Predicted Probability",
col = c("orange", "darkgreen"))

# Diagnostic plots for GLM
par(mfrow = c(2, 2)) # Show 4 plots at once
plot(step_model)

par(mfrow = c(1, 1)) # Reset layout
#Residuals vs Fitted
plot(step_model, which = 1,
main = "Residuals vs Fitted")

#Normal Q-Q Plot of Deviance Residuals
plot(step_model, which = 2,
main = "Normal Q-Q Plot of Deviance Residuals")

#Scale-Location Plot
plot(step_model, which = 3,
main = "Scale Location Plot")

#Cook's Distance
plot(step_model, which = 4,
main = "Cook's Distance")

#Odds Ratios and Confidence Intervals
OR_CI <- exp(cbind(Odds_Ratio = coef(step_model),confint.default(step_model)))
summary(as.data.frame(OR_CI))
## Odds_Ratio 2.5 % 97.5 %
## Min. : 0.0000 Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.5778 1st Qu.: 0.5277 1st Qu.: 0.6326
## Median : 1.1556 Median : 1.0555 Median : 1.2652
## Mean : 40.9321 Mean :10.6240 Mean :160.4709
## 3rd Qu.: 61.3981 3rd Qu.:15.9359 3rd Qu.:240.7064
## Max. :121.6406 Max. :30.8164 Max. :480.1476
#Multicollinearity Check (VIF)
summary(data.frame(VIF = vif(step_model)))
## VIF
## Min. :1.036
## 1st Qu.:1.036
## Median :1.036
## Mean :1.036
## 3rd Qu.:1.036
## Max. :1.036