# This project applies logistic regression to examine which factors are associated with passing or failing the bar exam. The response variable is binary—whether a student passes or fails—making logistic regression an appropriate modeling approach for this analysis.

# In this study, I exclude bar exam component scores such as UBE, MBE, MEE, MPT, and Written Scaled Score as predictor variables. These measures are directly used to determine pass/fail outcomes, so including them would amount to using the outcome itself as a predictor, which would not provide meaningful insight. Additionally, variations in scoring across exam administrations further limit their usefulness for predictive modeling. Instead, I focus on modeling pass/fail outcomes directly using pre-exam information.

# The predictors used in this analysis consist of variables available prior to taking the bar exam. These include LSAT scores, undergraduate GPA, final law school GPA, and indicators of academic support such as mentoring, writing resources, accommodations, and participation in student success programs. These variables are more relevant for identifying opportunities where institutions can intervene earlier to support student success.

# I begin by cleaning and preparing the dataset, followed by fitting a logistic regression model to identify significant predictors. To improve model parsimony, I apply stepwise selection to retain only the most influential variables. Model performance is then evaluated using classification accuracy, a confusion matrix, residual diagnostics, and odds ratios to aid interpretation.

# This analysis provides insight into which students may benefit from additional support before taking the bar exam and offers guidance on how institutions can improve bar passage rates through targeted early interventions.

# Load Libraries
library(MASS)
library(tidyverse)
library(car)

# Load Dataset
df <- read.csv("https://raw.githubusercontent.com/tmatis12/datafiles/main/Updated_Bar_Data_For_Review_Final.csv")

# There are also binary or yes/no type variables like Accommodations, Probation, BarPrepMentor, and StudentSuccessInitiative, which are listed as “character” for now, and we’ll need to recode them into numbers later. The variable Pass shows who passed the bar exam

# View summary
summary(df)
##       Year        PassFail              Age             LSAT      
##  Min.   :2021   Length:476         Min.   :23.10   Min.   :141.0  
##  1st Qu.:2022   Class :character   1st Qu.:26.70   1st Qu.:153.0  
##  Median :2023   Mode  :character   Median :28.20   Median :156.0  
##  Mean   :2023                      Mean   :29.13   Mean   :155.3  
##  3rd Qu.:2024                      3rd Qu.:30.10   3rd Qu.:157.0  
##  Max.   :2024                      Max.   :65.70   Max.   :168.0  
##                                                                   
##       UGPA          CivPro              LPI                LPII          
##  Min.   :2.010   Length:476         Length:476         Length:476        
##  1st Qu.:3.250   Class :character   Class :character   Class :character  
##  Median :3.490   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :3.451                                                           
##  3rd Qu.:3.710                                                           
##  Max.   :4.140                                                           
##                                                                          
##      GPA_1L        GPA_Final    FinalRankPercentile Accommodations    
##  Min.   :2.200   Min.   :2.44   Min.   :0.0000      Length:476        
##  1st Qu.:2.781   1st Qu.:3.05   1st Qu.:0.2600      Class :character  
##  Median :3.083   Median :3.27   Median :0.5150      Mode  :character  
##  Mean   :3.086   Mean   :3.28   Mean   :0.5067                        
##  3rd Qu.:3.383   3rd Qu.:3.52   3rd Qu.:0.7500                        
##  Max.   :4.000   Max.   :3.99   Max.   :0.9900                        
##  NA's   :4                                                            
##   Probation         LegalAnalysis_TexasPractice AdvLegalPerfSkills
##  Length:476         Length:476                  Length:476        
##  Class :character   Class :character            Class :character  
##  Mode  :character   Mode  :character            Mode  :character  
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##  AdvLegalAnalysis   BarPrepCompany     BarPrepCompletion OptIntoWritingGuide
##  Length:476         Length:476         Min.   :0.0200    Length:476         
##  Class :character   Class :character   1st Qu.:0.8000    Class :character   
##  Mode  :character   Mode  :character   Median :0.8900    Mode  :character   
##                                        Mean   :0.8635                       
##                                        3rd Qu.:0.9800                       
##                                        Max.   :1.0000                       
##                                        NA's   :23                           
##  X.LawSchoolBarPrepWorkshops StudentSuccessInitiative BarPrepMentor     
##  Min.   :0.000               Length:476               Length:476        
##  1st Qu.:0.000               Class :character         Class :character  
##  Median :0.000               Mode  :character         Mode  :character  
##  Mean   :1.532                                                          
##  3rd Qu.:3.000                                                          
##  Max.   :5.000                                                          
##                                                                         
##       MPRE             MPT             MEE        WrittenScaledScore
##  Min.   : 76.00   Min.   :1.000   Min.   :2.000   Min.   :111.7     
##  1st Qu.: 89.50   1st Qu.:3.000   1st Qu.:3.330   1st Qu.:138.0     
##  Median : 99.00   Median :3.500   Median :3.670   Median :146.9     
##  Mean   : 99.46   Mean   :3.651   Mean   :3.719   Mean   :146.6     
##  3rd Qu.:107.00   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:155.7     
##  Max.   :145.00   Max.   :5.500   Max.   :5.330   Max.   :181.2     
##  NA's   :273                                                        
##       MBE             UBE       
##  Min.   :103.6   Min.   :227.3  
##  1st Qu.:138.7   1st Qu.:278.5  
##  Median :147.1   Median :293.5  
##  Mean   :146.2   Mean   :292.9  
##  3rd Qu.:154.0   3rd Qu.:306.8  
##  Max.   :187.9   Max.   :358.7  
## 
# I changed some of the yes/no answers into numbers so they make more sense in the analysis. It’s easier to compare and model the data when things like “Accommodations” or “Mentor” are just 1s and 0s. This helps the model look at who had extra support and who didn’t, and how that might relate to passing the bar exam.

# Recode Pass/Fail (1 = Pass, 0 = Fail)
df$Pass <- ifelse(df$PassFail == "P", 1, 0)

# Recode Y/N Variables
df$Accom <- ifelse(df$Accommodations == "Y", 1, 0)
df$Prob <- ifelse(df$Probation == "Y", 1, 0)
df$Mentor <- ifelse(df$BarPrepMentor == "Y", 1, 0)
df$SSI <- ifelse(df$StudentSuccessInitiative == "Y", 1, 0)
df$WriteGuide <- ifelse(df$OptIntoWritingGuide == "Y", 1, 0)

# The bar exam score variables like UBE, MBE, and the others are removed because they are actually part of what decides if someone passes or not. It wouldn’t make sense to use them to predict something they already help determine. We want to focus only on things we know before the student takes the bar exam.

# Remove Bar Exam Score Variables
drop_vars <- c("MPRE", "MPT", "MEE", "WrittenScaledScore", "MBE", "UBE")
df <- df[, !(names(df) %in% drop_vars)]

# Remove original PassFail column
df$PassFail <- NULL

# Remove missing values
df <- na.omit(df)

#To start the analysis, I built a full logistic regression model that includes all the student background and support-related variables. These predictors were selected because they are available before a student takes the bar exam and may be helpful in identifying who could benefit from extra support. The model includes LSAT, undergraduate GPA, final law school GPA, and whether the student had accommodations, was on probation, had a mentor, received writing support, or participated in student success programs. This full model helps us see which of these factors might be connected to passing the bar.

# Fit full model
full_model <- glm(Pass ~ LSAT + UGPA + GPA_Final + Accom + Prob +
                    Mentor + SSI + WriteGuide,
                  data = df, family = binomial)

# The equation below represents the full logistic regression model used in this project. It predicts the log odds of passing the bar exam based on several student-related factors. Each variable has a beta coefficient (β) that shows its influence. For example, if a student’s LSAT score or GPA goes up, and that coefficient is positive, it means their chances of passing also go up. The variables in this equation like LSAT, UGPA, GPA_Final, Accom, Prob, Mentor, SSI, and WriteGuide — were chosen because they can be known before the bar exam and might help the school identify which students need more support ahead of time. This formula is what the logistic model uses to estimate probabilities for each student.

# Outcome distribution
table(df$Pass)
## 
##   0   1 
##  51 398
# Stepwise selection was used to simplify the model by removing predictors that didn’t improve performance. After testing different combinations, the final model kept only LSAT and GPA_Final, since they gave the best results with the lowest AIC.

# Stepwise selection
step_model <- step(full_model, direction = "both")
## Start:  AIC=252.28
## Pass ~ LSAT + UGPA + GPA_Final + Accom + Prob + Mentor + SSI + 
##     WriteGuide
## 
## 
## Step:  AIC=252.28
## Pass ~ LSAT + UGPA + GPA_Final + Accom + Prob + Mentor + WriteGuide
## 
## 
## Step:  AIC=252.28
## Pass ~ LSAT + UGPA + GPA_Final + Accom + Prob + WriteGuide
## 
##              Df Deviance    AIC
## - Accom       1   238.28 250.28
## - WriteGuide  1   238.31 250.31
## - Prob        1   238.73 250.73
## - UGPA        1   239.75 251.75
## <none>            238.28 252.28
## - LSAT        1   248.76 260.76
## - GPA_Final   1   287.47 299.47
## 
## Step:  AIC=250.28
## Pass ~ LSAT + UGPA + GPA_Final + Prob + WriteGuide
## 
##              Df Deviance    AIC
## - WriteGuide  1   238.31 248.31
## - Prob        1   238.73 248.73
## - UGPA        1   239.75 249.75
## <none>            238.28 250.28
## + Accom       1   238.28 252.28
## - LSAT        1   248.81 258.81
## - GPA_Final   1   287.79 297.79
## 
## Step:  AIC=248.31
## Pass ~ LSAT + UGPA + GPA_Final + Prob
## 
##              Df Deviance    AIC
## - Prob        1   238.79 246.79
## - UGPA        1   239.92 247.92
## <none>            238.31 248.31
## + WriteGuide  1   238.28 250.28
## + Accom       1   238.31 250.31
## - LSAT        1   248.83 256.83
## - GPA_Final   1   289.06 297.06
## 
## Step:  AIC=246.79
## Pass ~ LSAT + UGPA + GPA_Final
## 
##              Df Deviance    AIC
## - UGPA        1   240.34 246.34
## <none>            238.79 246.79
## + Prob        1   238.31 248.31
## + WriteGuide  1   238.73 248.73
## + Accom       1   238.78 248.78
## - LSAT        1   250.28 256.28
## - GPA_Final   1   301.84 307.84
## 
## Step:  AIC=246.34
## Pass ~ LSAT + GPA_Final
## 
##              Df Deviance    AIC
## <none>            240.34 246.34
## + UGPA        1   238.79 246.79
## + Prob        1   239.92 247.92
## + WriteGuide  1   240.13 248.13
## + Accom       1   240.34 248.34
## - LSAT        1   250.38 254.38
## - GPA_Final   1   308.29 312.29
# Model summary
summary(step_model)
## 
## Call:
## glm(formula = Pass ~ LSAT + GPA_Final, family = binomial, data = df)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -35.32684    7.80717  -4.525 6.04e-06 ***
## LSAT          0.14462    0.04624   3.128  0.00176 ** 
## GPA_Final     4.80107    0.70053   6.853 7.21e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 317.84  on 448  degrees of freedom
## Residual deviance: 240.34  on 446  degrees of freedom
## AIC: 246.34
## 
## Number of Fisher Scoring iterations: 6
# This formula shows how much better the model fits compared to just guessing the average outcome. The bigger the difference between null deviance and residual deviance, the better the model is at explaining the results.

# This formula shows how much better the model fits compared to just guessing the average outcome. The bigger the difference between null deviance and residual deviance, the better the model is at explaining the results.

# Predict probabilities
df$Pred_Prob <- predict(step_model, type = "response")

# Classify outcomes
df$Pred_Class <- ifelse(df$Pred_Prob > 0.5, 1, 0)

# Confusion matrix
conf_matrix <- table(Predicted = df$Pred_Class, Actual = df$Pass)
conf_matrix
##          Actual
## Predicted   0   1
##         0   7  11
##         1  44 387
# The model’s classification accuracy was calculated by comparing the predicted results to the actual outcomes. The accuracy was high, meaning the model correctly predicted whether a student passed or failed most of the time.

# Accuracy
accuracy <- mean(df$Pred_Class == df$Pass)
accuracy
## [1] 0.8775056
barplot(table(df$Pass),
        main = "Bar Exam Outcomes",
        names.arg = c("Fail", "Pass"),
        col = c("red", "lightgreen"),
        ylab = "Number of Students")

hist(df$Pred_Prob,
     main = "Predicted Probabilities of Passing",
     xlab = "Predicted Probability",
     col = "darkblue",
     border = "black")

boxplot(Pred_Prob ~ Pass, data = df,
        main = "Predicted Probabilities by Outcome",
        xlab = "Actual Outcome (0 = Fail, 1 = Pass)",
        ylab = "Predicted Probability",
        col = c("orange", "darkgreen"))

# Diagnostic plots for GLM
par(mfrow = c(2, 2))  # Show 4 plots at once
plot(step_model)

par(mfrow = c(1, 1))  # Reset layout

#Residuals vs Fitted
plot(step_model, which = 1,
     main = "Residuals vs Fitted")

#Normal Q-Q Plot of Deviance Residuals
plot(step_model, which = 2,
     main = "Normal Q-Q Plot of Deviance Residuals")

#Scale-Location Plot
plot(step_model, which = 3,
     main = "Scale Location Plot")

#Cook's Distance
plot(step_model, which = 4,
     main = "Cook's Distance")

#Odds Ratios and Confidence Intervals
OR_CI <- exp(cbind(Odds_Ratio = coef(step_model),confint.default(step_model)))
summary(as.data.frame(OR_CI))  
##    Odds_Ratio           2.5 %             97.5 %        
##  Min.   :  0.0000   Min.   : 0.0000   Min.   :  0.0000  
##  1st Qu.:  0.5778   1st Qu.: 0.5277   1st Qu.:  0.6326  
##  Median :  1.1556   Median : 1.0555   Median :  1.2652  
##  Mean   : 40.9321   Mean   :10.6240   Mean   :160.4709  
##  3rd Qu.: 61.3981   3rd Qu.:15.9359   3rd Qu.:240.7064  
##  Max.   :121.6406   Max.   :30.8164   Max.   :480.1476
#Multicollinearity Check (VIF)
summary(data.frame(VIF = vif(step_model))) 
##       VIF       
##  Min.   :1.036  
##  1st Qu.:1.036  
##  Median :1.036  
##  Mean   :1.036  
##  3rd Qu.:1.036  
##  Max.   :1.036