1. Load Libraries and Data

# Load Libraries
library(MASS)
library(tidyverse)
library(car)

# Load Dataset
df <- read.csv("https://raw.githubusercontent.com/tmatis12/datafiles/main/Updated_Bar_Data_For_Review_Final.csv")

# View summary
summary(df)
##       Year        PassFail              Age             LSAT      
##  Min.   :2021   Length:476         Min.   :23.10   Min.   :141.0  
##  1st Qu.:2022   Class :character   1st Qu.:26.70   1st Qu.:153.0  
##  Median :2023   Mode  :character   Median :28.20   Median :156.0  
##  Mean   :2023                      Mean   :29.13   Mean   :155.3  
##  3rd Qu.:2024                      3rd Qu.:30.10   3rd Qu.:157.0  
##  Max.   :2024                      Max.   :65.70   Max.   :168.0  
##                                                                   
##       UGPA          CivPro              LPI                LPII          
##  Min.   :2.010   Length:476         Length:476         Length:476        
##  1st Qu.:3.250   Class :character   Class :character   Class :character  
##  Median :3.490   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :3.451                                                           
##  3rd Qu.:3.710                                                           
##  Max.   :4.140                                                           
##                                                                          
##      GPA_1L        GPA_Final    FinalRankPercentile Accommodations    
##  Min.   :2.200   Min.   :2.44   Min.   :0.0000      Length:476        
##  1st Qu.:2.781   1st Qu.:3.05   1st Qu.:0.2600      Class :character  
##  Median :3.083   Median :3.27   Median :0.5150      Mode  :character  
##  Mean   :3.086   Mean   :3.28   Mean   :0.5067                        
##  3rd Qu.:3.383   3rd Qu.:3.52   3rd Qu.:0.7500                        
##  Max.   :4.000   Max.   :3.99   Max.   :0.9900                        
##  NA's   :4                                                            
##   Probation         LegalAnalysis_TexasPractice AdvLegalPerfSkills
##  Length:476         Length:476                  Length:476        
##  Class :character   Class :character            Class :character  
##  Mode  :character   Mode  :character            Mode  :character  
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##  AdvLegalAnalysis   BarPrepCompany     BarPrepCompletion OptIntoWritingGuide
##  Length:476         Length:476         Min.   :0.0200    Length:476         
##  Class :character   Class :character   1st Qu.:0.8000    Class :character   
##  Mode  :character   Mode  :character   Median :0.8900    Mode  :character   
##                                        Mean   :0.8635                       
##                                        3rd Qu.:0.9800                       
##                                        Max.   :1.0000                       
##                                        NA's   :23                           
##  X.LawSchoolBarPrepWorkshops StudentSuccessInitiative BarPrepMentor     
##  Min.   :0.000               Length:476               Length:476        
##  1st Qu.:0.000               Class :character         Class :character  
##  Median :0.000               Mode  :character         Mode  :character  
##  Mean   :1.532                                                          
##  3rd Qu.:3.000                                                          
##  Max.   :5.000                                                          
##                                                                         
##       MPRE             MPT             MEE        WrittenScaledScore
##  Min.   : 76.00   Min.   :1.000   Min.   :2.000   Min.   :111.7     
##  1st Qu.: 89.50   1st Qu.:3.000   1st Qu.:3.330   1st Qu.:138.0     
##  Median : 99.00   Median :3.500   Median :3.670   Median :146.9     
##  Mean   : 99.46   Mean   :3.651   Mean   :3.719   Mean   :146.6     
##  3rd Qu.:107.00   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:155.7     
##  Max.   :145.00   Max.   :5.500   Max.   :5.330   Max.   :181.2     
##  NA's   :273                                                        
##       MBE             UBE       
##  Min.   :103.6   Min.   :227.3  
##  1st Qu.:138.7   1st Qu.:278.5  
##  Median :147.1   Median :293.5  
##  Mean   :146.2   Mean   :292.9  
##  3rd Qu.:154.0   3rd Qu.:306.8  
##  Max.   :187.9   Max.   :358.7  
## 
# Recode Pass/Fail (1 = Pass, 0 = Fail)
df$Pass <- ifelse(df$PassFail == "P", 1, 0)

# Recode Y/N Variables
df$Accom <- ifelse(df$Accommodations == "Y", 1, 0)
df$Prob <- ifelse(df$Probation == "Y", 1, 0)
df$Mentor <- ifelse(df$BarPrepMentor == "Y", 1, 0)
df$SSI <- ifelse(df$StudentSuccessInitiative == "Y", 1, 0)
df$WriteGuide <- ifelse(df$OptIntoWritingGuide == "Y", 1, 0)

# Remove Bar Exam Score Variables
drop_vars <- c("MPRE", "MPT", "MEE", "WrittenScaledScore", "MBE", "UBE")
df <- df[, !(names(df) %in% drop_vars)]

# Remove original PassFail column
df$PassFail <- NULL

# Remove missing values
df <- na.omit(df)

# Fit full model
full_model <- glm(Pass ~ LSAT + UGPA + GPA_Final + Accom + Prob +
                    Mentor + SSI + WriteGuide,
                  data = df, family = binomial)

# Outcome distribution
table(df$Pass)
## 
##   0   1 
##  51 398
# Stepwise selection
step_model <- step(full_model, direction = "both")
## Start:  AIC=252.28
## Pass ~ LSAT + UGPA + GPA_Final + Accom + Prob + Mentor + SSI + 
##     WriteGuide
## 
## 
## Step:  AIC=252.28
## Pass ~ LSAT + UGPA + GPA_Final + Accom + Prob + Mentor + WriteGuide
## 
## 
## Step:  AIC=252.28
## Pass ~ LSAT + UGPA + GPA_Final + Accom + Prob + WriteGuide
## 
##              Df Deviance    AIC
## - Accom       1   238.28 250.28
## - WriteGuide  1   238.31 250.31
## - Prob        1   238.73 250.73
## - UGPA        1   239.75 251.75
## <none>            238.28 252.28
## - LSAT        1   248.76 260.76
## - GPA_Final   1   287.47 299.47
## 
## Step:  AIC=250.28
## Pass ~ LSAT + UGPA + GPA_Final + Prob + WriteGuide
## 
##              Df Deviance    AIC
## - WriteGuide  1   238.31 248.31
## - Prob        1   238.73 248.73
## - UGPA        1   239.75 249.75
## <none>            238.28 250.28
## + Accom       1   238.28 252.28
## - LSAT        1   248.81 258.81
## - GPA_Final   1   287.79 297.79
## 
## Step:  AIC=248.31
## Pass ~ LSAT + UGPA + GPA_Final + Prob
## 
##              Df Deviance    AIC
## - Prob        1   238.79 246.79
## - UGPA        1   239.92 247.92
## <none>            238.31 248.31
## + WriteGuide  1   238.28 250.28
## + Accom       1   238.31 250.31
## - LSAT        1   248.83 256.83
## - GPA_Final   1   289.06 297.06
## 
## Step:  AIC=246.79
## Pass ~ LSAT + UGPA + GPA_Final
## 
##              Df Deviance    AIC
## - UGPA        1   240.34 246.34
## <none>            238.79 246.79
## + Prob        1   238.31 248.31
## + WriteGuide  1   238.73 248.73
## + Accom       1   238.78 248.78
## - LSAT        1   250.28 256.28
## - GPA_Final   1   301.84 307.84
## 
## Step:  AIC=246.34
## Pass ~ LSAT + GPA_Final
## 
##              Df Deviance    AIC
## <none>            240.34 246.34
## + UGPA        1   238.79 246.79
## + Prob        1   239.92 247.92
## + WriteGuide  1   240.13 248.13
## + Accom       1   240.34 248.34
## - LSAT        1   250.38 254.38
## - GPA_Final   1   308.29 312.29
# Model summary
summary(step_model)
## 
## Call:
## glm(formula = Pass ~ LSAT + GPA_Final, family = binomial, data = df)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -35.32684    7.80717  -4.525 6.04e-06 ***
## LSAT          0.14462    0.04624   3.128  0.00176 ** 
## GPA_Final     4.80107    0.70053   6.853 7.21e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 317.84  on 448  degrees of freedom
## Residual deviance: 240.34  on 446  degrees of freedom
## AIC: 246.34
## 
## Number of Fisher Scoring iterations: 6
# Predict probabilities
df$Pred_Prob <- predict(step_model, type = "response")

# Classify outcomes
df$Pred_Class <- ifelse(df$Pred_Prob > 0.5, 1, 0)

# Confusion matrix
conf_matrix <- table(Predicted = df$Pred_Class, Actual = df$Pass)
conf_matrix
##          Actual
## Predicted   0   1
##         0   7  11
##         1  44 387
# Accuracy
accuracy <- mean(df$Pred_Class == df$Pass)
accuracy
## [1] 0.8775056
barplot(table(df$Pass),
        main = "Bar Exam Outcomes",
        names.arg = c("Fail", "Pass"),
        col = c("red", "lightgreen"),
        ylab = "Number of Students")

hist(df$Pred_Prob,
     main = "Predicted Probabilities of Passing",
     xlab = "Predicted Probability",
     col = "darkblue",
     border = "black")

boxplot(Pred_Prob ~ Pass, data = df,
        main = "Predicted Probabilities by Outcome",
        xlab = "Actual Outcome (0 = Fail, 1 = Pass)",
        ylab = "Predicted Probability",
        col = c("orange", "darkgreen"))

# Diagnostic plots for GLM
par(mfrow = c(2, 2))  # Show 4 plots at once
plot(step_model)

par(mfrow = c(1, 1))  # Reset layout

#Residuals vs Fitted
plot(step_model, which = 1,
     main = "Residuals vs Fitted")

#Normal Q-Q Plot of Deviance Residuals
plot(step_model, which = 2,
     main = "Normal Q-Q Plot of Deviance Residuals")

#Scale-Location Plot
plot(step_model, which = 3,
     main = "Scale Location Plot")

#Cook's Distance
plot(step_model, which = 4,
     main = "Cook's Distance")

#Odds Ratios and Confidence Intervals
OR_CI <- exp(cbind(Odds_Ratio = coef(step_model),confint.default(step_model)))
summary(as.data.frame(OR_CI))  
##    Odds_Ratio           2.5 %             97.5 %        
##  Min.   :  0.0000   Min.   : 0.0000   Min.   :  0.0000  
##  1st Qu.:  0.5778   1st Qu.: 0.5277   1st Qu.:  0.6326  
##  Median :  1.1556   Median : 1.0555   Median :  1.2652  
##  Mean   : 40.9321   Mean   :10.6240   Mean   :160.4709  
##  3rd Qu.: 61.3981   3rd Qu.:15.9359   3rd Qu.:240.7064  
##  Max.   :121.6406   Max.   :30.8164   Max.   :480.1476
#Multicollinearity Check (VIF)
summary(data.frame(VIF = vif(step_model))) 
##       VIF       
##  Min.   :1.036  
##  1st Qu.:1.036  
##  Median :1.036  
##  Mean   :1.036  
##  3rd Qu.:1.036  
##  Max.   :1.036