1. Load Libraries and Data
# Load Libraries
library(MASS)
library(tidyverse)
library(car)
# Load Dataset
df <- read.csv("https://raw.githubusercontent.com/tmatis12/datafiles/main/Updated_Bar_Data_For_Review_Final.csv")
# View summary
summary(df)
## Year PassFail Age LSAT
## Min. :2021 Length:476 Min. :23.10 Min. :141.0
## 1st Qu.:2022 Class :character 1st Qu.:26.70 1st Qu.:153.0
## Median :2023 Mode :character Median :28.20 Median :156.0
## Mean :2023 Mean :29.13 Mean :155.3
## 3rd Qu.:2024 3rd Qu.:30.10 3rd Qu.:157.0
## Max. :2024 Max. :65.70 Max. :168.0
##
## UGPA CivPro LPI LPII
## Min. :2.010 Length:476 Length:476 Length:476
## 1st Qu.:3.250 Class :character Class :character Class :character
## Median :3.490 Mode :character Mode :character Mode :character
## Mean :3.451
## 3rd Qu.:3.710
## Max. :4.140
##
## GPA_1L GPA_Final FinalRankPercentile Accommodations
## Min. :2.200 Min. :2.44 Min. :0.0000 Length:476
## 1st Qu.:2.781 1st Qu.:3.05 1st Qu.:0.2600 Class :character
## Median :3.083 Median :3.27 Median :0.5150 Mode :character
## Mean :3.086 Mean :3.28 Mean :0.5067
## 3rd Qu.:3.383 3rd Qu.:3.52 3rd Qu.:0.7500
## Max. :4.000 Max. :3.99 Max. :0.9900
## NA's :4
## Probation LegalAnalysis_TexasPractice AdvLegalPerfSkills
## Length:476 Length:476 Length:476
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## AdvLegalAnalysis BarPrepCompany BarPrepCompletion OptIntoWritingGuide
## Length:476 Length:476 Min. :0.0200 Length:476
## Class :character Class :character 1st Qu.:0.8000 Class :character
## Mode :character Mode :character Median :0.8900 Mode :character
## Mean :0.8635
## 3rd Qu.:0.9800
## Max. :1.0000
## NA's :23
## X.LawSchoolBarPrepWorkshops StudentSuccessInitiative BarPrepMentor
## Min. :0.000 Length:476 Length:476
## 1st Qu.:0.000 Class :character Class :character
## Median :0.000 Mode :character Mode :character
## Mean :1.532
## 3rd Qu.:3.000
## Max. :5.000
##
## MPRE MPT MEE WrittenScaledScore
## Min. : 76.00 Min. :1.000 Min. :2.000 Min. :111.7
## 1st Qu.: 89.50 1st Qu.:3.000 1st Qu.:3.330 1st Qu.:138.0
## Median : 99.00 Median :3.500 Median :3.670 Median :146.9
## Mean : 99.46 Mean :3.651 Mean :3.719 Mean :146.6
## 3rd Qu.:107.00 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:155.7
## Max. :145.00 Max. :5.500 Max. :5.330 Max. :181.2
## NA's :273
## MBE UBE
## Min. :103.6 Min. :227.3
## 1st Qu.:138.7 1st Qu.:278.5
## Median :147.1 Median :293.5
## Mean :146.2 Mean :292.9
## 3rd Qu.:154.0 3rd Qu.:306.8
## Max. :187.9 Max. :358.7
##
# Recode Pass/Fail (1 = Pass, 0 = Fail)
df$Pass <- ifelse(df$PassFail == "P", 1, 0)
# Recode Y/N Variables
df$Accom <- ifelse(df$Accommodations == "Y", 1, 0)
df$Prob <- ifelse(df$Probation == "Y", 1, 0)
df$Mentor <- ifelse(df$BarPrepMentor == "Y", 1, 0)
df$SSI <- ifelse(df$StudentSuccessInitiative == "Y", 1, 0)
df$WriteGuide <- ifelse(df$OptIntoWritingGuide == "Y", 1, 0)
# Remove Bar Exam Score Variables
drop_vars <- c("MPRE", "MPT", "MEE", "WrittenScaledScore", "MBE", "UBE")
df <- df[, !(names(df) %in% drop_vars)]
# Remove original PassFail column
df$PassFail <- NULL
# Remove missing values
df <- na.omit(df)
# Fit full model
full_model <- glm(Pass ~ LSAT + UGPA + GPA_Final + Accom + Prob +
Mentor + SSI + WriteGuide,
data = df, family = binomial)
# Outcome distribution
table(df$Pass)
##
## 0 1
## 51 398
# Stepwise selection
step_model <- step(full_model, direction = "both")
## Start: AIC=252.28
## Pass ~ LSAT + UGPA + GPA_Final + Accom + Prob + Mentor + SSI +
## WriteGuide
##
##
## Step: AIC=252.28
## Pass ~ LSAT + UGPA + GPA_Final + Accom + Prob + Mentor + WriteGuide
##
##
## Step: AIC=252.28
## Pass ~ LSAT + UGPA + GPA_Final + Accom + Prob + WriteGuide
##
## Df Deviance AIC
## - Accom 1 238.28 250.28
## - WriteGuide 1 238.31 250.31
## - Prob 1 238.73 250.73
## - UGPA 1 239.75 251.75
## <none> 238.28 252.28
## - LSAT 1 248.76 260.76
## - GPA_Final 1 287.47 299.47
##
## Step: AIC=250.28
## Pass ~ LSAT + UGPA + GPA_Final + Prob + WriteGuide
##
## Df Deviance AIC
## - WriteGuide 1 238.31 248.31
## - Prob 1 238.73 248.73
## - UGPA 1 239.75 249.75
## <none> 238.28 250.28
## + Accom 1 238.28 252.28
## - LSAT 1 248.81 258.81
## - GPA_Final 1 287.79 297.79
##
## Step: AIC=248.31
## Pass ~ LSAT + UGPA + GPA_Final + Prob
##
## Df Deviance AIC
## - Prob 1 238.79 246.79
## - UGPA 1 239.92 247.92
## <none> 238.31 248.31
## + WriteGuide 1 238.28 250.28
## + Accom 1 238.31 250.31
## - LSAT 1 248.83 256.83
## - GPA_Final 1 289.06 297.06
##
## Step: AIC=246.79
## Pass ~ LSAT + UGPA + GPA_Final
##
## Df Deviance AIC
## - UGPA 1 240.34 246.34
## <none> 238.79 246.79
## + Prob 1 238.31 248.31
## + WriteGuide 1 238.73 248.73
## + Accom 1 238.78 248.78
## - LSAT 1 250.28 256.28
## - GPA_Final 1 301.84 307.84
##
## Step: AIC=246.34
## Pass ~ LSAT + GPA_Final
##
## Df Deviance AIC
## <none> 240.34 246.34
## + UGPA 1 238.79 246.79
## + Prob 1 239.92 247.92
## + WriteGuide 1 240.13 248.13
## + Accom 1 240.34 248.34
## - LSAT 1 250.38 254.38
## - GPA_Final 1 308.29 312.29
# Model summary
summary(step_model)
##
## Call:
## glm(formula = Pass ~ LSAT + GPA_Final, family = binomial, data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -35.32684 7.80717 -4.525 6.04e-06 ***
## LSAT 0.14462 0.04624 3.128 0.00176 **
## GPA_Final 4.80107 0.70053 6.853 7.21e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 317.84 on 448 degrees of freedom
## Residual deviance: 240.34 on 446 degrees of freedom
## AIC: 246.34
##
## Number of Fisher Scoring iterations: 6
# Predict probabilities
df$Pred_Prob <- predict(step_model, type = "response")
# Classify outcomes
df$Pred_Class <- ifelse(df$Pred_Prob > 0.5, 1, 0)
# Confusion matrix
conf_matrix <- table(Predicted = df$Pred_Class, Actual = df$Pass)
conf_matrix
## Actual
## Predicted 0 1
## 0 7 11
## 1 44 387
# Accuracy
accuracy <- mean(df$Pred_Class == df$Pass)
accuracy
## [1] 0.8775056
barplot(table(df$Pass),
main = "Bar Exam Outcomes",
names.arg = c("Fail", "Pass"),
col = c("red", "lightgreen"),
ylab = "Number of Students")

hist(df$Pred_Prob,
main = "Predicted Probabilities of Passing",
xlab = "Predicted Probability",
col = "darkblue",
border = "black")

boxplot(Pred_Prob ~ Pass, data = df,
main = "Predicted Probabilities by Outcome",
xlab = "Actual Outcome (0 = Fail, 1 = Pass)",
ylab = "Predicted Probability",
col = c("orange", "darkgreen"))

# Diagnostic plots for GLM
par(mfrow = c(2, 2)) # Show 4 plots at once
plot(step_model)

par(mfrow = c(1, 1)) # Reset layout
#Residuals vs Fitted
plot(step_model, which = 1,
main = "Residuals vs Fitted")

#Normal Q-Q Plot of Deviance Residuals
plot(step_model, which = 2,
main = "Normal Q-Q Plot of Deviance Residuals")

#Scale-Location Plot
plot(step_model, which = 3,
main = "Scale Location Plot")

#Cook's Distance
plot(step_model, which = 4,
main = "Cook's Distance")

#Odds Ratios and Confidence Intervals
OR_CI <- exp(cbind(Odds_Ratio = coef(step_model),confint.default(step_model)))
summary(as.data.frame(OR_CI))
## Odds_Ratio 2.5 % 97.5 %
## Min. : 0.0000 Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.5778 1st Qu.: 0.5277 1st Qu.: 0.6326
## Median : 1.1556 Median : 1.0555 Median : 1.2652
## Mean : 40.9321 Mean :10.6240 Mean :160.4709
## 3rd Qu.: 61.3981 3rd Qu.:15.9359 3rd Qu.:240.7064
## Max. :121.6406 Max. :30.8164 Max. :480.1476
#Multicollinearity Check (VIF)
summary(data.frame(VIF = vif(step_model)))
## VIF
## Min. :1.036
## 1st Qu.:1.036
## Median :1.036
## Mean :1.036
## 3rd Qu.:1.036
## Max. :1.036