The TTU School of Law wants to know what factors affect whether a graduate passes the bar exam. The dataset includes 600 graduates who took the Uniform Bar Examination (UBE) between 2021 and 2025. In Texas, a score of 270 out of 400 is required to pass. I selected PassFail as my main response variable because the school’s primary concern is figuring out which students are likely to fail. I also look at UBE score as a continuous outcome to understand what affects overall performance, not just whether a student passes or fails. Before running any models, I expected these variables to be important:
LSAT and UGPA because students who performed well academically before law school are likely to perform better on a high-stakes exam like the bar.
GPA_Final because students who maintained strong grades throughout law school are probably more prepared for the exam.
BarPrepCompletion because students who complete more of their bar prep course are likely more ready when exam day comes.
df<-read.csv("https://raw.githubusercontent.com/tmatis12/datafiles/refs/heads/main/BarData_2025.csv")
str(df)
## 'data.frame': 600 obs. of 28 variables:
## $ Year : int 2021 2021 2021 2021 2021 2021 2021 2021 2021 2021 ...
## $ PassFail : chr "F" "F" "F" "F" ...
## $ Age : num 29.1 29.6 29 36.2 28.9 30.8 29.1 42.9 28.3 27.1 ...
## $ LSAT : int 152 155 157 156 145 154 149 160 152 150 ...
## $ UGPA : chr "3.42" "2.82" "3.46" "3.13" ...
## $ CivPro : chr "B+" "B+" "C" "D+" ...
## $ LPI : chr "A" "B" "B" "C" ...
## $ LPII : chr "A" "B" "B" "C+" ...
## $ GPA_1L : num 3.21 2.43 2.62 2.27 2.29 ...
## $ GPA_Final : num 3.29 3.2 2.91 2.77 2.9 2.82 3 3.09 3.21 2.74 ...
## $ FinalRankPercentile : num 0.46 0.33 0.08 0.02 0.08 0.05 0.15 0.22 0.34 0.01 ...
## $ Accommodations : chr "N" "Y" "N" "N" ...
## $ Probation : chr "N" "Y" "N" "Y" ...
## $ LegalAnalysis_TexasPractice: chr "Y" "Y" "Y" "Y" ...
## $ AdvLegalPerfSkills : chr "Y" "Y" "Y" "Y" ...
## $ AdvLegalAnalysis : chr "Y" "Y" "Y" "Y" ...
## $ BarPrepCompany : chr "Barbri" "Barbri" "Barbri" "Barbri" ...
## $ BarPrepCompletion : num 0.96 0.98 0.48 1 0.77 0.02 0.9 0.76 0.77 0.88 ...
## $ OptIntoWritingGuide : chr "" "" "" "" ...
## $ X.LawSchoolBarPrepWorkshops: int 3 0 3 0 5 1 5 5 1 5 ...
## $ StudentSuccessInitiative : chr "N" "Cochran" "Smith" "Baldwin" ...
## $ BarPrepMentor : chr "N" "N" "N" "N" ...
## $ MPRE : num 103 76 99 81 99 NA 90 97 100 78 ...
## $ MPT : num 3 3 3 2.5 3.5 3 2.5 2.5 3 2.5 ...
## $ MEE : num 2.67 3.17 2.67 3 2.67 2 3.5 3 2.67 3.83 ...
## $ WrittenScaledScore : num 126 133 126 126 130 ...
## $ MBE : num 133 133 118 140 125 ...
## $ UBE : num 259 266 244 266 256 ...
df$UGPA<-as.numeric(df$UGPA)
df$PassFail<-factor(df$PassFail,levels=c("F","P"))
grademap<-c("A"=4.0,"A-"=3.7,"B+"=3.3,"B"=3.0,"B-"=2.7,
"C+"=2.3,"C"=2.0,"C-"=1.7,"D+"=1.3,"D"=1.0,"D-"=.07,
"F"=0)
df$CivPro_Num<-grademap[df$CivPro]
df$LPI_Num<-grademap[df$LPI]
df$LPII_Num<-grademap[df$LPII]
head(df)
## Year PassFail Age LSAT UGPA CivPro LPI LPII GPA_1L GPA_Final
## 1 2021 F 29.1 152 3.42 B+ A A 3.206 3.29
## 2 2021 F 29.6 155 2.82 B+ B B 2.431 3.20
## 3 2021 F 29.0 157 3.46 C B B 2.620 2.91
## 4 2021 F 36.2 156 3.13 D+ C C+ 2.275 2.77
## 5 2021 F 28.9 145 3.49 C C+ C+ 2.293 2.90
## 6 2021 F 30.8 154 2.85 B+ F CR 2.538 2.82
## FinalRankPercentile Accommodations Probation LegalAnalysis_TexasPractice
## 1 0.46 N N Y
## 2 0.33 Y Y Y
## 3 0.08 N N Y
## 4 0.02 N Y Y
## 5 0.08 N Y Y
## 6 0.05 N N Y
## AdvLegalPerfSkills AdvLegalAnalysis BarPrepCompany BarPrepCompletion
## 1 Y Y Barbri 0.96
## 2 Y Y Barbri 0.98
## 3 Y Y Barbri 0.48
## 4 Y Y Barbri 1.00
## 5 Y Y Themis 0.77
## 6 Y Y Themis 0.02
## OptIntoWritingGuide X.LawSchoolBarPrepWorkshops StudentSuccessInitiative
## 1 3 N
## 2 0 Cochran
## 3 3 Smith
## 4 0 Baldwin
## 5 5 Baldwin
## 6 1 Rosen
## BarPrepMentor MPRE MPT MEE WrittenScaledScore MBE UBE CivPro_Num LPI_Num
## 1 N 103 3.0 2.67 125.5 133.3 258.8 3.3 4.0
## 2 N 76 3.0 3.17 133.1 132.7 265.8 3.3 3.0
## 3 N 99 3.0 2.67 125.5 118.2 243.7 2.0 3.0
## 4 N 81 2.5 3.00 125.5 140.1 265.6 1.3 2.0
## 5 N 99 3.5 2.67 130.5 125.4 255.9 2.0 2.3
## 6 N NA 3.0 2.00 115.4 113.5 228.9 3.3 0.0
## LPII_Num
## 1 4.0
## 2 3.0
## 3 3.0
## 4 2.3
## 5 2.3
## 6 NA
colnames(df)
## [1] "Year" "PassFail"
## [3] "Age" "LSAT"
## [5] "UGPA" "CivPro"
## [7] "LPI" "LPII"
## [9] "GPA_1L" "GPA_Final"
## [11] "FinalRankPercentile" "Accommodations"
## [13] "Probation" "LegalAnalysis_TexasPractice"
## [15] "AdvLegalPerfSkills" "AdvLegalAnalysis"
## [17] "BarPrepCompany" "BarPrepCompletion"
## [19] "OptIntoWritingGuide" "X.LawSchoolBarPrepWorkshops"
## [21] "StudentSuccessInitiative" "BarPrepMentor"
## [23] "MPRE" "MPT"
## [25] "MEE" "WrittenScaledScore"
## [27] "MBE" "UBE"
## [29] "CivPro_Num" "LPI_Num"
## [31] "LPII_Num"
colSums(is.na(df))
## Year PassFail
## 0 0
## Age LSAT
## 0 0
## UGPA CivPro
## 1 0
## LPI LPII
## 0 0
## GPA_1L GPA_Final
## 8 0
## FinalRankPercentile Accommodations
## 0 0
## Probation LegalAnalysis_TexasPractice
## 0 0
## AdvLegalPerfSkills AdvLegalAnalysis
## 0 0
## BarPrepCompany BarPrepCompletion
## 0 26
## OptIntoWritingGuide X.LawSchoolBarPrepWorkshops
## 0 0
## StudentSuccessInitiative BarPrepMentor
## 0 0
## MPRE MPT
## 397 0
## MEE WrittenScaledScore
## 0 0
## MBE UBE
## 0 0
## CivPro_Num LPI_Num
## 7 9
## LPII_Num
## 56
df_pass<-df[df$PassFail=="P",]
df_pass<-subset(df,PassFail=="P")
df_fail<-subset(df,PassFail=="F")
df_fail<-subset(df,PassFail!="P")
boxplot(df_pass$UBE,df_fail$UBE,main="Side-by-side BoxPlot of UBE Scores by Pass/Fail",
xlab="Pass/Fail of Exam",ylab="UBE Score",names=c("Pass","Fail"),
col=c("green","red"))
abline(h=270,lty=2,col="darkblue")
summary(df_pass[,c("LSAT","UGPA","GPA_Final","BarPrepCompletion")])
## LSAT UGPA GPA_Final BarPrepCompletion
## Min. :141.0 Min. :2.010 Min. :2.440 Min. :0.2700
## 1st Qu.:154.0 1st Qu.:3.283 1st Qu.:3.078 1st Qu.:0.8100
## Median :156.0 Median :3.550 Median :3.306 Median :0.9000
## Mean :155.9 Mean :3.487 Mean :3.312 Mean :0.8781
## 3rd Qu.:158.0 3rd Qu.:3.760 3rd Qu.:3.540 3rd Qu.:0.9900
## Max. :171.0 Max. :4.140 Max. :3.990 Max. :1.0000
## NA's :1 NA's :23
summary(df_fail[,c("LSAT","UGPA","GPA_Final","BarPrepCompletion")])
## LSAT UGPA GPA_Final BarPrepCompletion
## Min. :144.0 Min. :2.560 Min. :2.460 Min. :0.0000
## 1st Qu.:151.0 1st Qu.:3.220 1st Qu.:2.770 1st Qu.:0.7000
## Median :154.0 Median :3.460 Median :2.960 Median :0.8200
## Mean :153.7 Mean :3.405 Mean :2.946 Mean :0.7487
## 3rd Qu.:157.0 3rd Qu.:3.620 3rd Qu.:3.100 3rd Qu.:0.9300
## Max. :161.0 Max. :4.060 Max. :3.420 Max. :1.0000
## NA's :3
par(mfrow=c(1,2))
hist(df_pass$LSAT,col="green",main="LSAT - Pass",xlab="LSAT Score")
hist(df_fail$LSAT,col="red",main="LSAT - Fail",xlab="LSAT Score")
par(mfrow=c(1,1))
par(mfrow=c(1,2))
hist(df_pass$GPA_Final,col="green",main="Final GPA - Pass",xlab="GPA")
hist(df_fail$GPA_Final,col="red",main="Final GPA - Fail",xlab="GPA")
par(mfrow=c(1,1))
par(mfrow=c(1,2))
hist(df_pass$BarPrepCompletion,col="green",main="Bar Prep - Pass",xlab="Proportion Completed")
hist(df_fail$BarPrepCompletion,col="red",main="Bar Prep - Fail",xlab="Proportion Completed")
par(mfrow=c(1,1))
The boxplot shows a clear separation in UBE scores between passing and failing students, with the dashed line marking the 270 threshold. Passing students have higher LSAT scores, higher final GPAs, and complete more of their bar prep programs on average. These patterns motivate the hypotheses tested below.
I hypothesize that the student’s LSAT score and UGPA are significant in predicting UBE score. These are admission-level variables that reflect baseline academic ability before law school.
model=lm(UBE~LSAT*UGPA,data=df)
summary(model)
##
## Call:
## lm(formula = UBE ~ LSAT * UGPA, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66.392 -13.162 0.862 14.074 53.757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -618.7953 349.9206 -1.768 0.0775 .
## LSAT 5.5922 2.2303 2.507 0.0124 *
## UGPA 188.4714 97.5901 1.931 0.0539 .
## LSAT:UGPA -1.1317 0.6223 -1.819 0.0695 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.37 on 595 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.103, Adjusted R-squared: 0.09845
## F-statistic: 22.77 on 3 and 595 DF, p-value: 5.777e-14
plot(model,1)
plot(model,2)
model2=lm(UBE~LSAT+UGPA,data=df)
summary(model2)
##
## Call:
## lm(formula = UBE ~ LSAT + UGPA, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66.221 -13.466 1.022 14.406 54.180
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.1607 36.1339 0.392 0.695
## LSAT 1.5557 0.2183 7.125 3.02e-12 ***
## UGPA 11.0483 2.2807 4.844 1.62e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.41 on 596 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.09799, Adjusted R-squared: 0.09496
## F-statistic: 32.37 on 2 and 596 DF, p-value: 4.499e-14
library(car)
## Loading required package: carData
vif(model2)
## LSAT UGPA
## 1.026953 1.026953
plot(model2,4)
The interaction term in the first model tests whether the combined effect of LSAT and UGPA on UBE score is greater than their individual effects. If it is not significant (p > 0.05), model2 is preferred for simplicity. The residual plots check constant variance and normality. VIF values below 5 confirm no serious collinearity. The Cook’s distance plot checks for influential observations.
I hypothesize that the student’s LSAT score and UGPA are significant in predicting whether a student will pass the bar exam.
model3<-glm(PassFail~LSAT*UGPA,data=df,family=binomial)
anova(model3,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: PassFail
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 598 394.26
## LSAT 1 17.3830 597 376.88 3.056e-05 ***
## UGPA 1 6.6384 596 370.24 0.00998 **
## LSAT:UGPA 1 0.1542 595 370.09 0.69455
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model4<-glm(PassFail~LSAT+UGPA,data=df,family=binomial)
summary(model4)
##
## Call:
## glm(formula = PassFail ~ LSAT + UGPA, family = binomial, data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -27.32850 6.24420 -4.377 1.21e-05 ***
## LSAT 0.16872 0.03717 4.539 5.66e-06 ***
## UGPA 0.98478 0.37537 2.624 0.0087 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 394.26 on 598 degrees of freedom
## Residual deviance: 370.24 on 596 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 376.24
##
## Number of Fisher Scoring iterations: 5
anova(model4,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: PassFail
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 598 394.26
## LSAT 1 17.3830 597 376.88 3.056e-05 ***
## UGPA 1 6.6384 596 370.24 0.00998 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
1-pchisq(model4$null.deviance-model4$deviance,2)
## [1] 6.0788e-06
1-model4$deviance/model4$null.deviance
## [1] 0.06092768
exp(coef(model4))
## (Intercept) LSAT UGPA
## 1.353270e-12 1.183792e+00 2.677230e+00
The first computed value is the p-value for overall model significance. The second is the R-squared, indicating how much variation in PassFail is explained by LSAT and UGPA. Odds ratios above 1 indicate increased odds of passing the bar exam.
I hypothesize that final law school GPA and bar prep completion predict bar passage beyond what admission scores tell us. These are in-school factors the administration can monitor and act on directly.
model5<-glm(PassFail~GPA_Final+BarPrepCompletion,data=df,family=binomial)
summary(model5)
##
## Call:
## glm(formula = PassFail ~ GPA_Final + BarPrepCompletion, family = binomial,
## data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -13.7340 1.9906 -6.899 5.22e-12 ***
## GPA_Final 4.3935 0.6443 6.819 9.16e-12 ***
## BarPrepCompletion 2.7019 0.8151 3.315 0.000918 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 375.82 on 573 degrees of freedom
## Residual deviance: 284.65 on 571 degrees of freedom
## (26 observations deleted due to missingness)
## AIC: 290.65
##
## Number of Fisher Scoring iterations: 6
anova(model5,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: PassFail
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 573 375.82
## GPA_Final 1 80.025 572 295.80 < 2.2e-16 ***
## BarPrepCompletion 1 11.153 571 284.65 0.0008388 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
1-model5$deviance/model5$null.deviance
## [1] 0.2426095
exp(coef(model5))
## (Intercept) GPA_Final BarPrepCompletion
## 1.084963e-06 8.092208e+01 1.490751e+01
AIC(model4,model5)
## Warning in AIC.default(model4, model5): models are not all fitted to the same
## number of observations
## df AIC
## model4 3 376.2396
## model5 3 290.6463
summary(model2)$adj.r.squared
## [1] 0.09496233
summary(model)$adj.r.squared
## [1] 0.09845218
The AIC table compares the two logistic models, a lower AIC value indicates a better fitting model. The adjusted R-squared compares the two linear models, with a higher value indicating that the model explains more variation in UBE score.
Models 1 and 2 — LSAT and UGPA: Both LSAT and UGPA are positively associated with UBE score and with the probability of passing. The interaction term was not significant in either model, so the simpler additive models (model2 and model4) are preferred. My hypothesis for this group of predictors was supported.
Model 3 — GPA_Final and BarPrepCompletion: Final law school GPA is a significant predictor of bar passage, suggesting that sustained academic performance throughout law school carries into the exam. Bar prep completion is also positively associated with passing, students who finish more of their commercial prep program have higher odds of passing. My hypothesis for this group was also supported. The AIC comparison between model4 and model5 shows which set of predictors better explains pass/fail outcomes.
Limitations: This dataset comes from a single law school over five cohorts. Results reflect associations, not causation. Year-to-year differences in UBE equating may introduce variation not accounted for in the models. Variables such as bar prep company were excluded to keep each model focused on a specific hypothesis.
Recommendation 1: Monitor bar prep completion and intervene early. Model 3 shows that BarPrepCompletion is associated with passing even after controlling for GPA. The school could implement a midpoint check-in during the bar prep period to identify students who are significantly behind and connect them with academic support. This is directly tied to a significant predictor and is actionable within a single semester.
Recommendation 2:Offer bar readiness support to students with low Final GPA. Model 3 shows that GPA_Final is one of the strongest predictors of bar passage. Students graduating in the bottom quarter of their class could be offered a structured bar readiness program during their final semester, before commercial prep begins. Early preparation is more likely to be effective than last-minute intervention.
Recommendation 3: Use LSAT as an early warning tool, not a barrier. Model 2 confirms that LSAT is associated with bar passage. Students admitted with lower LSAT scores could be flagged for additional academic support starting in 1L, such as tutoring in bar-tested subjects. This flag should guide resource allocation and not lower expectations, since many lower-LSAT students do pass the bar exam.
df<-read.csv("https://raw.githubusercontent.com/tmatis12/datafiles/refs/heads/main/BarData_2025.csv")
str(df)
df$UGPA<-as.numeric(df$UGPA)
df$PassFail<-factor(df$PassFail,levels=c("F","P"))
grademap<-c("A"=4.0,"A-"=3.7,"B+"=3.3,"B"=3.0,"B-"=2.7,
"C+"=2.3,"C"=2.0,"C-"=1.7,"D+"=1.3,"D"=1.0,"D-"=.07,
"F"=0)
df$CivPro_Num<-grademap[df$CivPro]
df$LPI_Num<-grademap[df$LPI]
df$LPII_Num<-grademap[df$LPII]
head(df)
colnames(df)
colSums(is.na(df))
df_pass<-df[df$PassFail=="P",]
df_pass<-subset(df,PassFail=="P")
df_fail<-subset(df,PassFail=="F")
df_fail<-subset(df,PassFail!="P")
boxplot(df_pass$UBE,df_fail$UBE,main="Side-by-side BoxPlot of UBE Scores by Pass/Fail",
xlab="Pass/Fail of Exam",ylab="UBE Score",names=c("Pass","Fail"),
col=c("green","red"))
abline(h=270,lty=2,col="darkblue")
summary(df_pass[,c("LSAT","UGPA","GPA_Final","BarPrepCompletion")])
summary(df_fail[,c("LSAT","UGPA","GPA_Final","BarPrepCompletion")])
par(mfrow=c(1,2))
hist(df_pass$LSAT,col="green",main="LSAT - Pass",xlab="LSAT Score")
hist(df_fail$LSAT,col="red",main="LSAT - Fail",xlab="LSAT Score")
par(mfrow=c(1,1))
par(mfrow=c(1,2))
hist(df_pass$GPA_Final,col="green",main="Final GPA - Pass",xlab="GPA")
hist(df_fail$GPA_Final,col="red",main="Final GPA - Fail",xlab="GPA")
par(mfrow=c(1,1))
par(mfrow=c(1,2))
hist(df_pass$BarPrepCompletion,col="green",main="Bar Prep - Pass",xlab="Proportion Completed")
hist(df_fail$BarPrepCompletion,col="red",main="Bar Prep - Fail",xlab="Proportion Completed")
par(mfrow=c(1,1))
model=lm(UBE~LSAT*UGPA,data=df)
summary(model)
plot(model,1)
plot(model,2)
model2=lm(UBE~LSAT+UGPA,data=df)
summary(model2)
library(car)
vif(model2)
plot(model2,4)
model3<-glm(PassFail~LSAT*UGPA,data=df,family=binomial)
anova(model3,test="Chisq")
model4<-glm(PassFail~LSAT+UGPA,data=df,family=binomial)
summary(model4)
anova(model4,test="Chisq")
1-pchisq(model4$null.deviance-model4$deviance,2)
1-model4$deviance/model4$null.deviance
exp(coef(model4))
model5<-glm(PassFail~GPA_Final+BarPrepCompletion,data=df,family=binomial)
summary(model5)
anova(model5,test="Chisq")
1-model5$deviance/model5$null.deviance
exp(coef(model5))
AIC(model4,model5)
summary(model2)$adj.r.squared
summary(model)$adj.r.squared