The TTU Law School has shared a de-identified dataset of bar examination outcomes covering five recent cohorts of graduates (BarData_2025.csv, 600 candidates spanning the years 2021–2025). The administration is interested in identifying actionable factors that distinguish students who pass the Uniform Bar Examination from those who do not. Their goal is to design interventions that improve future passage rates without expanding the program or modifying admissions criteria.
In this paper we will develop a regression model from the data, interpret the results of specific bar preparation and translate the results in to specific actionable recommendation for the law school.
The raw source data file can be found at this link: BarData_2025.csv
The data dictionary describing the data items in the data set can be found here:BarData_Dictionary_2021to2025
What we are trying to accomplish with this paper is to specific the particular actions to increase student success on the Bar exam. To that end, we should first look at the current interventions and aides that are currently being used to effect examination success outside the classroom. We should also look at direct classroom performance to examine how strongly that this effects student performance on the bar exam. To this end we will look at the data in 2 groupings:
With the existing external aides we want to observe, if these items are having the intended effect. This especially important because these most likely incur a cost and we need to insure that these items are cost effective. Also by looking at the coursework, we can see which course or courses has the greatest effect on the bar passing rate.
My two statistical Hypothesis statement for both groups will be:
\[\begin{array}{l} {H_0}:{\mu _{pass}} = {\mu _{fail}}\\ {H_A}:{\mu _{pass}} \ne {\mu _{fail}} \end{array}\]
For this paper the following tools used:
The dataset was read as a csv file from the github source. We first need to condition and clean the data. We are doing the following task at this step
df<-read.csv("https://raw.githubusercontent.com/tmatis12/datafiles/refs/heads/main/BarData_2025.csv")
## Data Manipulation #####
#change any missing values to NA
#change several items from characters to factors
#change several items from set BarPrepCompletion
# UGPA to numeric
df$UGPA<-as.numeric(df$UGPA)
df$BarPrepCompletion <- as.numeric(df$BarPrepCompletion)
df$MPRE <- as.numeric(df$MPRE)
#change these items from character to factors
df$PassFail<-factor(df$PassFail,levels=c("F","P"))
df$Accommodations<-factor(df$Accommodations, levels=c("Y","N"))
df$Probation<-factor(df$Probation, levels = c("Y", "N"))
df$LegalAnalysis_TexasPractice<-factor(df$LegalAnalysis_TexasPractice, levels=c("Y","N"))
df$AdvLegalAnalysis<-factor(df$AdvLegalAnalysis,levels = c("Y","N"))
df$AdvLegalPerfSkills<-factor(df$AdvLegalPerfSkills,levels = c("Y","N"))
df$OptIntoWritingGuide<-factor(df$OptIntoWritingGuide, levels = c("Y","N"))
#Remove the blank items in these columns and assign NA.
df$OptIntoWritingGuide[df$OptIntoWritingGuide == ""] <- NA
df$BarPrepCompany[df$BarPrepCompany == ""] <- NA
df$BarPrepCompletion[df$BarPrepCompletion == ""] <- NA
df$MPRE[df$MPRE == ""] <- NA
#set BarPrepCompany to factors
df$BarPrepCompany <- factor(
df$BarPrepCompany,
levels = c("Helix", "Barbri", "Themis", "JD Advising", "Quimbee", "Kaplan"))
#use a grade map
grademap<-c("A"=4.0,"A-"=3.7,"B+"=3.3,"B"=3.0,"B-"=2.7,
"C+"=2.3,"C"=2.0,"C-"=1.7,"D+"=1.3,"D"=1.0,"D-"=.07,
"F"=0)
#Apply grademap to these columns
df$CivPro_Num<-grademap[df$CivPro]
df$LPI_Num<-grademap[df$LPI]
df$LPII_Num<-grademap[df$LPII]
The next step is to partition and parse the data into subgroups of interest by pass and fail and the entire population for some characteristics. Although some of these partition groups may not be used, we will have them if they are needed.
#break up the datasets into to organized characteristic groups
#Some of these may not be used in the analysis, but we do them just in case
#student success
df_student_success<-df[df$StudentSuccessInitiative!="N",]
df_no_student_success<-df[df$StudentSuccessInitiative=="N",]
#barprep mentor, yes or no
df_bar_prep_mentor<-df[df$BarPrepMentor!="N",]
df_no_bar_prep_mentor<-df[df$BarPrepMentor=="N",]
#success seminar, yes and either pass or fail
df_pass_success_seminar<-subset(df,StudentSuccessInitiative!="N" & PassFail =="P")
df_fail_success_seminar<-subset(df,StudentSuccessInitiative!="N" & PassFail =="F")
#success seminar, no and either pass or fail
df_pass_no_success_seminar<-subset(df,StudentSuccessInitiative=="N" & PassFail =="P")
df_fail_no_success_seminar<-subset(df,StudentSuccessInitiative=="N" & PassFail =="F")
#bar prep classes, we only do Barbri & Themis, since they have 90% coverage
df_total_withtest_prep<-subset(df, BarPrepCompany %in% c("Barbri", "Themis"))
#attend prep attend more than 50%
df_total_withtest_prep_used<-subset(df,
BarPrepCompany %in% c("Barbri", "Themis") &
(BarPrepCompletion >= 0.50) )
#Used prep and attend less than 50%
df_total_withtest_prep_underutil<-subset(df,
BarPrepCompany %in% c("Barbri", "Themis") &
(BarPrepCompletion <= 0.50) )
#combinations with pass and fail
df_pass_withtest_prep<-subset(df,
BarPrepCompany %in% c("Barbri", "Themis") &
(BarPrepCompletion >= 0.50) & PassFail=="P")
df_pass_lesstest_prep<-subset(df,
BarPrepCompany %in% c("Barbri", "Themis") &
(BarPrepCompletion <= 0.50) & PassFail=="P")
df_fail_withtest_prep<-subset(df,
BarPrepCompany %in% c("Barbri", "Themis") &
(BarPrepCompletion >= 0.50) & PassFail=="F")
df_fail_lesstest_prep<-subset(df,
BarPrepCompany %in% c("Barbri", "Themis") &
(BarPrepCompletion <= 0.50) & PassFail=="F")
### Subset DataFrame by PassFail #####
#df_pass<-df[df$PassFail=="P",] #filter dataframe by students who passed
df_pass<-subset(df,PassFail=="P") #alternate way
df_fail<-subset(df,PassFail=="F") #filter dataframe by students who failed
To get a general idea of the overall pass/fail rate of the dataset, we first generate a pie chart. This gives us some visual cue of the magnitude of the issue we are addressing.
#Let's setup a pie chart
pass_val <- nrow(df_pass)
fail_val <- nrow(df_fail)
#Combine pass & fail values into 1 vector and label chart
vals <- c(pass_val, fail_val)
labs <- c("PASS", "FAIL")
# Colors: green for pass, red for fail
cols <- c("green4", "firebrick1")
# Render the pie chart
pie(vals,
labels = labs,
col = cols,
main = "Pass & Fail Breakdown: Pass(539) Fail(61)")
We can see from this pie chart that there is an approximately 10.2% failure rate that we are trying to address. After applying any recommendations, we can compare the new outcomes to the existing reality.
Before beginning any analysis of deviance or Anova analysis, we compare the individual predictor variables directly to the numerical bar score(UBE) to valiadate that we are choosing applicable predicators. We will first do box plots for a visual comparision
#let's do a boxplot to visually look at the data, before we do an analysis
boxplot(df_pass$UBE,df_fail$UBE,main="UBE Scores by Pass/Fail",
xlab="Figure 1 Pass/Fail of Exam",ylab="UBE Score",names=c("Pass","Fail"),
col=c("green4","red2")) #side by side boxplot for data exploration
Figure 1 is a boxplot of UBE scores by pass and fail that shows a clear
difference in the median value of the UBE between the two populations.
There failing scores are tightly clustered around the failing median
value. The median failing score is near the passing score of 270. We can
conclude that a increase of 10 to 15 ppints would lead to many of these
failing scores becoming a passing score.
boxplot(df$UBE,df_total_withtest_prep$UBE,main="BoxPlot of UBE Scores of All & Test Prep",
xlab="Figure 2 Effect of Test Prep",ylab="UBE Score",names=c("All","With Test Prep"),
col=c("gold","darkblue")) #
Figure 2 is a boxplot of the effect of the test preparation class on the
median score. It appears that there is minimal effect on the median
score between the populations median score and the median score of those
who were enrolled in test prep. This data set used the large groups of
Barbri(289 students) and Themis(266 students). This graph may be
attribute to the fact that the groups combined represent 555 students
which is almost identical to the population of 600.
boxplot(df_total_withtest_prep_used$UBE,df_total_withtest_prep_underutil$UBE,main="UBE Scores Based on Usage of Test Prep",
xlab="Figure 3 Effect of Time using Test Prep",ylab="UBE Score",names=c(">50%","<50%"),
col=c("deepskyblue","tomato")) #
Figure 3 shows the true effect of the Bar prep classes. This figure
shows two subgroups, one group that attended more than 50% of the course
and the second that attended less than 50% of the sessions. There is a
clear effect of attendance of greater than 50% on a increase of the
median UBE score. Clearly, increased attendance to the bar prep seminars
increases the UBE score
boxplot(df$UBE,df_student_success$UBE,main="UBE Scores Based on Student Success Seminar",
xlab="Figure 4 Effect of Involvement with Student Success",ylab="UBE Score",names=c("All","Student Success"),
col=c("palegreen","coral")) #
Figure 4 shows the effect of student who were involved in with the
student success seminars. The median UBE score is lower for students in
this subgroup in comparision to the overall population.
We now will do a correlation computation to the UBE to get a more analytical picture of our data. We also do a cor.test() for each pairing.
df_a<-data.frame(df$UBE,df$GPA_1L)
cor(df_a, use = "pairwise.complete.obs")
## df.UBE df.GPA_1L
## df.UBE 1.0000000 0.5247093
## df.GPA_1L 0.5247093 1.0000000
cor.test(df$UBE,df$GPA_1L)
##
## Pearson's product-moment correlation
##
## data: df$UBE and df$GPA_1L
## t = 14.972, df = 590, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4637336 0.5807378
## sample estimates:
## cor
## 0.5247093
The correlation value of 0.52471 means that there is a moderately strong relationship between UBE score and the first year GPA. The output from the cor.test() gives a extremely small p-value of <2.2e-16 which means that GPA1_L is definitely a significant predictor of the UBE score. The t-value is large at 14.972, which means there strong evidence to take the alternate hypothesis of the first year GPA has a postive effect on bar passage. So students who perform better the first year score higher on the bar exam.
df_b<-data.frame(df$UBE,df$GPA_Final)
cor(df_b, use = "pairwise.complete.obs")
## df.UBE df.GPA_Final
## df.UBE 1.0000000 0.5744117
## df.GPA_Final 0.5744117 1.0000000
cor.test(df$UBE, df$GPA_Final)
##
## Pearson's product-moment correlation
##
## data: df$UBE and df$GPA_Final
## t = 17.16, df = 598, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5181930 0.6256879
## sample estimates:
## cor
## 0.5744117
The correlation value of 0.57441 means that there is a moderately strong relationship between UBE score and the final GPA. The output from the cor.test() gives a extremely small p-value of <2.2e-16 which means that GPA_Final is definitely a significant predictor of the UBE score. The t-value is large at 17.16, which means there strong evidence to take the alternate hypothesis of the final GPA has a postive effect on bar passage. So students have a higher final year GPA score higher on the bar exam.
df_c<-data.frame(df$UBE,df$FinalRankPercentile)
#We must use Spearman since this is a rank ordering
cor( df_c, use= "pairwise.complete.obs",method=c("spearman"))
## df.UBE df.FinalRankPercentile
## df.UBE 1.0000000 0.5628156
## df.FinalRankPercentile 0.5628156 1.0000000
str(df_c$UBE)
## NULL
cor.test(df$UBE,df$FinalRankPercentile)
##
## Pearson's product-moment correlation
##
## data: df$UBE and df$FinalRankPercentile
## t = 17.077, df = 598, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5161441 0.6239829
## sample estimates:
## cor
## 0.5725346
The correlation value of 0.56282 means that there is a moderately strong relationship between UBE score and the final class rank. The output from the cor.test() gives a extremely small p-value of <2.2e-16 which means that final class rank is definitely a significant predictor of the UBE score. The t-value is large at 17.077, which means there strong evidence to take the alternate hypothesis of the final class ranking has a postive effect on bar passage. So students have a higher final ranking score higher on the bar exam.
df_d<-data.frame(df$UBE,df$CivPro_Num)
cor(df_d,use = "pairwise.complete.obs")
## df.UBE df.CivPro_Num
## df.UBE 1.0000000 0.3692164
## df.CivPro_Num 0.3692164 1.0000000
cor.test(df$UBE,df$CivPro_Num)
##
## Pearson's product-moment correlation
##
## data: df$UBE and df$CivPro_Num
## t = 9.6583, df = 591, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2975460 0.4367486
## sample estimates:
## cor
## 0.3692164
The correlation value of 0.36922 means that there is a moderately relationship between UBE score and the performance in the class CivPro. Notice that this correlation is smaller than the prior correlations. The output from the cor.test() gives a extremely small p-value of <2.2e-16 which means that grade in this class is definitely a significant predictor of the UBE score. The t-value is large at 9.6583, which means there strong evidence to take the alternate hypothesis of the class performance in CivPro has a postive effect on bar passage. So students have a higher grade in CivPro score higher on the bar exam.
df_e<-data.frame(df$UBE,df$LPI_Num)
cor(df_e,use = "pairwise.complete.obs")
## df.UBE df.LPI_Num
## df.UBE 1.0000000 0.2254762
## df.LPI_Num 0.2254762 1.0000000
cor.test(df$UBE,df$LPI_Num)
##
## Pearson's product-moment correlation
##
## data: df$UBE and df$LPI_Num
## t = 5.6168, df = 589, p-value = 3.002e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1475066 0.3006606
## sample estimates:
## cor
## 0.2254762
The correlation value of 0.22548 means that there is a moderately relationship between UBE score and the performance in the class LPI. Notice that this correlation is smaller than the prior correlations and CivPro. The output from the cor.test() gives a extremely small p-value of 3.002e-08 which means that grade in this class is definitely a significant predictor of the UBE score. The t-value is large at 5.6168, which means there strong evidence to take the alternate hypothesis of the class performance in LPI has a positive effect on bar passage. So students have a higher grade in LPI score higher on the bar exam. However this class performance does not have as strong effect as the CivPro class.
df_f<-data.frame(df$UBE,df$LPII_Num)
cor(df_f,use = "pairwise.complete.obs")
## df.UBE df.LPII_Num
## df.UBE 1.0000000 0.2834006
## df.LPII_Num 0.2834006 1.0000000
cor.test(df$UBE,df$LPII_Num)
##
## Pearson's product-moment correlation
##
## data: df$UBE and df$LPII_Num
## t = 6.8799, df = 542, p-value = 1.659e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2041989 0.3589161
## sample estimates:
## cor
## 0.2834006
The correlation value of 0.28340 means that there is a moderately relationship between UBE score and the performance in the class LPII. Notice that this correlation is smaller than the prior correlations and CivPro. The output from the cor.test() gives a extremely small p-value of 1.659e-11 which means that grade in this class is definitely a significant predictor of the UBE score. The t-value is large at 6.8799, which means there strong evidence to take the alternate hypothesis of the class performance in LPII has a positive effect on bar passage. So students have a higher grade in LPII score higher on the bar exam. However this class performance does not have as strong effect as the CivPro class.
model6<-glm(PassFail~GPA_1L+GPA_Final+FinalRankPercentile,data=df,family=binomial)
summary(model6)
##
## Call:
## glm(formula = PassFail ~ GPA_1L + GPA_Final + FinalRankPercentile,
## family = binomial, data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.5103 5.9328 -1.266 0.206
## GPA_1L 0.8172 0.7036 1.161 0.245
## GPA_Final 2.1351 2.1235 1.005 0.315
## FinalRankPercentile 1.9524 2.6021 0.750 0.453
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 384.02 on 591 degrees of freedom
## Residual deviance: 304.01 on 588 degrees of freedom
## (8 observations deleted due to missingness)
## AIC: 312.01
##
## Number of Fisher Scoring iterations: 6
anova(model6,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: PassFail
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 591 384.02
## GPA_1L 1 62.839 590 321.18 2.243e-15 ***
## GPA_Final 1 16.610 589 304.57 4.590e-05 ***
## FinalRankPercentile 1 0.559 588 304.01 0.4548
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model7<-glm(PassFail~Age+FinalRankPercentile,data=df,family=binomial)
summary(model7)
##
## Call:
## glm(formula = PassFail ~ Age + FinalRankPercentile, family = binomial,
## data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.37728 0.93025 2.556 0.0106 *
## Age -0.07446 0.03172 -2.348 0.0189 *
## FinalRankPercentile 5.50743 0.76163 7.231 4.79e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 394.48 on 599 degrees of freedom
## Residual deviance: 307.88 on 597 degrees of freedom
## AIC: 313.88
##
## Number of Fisher Scoring iterations: 6
anova(model7,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: PassFail
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 599 394.48
## Age 1 4.331 598 390.14 0.03743 *
## FinalRankPercentile 1 82.264 597 307.88 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model8<-glm(PassFail~CivPro_Num+LPII_Num+LPI_Num,data=df,family=binomial)
summary(model8)
##
## Call:
## glm(formula = PassFail ~ CivPro_Num + LPII_Num + LPI_Num, family = binomial,
## data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.68136 0.85201 -1.973 0.0485 *
## CivPro_Num 1.19743 0.23509 5.093 3.52e-07 ***
## LPII_Num 0.04673 0.27952 0.167 0.8672
## LPI_Num 0.18428 0.28600 0.644 0.5194
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 338.11 on 541 degrees of freedom
## Residual deviance: 301.92 on 538 degrees of freedom
## (58 observations deleted due to missingness)
## AIC: 309.92
##
## Number of Fisher Scoring iterations: 5
anova(model8,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: PassFail
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 541 338.11
## CivPro_Num 1 35.493 540 302.62 2.559e-09 ***
## LPII_Num 1 0.292 539 302.33 0.5892
## LPI_Num 1 0.412 538 301.92 0.5209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model9<-glm(df_total_withtest_prep$PassFail~df_total_withtest_prep$BarPrepCompany,data=df_total_withtest_prep,family=binomial)
summary(model9)
##
## Call:
## glm(formula = df_total_withtest_prep$PassFail ~ df_total_withtest_prep$BarPrepCompany,
## family = binomial, data = df_total_withtest_prep)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 2.272508 0.202123 11.243
## df_total_withtest_prep$BarPrepCompanyThemis -0.006587 0.291553 -0.023
## Pr(>|z|)
## (Intercept) <2e-16 ***
## df_total_withtest_prep$BarPrepCompanyThemis 0.982
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 345.21 on 554 degrees of freedom
## Residual deviance: 345.21 on 553 degrees of freedom
## AIC: 349.21
##
## Number of Fisher Scoring iterations: 5
anova(model9,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: df_total_withtest_prep$PassFail
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 554 345.21
## df_total_withtest_prep$BarPrepCompany 1 0.00051033 553 345.21
## Pr(>Chi)
## NULL
## df_total_withtest_prep$BarPrepCompany 0.982
A GLM analysis was done on several combinations of predictors. The selection of the predictors were chosen based on the previous correlation analysis. The analysis show that the first year GPA and final GPA are highly significant. Final class rank appears to be significant, but not to the same degree as first year GPA and the final GPA. Age does not appear to be a significant predictor, it only seems to have a moderate significance only when analysis in relation to final ranking. However, the first year course grade in CivPro is very significant. The same result seen in the correlation analysis is seen here also. The CivPro course grade has the biggest effect on UBE passage of all the courses.
The coefficients for GPA_1L, GPA_Final and FinalRankPercentile are all positive indicating a positive slope effect on the passing the UBE. Age’s coefficient was negative indicating that the higher age related to lower passing rates. All the coursework coefficents, CivPro_Num, LPII_Num and LPI_Num were also positive indicating that higher grades indicted a higher pass rate on the UBE exam.
The actionable recommendations for the law school would be the following:
Full source code for Bar Passage Project
df<-read.csv("https://raw.githubusercontent.com/tmatis12/datafiles/refs/heads/main/BarData_2025.csv")
## Data Manipulation #####
#change any missing values to NA
#change several items from characters to factors
#change several items from set BarPrepCompletion
# UGPA to numeric
df$UGPA<-as.numeric(df$UGPA)
df$BarPrepCompletion <- as.numeric(df$BarPrepCompletion)
df$MPRE <- as.numeric(df$MPRE)
#change these items from character to factors
df$PassFail<-factor(df$PassFail,levels=c("F","P"))
df$Accommodations<-factor(df$Accommodations, levels=c("Y","N"))
df$Probation<-factor(df$Probation, levels = c("Y", "N"))
df$LegalAnalysis_TexasPractice<-factor(df$LegalAnalysis_TexasPractice, levels=c("Y","N"))
df$AdvLegalAnalysis<-factor(df$AdvLegalAnalysis,levels = c("Y","N"))
df$AdvLegalPerfSkills<-factor(df$AdvLegalPerfSkills,levels = c("Y","N"))
df$OptIntoWritingGuide<-factor(df$OptIntoWritingGuide, levels = c("Y","N"))
#Remove the blank items in these columns and assign NA.
df$OptIntoWritingGuide[df$OptIntoWritingGuide == ""] <- NA
df$BarPrepCompany[df$BarPrepCompany == ""] <- NA
df$BarPrepCompletion[df$BarPrepCompletion == ""] <- NA
df$MPRE[df$MPRE == ""] <- NA
#set BarPrepCompany to factors
df$BarPrepCompany <- factor(
df$BarPrepCompany,
levels = c("Helix", "Barbri", "Themis", "JD Advising", "Quimbee", "Kaplan"))
#use a grade map
grademap<-c("A"=4.0,"A-"=3.7,"B+"=3.3,"B"=3.0,"B-"=2.7,
"C+"=2.3,"C"=2.0,"C-"=1.7,"D+"=1.3,"D"=1.0,"D-"=.07,
"F"=0)
#Apply grademap to these columns
df$CivPro_Num<-grademap[df$CivPro]
df$LPI_Num<-grademap[df$LPI]
df$LPII_Num<-grademap[df$LPII]
#break up the datasets into to organized characteristic groups
#Some of these may not be used in the analysis, but we do them just in case
#student success
df_student_success<-df[df$StudentSuccessInitiative!="N",]
df_no_student_success<-df[df$StudentSuccessInitiative=="N",]
#barprep mentor, yes or no
df_bar_prep_mentor<-df[df$BarPrepMentor!="N",]
df_no_bar_prep_mentor<-df[df$BarPrepMentor=="N",]
#success seminar, yes and either pass or fail
df_pass_success_seminar<-subset(df,StudentSuccessInitiative!="N" & PassFail =="P")
df_fail_success_seminar<-subset(df,StudentSuccessInitiative!="N" & PassFail =="F")
#success seminar, no and either pass or fail
df_pass_no_success_seminar<-subset(df,StudentSuccessInitiative=="N" & PassFail =="P")
df_fail_no_success_seminar<-subset(df,StudentSuccessInitiative=="N" & PassFail =="F")
#bar prep classes, we only do Barbri & Themis, since they have 90% coverage
df_total_withtest_prep<-subset(df, BarPrepCompany %in% c("Barbri", "Themis"))
#attend prep attend more than 50%
df_total_withtest_prep_used<-subset(df,
BarPrepCompany %in% c("Barbri", "Themis") &
(BarPrepCompletion >= 0.50) )
#Used prep and attend less than 50%
df_total_withtest_prep_underutil<-subset(df,
BarPrepCompany %in% c("Barbri", "Themis") &
(BarPrepCompletion <= 0.50) )
#combinations with pass and fail
df_pass_withtest_prep<-subset(df,
BarPrepCompany %in% c("Barbri", "Themis") &
(BarPrepCompletion >= 0.50) & PassFail=="P")
df_pass_lesstest_prep<-subset(df,
BarPrepCompany %in% c("Barbri", "Themis") &
(BarPrepCompletion <= 0.50) & PassFail=="P")
df_fail_withtest_prep<-subset(df,
BarPrepCompany %in% c("Barbri", "Themis") &
(BarPrepCompletion >= 0.50) & PassFail=="F")
df_fail_lesstest_prep<-subset(df,
BarPrepCompany %in% c("Barbri", "Themis") &
(BarPrepCompletion <= 0.50) & PassFail=="F")
### Subset DataFrame by PassFail #####
#df_pass<-df[df$PassFail=="P",] #filter dataframe by students who passed
df_pass<-subset(df,PassFail=="P") #alternate way
df_fail<-subset(df,PassFail=="F") #filter dataframe by students who failed
#Let's setup a pie chart
pass_val <- nrow(df_pass)
fail_val <- nrow(df_fail)
#Combine pass & fail values into 1 vector and label chart
vals <- c(pass_val, fail_val)
labs <- c("PASS", "FAIL")
# Colors: green for pass, red for fail
cols <- c("green4", "firebrick1")
# Render the pie chart
pie(vals,
labels = labs,
col = cols,
main = "Pass & Fail Breakdown: Pass(539) Fail(61)")
#let's do a boxplot to visually look at the data, before we do an analysis
boxplot(df_pass$UBE,df_fail$UBE,main="UBE Scores by Pass/Fail",
xlab="Pass/Fail of Exam",ylab="UBE Score",names=c("Pass","Fail"),
col=c("green4","red2")) #side by side boxplot for data exploration
boxplot(df$UBE,df_total_withtest_prep$UBE,main="BoxPlot of UBE Scores of All & Test Prep",
xlab="Effect of Test Prep",ylab="UBE Score",names=c("All","With Test Prep"),
col=c("gold","darkblue")) #
boxplot(df_total_withtest_prep_used$UBE,df_total_withtest_prep_underutil$UBE,main="UBE Scores Based on Usage of Test Prep",
xlab="Effect of Time using Test Prep",ylab="UBE Score",names=c(">50%","<50%"),
col=c("deepskyblue","tomato")) #
boxplot(df$UBE,df_student_success$UBE,main="UBE Scores Based on Student Success Seminar",
xlab="Effect of Involvement with Student Success",ylab="UBE Score",names=c("All","Student Success"),
col=c("palegreen","coral")) #
### Pairwise Correlations ###
### Look at the correlation of the several numerical data,
# to decide which data to use in our analysis
# also run cor.test() to get an idea of what to expect
# when we do anova, glm or lm
df_a<-data.frame(df$UBE,df$GPA_1L)
cor(df_a, use = "pairwise.complete.obs")
cor.test(df$UBE,df$GPA_1L)
df_b<-data.frame(df$UBE,df$GPA_Final)
cor(df_b, use = "pairwise.complete.obs")
cor.test(df$UBE, df$GPA_Final)
df_c<-data.frame(df$UBE,df$FinalRankPercentile)
cor( df_c, use= "pairwise.complete.obs",method=c("spearman"))
str(df_c$UBE)
cor.test(df$UBE,df$FinalRankPercentile)
df_d<-data.frame(df$UBE,df$CivPro_Num)
cor(df_d,use = "pairwise.complete.obs")
cor.test(df$UBE,df$CivPro_Num)
df_e<-data.frame(df$UBE,df$LPI_Num)
cor(df_e,use = "pairwise.complete.obs")
cor.test(df$UBE,df$LPI_Num)
df_f<-data.frame(df$UBE,df$LPII_Num)
cor(df_f,use = "pairwise.complete.obs")
cor.test(df$UBE,df$LPII_Num)
### Admission Criterion ####
#
# #Using PassFail as the response variable
# model3<-glm(PassFail~LSAT*UGPA,data=df,family=binomial)
# anova(model3,test="Chisq")
# model4<-glm(PassFail~LSAT+UGPA,data=df,family=binomial)
# summary(model4)
# anova(model4,test="Chisq")
# 1-pchisq(394.26-370.24,2) #chi-sq probability of model significance
# 1-370.24/394.26 #psuedo R^2
model5<-glm(PassFail~GPA_1L+GPA_Final,data=df,family=binomial)
summary(model5)
plot(model5,1) #constant variance
plot(model5,2) #npp
anova(model5,test="Chisq")
model6<-glm(PassFail~GPA_1L+GPA_Final+FinalRankPercentile,data=df,family=binomial)
summary(model6)
plot(model6,1) #constant variance
plot(model6,2) #npp
anova(model6,test="Chisq")
model7<-glm(PassFail~Age+FinalRankPercentile,data=df,family=binomial)
summary(model7)
plot(model7,1) #constant variance
plot(model7,2) #npp
anova(model7,test="Chisq")
model8<-glm(PassFail~CivPro_Num+LPII_Num+LPI_Num,data=df,family=binomial)
summary(model8)
plot(model8,1) #constant variance
plot(model8,2) #npp
anova(model8,test="Chisq")
model9<-glm(df_total_withtest_prep$PassFail~df_total_withtest_prep$BarPrepCompany,data=df_total_withtest_prep,family=binomial)
summary(model9)
plot(model9,1) #constant variance
plot(model9,2) #npp
anova(model9,test="Chisq")
table(df_total_withtest_prep$PassFail)
table(df_total_withtest_prep$BarPrepCompany)
model10<-glm(df_student_success$PassFail~df_student_success$StudentSuccessInitiative,data=df_student_success,family=binomial)
summary(model10)
plot(model10,1) #constant variance
plot(model10,2) #npp
anova(model10,test="Chisq")
table(df_student_success$PassFail)
table(df_student_success$StudentSuccessInitiative)
df$StudentSuccessInitiative_MOD <- ifelse(df$StudentSuccessInitiative == "N", "N", "Y")
df$StudentSuccessInitiative_MOD <- factor(df$StudentSuccessInitiative_MOD, levels = c("N", "Y"))
table(df$StudentSuccessInitiative_MOD)
model11<-glm(df$PassFail~df$StudentSuccessInitiative_MOD,data=df,family=binomial)
summary(model11)
plot(model11,1) #constant variance
plot(model11,2) #npp
anova(model11,test="Chisq")