library(pacman); p_load(ggplot2, dplyr, tidyverse)
# Cohen's d function
Cohensd <- function(M1, M2, SD1, SD2, N1 = 1, N2 = 1, rnd = 3){
SDP = sqrt((SD1^2 + SD2^2)/2)
SDPW = sqrt((((N1 - 1) * SD1^2) + ((N2 - 1) * SD2^2))/(N1 + N2))
d = (M2 - M1)/SDP
delta = (M2 - M1)/SD1
g = (M2 - M1)/SDPW
if (N1 & N2 <= 1) {cat(paste0("With group means of ", M1, " and ", M2," with SDs of ", SD1, " and ", SD2, ", Cohen's d is ", round(d, rnd), " Glass' Delta is ", round(delta, rnd), ". \n"))} else {cat(paste0("With group means of ", M1, " and ", M2," with SDs of ", SD1, " and ", SD2, " Cohen's d is ", round(d, rnd), " Glass' Delta is ", round(delta, rnd), " and Hedge's g is ", round(g, rnd), ". \n"))}}
The LSAT has recently been discussed in the context of affirmative action and diversity in admissions. Since it is the only unbiased part of law school applications, it is critically important. But, it is also important to understand the relevant gaps.
My data for the LSAT is derive from the three most recent available technical reports with regional, gender, and racial and ethnic breakdowns. Archive links are provided at the end.
LSATData <- data.frame(
Year = c(2006, 2007, #Report 1
2008, 2009, 2010, 2011, #Report 2
2012, 2013, 2014, 2015, 2016, 2017, 2018, #Report 3; Asian; end of range used, so if year was listed as "2005-2006", 2006 would be used. Only adequately represented and well-defined groups were used.
2006, 2007,
2008, 2009, 2010, 2011,
2012, 2013, 2014, 2015, 2016, 2017, 2018, #Black
2006, 2007,
2008, 2009, 2010, 2011,
2012, 2013, 2014, 2015, 2016, 2017, 2018, #Hispanic
2006, 2007,
2008, 2009, 2010, 2011,
2012, 2013, 2014, 2015, 2016, 2017, 2018 #White
),
Race = c("Asian", "Asian", "Asian", "Asian", "Asian", "Asian", "Asian", "Asian", "Asian", "Asian", "Asian", "Asian", "Asian",
"Black", "Black", "Black", "Black", "Black", "Black", "Black", "Black", "Black", "Black", "Black", "Black", "Black",
"Hispanic", "Hispanic", "Hispanic", "Hispanic", "Hispanic", "Hispanic", "Hispanic", "Hispanic", "Hispanic", "Hispanic", "Hispanic", "Hispanic", "Hispanic",
"White", "White", "White", "White", "White", "White", "White", "White", "White", "White", "White", "White", "White"),
Mean = c(152.06, 152.11,
152.04, 152.02, 152.37, 152.36,
152.67, 152.21, 152.63, 152.42, 152.33, 152.72, 152.85,
142.31, 142.22,
142.16, 142.26, 142.05, 141.87,
141.83, 141.98, 141.76, 141.82, 141.73, 141.68, 141.70,
146.46, 146.40,
146.34, 146.58, 146.44, 146.25,
146.34, 146.01, 145.71, 145.92, 145.80, 145.90, 145.84,
152.71, 152.90,
152.57, 152.88, 152.86, 152.77,
152.80, 152.77, 152.75, 152.76, 152.81, 152.82, 153.18), #For mean and SD, most recent data was used so if "2011-2012" numbers were reported in a later report, it was used instead of the same data provided in an earlier report
SD = c(10.04, 10.13,
10.00, 10.15, 10.74, 10.42,
10.52, 10.59, 10.84, 10.43, 10.61, 10.49, 10.91,
8.39, 8.63,
8.40, 8.51, 8.74, 8.64,
8.68, 8.60, 8.97, 8.67, 8.88, 8.78, 8.97,
9.40, 9.45,
9.27, 9.50, 9.65, 9.28,
9.26, 9.07, 9.44, 9.01, 9.18, 9.13, 9.31,
10.20, 9.31,
8.96, 8.95, 9.33, 9.17,
9.27, 9.06, 9.39, 9.15, 9.40, 9.05, 9.27),
N = c(8976, 9109,
9070, 9728, 10749, 9260,
7522, 6414, 6147, 5734, 5800, 5991, 7043,
11288, 11844,
12184, 13253, 14618, 13524,
11473, 9839, 9273, 9114, 9309, 9778, 10997,
5588, 5972,
6324, 6933, 9296, 8179,
7236, 6611, 6225, 6445, 6919, 6978, 8056,
72700, 70249,
70143, 73540, 80108, 69321,
57149, 47819, 42064, 40591, 41778, 42455, 48608))
ggplot(LSATData, aes(x = Year, y = Mean, color = Race)) +
geom_point(size = 1.5) +
geom_line(linewidth = .75) +
geom_errorbar(aes(
ymin = Mean - (1.96 * SD/sqrt(N)),
ymax = Mean + (1.96 * SD/sqrt(N))),
size = .75, width = .2) +
scale_colour_personal_d("PaletteOne") +
lims(
y = c(138, 155)) +
labs(
y = "", #Mean LSAT Score
title = "LSAT Means by Race between 2006 and 2018") +
theme(
legend.position = c(.047, .067),
legend.title = element_blank(),
legend.margin = margin(t = -5, r = 1, l = 1, b = 1),
legend.key = element_rect(fill = NA)) +
scale_x_continuous("", labels = as.character(LSATData$Year), breaks = LSATData$Year) +
my_theme()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
The Hispanic-White gap and Black-White gaps in 2006 and 2018, respectively, were
Cohensd(152.71, 142.31, 10.20, 8.39, 72700, 11288)
## With group means of 152.71 and 142.31 with SDs of 10.2 and 8.39 Cohen's d is -1.114 Glass' Delta is -1.02 and Hedge's g is -1.043.
Cohensd(153.18, 141.70, 9.27, 8.97, 48608, 10997)
## With group means of 153.18 and 141.7 with SDs of 9.27 and 8.97 Cohen's d is -1.259 Glass' Delta is -1.238 and Hedge's g is -1.246.
Cohensd(152.71, 146.46, 10.20, 9.40, 72700, 5588)
## With group means of 152.71 and 146.46 with SDs of 10.2 and 9.4 Cohen's d is -0.637 Glass' Delta is -0.613 and Hedge's g is -0.616.
Cohensd(153.18, 145.84, 9.27, 9.31, 48608, 8056)
## With group means of 153.18 and 145.84 with SDs of 9.27 and 9.31 Cohen's d is -0.79 Glass' Delta is -0.792 and Hedge's g is -0.791.
Or in other words, pretty much constant. So, let’s aggregate results across years by race and let’s do some simulation of the distributions, ignoring truncation of the LSAT score distribution at 120 or 180 or skewness. Skewness doesn’t seem to be an issue, nor does truncation in the empirical plots, but they will have differential relevance by racial group due to mean differences. For example, for Puerto Ricans, truncation at 120 would have a major effect in the 2011-2012 data (see first, second, and third reports, Figure 14). Notably, there is a bit of an impact of this for the Black group, although the extent to which the other end is an issue is far more muted. In other words, very low scores are more common than very high ones.
GroupedData <- LSATData %>% #Aggregation across years
group_by(Race) %>%
summarize(weighted_mean = round(weighted.mean(Mean, N), digits = 3),
weighted_sd = round(weighted.mean(SD, N), digits = 3),
total_N = sum(N),
Threshold170 = round(1 - pnorm(170, mean = weighted_mean, sd = weighted_sd), digits = 5),
N170 = round(Threshold170 * total_N),
.groups = "drop"); GroupedData
set.seed(1)
SimulatedData <- GroupedData %>% #Simulate to plot distributions, assuming normality, which is observed in the reports
mutate(SimulatedValues = map2(weighted_mean,
weighted_sd,
~ rnorm(total_N[1], mean = .x, sd = .y))) %>%
unnest(SimulatedValues)
SimDataSummary <- SimulatedData %>% #Get exact confidence intervals from simulated data for a bar plot with error bars
group_by(Race) %>%
summarize(N = n(),
N170 = sum(SimulatedValues > 170),
.groups = "drop") %>%
mutate(CI = purrr::map2(N,
N170,
~ binom.test(.y, .x)$conf.int)) %>% #CIs for proportions above threshold (170)
tidyr::unnest_wider(CI, names_sep = "_") %>%
rename(ymin = CI_1,
ymax = CI_2)
Here are violin plots for the LSAT score distribution with a dashed line at 170, the lowest median for an elite law school. See:
ggplot(SimulatedData,
aes(x = SimulatedValues,
y = Race,
fill = Race)) +
geom_violin(color = "black", linewidth = 1) +
geom_vline(xintercept = 170, linetype = "dashed", linewidth = .73, color = "gray50") +
scale_fill_personal_d("PaletteOne") +
labs(
x = "",
y = "",
title = "LSAT Score Distributions by Race",
caption = "Plot Based on all LSAT Data from 2006-2018") +
theme(
legend.position = "none",
axis.title.x = element_text(vjust = .5)) +
my_theme()
And here are the numbers for each group with scores beyond 170.
ggplot(GroupedData,
aes(x = Race,
y = N170,
fill = Race)) +
geom_col(color = "black", linewidth = 1) +
scale_y_continuous(expand = c(0, 0)) +
labs(
x = "",
y = "",
title = "Estimated Numbers of People in Each Group with Scores Above 170",
caption = "Plot Based on all LSAT Data from 2006-2018") +
scale_fill_personal_d("PaletteOne") +
theme(
legend.position = "none") +
my_theme()
2005-2006 through 2011-2012: https://web.archive.org/web/20230315031150/https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=919c559897c97500076b540f7c7482d30a8e008c
2007-2008 through 2013-2014: https://web.archive.org/web/20230315031303/https://www.lsac.org/data-research/research/lsat-performance-regional-gender-and-racialethnic-breakdowns-2007-2008
2011-2012 through 2017-2018: https://web.archive.org/web/20230315031200/https://www.lsac.org/data-research/research/lsat-performance-regional-gender-and-racial-and-ethnic-breakdowns-2011-2018
ggplot(SimDataSummary,
aes(x = Race,
y = GroupedData$N170,
fill = Race)) +
geom_col(color = "black", linewidth = 1) +
scale_y_continuous(expand = c(0, 0)) +
geom_errorbar(aes(ymin = ymin * GroupedData$total_N,
ymax = ymax * GroupedData$total_N),
width = 0.2, color = "gray45", linewidth = 1.5) +
labs(
x = "",
y = "",
title = "Estimated Numbers of People in Each Group with Scores Above 170
with a 95% CI based on Simulated Data",
caption = "Plot Based on all LSAT Data from 2006-2018") +
scale_fill_personal_d("PaletteTwo") +
my_theme() +
theme(
legend.position = "none")
Data from https://www.aamc.org/data-reports/students-residents/interactive-data/2022-facts-applicants-and-matriculants-data, and specifically, https://web.archive.org/web/20230302162451/https://www.aamc.org/media/6066/download. There is no sex by race breakout.
The MCAT sections with provided score are the
I will be focusing on the total score, which is the sum of the four sections, each of which is scored from 118 to 132, for a minimum score of 472 and a maximum score of 528. For scores of top medical schools, see here: https://web.archive.org/web/20230316012724/https://ingeniusprep.com/blog/average-mcat-scores/. For the elite threshold, I will be using 520.
ApplicantData <- data.frame(
"Race" = c("Asian", "Black", "Hispanic", "White"),
"Mean" = c(509.2, 497.4, 500.1, 507.9),
"SD" = c(9.2, 10.0, 10.2, 8.5),
"N" = c(12736, 4924, 3257, 22917))
MatriculantData <- data.frame(
"Race" = c("Asian", "Black", "Hispanic", "White"),
"Mean" = c(514.4, 505.7, 506.1, 512.6),
"SD" = c(6.0, 6.4, 7.3, 5.9),
"N" = c(5604, 1856, 1444, 9599))
The Asian-White, Black-White, and Hispanic-White gaps for applicants and then matriculants were
"Applicants"
## [1] "Applicants"
Cohensd(507.9, 509.2, 8.5, 9.2, 22917, 12736) #Asian
## With group means of 507.9 and 509.2 with SDs of 8.5 and 9.2 Cohen's d is 0.147 Glass' Delta is 0.153 and Hedge's g is 0.148.
Cohensd(507.9, 497.4, 8.5, 10.0, 22917, 4924) #Black
## With group means of 507.9 and 497.4 with SDs of 8.5 and 10 Cohen's d is -1.131 Glass' Delta is -1.235 and Hedge's g is -1.195.
Cohensd(507.9, 500.1, 8.5, 10.2, 22917, 3257) #Hispanic
## With group means of 507.9 and 500.1 with SDs of 8.5 and 10.2 Cohen's d is -0.831 Glass' Delta is -0.918 and Hedge's g is -0.894.
"Matriculants"
## [1] "Matriculants"
Cohensd(512.6, 514.4, 5.9, 6.0, 9599, 5604) #Asian
## With group means of 512.6 and 514.4 with SDs of 5.9 and 6 Cohen's d is 0.303 Glass' Delta is 0.305 and Hedge's g is 0.303.
Cohensd(512.6, 505.7, 5.9, 6.4, 9599, 1856) #Black
## With group means of 512.6 and 505.7 with SDs of 5.9 and 6.4 Cohen's d is -1.121 Glass' Delta is -1.169 and Hedge's g is -1.153.
Cohensd(512.6, 506.1, 5.9, 7.3, 9599, 1444) #Hispanic
## With group means of 512.6 and 506.1 with SDs of 5.9 and 7.3 Cohen's d is -0.979 Glass' Delta is -1.102 and Hedge's g is -1.065.
Based on the changes in applicant versus matriculant scores, Whites are favored over Asians (matriculant gap more greatly favors Asians than it does with applicants), Blacks are favored over Whites (matriculant gap is unchanged, when it should be reduced), and favors Hispanics over Whites (matriculant gap is incresaed, when it should be reduced). This is naive though, since it assumes no compensatory race-varying factors or compositional differences, where the first assumption is ten able but the latter is not for Asians because of how broadly that group is defined.
How many people in each group had elite scores in the 2023 testing group?
ApplicantData$Threshold520 = round(1 - pnorm(520, ApplicantData$Mean, ApplicantData$SD), digits = 5)
ApplicantData$N520 = round(ApplicantData$Threshold520 * ApplicantData$N); ApplicantData
MatriculantData$Threshold520 = round(1 - pnorm(520, MatriculantData$Mean, MatriculantData$SD), digits = 5)
MatriculantData$N520 = round(MatriculantData$Threshold520 * MatriculantData$N); MatriculantData
ggplot(ApplicantData,
aes(x = Race,
y = N520,
fill = Race)) +
geom_col(color = "black", linewidth = 1) +
scale_y_continuous(expand = c(0, 0)) +
labs(
x = "",
y = "",
title = "Estimated Numbers of People in Each Applicant Group with Scores Above 520",
caption = "Plot Based on MCAT Data from 2022-2023") +
scale_fill_personal_d("PaletteOne") +
my_theme() +
theme(
legend.position = "none")
ggplot(MatriculantData,
aes(x = Race,
y = N520,
fill = Race)) +
geom_col(color = "black", linewidth = 1) +
scale_y_continuous(expand = c(0, 0)) +
labs(
x = "",
y = "",
title = "Estimated Numbers of People in Each Matriculant Group with Scores Above 520",
caption = "Plot Based on MCAT Data from 2022-2023") +
scale_fill_personal_d("PaletteOne") +
my_theme() +
theme(
legend.position = "none")
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0 purrr_1.0.1
## [5] readr_2.1.4 tidyr_1.3.0 tibble_3.1.8 tidyverse_2.0.0
## [9] dplyr_1.1.0 ggplot2_3.4.1 pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] highr_0.10 bslib_0.4.2 compiler_4.2.2 pillar_1.8.1
## [5] jquerylib_0.1.4 tools_4.2.2 digest_0.6.31 timechange_0.2.0
## [9] jsonlite_1.8.4 evaluate_0.20 lifecycle_1.0.3 gtable_0.3.1
## [13] pkgconfig_2.0.3 rlang_1.0.6 cli_3.6.0 rstudioapi_0.14
## [17] yaml_2.3.7 xfun_0.37 fastmap_1.1.0 withr_2.5.0
## [21] knitr_1.42 hms_1.1.2 generics_0.1.3 vctrs_0.5.2
## [25] sass_0.4.5 grid_4.2.2 tidyselect_1.2.0 glue_1.6.2
## [29] R6_2.5.1 fansi_1.0.4 rmarkdown_2.20 farver_2.1.1
## [33] tzdb_0.3.0 magrittr_2.0.3 ellipsis_0.3.2 scales_1.2.1
## [37] htmltools_0.5.4 colorspace_2.1-0 labeling_0.4.2 utf8_1.2.3
## [41] stringi_1.7.12 munsell_0.5.0 cachem_1.0.6