# Libraries
pacman::p_load(haven, estimatr, texreg, janitor, tidyverse, skimr, compareGroups)
# load dataset (observations are at the student-course level)
df <- read_rds("df.rds")
# Student-level slice (one row per student) – used for descriptives, overlaps,
# and dropout/switching analyses
df_stu <- df %>%
group_by(stdid) %>%
summarise(
female = first(female),
univcode = first(univcode),
department_id = first(department_id),
fem_share = mean(fac_female, na.rm = TRUE) * 10, # ×10, as in Eq. 3
father_college = first(father_college),
mother_college = first(mother_college),
reservation_stu = first(reservation_stu),
# baseline psych & test
b_math_anxiety = first(b_math_anxiety),
b_math_confidence = first(b_math_confidence),
b_math_g1_score = first(b_math_g1_score),
b_physics_g1_score = first(b_physics_g1_score),
b_stay_branch = first(b_stay_branch),
# endline psych, test, and longer-run outcomes
e_math_anxiety = first(e_math_anxiety),
e_math_confidence = first(e_math_confidence),
e_math_g3_score = first(e_math_g3_score),
e_physics_g3_score = first(e_physics_g3_score),
e_dropped_out = first(e_dropped_out),
e_switchmajor = first(e_switchmajor),
e_attend_grad_school = first(e_attend_grad_school),
e_college_satisfied = first(e_college_satisfied),
e_stay_branch = first(e_stay_branch),
e_salary_expected = first(e_salary_expected),
e_stem_belonging_z = first(e_stem_belonging_z),
.groups = "drop"
) %>%
mutate(
has_admin = 1L,
has_b_test = as.integer(!is.na(b_math_g1_score) | !is.na(b_physics_g1_score)),
has_e_test = as.integer(!is.na(e_math_g3_score) | !is.na(e_physics_g3_score)),
has_b_psych = as.integer(!is.na(b_math_anxiety) | !is.na(b_math_confidence)),
has_e_psych = as.integer(!is.na(e_math_anxiety) | !is.na(e_math_confidence))
)
We thank the reviewer for asking us to clarify how students entered the survey sample. All survey respondents are drawn from the same 11 colleges and same first-year cohorts in computer science and electrical engineering programs used in the main administrative analysis. The baseline survey was administered in person at program entry, and the endline survey was administered in person approximately two years later. Participation rates were approximately 95 percent in both survey rounds, with no meaningful difference by gender (female students: [X]% baseline, [X]% endline; male students: [X]% baseline, [X]% endline).
The standardized assessment in mathematics and science was administered to a randomly selected half of first-year students within each college and program. This 50% sub-sampling, which is part of the broader assessment design described in Loyalka et al. (2021), was performed before students participated in these courses and is independent of the departments’ randomized assignment of students to sections with each course. The psychological survey items used for STEM-related anxiety and confidence were administered to the same randomly selected half of students at the same time as the standardized math (or science) assessment. For full context, the other half of students were administered higher-order thinking skills assessments, the details of which can be found in Loyalka et al. (2021). To make the overlap between samples clear, we have added Table X, which reports sample composition by analytic stage.
# Student-level slice for overlap counts
df_stu <- df %>%
group_by(stdid) %>%
drop_na(female) %>%
summarise(
female = first(female),
univcode = first(univcode),
department_id = first(department_id),
b_math_g1_score = first(b_math_g1_score),
b_physics_g1_score = first(b_physics_g1_score),
e_math_g3_score = first(e_math_g3_score),
e_physics_g3_score = first(e_physics_g3_score),
b_math_anxiety = first(b_math_anxiety),
e_math_anxiety = first(e_math_anxiety),
.groups = "drop"
) %>%
mutate(
has_admin = 1L,
has_b_test = as.integer(!is.na(b_math_g1_score) | !is.na(b_physics_g1_score)),
has_e_test = as.integer(!is.na(e_math_g3_score) | !is.na(e_physics_g3_score)),
has_b_psych = as.integer(!is.na(b_math_anxiety)),
has_e_psych = as.integer(!is.na(e_math_anxiety)),
has_both_test = as.integer(has_b_test & has_e_test)
)
# Sample composition - overall
overlap_overall <-
df_stu %>%
drop_na(female) %>%
summarise(
`Sample used for course-grade analysis` = sum(has_admin),
`Baseline standardized assessment` = sum(has_b_test),
`Endline standardized assessment` = sum(has_e_test),
`Both baseline and endline assessment` = sum(has_both_test),
`Baseline psychological survey` = sum(has_b_psych),
`Endline psychological survey` = sum(has_e_psych)
) %>%
pivot_longer(everything(), names_to = "Sample", values_to = "All students")
# Sample composition - by gender
overlap_by_gender <-
df_stu %>%
drop_na(female) %>%
group_by(female) %>%
summarise(
`Sample used for course-grade analysis` = sum(has_admin),
`Baseline standardized assessment` = sum(has_b_test),
`Endline standardized assessment` = sum(has_e_test),
`Both baseline and endline assessment` = sum(has_both_test),
`Baseline psychological survey` = sum(has_b_psych),
`Endline psychological survey` = sum(has_e_psych),
.groups = "drop"
) %>%
pivot_longer(-female, names_to = "Sample", values_to = "n") %>%
mutate(female = ifelse(female == 1, "Female", "Male")) %>%
pivot_wider(names_from = female, values_from = n)
# Participation rates (for the response paragraph)
participation_rate <-
df_stu %>%
drop_na(female) %>%
group_by(female) %>%
summarise(
n_admin = sum(has_admin),
pct_b_test = round(100 * sum(has_b_test) / sum(has_admin), 1),
pct_e_test = round(100 * sum(has_e_test) / sum(has_admin), 1),
pct_both_test = round(100 * sum(has_both_test) / sum(has_admin), 1),
.groups = "drop"
) %>%
mutate(female = ifelse(female == 1, "Female", "Male"))
table_e18 <- overlap_overall %>% left_join(overlap_by_gender, by = "Sample") %>% head(4)
knitr::kable(table_e18)
| Sample | All students | Male | Female |
|---|---|---|---|
| Sample used for course-grade analysis | 1793 | 1216 | 577 |
| Baseline standardized assessment | 827 | 557 | 270 |
| Endline standardized assessment | 892 | 605 | 287 |
| Both baseline and endline assessment | 807 | 542 | 265 |
We have also placed the full survey instrument for the psychological scale items, in the online appendix at https://rpubs.com/saurabhkhanna/a-stem-prpofessor-like-me.
We thank the reviewer for this constructive suggestion. We implement a permutation-based test of within-course random assignment. The test holds the existing classroom structure fixed and randomly reshuffles students across classrooms within each course-college-semester stratum, which mirrors the colleges’ assignment procedure. For example, in a stratum consisting of a given course offered in a given college and semester, enrolled students are randomly redistributed across the existing classrooms in that stratum; students are not shuffled across courses, colleges, or semesters. We repeated this procedure B = 1,000 times to construct, for each baseline characteristic, the distribution of mean differences between female- and male-taught classrooms that would be expected under random assignment. We tested the same baseline characteristics used in Table E2: baseline standardized mathematics and science scores, baseline STEM-related anxiety, baseline STEM-related confidence, father’s college attendance, mother’s college attendance, and eligibility for caste-based affirmative action. The test was restricted to strata in which both female- and male-taught classrooms exist, since strata with a single faculty gender contribute no information to the comparison.
set.seed(20260519)
B <- 1000 # number of permutations
# Baseline covariates to test (same set as Table E2)
covars <- c("b_math_g1_score", "b_physics_g1_score",
"b_math_anxiety", "b_math_confidence",
"father_college", "mother_college", "reservation_stu")
# Define the "course" stratum: a course offered at a college in a semester
df_perm <- df %>%
mutate(stratum = paste(course_name, univcode, semester_clean, sep = "::"))
# Standardize each baseline covariate using the analysis sample so that
# all observed/permuted differences are expressed in SD units.
df_perm <- df_perm %>%
mutate(across(all_of(covars),
~ as.numeric(scale(.x, center = TRUE, scale = TRUE))))
# Keep only strata that have BOTH female- and male-taught classrooms
# (strata with only one faculty gender give no within-stratum variation to shuffle)
strata_ok <- df_perm %>%
group_by(stratum) %>%
summarise(both_genders = n_distinct(fac_female) == 2, .groups = "drop") %>%
filter(both_genders) %>%
pull(stratum)
df_perm <- df_perm %>% filter(stratum %in% strata_ok)
cat("Strata with both faculty genders:", length(strata_ok), "\n")
## Strata with both faculty genders: 136
cat("Total student-course observations used in permutation test:", nrow(df_perm), "\n\n")
## Total student-course observations used in permutation test: 16390
# For each baseline covariate, compute the actual mean difference between
# students in female-taught vs male-taught classrooms
obs_diff <- sapply(covars, function(v) {
mean(df_perm[[v]][df_perm$fac_female == 1], na.rm = TRUE) -
mean(df_perm[[v]][df_perm$fac_female == 0], na.rm = TRUE)
})
# Within each "course" stratum, randomly reshuffle the fac_female assignment.
# This mimics what we'd expect if students were truly randomly assigned to
# classrooms (each taught by faculty of either gender) within that course.
# Repeat B times to build a null distribution.
perm_diff <- replicate(B, {
dfp <- df_perm %>%
group_by(stratum) %>%
mutate(fac_female_perm = sample(fac_female)) %>%
ungroup()
sapply(covars, function(v) {
mean(dfp[[v]][dfp$fac_female_perm == 1], na.rm = TRUE) -
mean(dfp[[v]][dfp$fac_female_perm == 0], na.rm = TRUE)
})
})
# perm_diff is a (covariates x B) matrix
# Two-sided: how often is the absolute simulated difference at least as
# extreme as the absolute observed difference?
p_perm <- sapply(seq_along(covars), function(i) {
mean(abs(perm_diff[i, ]) >= abs(obs_diff[i]), na.rm = TRUE)
})
# Per-covariate table
balance_perm_tbl <- tibble(
Covariate = covars,
`Observed diff (F - M)` = round(obs_diff, 4),
`Permutation p-value` = round(p_perm, 3),
`Permutations` = B
)
knitr::kable(balance_perm_tbl,
caption = "Permutation-based balance test of random assignment")
| Covariate | Observed diff (F - M) | Permutation p-value | Permutations |
|---|---|---|---|
| b_math_g1_score | -0.0765 | 0.745 | 1000 |
| b_physics_g1_score | -0.0191 | 1.000 | 1000 |
| b_math_anxiety | -0.0004 | 0.973 | 1000 |
| b_math_confidence | -0.0070 | 0.873 | 1000 |
| father_college | 0.0568 | 0.891 | 1000 |
| mother_college | 0.0670 | 0.764 | 1000 |
| reservation_stu | -0.0451 | 0.575 | 1000 |
# Worst-case (max absolute difference) observed across all covariates vs.
# the corresponding max in each permutation. A "joint" check.
worst_obs <- max(abs(obs_diff), na.rm = TRUE)
worst_perm <- apply(abs(perm_diff), 2, max, na.rm = TRUE)
joint_p <- mean(worst_perm >= worst_obs)
cat("Joint test (max |diff| across all covariates):\n")
## Joint test (max |diff| across all covariates):
cat(" Observed max |diff|:", round(worst_obs, 4), "\n")
## Observed max |diff|: 0.0765
cat(" Joint p-value :", round(joint_p, 3), "\n")
## Joint p-value : 0.911
# Reshape permutation results to long format for plotting
perm_long <- as_tibble(t(perm_diff)) %>%
setNames(covars) %>%
pivot_longer(everything(), names_to = "Covariate", values_to = "diff")
obs_df <- tibble(Covariate = covars, obs = obs_diff)
ggplot(perm_long, aes(diff)) +
geom_histogram(bins = 30, fill = "grey75", colour = "white") +
geom_vline(data = obs_df, aes(xintercept = obs),
colour = "red", linewidth = 0.7) +
facet_wrap(~ Covariate, scales = "free", ncol = 3) +
labs(
x = "Mean difference (female- vs male-taught classrooms) under permutation null",
y = "Frequency",
caption = paste0(
"Red line = observed difference. Null distribution built from ",
B, " within-stratum permutations."
)
) +
theme_minimal(base_size = 10)
The following items were administered separately for mathematics and physics (science) at baseline (program entry) and endline (approximately two years later). Items were adapted from the OECD PISA Mathematics Anxiety and Mathematics Self-Concept scales, with parallel wording for physics.
Response scale (all items): 1 = strongly agree, 2 = agree, 3 = disagree, 4 = strongly disagree.
“We are interested in your subjective experience in maths. Please indicate whether or not you agree with the following statements.”
“We are interested in your subjective experience in physics. Please indicate whether or not you agree with the following statements.”