knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE
)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(broom)
if (!dir.exists("figs")) dir.create("figs")
admissions_raw <- read_csv("GradSchool_Admissions.csv")
admissions <- admissions_raw |>
clean_names() |>
mutate(
admit = factor(admit, levels = c(0, 1),
labels = c("rejected", "admitted")),
rank = factor(rank,
levels = sort(unique(rank)),
labels = paste("Rank", sort(unique(rank))))
)
glimpse(admissions)
## Rows: 400
## Columns: 4
## $ admit <fct> rejected, admitted, admitted, admitted, rejected, admitted, admi…
## $ gre <dbl> 380, 660, 800, 640, 520, 760, 560, 400, 540, 700, 800, 440, 760,…
## $ gpa <dbl> 3.61, 3.67, 4.00, 3.19, 2.93, 3.00, 2.98, 3.08, 3.39, 3.92, 4.00…
## $ rank <fct> Rank 3, Rank 3, Rank 1, Rank 4, Rank 4, Rank 2, Rank 1, Rank 2, …
summary(admissions)
## admit gre gpa rank
## rejected:273 Min. :220.0 Min. :2.260 Rank 1: 61
## admitted:127 1st Qu.:520.0 1st Qu.:3.130 Rank 2:151
## Median :580.0 Median :3.395 Rank 3:121
## Mean :587.7 Mean :3.390 Rank 4: 67
## 3rd Qu.:660.0 3rd Qu.:3.670
## Max. :800.0 Max. :4.000
# Admission counts
admissions |>
count(admit)
## # A tibble: 2 × 2
## admit n
## <fct> <int>
## 1 rejected 273
## 2 admitted 127
# Admission proportion by rank
admissions |>
count(rank, admit) |>
group_by(rank) |>
mutate(prop = n / sum(n))
## # A tibble: 8 × 4
## # Groups: rank [4]
## rank admit n prop
## <fct> <fct> <int> <dbl>
## 1 Rank 1 rejected 28 0.459
## 2 Rank 1 admitted 33 0.541
## 3 Rank 2 rejected 97 0.642
## 4 Rank 2 admitted 54 0.358
## 5 Rank 3 rejected 93 0.769
## 6 Rank 3 admitted 28 0.231
## 7 Rank 4 rejected 55 0.821
## 8 Rank 4 admitted 12 0.179
From the quick summary, most people in this dataset didn’t get in. Also, the school rank already looks like it might matter. The higher-ranked schools seem to have better acceptance numbers.
# GRE distribution
ggplot(admissions, aes(x = gre)) +
geom_histogram(bins = 30) +
labs(title = "Distribution of GRE Scores",
x = "GRE", y = "Count")
# GPA distribution
ggplot(admissions, aes(x = gpa)) +
geom_histogram(bins = 30) +
labs(title = "Distribution of GPA",
x = "GPA", y = "Count")
GRE scores have a wide range, but most people seem to land somewhere in the middle. There are a few really high and really low scores.
GPAs are mostly pretty high. Almost everyone is between a 3.0 and 4.0, so the dataset doesn’t really include many low-GPA applicants.
ggplot(admissions, aes(x = rank, fill = admit)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Admission Proportion by Undergraduate Rank",
x = "Undergraduate Rank",
y = "Proportion")
This plot basically shows that the rank of your undergrad school definitely matters. People from Rank 1 and Rank 2 schools have way better chances than Rank 3 or Rank 4.
# Null model (intercept only)
m0 <- glm(admit ~ 1,
data = admissions,
family = binomial)
# Model 1: GRE + GPA
m1 <- glm(admit ~ gre + gpa,
data = admissions,
family = binomial)
# Model 2: GRE + GPA + rank
m2 <- glm(admit ~ gre + gpa + rank,
data = admissions,
family = binomial)
# Compare by AIC
AIC(m0, m1, m2)
## df AIC
## m0 1 501.9765
## m1 3 486.3440
## m2 6 470.5175
# Likelihood ratio tests (model improvement)
anova(m0, m1, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: admit ~ 1
## Model 2: admit ~ gre + gpa
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 399 499.98
## 2 397 480.34 2 19.633 5.456e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m1, m2, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: admit ~ gre + gpa
## Model 2: admit ~ gre + gpa + rank
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 397 480.34
## 2 394 458.52 3 21.826 7.088e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Detailed output for best model
summary(m2)
##
## Call:
## glm(formula = admit ~ gre + gpa + rank, family = binomial, data = admissions)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.989979 1.139951 -3.500 0.000465 ***
## gre 0.002264 0.001094 2.070 0.038465 *
## gpa 0.804038 0.331819 2.423 0.015388 *
## rankRank 2 -0.675443 0.316490 -2.134 0.032829 *
## rankRank 3 -1.340204 0.345306 -3.881 0.000104 ***
## rankRank 4 -1.551464 0.417832 -3.713 0.000205 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 499.98 on 399 degrees of freedom
## Residual deviance: 458.52 on 394 degrees of freedom
## AIC: 470.52
##
## Number of Fisher Scoring iterations: 4
I tested three different logistic regression models to see what actually predicts whether someone gets into grad school:
First, comparing the null model to the GRE+GPA model, the improvement was super significant (χ² = 19.63, p < 0.001). Basically, GRE and GPA obviously help explain who gets admitted.
Then I compared that model to the one that also includes the rank of the undergrad program. That was also a big improvement (χ² = 21.83, p < 0.001), meaning the rank of the school you went to actually matters, even after controlling for your scores.
Since Model 2 had the lowest AIC and was the best-fitting overall, that’s the model I went with. So yeah — GRE, GPA, and school rank all play a role in admission chances.
admissions_pred <- admissions |>
mutate(
p_hat = predict(m2, type = "response"),
admit_pred = ifelse(p_hat >= 0.5, "admitted", "rejected"),
admit_pred = factor(admit_pred, levels = levels(admit))
)
conf_mat <- table(
observed = admissions_pred$admit,
predicted = admissions_pred$admit_pred
)
conf_mat
## predicted
## observed rejected admitted
## rejected 254 19
## admitted 97 30
accuracy <- sum(diag(conf_mat)) / sum(conf_mat)
accuracy
## [1] 0.71
Using the final model (GRE + GPA + Rank), I generated predicted probabilities and turned them into admitted/rejected calls using a 0.5 cutoff. The confusion matrix and accuracy value show that the model gets most of the cases right, but it still messes up a bit on borderline applicants. Honestly, that makes sense because admissions depend on a lot more than just GRE/GPA/stats.
Overall, the analysis suggests that higher GRE scores, higher GPAs, and attending a higher-ranked undergrad school are all linked to better odds of grad school admission. Rank still matters even after controlling for GRE and GPA, which lines up with how competitive programs usually work. The model does a decent job predicting outcomes, but it’s not perfect, and real decisions probably also depend on other factors that aren’t in this dataset (like letters, personal statements, etc.).