library(MatchIt) #propensity score matching
library(dplyr)
library(tidyverse)
library(mosaic)
library(hrbrthemes)
# count transfer and non-transfer students
df %>%
group_by(IfTransfer) %>%
summarize(unique_count = n_distinct(ID))
## # A tibble: 2 × 2
## IfTransfer unique_count
## <lgl> <int>
## 1 FALSE 98731
## 2 TRUE 9742
# Group by ID, admitted year and current program
# find the last academic period student attended Humber where GPA is not null
# find last term GPA
# find number course completed by last term
last_GPA<- df %>%
filter(!is.na(Current.GPA)) %>%
group_by(ID, Admitted.Year, Current.Program.Code) %>%
reframe(Last_Term = max(Academic.Period[!is.na(Current.GPA)]),
Last_Term_GPA = Current.GPA[which.max(Academic.Period)],
Course_Completed_bylastterm = Current.Courses.Completed[which.max(Academic.Period)],
Admitted_GPA= Admitted.GPA,
IfTransfer= IfTransfer,
Birth_Year= Birth.Year,
Gender= Gender,
First.Generation= First.Generation,
Immigration_Status = Immigration.Status)
#replace 0 with NA in Admission GPA
last_GPA <- mutate(last_GPA, Admitted_GPA = ifelse( Admitted_GPA== 0, NA, Admitted_GPA))
last_GPA<- distinct(last_GPA)
last_GPA$Age<- 2023-last_GPA$Birth_Year
colnames(last_GPA)[colnames(last_GPA) %in% c('Admitted.Year', 'Current.Program.Code','First.Generation')] <- c('Admitted_Year', 'Current_Program', 'First_Gen')
# naive mean comparison between transfer and non-transfer students
t.test(Last_Term_GPA~IfTransfer, data=last_GPA)
##
## Welch Two Sample t-test
##
## data: Last_Term_GPA by IfTransfer
## t = -28.987, df = 9886.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## -8.288895 -7.238855
## sample estimates:
## mean in group FALSE mean in group TRUE
## 64.62397 72.38784
t.test(Age~IfTransfer, last_GPA)
##
## Welch Two Sample t-test
##
## data: Age by IfTransfer
## t = -20.695, df = 9608, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## -1.632524 -1.350018
## sample estimates:
## mean in group FALSE mean in group TRUE
## 27.79174 29.28301
t.test(Admitted_GPA~IfTransfer, last_GPA)
##
## Welch Two Sample t-test
##
## data: Admitted_GPA by IfTransfer
## t = -10.172, df = 8464.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## -1.3012332 -0.8807342
## sample estimates:
## mean in group FALSE mean in group TRUE
## 75.78763 76.87862
# temporarily remove rows with na in Admitted.GPA column for MatchIt to work
na_omit_last_GPA<-last_GPA[!is.na(last_GPA$Admitted_GPA),]
# how many transfer students excluded because Admitted GPA is na
count(last_GPA$IfTransfer)- count(na_omit_last_GPA$IfTransfer)
## n_TRUE
## 1089
# find non-transfer matches based on age, gender, first generation, admitted GPA, number course completed by last term, current program, and cohort
transfer_match <- matchit(IfTransfer~ Age+Gender+First_Gen+Admitted_GPA+Course_Completed_bylastterm+Current_Program+Admitted_Year, na_omit_last_GPA, ratio = 5)
# check covariate balance
summary(transfer_match)
# extract the matched data and analyze
transfer_matched <- match.data(transfer_match)
lm_transfer <- lm(Last_Term_GPA~IfTransfer, transfer_matched)
coef(lm_transfer) # estimated transfer students have 3.8 higher GPA than non-transfer
## (Intercept) IfTransferTRUE
## 68.759895 3.798168
# get confidence intervals by bootstrapping
boot_transfer <- do(1000)*lm(Last_Term_GPA~IfTransfer, resample(transfer_matched))
confint(boot_transfer) # 95% CI is between 3.2 and 4.4 higher GPA
## name lower upper level method estimate
## 1 Intercept 6.852599e+01 6.898897e+01 0.95 percentile 68.75989527
## 2 IfTransferTRUE 3.180150e+00 4.390031e+00 0.95 percentile 3.79816809
## 3 sigma 2.178810e+01 2.231333e+01 0.95 percentile 22.03459062
## 4 r.squared 2.883419e-03 5.538927e-03 0.95 percentile 0.00410997
## 5 F 1.177524e+02 2.268014e+02 0.95 percentile 168.04863592
t.test(Last_Term_GPA~IfTransfer, transfer_matched)
##
## Welch Two Sample t-test
##
## data: Last_Term_GPA by IfTransfer
## t = -12.604, df = 9446.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## -4.388848 -3.207488
## sample estimates:
## mean in group FALSE mean in group TRUE
## 68.75990 72.55806