library(MatchIt) #propensity score matching
library(dplyr)
library(tidyverse)
library(mosaic)
library(hrbrthemes)
# count transfer and non-transfer students
df %>%
  group_by(IfTransfer) %>%
  summarize(unique_count = n_distinct(ID))
## # A tibble: 2 × 2
##   IfTransfer unique_count
##   <lgl>             <int>
## 1 FALSE             98731
## 2 TRUE               9742
# Group by ID, admitted year and current program
# find the last academic period student attended Humber where GPA is not null 
# find last term GPA
# find number course completed by last term
last_GPA<- df %>%
  filter(!is.na(Current.GPA)) %>%
  group_by(ID, Admitted.Year, Current.Program.Code)  %>%
  reframe(Last_Term = max(Academic.Period[!is.na(Current.GPA)]),
    Last_Term_GPA = Current.GPA[which.max(Academic.Period)],
    Course_Completed_bylastterm = Current.Courses.Completed[which.max(Academic.Period)],
     Admitted_GPA= Admitted.GPA,
    IfTransfer= IfTransfer,
    Birth_Year= Birth.Year,
    Gender= Gender,
    First.Generation= First.Generation,
    Immigration_Status = Immigration.Status)

#replace 0 with NA in Admission GPA
last_GPA <- mutate(last_GPA, Admitted_GPA = ifelse( Admitted_GPA== 0, NA, Admitted_GPA)) 

last_GPA<- distinct(last_GPA)
last_GPA$Age<- 2023-last_GPA$Birth_Year

colnames(last_GPA)[colnames(last_GPA) %in% c('Admitted.Year', 'Current.Program.Code','First.Generation')] <- c('Admitted_Year', 'Current_Program', 'First_Gen')
# naive mean comparison between transfer and non-transfer students
t.test(Last_Term_GPA~IfTransfer, data=last_GPA)
## 
##  Welch Two Sample t-test
## 
## data:  Last_Term_GPA by IfTransfer
## t = -28.987, df = 9886.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  -8.288895 -7.238855
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            64.62397            72.38784
t.test(Age~IfTransfer, last_GPA)
## 
##  Welch Two Sample t-test
## 
## data:  Age by IfTransfer
## t = -20.695, df = 9608, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  -1.632524 -1.350018
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            27.79174            29.28301
t.test(Admitted_GPA~IfTransfer, last_GPA)
## 
##  Welch Two Sample t-test
## 
## data:  Admitted_GPA by IfTransfer
## t = -10.172, df = 8464.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  -1.3012332 -0.8807342
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            75.78763            76.87862
# temporarily remove rows with na in Admitted.GPA column for MatchIt to work
na_omit_last_GPA<-last_GPA[!is.na(last_GPA$Admitted_GPA),]

# how many transfer students excluded because Admitted GPA is na
count(last_GPA$IfTransfer)- count(na_omit_last_GPA$IfTransfer)
## n_TRUE 
##   1089
# find non-transfer matches based on age, gender, first generation, admitted GPA, number course completed by last term, current program, and cohort
transfer_match <- matchit(IfTransfer~ Age+Gender+First_Gen+Admitted_GPA+Course_Completed_bylastterm+Current_Program+Admitted_Year, na_omit_last_GPA, ratio = 5)

# check covariate balance
summary(transfer_match)

# extract the matched data and analyze
transfer_matched <- match.data(transfer_match)
lm_transfer <- lm(Last_Term_GPA~IfTransfer, transfer_matched)
coef(lm_transfer) # estimated transfer students have 3.8 higher GPA than non-transfer
##    (Intercept) IfTransferTRUE 
##      68.759895       3.798168
# get confidence intervals by bootstrapping
boot_transfer <- do(1000)*lm(Last_Term_GPA~IfTransfer, resample(transfer_matched))
confint(boot_transfer) # 95% CI is between 3.2 and 4.4 higher GPA
##             name        lower        upper level     method     estimate
## 1      Intercept 6.852599e+01 6.898897e+01  0.95 percentile  68.75989527
## 2 IfTransferTRUE 3.180150e+00 4.390031e+00  0.95 percentile   3.79816809
## 3          sigma 2.178810e+01 2.231333e+01  0.95 percentile  22.03459062
## 4      r.squared 2.883419e-03 5.538927e-03  0.95 percentile   0.00410997
## 5              F 1.177524e+02 2.268014e+02  0.95 percentile 168.04863592
t.test(Last_Term_GPA~IfTransfer, transfer_matched)
## 
##  Welch Two Sample t-test
## 
## data:  Last_Term_GPA by IfTransfer
## t = -12.604, df = 9446.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  -4.388848 -3.207488
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            68.75990            72.55806