library(dplyr)
library(knitr)

Non-direct entry student who are considering PS for the first time

Gap Year, late bloomer and late bloomer plus, are categories of non-direct entry students who are considering PS for the first time. Account.ID is associate with each unique individual despite their Applicant.ID differ from one application cycle to another. So I grouped by Account.ID and filtered for people who ONLY have non-direct entry record.

selected_columns <- c("Applicant.Id", "Account.Id", "App.Cycle", "Age","Applicant.Type", "HS.Graduation.Year","Enrolment.Stage","Program.Credential","Entry.Level")

# filter for Account.ID who only have non-direct entry record
df <- all_data %>% 
  select(all_of(selected_columns)) %>%
  group_by(Account.Id) %>%
  filter(all(Applicant.Type == "NON-DIRECT"))

Gap Year

Gap year applicants: age 18-19, graduated from HS and have taken a year off.

I filtered out applicants without HS graduation year, and included only “application cycle - HS graduation year = 1” and age 18-19; then got a count of unique Applicant ID for each application cycle using this data. I also calculated the count of unique Application ID for each application cycle using unfiltered data. The percentage of gap year applicants in all applicants for each application cycle is then calculated.

I then calculated the count of how many unique applicants registered with a college in each application cycle. If we didn’t control for “App.Cycle - HS.Graduation.Year == 1”, there could potentially be some non-direct applicants who apply to PS in many application cycles and we would have included the later application cycles which are not the first time they are considering PS. However, there is only one application cycle where “App.Cycle - HS.Graduation.Year == 1”, so no further filtering was applied.

# Subset data
gap_year_data <- df %>%
  filter(!is.null(HS.Graduation.Year), App.Cycle - HS.Graduation.Year == 1 , Age>=18, Age <= 19) %>%
  select(Applicant.Id, App.Cycle, Enrolment.Stage)
## Adding missing grouping variables: `Account.Id`
filtered1<-gap_year_data %>%
  group_by(App.Cycle) %>%
  summarize(Gap_year_Count = n_distinct(Applicant.Id))

unfiltered1<-df %>%
  group_by(App.Cycle) %>%
  summarize(All_Count = n_distinct(Applicant.Id))

# Join gap year count and all applicant count in one table
gap_year_counts <- left_join(filtered1, unfiltered1, by = "App.Cycle")

# percentage of gap year students in all applicants
gap_year_counts$Gap_year_Percentage<-gap_year_counts$Gap_year_Count/gap_year_counts$All_Count

# number of registered gap year students
gap_year_registration <- gap_year_data %>%
  filter(Enrolment.Stage == "REGISTRATION") %>%
  group_by(App.Cycle) %>%
  summarize(Registration_Count = n_distinct(Applicant.Id))

gap_year_all <- left_join(gap_year_counts, gap_year_registration, by = "App.Cycle")

# percentage of gap year students registered
gap_year_all$Conversion_rate<-gap_year_all$Registration_Count/gap_year_all$Gap_year_Count

kable(gap_year_all)
App.Cycle Gap_year_Count All_Count Gap_year_Percentage Registration_Count Conversion_rate
2017 14636 143455 0.1020250 10544 0.7204154
2018 7436 136145 0.0546182 5348 0.7192039
2019 7509 124468 0.0603288 5420 0.7218005
2020 7606 132906 0.0572284 4896 0.6437023
2021 7325 118473 0.0618284 4843 0.6611604
2022 7718 109777 0.0703062 5327 0.6902047
2023 6714 80062 0.0838600 32 0.0047662

Late Bloomer

Late bloomer applicants: age 19-24, HS graduates Here I filtered for the smallest value of application cycle for each Account.ID to reflect THE FIRST time of PS consideration

# Here I filtered for the smallest value of application cycle for each applicant to reflect THE FIRST time of PS consideration
LB_data <- df %>%
  filter(!is.null(HS.Graduation.Year), Age >=19, Age <= 24) %>%
  group_by(Account.Id) %>%  
  filter(App.Cycle == min(App.Cycle))

filtered2<-LB_data %>%
  group_by(App.Cycle) %>%
  summarize(LB_Count = n_distinct(Applicant.Id))

unfiltered2<-df %>%
  group_by(App.Cycle) %>%
  summarize(All_Count = n_distinct(Applicant.Id))

# Join LB count and all applicant count in one table
LB_counts <- left_join(filtered2, unfiltered2, by = "App.Cycle")

# percentage of LB students in all applicants
LB_counts$LB_Percentage<-LB_counts$LB_Count/LB_counts$All_Count

# number of registered LB students
LB_registration <- LB_data %>%
  filter(Enrolment.Stage == "REGISTRATION") %>%
  group_by(App.Cycle) %>%
  summarize(Registration_Count = n_distinct(Applicant.Id))

LB_all <- left_join(LB_counts, LB_registration, by = "App.Cycle")

# percentage of LB students registered
LB_all$Conversion_rate<-LB_all$Registration_Count/LB_all$LB_Count

kable(LB_all)
App.Cycle LB_Count All_Count LB_Percentage Registration_Count Conversion_rate
2017 83720 143455 0.5835976 51871 0.6195772
2018 56525 136145 0.4151823 33911 0.5999292
2019 43980 124468 0.3533438 25974 0.5905866
2020 38136 132906 0.2869396 21563 0.5654237
2021 31760 118473 0.2680780 17862 0.5624055
2022 30418 109777 0.2770890 17784 0.5846538
2023 24404 80062 0.3048138 35 0.0014342

Late Bloomer Plus

Late bloomer plus: age > 24, may or may not have graduated from HS

# Here I filtered for the smallest value of application cycle for each applicant to reflect THE FIRST time of PS consideration
LBPLUS_data <- df %>%
  filter(Age >=24) %>%
  group_by(Account.Id) %>%   
  filter(App.Cycle == min(App.Cycle))

filtered2<-LBPLUS_data %>%
  group_by(App.Cycle) %>%
  summarize(LBPLUS_Count = n_distinct(Applicant.Id))

unfiltered2<-df %>%
  group_by(App.Cycle) %>%
  summarize(All_Count = n_distinct(Applicant.Id))

# Join LBPLUS count and all applicant count in one table
LBPLUS_counts <- left_join(filtered2, unfiltered2, by = "App.Cycle")

# percentage of LBPLUS students in all applicants
LBPLUS_counts$LBPLUS_Percentage<-LBPLUS_counts$LBPLUS_Count/LBPLUS_counts$All_Count

# number of registered LBPLUS students
LBPLUS_registration <- LBPLUS_data %>%
  filter(Enrolment.Stage == "REGISTRATION") %>%
  group_by(App.Cycle) %>%
  summarize(Registration_Count = n_distinct(Applicant.Id))

LBPLUS_all <- left_join(LBPLUS_counts, LBPLUS_registration, by = "App.Cycle")

# percentage of LBPLUS students registered
LBPLUS_all$Conversion_rate<-LBPLUS_all$Registration_Count/LBPLUS_all$LBPLUS_Count

kable(LBPLUS_all)
App.Cycle LBPLUS_Count All_Count LBPLUS_Percentage Registration_Count Conversion_rate
2017 64955 143455 0.4527901 32431 0.4992841
2018 59330 136145 0.4357854 29092 0.4903422
2019 54502 124468 0.4378796 24942 0.4576346
2020 65076 132906 0.4896393 28853 0.4433739
2021 54428 118473 0.4594127 23435 0.4305688
2022 48237 109777 0.4394090 21186 0.4392064
2023 31778 80062 0.3969174 24 0.0007552

Data manipulation to find applicants who applied in multiple application cycles

Transfer students, repeat learning, pathway students are categories of non-direct entry students who are assumed to have previous PS experience. We don’t have program completion information in the OCAS data, to I used “registration” record as a proxy for having received PS education.

Here I grouped by Account.ID to for each unique individual, how many application cycles they have submitted an application, then filtered for the ones with more than 1 application cycles.

# subset columns
df2 <- all_data %>% 
  select(all_of(selected_columns),"College.Name")

# Use Account.ID to track individuals with multiple app.cycle applications since Applicant.ID change from one year to another for the same individual.
unique_cycle_count<-df2 %>%
  group_by(Account.Id) %>%
  summarize(UniqueAppCycles = n_distinct(App.Cycle))

with_cycle_count<-left_join(df2, unique_cycle_count, by="Account.Id")

d<-with_cycle_count %>%
  filter(UniqueAppCycles>1)

Transfer students

Transfer students: age >= 20 Here I filtered for the Account.ID that has at least one registration record, and then further filtered for applicants who has a registration record prior to a new round of application. Lastly, filter out the application cycles equal or prior to the first registration record so we get to applicants WITH PRIOR PS experience.

# find students who registered with a college (proxy for some university/college education)
d2<-d %>%
  group_by(Account.Id) %>%
  filter(
    "REGISTRATION" %in% Enrolment.Stage)%>% 
  arrange(Account.Id, App.Cycle, Enrolment.Stage)

# find students who has registration record in an application cycle prior to a new application record; then filter out any application cycle equal or prior to the one with registration record
transfer_d <- d2 %>%
  filter(Age>=20) %>%
  group_by(Account.Id) %>%
  mutate(apply_after_registration = any(min(App.Cycle[Enrolment.Stage == "REGISTRATION"]) < max(App.Cycle[Enrolment.Stage == "APPLICATION"]))) %>%
  filter(apply_after_registration==TRUE)  %>%
  filter(!(App.Cycle <= min(App.Cycle[Enrolment.Stage == "REGISTRATION"])))

Applicants may potentially apply in many application cycles after the application cycle in which they registered. No further filtering is needed because we only want to know whether the applicants had prior PS education, so each year they apply it’d be recorded as a unique applicant.

# same as before, count distinct Applicant.Id by application cycle then calculate percentage
filtered4<-transfer_d %>%
  group_by(App.Cycle) %>%
  summarize(Transfer_Count = n_distinct(Applicant.Id))

unfiltered4<-df %>%
  group_by(App.Cycle) %>%
  summarize(All_Count = n_distinct(Applicant.Id))

# Join transfer count and all applicant count in one table
transfer_counts <- left_join(filtered4, unfiltered4, by = "App.Cycle")

# percentage of transfer students in all applicants
transfer_counts$Transfer_Percentage<-transfer_counts$Transfer_Count/transfer_counts$All_Count

# number of registered transfer students
transfer_registration <- transfer_d %>%
  filter(Enrolment.Stage == "REGISTRATION") %>%
  group_by(App.Cycle) %>%
  summarize(Registration_Count = n_distinct(Applicant.Id))

transfer_all <- left_join(transfer_counts, transfer_registration, by = "App.Cycle")

# percentage of transfer students registered
transfer_all$Conversion_rate<-transfer_all$Registration_Count/transfer_all$Transfer_Count

kable(transfer_all)
App.Cycle Transfer_Count All_Count Transfer_Percentage Registration_Count Conversion_rate
2018 10670 136145 0.0783723 6148 0.5761949
2019 16070 124468 0.1291095 8879 0.5525202
2020 20916 132906 0.1573744 11140 0.5326066
2021 23239 118473 0.1961544 11557 0.4973106
2022 23759 109777 0.2164297 11863 0.4993055
2023 19295 80062 0.2410007 138 0.0071521

Repeat learners

Repeat learning - age >= 20, prior PS experience at Humber. Here I filtered for Account.ID with a Humber registration prior to another application record, then filtered out application cycles prior to the first registration record.

# find applicants who registered at Humber before having another application record
repeat_d <- d2 %>%
  filter(Age>=20) %>%
  group_by(Account.Id) %>%
  mutate(has_smaller_registration = any(App.Cycle[College.Name == "HUMBER" & Enrolment.Stage == "REGISTRATION"] < max(App.Cycle[Enrolment.Stage == "APPLICATION"]))) %>%
  filter(has_smaller_registration == TRUE) %>%
  filter(!(App.Cycle <= min(App.Cycle[Enrolment.Stage == "REGISTRATION"]))) %>%
  arrange(Account.Id, App.Cycle, Enrolment.Stage)
# same as before, count distinct Applicant.Id by application cycle then calculate percentage
filtered5<-repeat_d %>%
  group_by(App.Cycle) %>%
  summarize(Repeat_Count = n_distinct(Applicant.Id))

unfiltered5<-df %>%
  group_by(App.Cycle) %>%
  summarize(All_Count = n_distinct(Applicant.Id))

# Join repeat count and all applicant count in one table
repeat_counts <- left_join(filtered5, unfiltered5, by = "App.Cycle")

# percentage of repeat students in all applicants
repeat_counts$Repeat_Percentage<-repeat_counts$Repeat_Count/repeat_counts$All_Count

# number of registered repeat students
repeat_registration <- repeat_d %>%
  filter(Enrolment.Stage == "REGISTRATION") %>%
  group_by(App.Cycle) %>%
  summarize(Registration_Count = n_distinct(Applicant.Id))

repeat_all <- left_join(repeat_counts, repeat_registration, by = "App.Cycle")

# percentage of repeat students registered
repeat_all$Conversion_rate<-repeat_all$Registration_Count/repeat_all$Repeat_Count

kable(repeat_all)
App.Cycle Repeat_Count All_Count Repeat_Percentage Registration_Count Conversion_rate
2018 1240 136145 0.0091079 785 0.6330645
2019 1886 124468 0.0151525 1112 0.5896076
2020 2458 132906 0.0184943 1466 0.5964199
2021 2719 118473 0.0229504 1447 0.5321809
2022 2652 109777 0.0241581 1428 0.5384615
2023 2040 80062 0.0254803 9 0.0044118

Pathway Students

Pathway students - age >= 20, completed a PS program and are looking to continue in a degree program through a pathway We don’t have program completion or pathway information in OCAS, again I’m using registration record as a proxy for program completion. Using the data where “only applicant with a registration before another application, and only the app cycle after the first registration”, I filtered for applied program credential being degree

pathway_d <- transfer_d %>%
  filter(Program.Credential %in% c("DEGREE"))
# same as before, count distinct Applicant.Id by application cycle then calculate percentage
filtered6<-pathway_d %>%
  group_by(App.Cycle) %>%
  summarize(Pathway_Count = n_distinct(Applicant.Id))

unfiltered6<-df %>%
  group_by(App.Cycle) %>%
  summarize(All_Count = n_distinct(Applicant.Id))

# Join pathway count and all applicant count in one table
pathway_counts <- left_join(filtered6, unfiltered6, by = "App.Cycle")

# percentage of pathway students in all applicants
pathway_counts$Pathway_Percentage<-pathway_counts$Pathway_Count/pathway_counts$All_Count

# number of registered pathway students
pathway_registration <- pathway_d %>%
  filter(Enrolment.Stage == "REGISTRATION") %>%
  group_by(App.Cycle) %>%
  summarize(Registration_Count = n_distinct(Applicant.Id))

pathway_all <- left_join(pathway_counts, pathway_registration, by = "App.Cycle")

# percentage of pathway students registered
pathway_all$Conversion_rate<-pathway_all$Registration_Count/pathway_all$Pathway_Count

kable(pathway_all)
App.Cycle Pathway_Count All_Count Pathway_Percentage Registration_Count Conversion_rate
2018 1576 136145 0.0115759 528 0.3350254
2019 2455 124468 0.0197239 788 0.3209776
2020 3291 132906 0.0247619 1059 0.3217867
2021 3665 118473 0.0309353 1125 0.3069577
2022 3505 109777 0.0319284 1044 0.2978602
2023 3294 80062 0.0411431 2 0.0006072

Graduate Students

Graduate students - age 22-30. Considering graduate certificate.

For anyone that apply for a graduate certificate we can safely assume they already completed a university or college degree. So I used all applicants data to begin with.

# in remaining App cycles filter only for GC program applications
graduate_d <- all_data %>%
  filter(Age >= 22 & Age <= 30)%>%
  filter(Program.Credential == "GRADUATE CERTIFICATE")
# same as before, count distinct Applicant.Id by application cycle then calculate percentage
filtered7<-graduate_d %>%
  group_by(App.Cycle) %>%
  summarize(Graduate_Count = n_distinct(Applicant.Id))

unfiltered7<-df %>%
  group_by(App.Cycle) %>%
  summarize(All_Count = n_distinct(Applicant.Id))

# Join graduate count and all applicant count in one table
graduate_counts <- left_join(filtered7, unfiltered7, by = "App.Cycle")

# percentage of graduate students in all applicants
graduate_counts$Graduate_Percentage<-graduate_counts$Graduate_Count/graduate_counts$All_Count

# number of registered graduate students
graduate_registration <- graduate_d %>%
  filter(Enrolment.Stage == "REGISTRATION") %>%
  group_by(App.Cycle) %>%
  summarize(Registration_Count = n_distinct(Applicant.Id))

graduate_all <- left_join(graduate_counts, graduate_registration, by = "App.Cycle")

# percentage of graduate students registered
graduate_all$Conversion_rate<-graduate_all$Registration_Count/graduate_all$Graduate_Count

kable(graduate_all)
App.Cycle Graduate_Count All_Count Graduate_Percentage Registration_Count Conversion_rate
2017 16186 143455 0.1128298 6825 0.4216607
2018 16791 136145 0.1233317 6711 0.3996784
2019 16213 124468 0.1302584 6374 0.3931413
2020 17224 132906 0.1295954 6980 0.4052485
2021 15823 118473 0.1335579 5690 0.3596031
2022 13645 109777 0.1242974 4753 0.3483327
2023 9694 80062 0.1210812 13 0.0013410