library(dplyr)
library(knitr)
Gap Year, late bloomer and late bloomer plus, are categories of non-direct entry students who are considering PS for the first time. Account.ID is associate with each unique individual despite their Applicant.ID differ from one application cycle to another. So I grouped by Account.ID and filtered for people who ONLY have non-direct entry record.
selected_columns <- c("Applicant.Id", "Account.Id", "App.Cycle", "Age","Applicant.Type", "HS.Graduation.Year","Enrolment.Stage","Program.Credential","Entry.Level")
# filter for Account.ID who only have non-direct entry record
df <- all_data %>%
select(all_of(selected_columns)) %>%
group_by(Account.Id) %>%
filter(all(Applicant.Type == "NON-DIRECT"))
Gap year applicants: age 18-19, graduated from HS and have taken a year off.
I filtered out applicants without HS graduation year, and included only “application cycle - HS graduation year = 1” and age 18-19; then got a count of unique Applicant ID for each application cycle using this data. I also calculated the count of unique Application ID for each application cycle using unfiltered data. The percentage of gap year applicants in all applicants for each application cycle is then calculated.
I then calculated the count of how many unique applicants registered with a college in each application cycle. If we didn’t control for “App.Cycle - HS.Graduation.Year == 1”, there could potentially be some non-direct applicants who apply to PS in many application cycles and we would have included the later application cycles which are not the first time they are considering PS. However, there is only one application cycle where “App.Cycle - HS.Graduation.Year == 1”, so no further filtering was applied.
# Subset data
gap_year_data <- df %>%
filter(!is.null(HS.Graduation.Year), App.Cycle - HS.Graduation.Year == 1 , Age>=18, Age <= 19) %>%
select(Applicant.Id, App.Cycle, Enrolment.Stage)
## Adding missing grouping variables: `Account.Id`
filtered1<-gap_year_data %>%
group_by(App.Cycle) %>%
summarize(Gap_year_Count = n_distinct(Applicant.Id))
unfiltered1<-df %>%
group_by(App.Cycle) %>%
summarize(All_Count = n_distinct(Applicant.Id))
# Join gap year count and all applicant count in one table
gap_year_counts <- left_join(filtered1, unfiltered1, by = "App.Cycle")
# percentage of gap year students in all applicants
gap_year_counts$Gap_year_Percentage<-gap_year_counts$Gap_year_Count/gap_year_counts$All_Count
# number of registered gap year students
gap_year_registration <- gap_year_data %>%
filter(Enrolment.Stage == "REGISTRATION") %>%
group_by(App.Cycle) %>%
summarize(Registration_Count = n_distinct(Applicant.Id))
gap_year_all <- left_join(gap_year_counts, gap_year_registration, by = "App.Cycle")
# percentage of gap year students registered
gap_year_all$Conversion_rate<-gap_year_all$Registration_Count/gap_year_all$Gap_year_Count
kable(gap_year_all)
| App.Cycle | Gap_year_Count | All_Count | Gap_year_Percentage | Registration_Count | Conversion_rate |
|---|---|---|---|---|---|
| 2017 | 14636 | 143455 | 0.1020250 | 10544 | 0.7204154 |
| 2018 | 7436 | 136145 | 0.0546182 | 5348 | 0.7192039 |
| 2019 | 7509 | 124468 | 0.0603288 | 5420 | 0.7218005 |
| 2020 | 7606 | 132906 | 0.0572284 | 4896 | 0.6437023 |
| 2021 | 7325 | 118473 | 0.0618284 | 4843 | 0.6611604 |
| 2022 | 7718 | 109777 | 0.0703062 | 5327 | 0.6902047 |
| 2023 | 6714 | 80062 | 0.0838600 | 32 | 0.0047662 |
Late bloomer applicants: age 19-24, HS graduates Here I filtered for the smallest value of application cycle for each Account.ID to reflect THE FIRST time of PS consideration
# Here I filtered for the smallest value of application cycle for each applicant to reflect THE FIRST time of PS consideration
LB_data <- df %>%
filter(!is.null(HS.Graduation.Year), Age >=19, Age <= 24) %>%
group_by(Account.Id) %>%
filter(App.Cycle == min(App.Cycle))
filtered2<-LB_data %>%
group_by(App.Cycle) %>%
summarize(LB_Count = n_distinct(Applicant.Id))
unfiltered2<-df %>%
group_by(App.Cycle) %>%
summarize(All_Count = n_distinct(Applicant.Id))
# Join LB count and all applicant count in one table
LB_counts <- left_join(filtered2, unfiltered2, by = "App.Cycle")
# percentage of LB students in all applicants
LB_counts$LB_Percentage<-LB_counts$LB_Count/LB_counts$All_Count
# number of registered LB students
LB_registration <- LB_data %>%
filter(Enrolment.Stage == "REGISTRATION") %>%
group_by(App.Cycle) %>%
summarize(Registration_Count = n_distinct(Applicant.Id))
LB_all <- left_join(LB_counts, LB_registration, by = "App.Cycle")
# percentage of LB students registered
LB_all$Conversion_rate<-LB_all$Registration_Count/LB_all$LB_Count
kable(LB_all)
| App.Cycle | LB_Count | All_Count | LB_Percentage | Registration_Count | Conversion_rate |
|---|---|---|---|---|---|
| 2017 | 83720 | 143455 | 0.5835976 | 51871 | 0.6195772 |
| 2018 | 56525 | 136145 | 0.4151823 | 33911 | 0.5999292 |
| 2019 | 43980 | 124468 | 0.3533438 | 25974 | 0.5905866 |
| 2020 | 38136 | 132906 | 0.2869396 | 21563 | 0.5654237 |
| 2021 | 31760 | 118473 | 0.2680780 | 17862 | 0.5624055 |
| 2022 | 30418 | 109777 | 0.2770890 | 17784 | 0.5846538 |
| 2023 | 24404 | 80062 | 0.3048138 | 35 | 0.0014342 |
Late bloomer plus: age > 24, may or may not have graduated from HS
# Here I filtered for the smallest value of application cycle for each applicant to reflect THE FIRST time of PS consideration
LBPLUS_data <- df %>%
filter(Age >=24) %>%
group_by(Account.Id) %>%
filter(App.Cycle == min(App.Cycle))
filtered2<-LBPLUS_data %>%
group_by(App.Cycle) %>%
summarize(LBPLUS_Count = n_distinct(Applicant.Id))
unfiltered2<-df %>%
group_by(App.Cycle) %>%
summarize(All_Count = n_distinct(Applicant.Id))
# Join LBPLUS count and all applicant count in one table
LBPLUS_counts <- left_join(filtered2, unfiltered2, by = "App.Cycle")
# percentage of LBPLUS students in all applicants
LBPLUS_counts$LBPLUS_Percentage<-LBPLUS_counts$LBPLUS_Count/LBPLUS_counts$All_Count
# number of registered LBPLUS students
LBPLUS_registration <- LBPLUS_data %>%
filter(Enrolment.Stage == "REGISTRATION") %>%
group_by(App.Cycle) %>%
summarize(Registration_Count = n_distinct(Applicant.Id))
LBPLUS_all <- left_join(LBPLUS_counts, LBPLUS_registration, by = "App.Cycle")
# percentage of LBPLUS students registered
LBPLUS_all$Conversion_rate<-LBPLUS_all$Registration_Count/LBPLUS_all$LBPLUS_Count
kable(LBPLUS_all)
| App.Cycle | LBPLUS_Count | All_Count | LBPLUS_Percentage | Registration_Count | Conversion_rate |
|---|---|---|---|---|---|
| 2017 | 64955 | 143455 | 0.4527901 | 32431 | 0.4992841 |
| 2018 | 59330 | 136145 | 0.4357854 | 29092 | 0.4903422 |
| 2019 | 54502 | 124468 | 0.4378796 | 24942 | 0.4576346 |
| 2020 | 65076 | 132906 | 0.4896393 | 28853 | 0.4433739 |
| 2021 | 54428 | 118473 | 0.4594127 | 23435 | 0.4305688 |
| 2022 | 48237 | 109777 | 0.4394090 | 21186 | 0.4392064 |
| 2023 | 31778 | 80062 | 0.3969174 | 24 | 0.0007552 |
Transfer students, repeat learning, pathway students are categories of non-direct entry students who are assumed to have previous PS experience. We don’t have program completion information in the OCAS data, to I used “registration” record as a proxy for having received PS education.
Here I grouped by Account.ID to for each unique individual, how many application cycles they have submitted an application, then filtered for the ones with more than 1 application cycles.
# subset columns
df2 <- all_data %>%
select(all_of(selected_columns),"College.Name")
# Use Account.ID to track individuals with multiple app.cycle applications since Applicant.ID change from one year to another for the same individual.
unique_cycle_count<-df2 %>%
group_by(Account.Id) %>%
summarize(UniqueAppCycles = n_distinct(App.Cycle))
with_cycle_count<-left_join(df2, unique_cycle_count, by="Account.Id")
d<-with_cycle_count %>%
filter(UniqueAppCycles>1)
Transfer students: age >= 20 Here I filtered for the Account.ID that has at least one registration record, and then further filtered for applicants who has a registration record prior to a new round of application. Lastly, filter out the application cycles equal or prior to the first registration record so we get to applicants WITH PRIOR PS experience.
# find students who registered with a college (proxy for some university/college education)
d2<-d %>%
group_by(Account.Id) %>%
filter(
"REGISTRATION" %in% Enrolment.Stage)%>%
arrange(Account.Id, App.Cycle, Enrolment.Stage)
# find students who has registration record in an application cycle prior to a new application record; then filter out any application cycle equal or prior to the one with registration record
transfer_d <- d2 %>%
filter(Age>=20) %>%
group_by(Account.Id) %>%
mutate(apply_after_registration = any(min(App.Cycle[Enrolment.Stage == "REGISTRATION"]) < max(App.Cycle[Enrolment.Stage == "APPLICATION"]))) %>%
filter(apply_after_registration==TRUE) %>%
filter(!(App.Cycle <= min(App.Cycle[Enrolment.Stage == "REGISTRATION"])))
Applicants may potentially apply in many application cycles after the application cycle in which they registered. No further filtering is needed because we only want to know whether the applicants had prior PS education, so each year they apply it’d be recorded as a unique applicant.
# same as before, count distinct Applicant.Id by application cycle then calculate percentage
filtered4<-transfer_d %>%
group_by(App.Cycle) %>%
summarize(Transfer_Count = n_distinct(Applicant.Id))
unfiltered4<-df %>%
group_by(App.Cycle) %>%
summarize(All_Count = n_distinct(Applicant.Id))
# Join transfer count and all applicant count in one table
transfer_counts <- left_join(filtered4, unfiltered4, by = "App.Cycle")
# percentage of transfer students in all applicants
transfer_counts$Transfer_Percentage<-transfer_counts$Transfer_Count/transfer_counts$All_Count
# number of registered transfer students
transfer_registration <- transfer_d %>%
filter(Enrolment.Stage == "REGISTRATION") %>%
group_by(App.Cycle) %>%
summarize(Registration_Count = n_distinct(Applicant.Id))
transfer_all <- left_join(transfer_counts, transfer_registration, by = "App.Cycle")
# percentage of transfer students registered
transfer_all$Conversion_rate<-transfer_all$Registration_Count/transfer_all$Transfer_Count
kable(transfer_all)
| App.Cycle | Transfer_Count | All_Count | Transfer_Percentage | Registration_Count | Conversion_rate |
|---|---|---|---|---|---|
| 2018 | 10670 | 136145 | 0.0783723 | 6148 | 0.5761949 |
| 2019 | 16070 | 124468 | 0.1291095 | 8879 | 0.5525202 |
| 2020 | 20916 | 132906 | 0.1573744 | 11140 | 0.5326066 |
| 2021 | 23239 | 118473 | 0.1961544 | 11557 | 0.4973106 |
| 2022 | 23759 | 109777 | 0.2164297 | 11863 | 0.4993055 |
| 2023 | 19295 | 80062 | 0.2410007 | 138 | 0.0071521 |
Repeat learning - age >= 20, prior PS experience at Humber. Here I filtered for Account.ID with a Humber registration prior to another application record, then filtered out application cycles prior to the first registration record.
# find applicants who registered at Humber before having another application record
repeat_d <- d2 %>%
filter(Age>=20) %>%
group_by(Account.Id) %>%
mutate(has_smaller_registration = any(App.Cycle[College.Name == "HUMBER" & Enrolment.Stage == "REGISTRATION"] < max(App.Cycle[Enrolment.Stage == "APPLICATION"]))) %>%
filter(has_smaller_registration == TRUE) %>%
filter(!(App.Cycle <= min(App.Cycle[Enrolment.Stage == "REGISTRATION"]))) %>%
arrange(Account.Id, App.Cycle, Enrolment.Stage)
# same as before, count distinct Applicant.Id by application cycle then calculate percentage
filtered5<-repeat_d %>%
group_by(App.Cycle) %>%
summarize(Repeat_Count = n_distinct(Applicant.Id))
unfiltered5<-df %>%
group_by(App.Cycle) %>%
summarize(All_Count = n_distinct(Applicant.Id))
# Join repeat count and all applicant count in one table
repeat_counts <- left_join(filtered5, unfiltered5, by = "App.Cycle")
# percentage of repeat students in all applicants
repeat_counts$Repeat_Percentage<-repeat_counts$Repeat_Count/repeat_counts$All_Count
# number of registered repeat students
repeat_registration <- repeat_d %>%
filter(Enrolment.Stage == "REGISTRATION") %>%
group_by(App.Cycle) %>%
summarize(Registration_Count = n_distinct(Applicant.Id))
repeat_all <- left_join(repeat_counts, repeat_registration, by = "App.Cycle")
# percentage of repeat students registered
repeat_all$Conversion_rate<-repeat_all$Registration_Count/repeat_all$Repeat_Count
kable(repeat_all)
| App.Cycle | Repeat_Count | All_Count | Repeat_Percentage | Registration_Count | Conversion_rate |
|---|---|---|---|---|---|
| 2018 | 1240 | 136145 | 0.0091079 | 785 | 0.6330645 |
| 2019 | 1886 | 124468 | 0.0151525 | 1112 | 0.5896076 |
| 2020 | 2458 | 132906 | 0.0184943 | 1466 | 0.5964199 |
| 2021 | 2719 | 118473 | 0.0229504 | 1447 | 0.5321809 |
| 2022 | 2652 | 109777 | 0.0241581 | 1428 | 0.5384615 |
| 2023 | 2040 | 80062 | 0.0254803 | 9 | 0.0044118 |
Pathway students - age >= 20, completed a PS program and are looking to continue in a degree program through a pathway We don’t have program completion or pathway information in OCAS, again I’m using registration record as a proxy for program completion. Using the data where “only applicant with a registration before another application, and only the app cycle after the first registration”, I filtered for applied program credential being degree
pathway_d <- transfer_d %>%
filter(Program.Credential %in% c("DEGREE"))
# same as before, count distinct Applicant.Id by application cycle then calculate percentage
filtered6<-pathway_d %>%
group_by(App.Cycle) %>%
summarize(Pathway_Count = n_distinct(Applicant.Id))
unfiltered6<-df %>%
group_by(App.Cycle) %>%
summarize(All_Count = n_distinct(Applicant.Id))
# Join pathway count and all applicant count in one table
pathway_counts <- left_join(filtered6, unfiltered6, by = "App.Cycle")
# percentage of pathway students in all applicants
pathway_counts$Pathway_Percentage<-pathway_counts$Pathway_Count/pathway_counts$All_Count
# number of registered pathway students
pathway_registration <- pathway_d %>%
filter(Enrolment.Stage == "REGISTRATION") %>%
group_by(App.Cycle) %>%
summarize(Registration_Count = n_distinct(Applicant.Id))
pathway_all <- left_join(pathway_counts, pathway_registration, by = "App.Cycle")
# percentage of pathway students registered
pathway_all$Conversion_rate<-pathway_all$Registration_Count/pathway_all$Pathway_Count
kable(pathway_all)
| App.Cycle | Pathway_Count | All_Count | Pathway_Percentage | Registration_Count | Conversion_rate |
|---|---|---|---|---|---|
| 2018 | 1576 | 136145 | 0.0115759 | 528 | 0.3350254 |
| 2019 | 2455 | 124468 | 0.0197239 | 788 | 0.3209776 |
| 2020 | 3291 | 132906 | 0.0247619 | 1059 | 0.3217867 |
| 2021 | 3665 | 118473 | 0.0309353 | 1125 | 0.3069577 |
| 2022 | 3505 | 109777 | 0.0319284 | 1044 | 0.2978602 |
| 2023 | 3294 | 80062 | 0.0411431 | 2 | 0.0006072 |
Graduate students - age 22-30. Considering graduate certificate.
For anyone that apply for a graduate certificate we can safely assume they already completed a university or college degree. So I used all applicants data to begin with.
# in remaining App cycles filter only for GC program applications
graduate_d <- all_data %>%
filter(Age >= 22 & Age <= 30)%>%
filter(Program.Credential == "GRADUATE CERTIFICATE")
# same as before, count distinct Applicant.Id by application cycle then calculate percentage
filtered7<-graduate_d %>%
group_by(App.Cycle) %>%
summarize(Graduate_Count = n_distinct(Applicant.Id))
unfiltered7<-df %>%
group_by(App.Cycle) %>%
summarize(All_Count = n_distinct(Applicant.Id))
# Join graduate count and all applicant count in one table
graduate_counts <- left_join(filtered7, unfiltered7, by = "App.Cycle")
# percentage of graduate students in all applicants
graduate_counts$Graduate_Percentage<-graduate_counts$Graduate_Count/graduate_counts$All_Count
# number of registered graduate students
graduate_registration <- graduate_d %>%
filter(Enrolment.Stage == "REGISTRATION") %>%
group_by(App.Cycle) %>%
summarize(Registration_Count = n_distinct(Applicant.Id))
graduate_all <- left_join(graduate_counts, graduate_registration, by = "App.Cycle")
# percentage of graduate students registered
graduate_all$Conversion_rate<-graduate_all$Registration_Count/graduate_all$Graduate_Count
kable(graduate_all)
| App.Cycle | Graduate_Count | All_Count | Graduate_Percentage | Registration_Count | Conversion_rate |
|---|---|---|---|---|---|
| 2017 | 16186 | 143455 | 0.1128298 | 6825 | 0.4216607 |
| 2018 | 16791 | 136145 | 0.1233317 | 6711 | 0.3996784 |
| 2019 | 16213 | 124468 | 0.1302584 | 6374 | 0.3931413 |
| 2020 | 17224 | 132906 | 0.1295954 | 6980 | 0.4052485 |
| 2021 | 15823 | 118473 | 0.1335579 | 5690 | 0.3596031 |
| 2022 | 13645 | 109777 | 0.1242974 | 4753 | 0.3483327 |
| 2023 | 9694 | 80062 | 0.1210812 | 13 | 0.0013410 |