library(dplyr)
library(knitr)

Non-direct entry student who are considering PS for the first time

Gap Year, late bloomer and late bloomer plus, are categories of non-direct entry students who are considering PS for the first time. Account.ID is associate with each unique individual despite their Applicant.ID differ from one application cycle to another. So I grouped by Account.ID and filtered for people who ONLY have non-direct entry record.

selected_columns <- c("Applicant.Id", "Account.Id", "App.Cycle", "Age","Applicant.Type", "HS.Graduation.Year","Enrolment.Stage","Program.Credential","Entry.Level")

# filter for Account.ID who only have non-direct entry record
df <- all_data %>% 
  select(all_of(selected_columns)) %>%
  group_by(Account.Id) %>%
  filter(all(Applicant.Type == "NON-DIRECT"))

Gap Year

Gap year applicants: age 18-19, graduated from HS and have taken a year off.

I filtered out applicants without HS graduation year, and included only “application cycle - HS graduation year = 1” and age 18-19; then got a count of unique Applicant ID for each application cycle using this data. I also calculated the count of unique Application ID for each application cycle using unfiltered data. The percentage of gap year applicants in all applicants for each application cycle is then calculated.

I then calculated the count of how many unique applicants registered with a college in each application cycle. If we didn’t control for “App.Cycle - HS.Graduation.Year == 1”, there could potentially be some non-direct applicants who apply to PS in many application cycles and we would have included the later application cycles which are not the first time they are considering PS. However, there is only one application cycle where “App.Cycle - HS.Graduation.Year == 1”, so no further filtering was applied.

# Subset data
gap_year_data <- df %>%
  filter(!is.null(HS.Graduation.Year), App.Cycle - HS.Graduation.Year == 1 , Age>=18, Age <= 19) %>%
  select(Applicant.Id, App.Cycle, Enrolment.Stage)

## Adding missing grouping variables: `Account.Id`

filtered1<-gap_year_data %>%
  group_by(App.Cycle) %>%
  summarize(Gap_year_Count = n_distinct(Applicant.Id))

unfiltered1<-df %>%
  group_by(App.Cycle) %>%
  summarize(All_Count = n_distinct(Applicant.Id))

# Join gap year count and all applicant count in one table
gap_year_counts <- left_join(filtered1, unfiltered1, by = "App.Cycle")

# percentage of gap year students in all applicants
gap_year_counts$Gap_year_Percentage<-gap_year_counts$Gap_year_Count/gap_year_counts$All_Count

# number of registered gap year students
gap_year_registration <- gap_year_data %>%
  filter(Enrolment.Stage == "REGISTRATION") %>%
  group_by(App.Cycle) %>%
  summarize(Registration_Count = n_distinct(Applicant.Id))

gap_year_all <- left_join(gap_year_counts, gap_year_registration, by = "App.Cycle")

# percentage of gap year students registered
gap_year_all$Conversion_rate<-gap_year_all$Registration_Count/gap_year_all$Gap_year_Count

kable(gap_year_all)

App.Cycle	Gap_year_Count	All_Count	Gap_year_Percentage	Registration_Count	Conversion_rate
2017	14636	143455	0.1020250	10544	0.7204154
2018	7436	136145	0.0546182	5348	0.7192039
2019	7509	124468	0.0603288	5420	0.7218005
2020	7606	132906	0.0572284	4896	0.6437023
2021	7325	118473	0.0618284	4843	0.6611604
2022	7718	109777	0.0703062	5327	0.6902047
2023	6714	80062	0.0838600	32	0.0047662

Late Bloomer

Late bloomer applicants: age 19-24, HS graduates Here I filtered for the smallest value of application cycle for each Account.ID to reflect THE FIRST time of PS consideration

# Here I filtered for the smallest value of application cycle for each applicant to reflect THE FIRST time of PS consideration
LB_data <- df %>%
  filter(!is.null(HS.Graduation.Year), Age >=19, Age <= 24) %>%
  group_by(Account.Id) %>%  
  filter(App.Cycle == min(App.Cycle))

filtered2<-LB_data %>%
  group_by(App.Cycle) %>%
  summarize(LB_Count = n_distinct(Applicant.Id))

unfiltered2<-df %>%
  group_by(App.Cycle) %>%
  summarize(All_Count = n_distinct(Applicant.Id))

# Join LB count and all applicant count in one table
LB_counts <- left_join(filtered2, unfiltered2, by = "App.Cycle")

# percentage of LB students in all applicants
LB_counts$LB_Percentage<-LB_counts$LB_Count/LB_counts$All_Count

# number of registered LB students
LB_registration <- LB_data %>%
  filter(Enrolment.Stage == "REGISTRATION") %>%
  group_by(App.Cycle) %>%
  summarize(Registration_Count = n_distinct(Applicant.Id))

LB_all <- left_join(LB_counts, LB_registration, by = "App.Cycle")

# percentage of LB students registered
LB_all$Conversion_rate<-LB_all$Registration_Count/LB_all$LB_Count

kable(LB_all)

App.Cycle	LB_Count	All_Count	LB_Percentage	Registration_Count	Conversion_rate
2017	83720	143455	0.5835976	51871	0.6195772
2018	56525	136145	0.4151823	33911	0.5999292
2019	43980	124468	0.3533438	25974	0.5905866
2020	38136	132906	0.2869396	21563	0.5654237
2021	31760	118473	0.2680780	17862	0.5624055
2022	30418	109777	0.2770890	17784	0.5846538
2023	24404	80062	0.3048138	35	0.0014342

Late Bloomer Plus

Late bloomer plus: age > 24, may or may not have graduated from HS

# Here I filtered for the smallest value of application cycle for each applicant to reflect THE FIRST time of PS consideration
LBPLUS_data <- df %>%
  filter(Age >=24) %>%
  group_by(Account.Id) %>%   
  filter(App.Cycle == min(App.Cycle))

filtered2<-LBPLUS_data %>%
  group_by(App.Cycle) %>%
  summarize(LBPLUS_Count = n_distinct(Applicant.Id))

unfiltered2<-df %>%
  group_by(App.Cycle) %>%
  summarize(All_Count = n_distinct(Applicant.Id))

# Join LBPLUS count and all applicant count in one table
LBPLUS_counts <- left_join(filtered2, unfiltered2, by = "App.Cycle")

# percentage of LBPLUS students in all applicants
LBPLUS_counts$LBPLUS_Percentage<-LBPLUS_counts$LBPLUS_Count/LBPLUS_counts$All_Count

# number of registered LBPLUS students
LBPLUS_registration <- LBPLUS_data %>%
  filter(Enrolment.Stage == "REGISTRATION") %>%
  group_by(App.Cycle) %>%
  summarize(Registration_Count = n_distinct(Applicant.Id))

LBPLUS_all <- left_join(LBPLUS_counts, LBPLUS_registration, by = "App.Cycle")

# percentage of LBPLUS students registered
LBPLUS_all$Conversion_rate<-LBPLUS_all$Registration_Count/LBPLUS_all$LBPLUS_Count

kable(LBPLUS_all)

App.Cycle	LBPLUS_Count	All_Count	LBPLUS_Percentage	Registration_Count	Conversion_rate
2017	64955	143455	0.4527901	32431	0.4992841
2018	59330	136145	0.4357854	29092	0.4903422
2019	54502	124468	0.4378796	24942	0.4576346
2020	65076	132906	0.4896393	28853	0.4433739
2021	54428	118473	0.4594127	23435	0.4305688
2022	48237	109777	0.4394090	21186	0.4392064
2023	31778	80062	0.3969174	24	0.0007552

Data manipulation to find applicants who applied in multiple application cycles

Transfer students, repeat learning, pathway students are categories of non-direct entry students who are assumed to have previous PS experience. We don’t have program completion information in the OCAS data, to I used “registration” record as a proxy for having received PS education.

Here I grouped by Account.ID to for each unique individual, how many application cycles they have submitted an application, then filtered for the ones with more than 1 application cycles.

# subset columns
df2 <- all_data %>% 
  select(all_of(selected_columns),"College.Name")

# Use Account.ID to track individuals with multiple app.cycle applications since Applicant.ID change from one year to another for the same individual.
unique_cycle_count<-df2 %>%
  group_by(Account.Id) %>%
  summarize(UniqueAppCycles = n_distinct(App.Cycle))

with_cycle_count<-left_join(df2, unique_cycle_count, by="Account.Id")

d<-with_cycle_count %>%
  filter(UniqueAppCycles>1)

Transfer students

Transfer students: age >= 20 Here I filtered for the Account.ID that has at least one registration record, and then further filtered for applicants who has a registration record prior to a new round of application. Lastly, filter out the application cycles equal or prior to the first registration record so we get to applicants WITH PRIOR PS experience.

# find students who registered with a college (proxy for some university/college education)
d2<-d %>%
  group_by(Account.Id) %>%
  filter(
    "REGISTRATION" %in% Enrolment.Stage)%>% 
  arrange(Account.Id, App.Cycle, Enrolment.Stage)

# find students who has registration record in an application cycle prior to a new application record; then filter out any application cycle equal or prior to the one with registration record
transfer_d <- d2 %>%
  filter(Age>=20) %>%
  group_by(Account.Id) %>%
  mutate(apply_after_registration = any(min(App.Cycle[Enrolment.Stage == "REGISTRATION"]) < max(App.Cycle[Enrolment.Stage == "APPLICATION"]))) %>%
  filter(apply_after_registration==TRUE)  %>%
  filter(!(App.Cycle <= min(App.Cycle[Enrolment.Stage == "REGISTRATION"])))

Applicants may potentially apply in many application cycles after the application cycle in which they registered. No further filtering is needed because we only want to know whether the applicants had prior PS education, so each year they apply it’d be recorded as a unique applicant.

# same as before, count distinct Applicant.Id by application cycle then calculate percentage
filtered4<-transfer_d %>%
  group_by(App.Cycle) %>%
  summarize(Transfer_Count = n_distinct(Applicant.Id))

unfiltered4<-df %>%
  group_by(App.Cycle) %>%
  summarize(All_Count = n_distinct(Applicant.Id))

# Join transfer count and all applicant count in one table
transfer_counts <- left_join(filtered4, unfiltered4, by = "App.Cycle")

# percentage of transfer students in all applicants
transfer_counts$Transfer_Percentage<-transfer_counts$Transfer_Count/transfer_counts$All_Count

# number of registered transfer students
transfer_registration <- transfer_d %>%
  filter(Enrolment.Stage == "REGISTRATION") %>%
  group_by(App.Cycle) %>%
  summarize(Registration_Count = n_distinct(Applicant.Id))

transfer_all <- left_join(transfer_counts, transfer_registration, by = "App.Cycle")

# percentage of transfer students registered
transfer_all$Conversion_rate<-transfer_all$Registration_Count/transfer_all$Transfer_Count

kable(transfer_all)

App.Cycle	Transfer_Count	All_Count	Transfer_Percentage	Registration_Count	Conversion_rate
2018	10670	136145	0.0783723	6148	0.5761949
2019	16070	124468	0.1291095	8879	0.5525202
2020	20916	132906	0.1573744	11140	0.5326066
2021	23239	118473	0.1961544	11557	0.4973106
2022	23759	109777	0.2164297	11863	0.4993055
2023	19295	80062	0.2410007	138	0.0071521

Repeat learners

Repeat learning - age >= 20, prior PS experience at Humber. Here I filtered for Account.ID with a Humber registration prior to another application record, then filtered out application cycles prior to the first registration record.

# find applicants who registered at Humber before having another application record
repeat_d <- d2 %>%
  filter(Age>=20) %>%
  group_by(Account.Id) %>%
  mutate(has_smaller_registration = any(App.Cycle[College.Name == "HUMBER" & Enrolment.Stage == "REGISTRATION"] < max(App.Cycle[Enrolment.Stage == "APPLICATION"]))) %>%
  filter(has_smaller_registration == TRUE) %>%
  filter(!(App.Cycle <= min(App.Cycle[Enrolment.Stage == "REGISTRATION"]))) %>%
  arrange(Account.Id, App.Cycle, Enrolment.Stage)

# same as before, count distinct Applicant.Id by application cycle then calculate percentage
filtered5<-repeat_d %>%
  group_by(App.Cycle) %>%
  summarize(Repeat_Count = n_distinct(Applicant.Id))

unfiltered5<-df %>%
  group_by(App.Cycle) %>%
  summarize(All_Count = n_distinct(Applicant.Id))

# Join repeat count and all applicant count in one table
repeat_counts <- left_join(filtered5, unfiltered5, by = "App.Cycle")

# percentage of repeat students in all applicants
repeat_counts$Repeat_Percentage<-repeat_counts$Repeat_Count/repeat_counts$All_Count

# number of registered repeat students
repeat_registration <- repeat_d %>%
  filter(Enrolment.Stage == "REGISTRATION") %>%
  group_by(App.Cycle) %>%
  summarize(Registration_Count = n_distinct(Applicant.Id))

repeat_all <- left_join(repeat_counts, repeat_registration, by = "App.Cycle")

# percentage of repeat students registered
repeat_all$Conversion_rate<-repeat_all$Registration_Count/repeat_all$Repeat_Count

kable(repeat_all)

App.Cycle	Repeat_Count	All_Count	Repeat_Percentage	Registration_Count	Conversion_rate
2018	1240	136145	0.0091079	785	0.6330645
2019	1886	124468	0.0151525	1112	0.5896076
2020	2458	132906	0.0184943	1466	0.5964199
2021	2719	118473	0.0229504	1447	0.5321809
2022	2652	109777	0.0241581	1428	0.5384615
2023	2040	80062	0.0254803	9	0.0044118

Pathway Students

Pathway students - age >= 20, completed a PS program and are looking to continue in a degree program through a pathway We don’t have program completion or pathway information in OCAS, again I’m using registration record as a proxy for program completion. Using the data where “only applicant with a registration before another application, and only the app cycle after the first registration”, I filtered for applied program credential being degree

pathway_d <- transfer_d %>%
  filter(Program.Credential %in% c("DEGREE"))

# same as before, count distinct Applicant.Id by application cycle then calculate percentage
filtered6<-pathway_d %>%
  group_by(App.Cycle) %>%
  summarize(Pathway_Count = n_distinct(Applicant.Id))

unfiltered6<-df %>%
  group_by(App.Cycle) %>%
  summarize(All_Count = n_distinct(Applicant.Id))

# Join pathway count and all applicant count in one table
pathway_counts <- left_join(filtered6, unfiltered6, by = "App.Cycle")

# percentage of pathway students in all applicants
pathway_counts$Pathway_Percentage<-pathway_counts$Pathway_Count/pathway_counts$All_Count

# number of registered pathway students
pathway_registration <- pathway_d %>%
  filter(Enrolment.Stage == "REGISTRATION") %>%
  group_by(App.Cycle) %>%
  summarize(Registration_Count = n_distinct(Applicant.Id))

pathway_all <- left_join(pathway_counts, pathway_registration, by = "App.Cycle")

# percentage of pathway students registered
pathway_all$Conversion_rate<-pathway_all$Registration_Count/pathway_all$Pathway_Count

kable(pathway_all)

App.Cycle	Pathway_Count	All_Count	Pathway_Percentage	Registration_Count	Conversion_rate
2018	1576	136145	0.0115759	528	0.3350254
2019	2455	124468	0.0197239	788	0.3209776
2020	3291	132906	0.0247619	1059	0.3217867
2021	3665	118473	0.0309353	1125	0.3069577
2022	3505	109777	0.0319284	1044	0.2978602
2023	3294	80062	0.0411431	2	0.0006072

Graduate Students

Graduate students - age 22-30. Considering graduate certificate.

For anyone that apply for a graduate certificate we can safely assume they already completed a university or college degree. So I used all applicants data to begin with.

# in remaining App cycles filter only for GC program applications
graduate_d <- all_data %>%
  filter(Age >= 22 & Age <= 30)%>%
  filter(Program.Credential == "GRADUATE CERTIFICATE")

# same as before, count distinct Applicant.Id by application cycle then calculate percentage
filtered7<-graduate_d %>%
  group_by(App.Cycle) %>%
  summarize(Graduate_Count = n_distinct(Applicant.Id))

unfiltered7<-df %>%
  group_by(App.Cycle) %>%
  summarize(All_Count = n_distinct(Applicant.Id))

# Join graduate count and all applicant count in one table
graduate_counts <- left_join(filtered7, unfiltered7, by = "App.Cycle")

# percentage of graduate students in all applicants
graduate_counts$Graduate_Percentage<-graduate_counts$Graduate_Count/graduate_counts$All_Count

# number of registered graduate students
graduate_registration <- graduate_d %>%
  filter(Enrolment.Stage == "REGISTRATION") %>%
  group_by(App.Cycle) %>%
  summarize(Registration_Count = n_distinct(Applicant.Id))

graduate_all <- left_join(graduate_counts, graduate_registration, by = "App.Cycle")

# percentage of graduate students registered
graduate_all$Conversion_rate<-graduate_all$Registration_Count/graduate_all$Graduate_Count

kable(graduate_all)

App.Cycle	Graduate_Count	All_Count	Graduate_Percentage	Registration_Count	Conversion_rate
2017	16186	143455	0.1128298	6825	0.4216607
2018	16791	136145	0.1233317	6711	0.3996784
2019	16213	124468	0.1302584	6374	0.3931413
2020	17224	132906	0.1295954	6980	0.4052485
2021	15823	118473	0.1335579	5690	0.3596031
2022	13645	109777	0.1242974	4753	0.3483327
2023	9694	80062	0.1210812	13	0.0013410

Non-direct demographic category

2023-10-17