Relationship between post secondary graduates and job openings

Author

Richard Martin

Code
#constants----------------------
letters <- "[:alpha:]"
#functions------------------------------
apply_props <- function(tbbl, val){
  tbbl|>
    mutate(count=prop*val)
}
my_dt <- function(tbbl) {
  DT::datatable(tbbl,
    filter = 'top',
    extensions = "Buttons",
    rownames = FALSE,
    options = list(
      columnDefs = list(list(className = "dt-center", targets = "_all")),
      paging = TRUE,
      scrollX = TRUE,
      scrollY = TRUE,
      searching = TRUE,
      ordering = TRUE,
      dom = "Btip",
      buttons = list(
        list(extend = "csv", filename = "some_file_name"),
        list(extend = "excel", filename = "some_file_name")
      ),
      pageLength = 20
    )
  )
}
#read in the data-------------------------------------

cip_counts_col_names <- unlist(as.vector(read_csv(here("data","cip_counts_bc.csv"), skip=12, n_max = 1, col_names = FALSE)))

cip_counts <- read_csv(here("data","cip_counts_bc.csv"), skip=14, na = "..", col_names = cip_counts_col_names)|>
  mutate(CIP=ex_between(`Field of study 5`,"[","]"),
         CIP=str_remove_all(CIP, letters),
         CIP=str_pad(CIP, width=5, side = "left", pad = "0"),
         field_of_study=word(`Field of study 5`, sep="\\["), .before = everything())|>
  select(-`Field of study 5`)|>
  pivot_longer(cols = starts_with("2"))|>
  group_by(CIP, field_of_study)|>
  summarise(mean_grads=mean(value, na.rm = TRUE))

cip_noc_long <- vroom::vroom(here("data","cip_2_noc_canada.csv"), skip = 13)[-1,]
colnames(cip_noc_long)[1] <- "field_of_study"
cip_noc_long <- cip_noc_long|>
  janitor::remove_empty("cols")|>
  pivot_longer(cols=-field_of_study, names_to = "noc", values_to = "count")|>
  mutate(count=as.numeric(str_remove_all(count,",")),
        CIP=str_sub(field_of_study, 1, 5),
        field_of_study=str_sub(field_of_study, 7))

jo <- readxl::read_excel(here("data","job_openings_occupation.xlsx"), skip = 3)|>
  janitor::remove_constant()|>
  filter(NOC!="#T",
         Variable=="Job Openings",
         `Geographic Area`=="British Columbia"
         )|>
  select(-Variable, -`Geographic Area`)|>
  mutate(NOC=str_remove(NOC,"#"))|>
  unite("noc", NOC, Description, sep=" ")|>
  pivot_longer(cols = starts_with("2"))|>
  group_by(noc)|>
  summarize(mean_job_openings=mean(value, na.rm = TRUE))

paths_to <- readxl::read_excel(here("data","top_paths_to_and_from.xlsx"), sheet = "Paths to Occupation all ages")|>
  group_by(`For workers in this occupation`)|>
  slice_max(`this proportion`, n=1, with_ties = FALSE)|>
  select(noc=`For workers in this occupation`, path_narrowness=`this proportion`)

paths_from <- readxl::read_excel(here("data","top_paths_to_and_from.xlsx"), sheet = "Paths from Education all ages")|>
  group_by(`in this field of study`)|>
  slice_max(`this proportion`, n=1, with_ties = FALSE)|>
  summarize(path_narrowness=mean(`this proportion`, na.rm=TRUE))|>
  mutate(CIP=str_sub(`in this field of study`, 1,5), .keep="unused")


#process the data------------------------------------

supply_by_occupation <- cip_noc_long|>
  group_by(CIP, field_of_study)|>
  mutate(prop=count/sum(count, na.rm = TRUE))|>
  select(-count)|>
  nest()|>
  inner_join(cip_counts, by="CIP")|>
  mutate(data=map2(data, mean_grads, apply_props))|>
  select(data)|>
  unnest(data)|>
  group_by(noc)|>
  summarize(supply_by_occupation=sum(count, na.rm = TRUE))

demand_by_field_of_study <- cip_noc_long|>
  group_by(noc)|>
  mutate(prop=count/sum(count, na.rm = TRUE))|>
  select(-count)|>
  nest()|>
  inner_join(jo, by="noc")|>
  mutate(data=map2(data, mean_job_openings, apply_props))|>
  select(data)|>
  unnest(data)|>
  group_by(CIP, field_of_study)|>
  summarize(demand_by_field_of_study=sum(count, na.rm = TRUE))

by_field_of_study <- inner_join(demand_by_field_of_study, cip_counts, by="CIP")|>
  ungroup()|>
  mutate(excess_demand=demand_by_field_of_study - mean_grads)|>
  inner_join(paths_from)

by_occupation <- inner_join(supply_by_occupation, jo)|>
  mutate(excess_demand=mean_job_openings - supply_by_occupation)|>
  inner_join(paths_to)

by_occupation_scatter <- ggplot(by_occupation, aes(excess_demand, 
                                                   path_narrowness,
                                                   colour=path_narrowness,
                                                   text=noc))+
  geom_vline(xintercept = 0)+
  geom_point()+
  scale_y_continuous(trans="log10")+
  scale_colour_viridis_c()+
  labs(x="Excess Supply |                                                                                                        Excess Demand",
       y="Path Narrowness")

by_field_of_study_scatter <- ggplot(by_field_of_study, aes(excess_demand, 
                                                           path_narrowness, 
                                                           colour=path_narrowness,
                                                           text=field_of_study.x))+
  geom_vline(xintercept = 0)+
  geom_point()+
  scale_y_continuous(trans="log10")+
  scale_colour_viridis_c()+
  labs(x="Excess Supply    |                                                                                                        Excess Demand",
       y="Path Narrowness")

by_occupation_col <- by_occupation|>
  mutate(noc=fct_reorder(noc, path_narrowness))|>
  slice_max(abs(excess_demand), n=40)|>
  ggplot(aes(excess_demand, 
             noc, 
             fill=path_narrowness, 
             text=paste0(
               noc,
               "\n Excess Demand = ",
               round(excess_demand),
               "\n Path narrowness = ",
               round(path_narrowness,3))))+
  geom_col()+
  labs(fill="Path Narrowness",
       x="Excess demand",
       y=NULL,
       title="Excess demand by occupation")+
  theme_minimal()+
  scale_fill_viridis_c()+
  labs(x="Excess demand")

by_field_of_study_col <- by_field_of_study|>
  slice_max(abs(excess_demand), n=40)|>
  ggplot(aes(excess_demand, 
             fct_reorder(field_of_study.x, path_narrowness), 
             fill=path_narrowness,
              text=paste0(
               field_of_study.x,
               "\n Excess Demand = ",
               round(excess_demand),
               "\n Path narrowness = ",
               round(path_narrowness,3))))+
  geom_col()+
  labs(fill="Path Narrowness",
       x="Excess demand",
       y=NULL,
       title="Excess demand by field of study")+
   scale_fill_viridis_c()+
  theme_minimal()+
  scale_x_continuous(labels = scales::comma)

Introduction

A vast majority of British Columbia job openings over the next ten years are expected to require some level of post secondary training. The point of this exercise is to investigate how well matched the predicted supply of post secondary graduates is to these job openings.

Main assumptions:

  1. Statistics Canada table 37-10-0183-01 provides counts of post secondary graduates by detailed field of study (CIP counts) over the period 2010-2021. We are going to assume that the annual supply of graduates will continue over the forecast period at the historic mean.

  2. Statistics Canada table 98-10-0403-01 provides counts of employment by detailed field of study and occupation (CIP-NOC counts) based on the Canadian 2021 Census. From this table we can derive proportions:

    • For a given field of study, what proportion of graduates (historically) have ended up in occupation \(X\). We multiply the post secondary graduate counts by these proportions, allocating new graduates to occupations according to the historic proportions. When summed across all fields of study this represents a prediction of the supply of new graduates by occupation.

    • For a given occupation, what proportion of workers (historically) have a highest attainment in field of study \(Y\). We multiply the LMO job openings by these proportions, allocating job openings to fields of study according to the historic proportions. When summed across all occupations this represents a prediction of the demand of new graduates by field of study.

We assume that the above proportions are stable and will continue to hold over the forecast period.

Excess demand and supply

By occupation: once we have the prediction of supply of new graduates by occupation, we can compare this to the LMO job openings (by occupation) to derive a measure of excess demand or supply. Of particular concern are occupations where the path to the occupation is narrow; occupations for which an imbalance would be expected to persist.

By field of study: once we have the prediction of demand for new graduates by field of study, we can compare this to the counts of new graduates (by field of study) to derive a measure of excess demand or supply. Of particular concern are fields of study where the path from the field of study is narrow; fields of study where an imbalance would be expected to persist.

Note that in no sense are the historic proportions the “right” proportions, nor do we expect the labour market to remain unbalanced when paths are wide. For wide path occupations or fields of study, any imbalance would likely be ephemeral with the current CIP-NOC proportions adjusting to clear the market.

What is Path Narrowness?

In other work we have quantified the narrowness of the path

  1. from education
  2. to occupation

The measure of path narrowness we utilize is the proportion coming from the largest contributor. e.g. 

  1. For people who’s highest attainment is in optometry, 91% end up working as optometrists: Narrowness = .91
  2. For optometrists, 96% have a highest attainment in optometry: Narrowness = .96

In other words, a vast majority follow the path optometry -> optometrist, but there are leakages out of optometry (9% choose an alternative occupation) and leakages into optometrists (for 4% their highest attainment is not their degree in optometry.)

At the other end of the spectrum, for those who’s highest attainment is a Bachelor’s degree in liberal arts the modal occupation is retail sales: Narrowness = .04

Narrowness of path is a measure of rigidity in the labour market: imbalances between demand and supply are more likely to persist when paths are narrow.

Missing data:

Note that the post secondary graduate data is incomplete. Below are roughly 100 fields of study with zero completions over a 12 year period.

Code
my_dt(cip_counts|>filter(mean_grads==0 | is.na(mean_grads)))

By occupation

Code
ggplotly(by_occupation_scatter, tooltip="text")

Now lets look for the largest difference in supply (new graduates) and demand (job openings) by occupation. The colour of the graph represents the path narrowness associated with the occupation.

Code
ggplotly(by_occupation_col, tooltip="text")|>
  plotly::config(displayModeBar = FALSE)

The by occupation data:

Code
by_occupation|>
  arrange(desc(excess_demand))|>
  my_dt()|>
  DT::formatRound(columns=c('supply_by_occupation', 'mean_job_openings', 'excess_demand'), digits=0)

By field of study

Code
ggplotly(by_field_of_study_scatter, tooltip="text")

Now lets look for the largest difference in supply (new graduates) and demand (job openings) by occupation. The colour of the graph represents the path narrowness associated with the occupation.

Code
ggplotly(by_field_of_study_col, tooltip="text")|>
  plotly::config(displayModeBar = FALSE)

The by field of study data:

Code
by_field_of_study|>
  select(CIP, field_of_study=field_of_study.x, demand_by_field_of_study, mean_grads, excess_demand, path_narrowness)|>
  arrange(desc(excess_demand))|>
  my_dt()|>
  DT::formatRound(columns=c("demand_by_field_of_study", "mean_grads", "excess_demand"), digits=0)