For Canadians looking at job opportunities, it is nice to have an idea how the job market will perform in the future for your chosen occupation or industry. Many countries have open data sets that offer this kind of data. In these exercises provided by Lauro Silva we will use R to analyze the future Canadian job prospects through 2024.
Blog: http://www.r-exercises.com/2017/06/01/canada-labour-maket-future-perspectives-exercises/#comment-6427
Datasource: http://open.canada.ca/data/en/dataset/e80851b8-de68-43bd-a85c-c72e1b3a3890?sort=time_descend&pagelimit=100
Data Downloaded: 2017-06-02
Download and read all data sets into R from Canadian Occupational Projection System (COPS) - 2015 to 2024 projections.
library(tidyverse)
library(forcats)
I could load them all individually but R has bulk load functionality.
Note: I downloaded all of the data locally before running code below. Use the data source at the top to get the most current data.
file_list <- list.files(".", "*.csv")
for (i in 1:length(file_list)) assign(file_list[i], read_csv(file_list[i]))
Some of the dataframes that were loaded have bad names. This section will give them better names and will also add a categorical variable to identify the dataset for future use.
employment <- mutate(Employment_emploi_2015_2024.csv, data_set = "Employment")
employment_growth <- mutate(Employment_Growth_croissance_emploi_2015_2024.csv,
data_set = "Employment Growth")
flmc_cfmt <- mutate(`FLMC_CFMT_2015-2024.csv`, data_set = "FLMC_CFMT")
immigration <- mutate(immigration_2015_2024.csv, data_set = "Immigration")
industrial_employment <- mutate(`Industrial_employment_emploi_par_secteur_2015-2024.csv`,
data_set = "Industrial Employment")
industrial_growth <- mutate(`Industrial_employment_Growth_Croissant_du_emploi_par_secteur_2015-2024.csv`,
data_set = "Industrial Growth")
industrial_jo_pe <- mutate(JO_PE_2015_2024.csv, data_set = "JO_PE")
industrial_other_replacements <- mutate(Industrial_other_replacement_autre_remplacement_par_secteur_2015_2024.csv,
data_set = "Other Replacement")
industrial_retirements <- mutate(`Industrial_retirements_retraites_par_secteur_2015-2024.csv`,
data_set = "Retirements")
job_openings_projected_demand <- mutate(JO_PE_2015_2024.csv, data_set = "Job Openings")
job_openings_projected_demand$`2014` <- as.integer(NA)
job_seekers <- mutate(Job_Seekers_Chercheurs_emploi_2015_2024.csv, data_set = "Job Seekers")
labor_force_population <- mutate(`Labour_Force_Population_active_2015-2024.csv`,
data_set = "Active Labor Force")
labor_force_education <- mutate(`Labour_Force_Population_Active_Education_2015-2024.csv`,
data_set = "Labor Force Education")
occupational_groupings <- mutate(`NOC_occ_grouping_eng_2015-2024.csv`, data_set = "Occupational Groupings")
other_replacements <- mutate(Other_replacement_autre_remplacement_2015_2024.csv,
data_set = "Other Replacements")
other_seekers <- mutate(Other_seekers_autres_chercheurs_2015_2024.csv, data_set = "Other Seekers")
participation_rate_01 <- mutate(`Participation_rate_Taux_d_activite_2015-2024.csv`,
data_set = "Participation Rate - Normal")
participation_rate_02 <- mutate(`Participation_rate_Taux_d_activite_Education_2015-2024.csv`,
data_set = "Participation Rate - Education")
population <- mutate(`Population_2015-2024.csv`, data_set = "Population")
retirements <- mutate(Retirements_retraites_2015_2024.csv, data_set = "Retirements")
retirement_rates <- mutate(Retirement_rates_Taux_de_retraite_2015_2024.csv,
data_set = "Retirement Rates")
rlmc_crmt <- mutate(RLMC_CRMT_2015_2024.csv, data_set = "RLMC_CRMT")
school_leavers <- mutate(School_Leavers_Sortants_scolaires_2015_2024.csv, data_set = "School Leavers")
summary_data <- mutate(Summary_sommaire_2015_2024.csv, data_set = "Summary")
rm(Employment_emploi_2015_2024.csv, Employment_Growth_croissance_emploi_2015_2024.csv,
`FLMC_CFMT_2015-2024.csv`, immigration_2015_2024.csv, `Industrial_employment_emploi_par_secteur_2015-2024.csv`,
`Industrial_employment_Growth_Croissant_du_emploi_par_secteur_2015-2024.csv`,
JO_PE_2015_2024.csv, Job_Seekers_Chercheurs_emploi_2015_2024.csv, `Labour_Force_Population_active_2015-2024.csv`,
`Labour_Force_Population_Active_Education_2015-2024.csv`, `NOC_occ_grouping_eng_2015-2024.csv`,
`NOC_occ_grouping_fr_2015-2024.csv`, Industrial_other_replacement_autre_remplacement_par_secteur_2015_2024.csv,
`Industrial_retirements_retraites_par_secteur_2015-2024.csv`, `Participation_rate_Taux_d_activite_2015-2024.csv`,
`Participation_rate_Taux_d_activite_Education_2015-2024.csv`, `Population_2015-2024.csv`,
Retirement_rates_Taux_de_retraite_2015_2024.csv, Retirements_retraites_2015_2024.csv,
RLMC_CRMT_2015_2024.csv, School_Leavers_Sortants_scolaires_2015_2024.csv,
Summary_sommaire_2015_2024.csv, Industrial_JO_PE_par_secteur_2015_2024.csv)
Load library tidyr. Use gather() to rearrange any occupation related data set that present time series data into a tidy data format.
I have already loaded tidyr by loading the tidyverse library.
I interpret “occupational related” to mean any dataset in this group that is in a yearly time series and contains an occupation_name variable.
Note: I am sure there is a more elegant approach to this but a quick search of google was fruitless so I’ll just plow through it.
These are the dataframes that were identified as “occupational related data” employment, employment growth, immigration, jo_pe, job seekers other_replacements, other_seekers, retirement rates, retirements, school_leavers
occ_related_data <- rbind(employment, employment_growth, immigration, job_seekers,
other_replacements, other_seekers, retirement_rates, retirements, school_leavers,
job_openings_projected_demand)
2014 column to integer typeocc_related_data$`2014` <- as.integer(occ_related_data$`2014`)
## Warning: NAs introduced by coercion
occ_related_data$data_set <- as.factor(occ_related_data$data_set)
occ_related_data$Code <- as.factor(occ_related_data$Code)
occ_related_data$Occupation_Name <- as.factor(occ_related_data$Occupation_Name)
Use gather() to rearrange ALL other occupation related data sets that present time series data into a tidy data format, and pile out them in a unique data frame. Note: gather() is one of my favorite functions since I work with economic data quite a bit. Economic data tends to be presented in the wide format, but is not easy to work with for analysis purposes so we have to make it skinny.
glimpse(occ_related_data)
## Observations: 3,080
## Variables: 14
## $ data_set <fctr> Employment, Employment, Employment, Employmen...
## $ Code <fctr> N0000, SL_M, SL_A, SL_B, SL_C, SL_D, ST_SM, S...
## $ Occupation_Name <fctr> All Occupations, Management Occupations, Occu...
## $ 2014 <int> 17802300, 1610800, 3370500, 5920500, 4940800, ...
## $ 2015 <dbl> 17945100, 1614500, 3418600, 5963600, 4975700, ...
## $ 2016 <dbl> 18105400, 1620100, 3472000, 6018300, 5007100, ...
## $ 2017 <dbl> 18280600, 1628500, 3527400, 6078900, 5041100, ...
## $ 2018 <dbl> 18465000, 1638200, 3582200, 6142300, 5078500, ...
## $ 2019 <dbl> 18598000, 1644500, 3627000, 6189200, 5101400, ...
## $ 2020 <dbl> 18716400, 1649700, 3668500, 6231000, 5120500, ...
## $ 2021 <dbl> 18836000, 1655100, 3709700, 6273000, 5140700, ...
## $ 2022 <dbl> 18971000, 1661500, 3752700, 6320100, 5165900, ...
## $ 2023 <dbl> 19110700, 1668700, 3796500, 6368100, 5192500, ...
## $ 2024 <dbl> 19252000, 1676400, 3840100, 6416700, 5219900, ...
occ_data_gathered <- gather(occ_related_data, key = "Year", value = "Value",
-data_set, -Code, -Occupation_Name)
occ_data_gathered$Year <- as.integer(occ_data_gathered$Year)
occ_data_gathered$Value <- as.integer(occ_data_gathered$Value)
glimpse(occ_data_gathered)
## Observations: 33,880
## Variables: 5
## $ data_set <fctr> Employment, Employment, Employment, Employmen...
## $ Code <fctr> N0000, SL_M, SL_A, SL_B, SL_C, SL_D, ST_SM, S...
## $ Occupation_Name <fctr> All Occupations, Management Occupations, Occu...
## $ Year <int> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014...
## $ Value <int> 17802300, 1610800, 3370500, 5920500, 4940800, ...
Remove lines that present NA values, columns in French, and the “X” in front every year.
This is mostly complete. Just need to remove the NA’s.
occ_data_complete <- na.omit(occ_data_gathered)
summary(occ_data_complete)
## data_set Code
## Employment : 3388 N0000 : 106
## Immigration : 3388 N0010 : 106
## Other Replacements: 3388 N0111 : 106
## Retirement Rates : 3388 N0112 : 106
## Retirements : 3388 N0113 : 106
## School Leavers : 3388 N0121 : 106
## (Other) :12320 (Other):32012
## Occupation_Name Year
## Legislators and senior management : 212 Min. :2014
## Accommodation service managers : 106 1st Qu.:2017
## Accounting and related clerks : 106 Median :2019
## Accounting technician and bookkeepers: 106 Mean :2019
## Administrative assistants : 106 3rd Qu.:2022
## Administrative officers : 106 Max. :2024
## (Other) :31906
## Value
## Min. : -67900
## 1st Qu.: 100
## Median : 500
## Mean : 28014
## 3rd Qu.: 1900
## Max. :19252000
##
NA’s are gone.
occupational_final <- occ_data_complete
glimpse(occupational_final)
## Observations: 32,648
## Variables: 5
## $ data_set <fctr> Employment, Employment, Employment, Employmen...
## $ Code <fctr> N0000, SL_M, SL_A, SL_B, SL_C, SL_D, ST_SM, S...
## $ Occupation_Name <fctr> All Occupations, Management Occupations, Occu...
## $ Year <int> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014...
## $ Value <int> 17802300, 1610800, 3370500, 5920500, 4940800, ...
Let’s do the same with Industry data sets. Start by taking one of the industrial data sets that present data in a time series and use gather().
I am going to interpret this to mean that any dataframe that contains an Industry column is in the Industry group.
We have already created individual Industry datasets with an identifying variable so I’ll jump ahead a bit to binding all the Industrial datasets together.
Exercises 6 & 7 will also be completed in this section.
industries <- rbind(industrial_employment, industrial_growth, industrial_other_replacements,
industrial_retirements)
# Make 2014 an integer
industries$`2014` <- as.integer(industries$`2014`)
## Warning: NAs introduced by coercion
data_set column at front.industrial_data <- industries %>% select(-Secteur) %>% select(data_set, everything())
(dim(industrial_data))
## [1] 136 14
So we start with 136x14 dataset
Don’t worry about the NA’s introduced by coercion message. This is actually one of the cool things about R, it does an automatic conversion instead of throwing an error.
industrial_gathered <- gather(industrial_data, key = "Year", value = "Value",
-data_set, -Code, -Industry)
industrial_gathered$Year <- as.integer(industrial_gathered$Year)
(dim(industrial_gathered))
## [1] 1496 5
Now we trimmed it down to 1496x5. Very good.
summary() to see if there are NA’ssummary(industrial_gathered)
## data_set Code Industry Year
## Length:1496 Length:1496 Length:1496 Min. :2014
## Class :character Class :character Class :character 1st Qu.:2016
## Mode :character Mode :character Mode :character Median :2019
## Mean :2019
## 3rd Qu.:2022
## Max. :2024
##
## Value
## Min. : -12100
## 1st Qu.: 1400
## Median : 5500
## Mean : 287874
## 3rd Qu.: 67650
## Max. :19252000
## NA's :34
There are some NA’s in Value. Now is the time to drop them.
industrial_gathered <- na.omit(industrial_gathered)
summary(industrial_gathered)
## data_set Code Industry Year
## Length:1462 Length:1462 Length:1462 Min. :2014
## Class :character Class :character Class :character 1st Qu.:2016
## Mode :character Mode :character Mode :character Median :2019
## Mean :2019
## 3rd Qu.:2022
## Max. :2024
## Value
## Min. : -12100
## 1st Qu.: 1400
## Median : 5500
## Mean : 287874
## 3rd Qu.: 67650
## Max. :19252000
NA’s are gone
industrial_final <- industrial_gathered
industrial_final$data_set <- as_factor(industrial_final$data_set)
industrial_final$Code <- as.factor(industrial_final$Code)
industrial_final$Industry <- as.factor(industrial_final$Industry)
industrial_final formatglimpse(industrial_final)
## Observations: 1,462
## Variables: 5
## $ data_set <fctr> Industrial Employment, Industrial Employment, Indust...
## $ Code <fctr> IN00, IN01, IN02, IN03, IN04, IN05, IN06, IN07, IN08...
## $ Industry <fctr> All Industries, Agriculture, Forestry and Logging wi...
## $ Year <int> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014,...
## $ Value <int> 17802200, 305100, 49100, 15900, 106700, 77200, 123700...
Looks good
Do the same procedure of Exercise 5 to all other industries data sets. Pile out them in a new data frame. ###Done previously
Remove NAs, and French columns. In addition, set year and value as numeric, and take a look at your new tidy data set about industries. ###Done previously
Find out the industries that have the lowest number of jobseekers, and create a new data set by sub setting the previous one.
Since this specifically mentions industries I am not going to use the job_seekers df because job_seekers refers to occupations, not industries.
I’ll use industrial_final as a base df. None of these data sets specifically mentions job seekers so I am going to focus on Industrial Growth.
NOTE: I use eight since the most popular color palettes have eight colors.
bottom_8_industries <- industrial_final %>% filter(data_set == "Industrial Growth") %>%
group_by(Industry) %>% summarize(job_seeker_total = sum(Value)) %>% arrange(job_seeker_total) %>%
head(8) %>% select(Industry)
## Warning: package 'bindrcpp' was built under R version 3.3.3
bottom_8_industries
## # A tibble: 8 x 1
## Industry
## <fctr>
## 1 Agriculture
## 2 Paper Manufacturing
## 3 Printing and Related Support Activities
## 4 Computer, Electronic and Electrical Products
## 5 Fishing, Hunting and Trapping
## 6 Support Activities for Mining and Oil and Gas Extraction
## 7 Other Manufacturing
## 8 Forestry and Logging with support activities
bottom_8_subset <- industrial_final %>% filter(data_set == "Industrial Growth") %>%
filter(Industry %in% bottom_8_industries$Industry)
bottom_8_subset$Code <- as.character(bottom_8_subset$Code)
bottom_8_subset$Code <- as.factor(bottom_8_subset$Code)
bottom_8_subset$Industry <- as.character(bottom_8_subset$Industry)
bottom_8_subset$Industry <- as.factor(bottom_8_subset$Industry)
glimpse(bottom_8_subset)
## Observations: 80
## Variables: 5
## $ data_set <fctr> Industrial Growth, Industrial Growth, Industrial Gro...
## $ Code <fctr> IN01, IN02, IN03, IN06, IN11, IN12, IN16, IN19, IN01...
## $ Industry <fctr> Agriculture, Forestry and Logging with support activ...
## $ Year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2016,...
## $ Value <int> -2700, 200, -900, -12100, -1300, -600, -2300, 800, -1...
Ten years of data for eight industries = 80 rows :-)
Plot the recently create data set using a line for each industry.
ggplot(bottom_8_subset, aes(x = Year, y = Value, color = Industry)) + geom_line() +
ylim(-3000, 3000) + scale_fill_brewer(palette = "Set1")
## Warning: Removed 1 rows containing missing values (geom_path).
### Done
All of these lean towards stability in the future though each industry requires fewer people than in the past except for Agriculture, but demand is low in agriculture also.
The mid-point is zero so none of these industries are expected to grow by even one person so those industries are, in fact, in decline. Agriculture can be a labor-intensive industry so growth of 2000 or less, year-on-year, until 2024 indicates a more mechanized approach to agriculture.
Create a similar plot for the top 5 occupations in terms of low amount of jobseekers.
There is a specific dataset for job seekers that we will use for this analysis. It has to be processed a little to be usable.
NOTE: Since this explicitly mentions it wants five occupations I will use top 5 instead of top 8.
job_seekers$`2014` <- as.integer(job_seekers$`2014`)
## Warning: NAs introduced by coercion
# job_seekers$Code <- as.factor(job_seekers$Code)
job_seekers$Occupation_Name <- as.factor(job_seekers$Occupation_Name)
job_seekers <- job_seekers %>% select(-Code, -Nom_de_la_profession)
job_seekers_gathered <- gather(job_seekers, key = "Year", value = "Value", -Occupation_Name)
job_seekers_gathered$Year <- as.integer(job_seekers_gathered$Year)
job_seekers_gathered$Value <- as.integer(job_seekers_gathered$Value)
# There are some NA's that need to be trimmed
job_seekers_gathered <- na.omit(job_seekers_gathered)
summary(job_seekers_gathered)
## Occupation_Name Year
## Legislators and senior management : 20 Min. :2015
## Accommodation service managers : 10 1st Qu.:2017
## Accounting and related clerks : 10 Median :2020
## Accounting technician and bookkeepers: 10 Mean :2020
## Administrative assistants : 10 3rd Qu.:2022
## Administrative officers : 10 Max. :2024
## (Other) :3010
## Value
## Min. : 0
## 1st Qu.: 700
## Median : 1300
## Mean : 7565
## 3rd Qu.: 2800
## Max. :608000
##
job_seekers_final <- job_seekers_gathered
glimpse(job_seekers_final)
## Observations: 3,080
## Variables: 3
## $ Occupation_Name <fctr> All Occupations, Management Occupations, Occu...
## $ Year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Value <int> 563600, 62200, 121000, 185400, 141200, 53700, ...
js_bottom_5 <- job_seekers_final %>% group_by(Occupation_Name) %>% summarise(js_sum = sum(Value)) %>%
arrange(js_sum) %>% head(5)
js_bottom_5$Occupation_Name <- as.character(js_bottom_5$Occupation_Name)
js_bottom_5$Occupation_Name <- as.factor(js_bottom_5$Occupation_Name)
js_bottom_5
## # A tibble: 5 x 2
## Occupation_Name js_sum
## <fctr> <int>
## 1 Service station attendants 200
## 2 Other workers in fishing and trapping and hunting occupations 1100
## 3 Instructors of persons with disabilities 1400
## 4 Bartenders 1600
## 5 Opticians 1700
fewest_job_seekers <- job_seekers_final %>% filter(Occupation_Name %in% js_bottom_5$Occupation_Name)
fewest_job_seekers$Occupation_Name <- as.factor(fewest_job_seekers$Occupation_Name)
fjs_1 <- fewest_job_seekers %>% mutate(Occupation_Name = fct_recode(Occupation_Name,
`Other Workers: Fishing, Hunting, Trapping` = "Other workers in fishing and trapping and hunting occupations"))
fjs_1$jittered_values <- jitter(fjs_1$Value)
ggplot(fjs_1, aes(x = Year, y = jittered_values, color = Occupation_Name)) +
geom_line() + ylim(-100, 300) + scale_y_continuous(breaks = seq(from = -50,
to = 300, by = 50)) + ylab("Per Year Job Growth") + scale_fill_brewer(palette = "Set1")
These exercises should be thought of as similar to etudes in music. While there is some insight gained from the excersizes it is primarily to practice techniques that are commonly used on economic data such as being comfortable with the gather() function and plotting out the results for visual inspection.
The overall data set is very deep and a lot of useful insight can be gained by a more thorough analysis. I look forward to Mr. Silva’s next set of exercises.