library(tidyverse)
library(ggplot2)
library(DT)

Overview

The raw Canada Labour Force Characteristics dataset used in this project is far from being able to be used for data analysis purposes. To change this several data cleaning and transformation tactics are employed. Extraneous rows are removed in lines 36, 65, 73, and 74. Columns are renamed and transformed multiple times through out the clean up process, sometimes through the use of regex. For the table views, the data is made wider via pivot_wider. In the data visualization portion subsets of the data are made longer via pivot_longer. Additionally, during the transformation process new columns are created via mutate.

Loading the data

# Load Canada unemployment csv from Github URL
#df_cs_raw <- read.csv(url("https://raw.githubusercontent.com/mattlucich/data-transformation/une/canada-unemployment.csv"))

# Load Canada unemployment csv from Github repo (if you prefer)
df_cu_raw <- read.csv("canada-unemployment.csv", skip = 9)

# Confirm we've skipped the correct amount of rows by viewing the column names
colnames(df_cu_raw)
## [1] "Geography.4"                      "Labour.force.characteristics"    
## [3] "February.2020"                    "January.2021"                    
## [5] "February.2021"                    "February.2020.to.February.2021"  
## [7] "January.2021.to.February.2021"    "January.2021.to.February.2021.1" 
## [9] "February.2020.to.February.2021.1"
# Preview the data
datatable(df_cu_raw, caption = "Raw Canada Labour Force Characteristics data")

Cleaning and transforming the data

# Remove unneeded columns
df_cu_raw <- df_cu_raw %>% select (-c(January.2021,
                         February.2020.to.February.2021,
                         January.2021.to.February.2021,
                         January.2021.to.February.2021.1,
                         February.2020.to.February.2021.1))

# Rename columns
df_cu_raw <- df_cu_raw %>% rename(province = "Geography.4",
                                  labour_stat = "Labour.force.characteristics",
                                  feb_2020 = "February.2020",
                                  feb_2021 = "February.2021")

# Filter out extraneous rows
df_cu_raw <- df_cu_raw %>% filter(feb_2020 != "")

# If province is empty fill in value from previous row
for (i in 1:10) {
  df_cu_raw <- df_cu_raw %>% mutate(province = ifelse(province=="", lag(province), province))
}

# Remove sub headers and Canada rows
df_cu_raw <- df_cu_raw %>% filter(!feb_2020 %in% c("Persons", "Percentage"))
df_cu_raw <- df_cu_raw %>% filter(!province %in% c("Canada"))

# Remove unneeded stats
remove_stats <- c("Labour force (x 1,000) 7", "Employment (x 1,000) 8", 
                    "Full-time employment (x 1,000) 9", "Part-time employment  (x 1,000) 10",
                    "Unemployment (x 1,000) 11")
df_cu_raw <- df_cu_raw %>% filter(!labour_stat %in% remove_stats)

# Make the dataframe wider (turn values from labour_stat into columns)
df_cu_raw <- df_cu_raw %>% pivot_wider(names_from = labour_stat, 
                                        values_from = c(feb_2020, feb_2021))

# Rename population columns
df_cu_raw <- df_cu_raw %>% rename(feb_2020_population = "feb_2020_Population (x 1,000) 5 6",
                                  feb_2021_population = "feb_2021_Population (x 1,000) 5 6")

# Remove extraneous two digit numbers at end of column names via Regex 
colnames(df_cu_raw) <- sub("(\\d{2}$)", "", colnames(df_cu_raw))

# Trim whitespace from column names
colnames(df_cu_raw) <- trimws(colnames(df_cu_raw))

# Convert column names to lowercase and replace blank spaces with underscores
df_cu_raw <- df_cu_raw %>% 
                rename_with(~ tolower(gsub(" ", "_", .x, fixed = TRUE)))

# Remove unnecessary commas
df_cu_raw <- as.data.frame(lapply(df_cu_raw, function(y) gsub(",", "", y)))

# Convert all columns except "province" to numeric
df_cu_raw <- df_cu_raw %>% mutate(across(!province, as.numeric))

# Check the data types of each column
str(df_cu_raw)
## 'data.frame':    10 obs. of  9 variables:
##  $ province                   : chr  "Newfoundland and Labrador" "Prince Edward Island" "Nova Scotia" "New Brunswick" ...
##  $ feb_2020_population        : num  446 132 812 646 7037 ...
##  $ feb_2020_unemployment_rate : num  12.4 8 8 7.3 4.5 5.5 5.1 6.3 7.5 5.1
##  $ feb_2020_participation_rate: num  57.4 67.5 62.5 61 64.8 65.1 66.9 68.8 70.2 65
##  $ feb_2020_employment_rate   : num  50.3 62.1 57.5 56.6 61.9 61.6 63.5 64.5 65 61.7
##  $ feb_2021_population        : num  445 133 817 648 7086 ...
##  $ feb_2021_unemployment_rate : num  15.3 9.2 8.1 8.9 6.4 9.2 6.8 7.3 9.9 6.9
##  $ feb_2021_participation_rate: num  54.9 64.4 61.9 60.8 63.6 64.4 66.1 66.7 69.1 65.3
##  $ feb_2021_employment_rate   : num  46.5 58.5 56.9 55.4 59.5 58.5 61.6 61.8 62.2 60.7
# Create YoY percentage point change columns
df_cu_raw <- df_cu_raw %>% 
        mutate(yoy_unemploy_rate_pp_change = round((feb_2021_unemployment_rate -   
                                                      feb_2020_unemployment_rate), 2),
              yoy_part_rate_pp_change = round((feb_2021_participation_rate - 
                                                 feb_2020_participation_rate), 2),
              yoy_employ_rate_pp_change = round((feb_2021_employment_rate - 
                                                   feb_2020_employment_rate), 2))

# Reorder columns for easier comparisons
df_cu <- df_cu_raw[, c(1, 2, 6, 3, 7, 10, 4, 8, 11, 5, 9, 12)]

Analysis: Canada Labour Force Characteristics

# Review the data
datatable(df_cu, caption = "Canada Labour Force Characteristics")
# Select only unemployment rate columns and province
df_cu_unemploy <- df_cu %>% select(province, feb_2020_unemployment_rate, feb_2021_unemployment_rate)

# Rename columns to improve chart formatting
df_cu_unemploy <- df_cu_unemploy %>% rename("2020" = feb_2020_unemployment_rate,
                                            "2021" = feb_2021_unemployment_rate)

# Convert to long format 
df_cu_unemploy <- df_cu_unemploy %>% pivot_longer(c("2020", "2021"), 
                                     names_to = "year", values_to = "unemployment_rate")

# YoY unemployment rate by province
ggplot(df_cu_unemploy, aes(province, unemployment_rate)) +   
  geom_bar(aes(fill = year), position = "dodge", stat="identity") + 
  theme(axis.text.x = element_text(angle = 90)) +
  ggtitle("YoY Unemployment by Province") + 
  ylab("Unemployment Rate") + xlab("Province")

# Select only participation rate columns and province
df_cu_part <- df_cu %>% select(province, feb_2020_participation_rate, feb_2021_participation_rate)

# Rename columns to improve chart formatting
df_cu_part <- df_cu_part %>% rename("2020" = feb_2020_participation_rate,
                                    "2021" = feb_2021_participation_rate)

# Convert to long format
df_cu_part <- df_cu_part %>% pivot_longer(c("2020", "2021"), 
                                     names_to = "year", values_to = "participation_rate")

# YoY Participation rate by province
ggplot(df_cu_part, aes(province, participation_rate)) +   
  geom_bar(aes(fill = year), position = "dodge", stat="identity") + 
  theme(axis.text.x = element_text(angle = 90)) +
  ggtitle("YoY Participation by Province") + 
  ylab("Participation Rate") + xlab("Province")

Conclusion

Due to the relatively labor-intensive data cleaning and transformation process, the data analysis portion of this dataset was cursory. Though attention to detail was put into considering what columns to include in the final table view and how to order them. The data visualizations provide a clear year over year picture of how crucial labour force characteristics have changed by province. Recommendations for further analysis include focusing on provinces with both the largest and smallest unemployment rate changes year over year then research factors for possible causes. A similar analysis should be conducted for participation rate as well.

References

Statistics Canada (StatCan) "Labour force characteristics by province, monthly, seasonally adjusted"


Creative Commons License
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.