library(tidyverse)
library(ggplot2)
library(DT)
The raw Canada Labour Force Characteristics dataset used in this project is far from being able to be used for data analysis purposes. To change this several data cleaning and transformation tactics are employed. Extraneous rows are removed in lines 36, 65, 73, and 74. Columns are renamed and transformed multiple times through out the clean up process, sometimes through the use of regex. For the table views, the data is made wider via pivot_wider. In the data visualization portion subsets of the data are made longer via pivot_longer. Additionally, during the transformation process new columns are created via mutate.
# Load Canada unemployment csv from Github URL
#df_cs_raw <- read.csv(url("https://raw.githubusercontent.com/mattlucich/data-transformation/une/canada-unemployment.csv"))
# Load Canada unemployment csv from Github repo (if you prefer)
df_cu_raw <- read.csv("canada-unemployment.csv", skip = 9)
# Confirm we've skipped the correct amount of rows by viewing the column names
colnames(df_cu_raw)
## [1] "Geography.4" "Labour.force.characteristics"
## [3] "February.2020" "January.2021"
## [5] "February.2021" "February.2020.to.February.2021"
## [7] "January.2021.to.February.2021" "January.2021.to.February.2021.1"
## [9] "February.2020.to.February.2021.1"
# Preview the data
datatable(df_cu_raw, caption = "Raw Canada Labour Force Characteristics data")
# Remove unneeded columns
df_cu_raw <- df_cu_raw %>% select (-c(January.2021,
February.2020.to.February.2021,
January.2021.to.February.2021,
January.2021.to.February.2021.1,
February.2020.to.February.2021.1))
# Rename columns
df_cu_raw <- df_cu_raw %>% rename(province = "Geography.4",
labour_stat = "Labour.force.characteristics",
feb_2020 = "February.2020",
feb_2021 = "February.2021")
# Filter out extraneous rows
df_cu_raw <- df_cu_raw %>% filter(feb_2020 != "")
# If province is empty fill in value from previous row
for (i in 1:10) {
df_cu_raw <- df_cu_raw %>% mutate(province = ifelse(province=="", lag(province), province))
}
# Remove sub headers and Canada rows
df_cu_raw <- df_cu_raw %>% filter(!feb_2020 %in% c("Persons", "Percentage"))
df_cu_raw <- df_cu_raw %>% filter(!province %in% c("Canada"))
# Remove unneeded stats
remove_stats <- c("Labour force (x 1,000) 7", "Employment (x 1,000) 8",
"Full-time employment (x 1,000) 9", "Part-time employment (x 1,000) 10",
"Unemployment (x 1,000) 11")
df_cu_raw <- df_cu_raw %>% filter(!labour_stat %in% remove_stats)
# Make the dataframe wider (turn values from labour_stat into columns)
df_cu_raw <- df_cu_raw %>% pivot_wider(names_from = labour_stat,
values_from = c(feb_2020, feb_2021))
# Rename population columns
df_cu_raw <- df_cu_raw %>% rename(feb_2020_population = "feb_2020_Population (x 1,000) 5 6",
feb_2021_population = "feb_2021_Population (x 1,000) 5 6")
# Remove extraneous two digit numbers at end of column names via Regex
colnames(df_cu_raw) <- sub("(\\d{2}$)", "", colnames(df_cu_raw))
# Trim whitespace from column names
colnames(df_cu_raw) <- trimws(colnames(df_cu_raw))
# Convert column names to lowercase and replace blank spaces with underscores
df_cu_raw <- df_cu_raw %>%
rename_with(~ tolower(gsub(" ", "_", .x, fixed = TRUE)))
# Remove unnecessary commas
df_cu_raw <- as.data.frame(lapply(df_cu_raw, function(y) gsub(",", "", y)))
# Convert all columns except "province" to numeric
df_cu_raw <- df_cu_raw %>% mutate(across(!province, as.numeric))
# Check the data types of each column
str(df_cu_raw)
## 'data.frame': 10 obs. of 9 variables:
## $ province : chr "Newfoundland and Labrador" "Prince Edward Island" "Nova Scotia" "New Brunswick" ...
## $ feb_2020_population : num 446 132 812 646 7037 ...
## $ feb_2020_unemployment_rate : num 12.4 8 8 7.3 4.5 5.5 5.1 6.3 7.5 5.1
## $ feb_2020_participation_rate: num 57.4 67.5 62.5 61 64.8 65.1 66.9 68.8 70.2 65
## $ feb_2020_employment_rate : num 50.3 62.1 57.5 56.6 61.9 61.6 63.5 64.5 65 61.7
## $ feb_2021_population : num 445 133 817 648 7086 ...
## $ feb_2021_unemployment_rate : num 15.3 9.2 8.1 8.9 6.4 9.2 6.8 7.3 9.9 6.9
## $ feb_2021_participation_rate: num 54.9 64.4 61.9 60.8 63.6 64.4 66.1 66.7 69.1 65.3
## $ feb_2021_employment_rate : num 46.5 58.5 56.9 55.4 59.5 58.5 61.6 61.8 62.2 60.7
# Create YoY percentage point change columns
df_cu_raw <- df_cu_raw %>%
mutate(yoy_unemploy_rate_pp_change = round((feb_2021_unemployment_rate -
feb_2020_unemployment_rate), 2),
yoy_part_rate_pp_change = round((feb_2021_participation_rate -
feb_2020_participation_rate), 2),
yoy_employ_rate_pp_change = round((feb_2021_employment_rate -
feb_2020_employment_rate), 2))
# Reorder columns for easier comparisons
df_cu <- df_cu_raw[, c(1, 2, 6, 3, 7, 10, 4, 8, 11, 5, 9, 12)]
# Review the data
datatable(df_cu, caption = "Canada Labour Force Characteristics")
# Select only unemployment rate columns and province
df_cu_unemploy <- df_cu %>% select(province, feb_2020_unemployment_rate, feb_2021_unemployment_rate)
# Rename columns to improve chart formatting
df_cu_unemploy <- df_cu_unemploy %>% rename("2020" = feb_2020_unemployment_rate,
"2021" = feb_2021_unemployment_rate)
# Convert to long format
df_cu_unemploy <- df_cu_unemploy %>% pivot_longer(c("2020", "2021"),
names_to = "year", values_to = "unemployment_rate")
# YoY unemployment rate by province
ggplot(df_cu_unemploy, aes(province, unemployment_rate)) +
geom_bar(aes(fill = year), position = "dodge", stat="identity") +
theme(axis.text.x = element_text(angle = 90)) +
ggtitle("YoY Unemployment by Province") +
ylab("Unemployment Rate") + xlab("Province")
# Select only participation rate columns and province
df_cu_part <- df_cu %>% select(province, feb_2020_participation_rate, feb_2021_participation_rate)
# Rename columns to improve chart formatting
df_cu_part <- df_cu_part %>% rename("2020" = feb_2020_participation_rate,
"2021" = feb_2021_participation_rate)
# Convert to long format
df_cu_part <- df_cu_part %>% pivot_longer(c("2020", "2021"),
names_to = "year", values_to = "participation_rate")
# YoY Participation rate by province
ggplot(df_cu_part, aes(province, participation_rate)) +
geom_bar(aes(fill = year), position = "dodge", stat="identity") +
theme(axis.text.x = element_text(angle = 90)) +
ggtitle("YoY Participation by Province") +
ylab("Participation Rate") + xlab("Province")
Due to the relatively labor-intensive data cleaning and transformation process, the data analysis portion of this dataset was cursory. Though attention to detail was put into considering what columns to include in the final table view and how to order them. The data visualizations provide a clear year over year picture of how crucial labour force characteristics have changed by province. Recommendations for further analysis include focusing on provinces with both the largest and smallest unemployment rate changes year over year then research factors for possible causes. A similar analysis should be conducted for participation rate as well.
Statistics Canada (StatCan) "Labour force characteristics by province, monthly, seasonally adjusted"
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.