In this Project, we are going to use the experimental estimates of household income well-being during the COVID-19 pandemic as released by Statistics Canada. The data tables from Statistics Canada contain information about various income and savings aspects of the population. The population is divided into quintiles as defined below:
A quintile is a statistical value of a data set that represents 20% of a given population, so the first quintile represents the lowest fifth of the data (1% to 20%); the second quintile represents the second fifth (21% to 40%) and so on
We can extract the data from the Statistics Canada page using the datapasta package
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(here)
## here() starts at C:/Users/Owner/Desktop/Sem1/DAB501
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(tidytext)
library(ggrepel)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(RedditExtractoR)
Loading the income group dataset below, using the datapasta
addin in RStudio.
income_data <- data.frame(
stringsAsFactors = FALSE,
check.names = FALSE,
Row_Names = c(NA,"Household disposable income","Lowest income quintile",
"Second income quintile","Third income quintile",
"Fourth income quintile","Highest income quintile","Compensation of employees",
"Lowest income quintile","Second income quintile","Third income quintile",
"Fourth income quintile","Highest income quintile","Net mixed income",
"Lowest income quintile","Second income quintile",
"Third income quintile","Fourth income quintile","Highest income quintile",
"Current transfers received","Lowest income quintile","Second income quintile",
"Third income quintile","Fourth income quintile",
"Highest income quintile","Household final consumption expenditure",
"Lowest income quintile","Second income quintile","Third income quintile",
"Fourth income quintile","Highest income quintile"),
`2019` = c("dollars","81,351","25,430","52,011","71,153",
"95,200","162,963","74,834","10,980","33,699","62,253",
"100,049","167,190","13,081","4,509","9,204","11,613","15,021",
"25,057","22,348","19,106","24,061","24,142","22,101","22,331",
"83,043","52,733","63,448","76,738","92,381","129,915"),
Quarter.1.2020 = c("dollars","19,958","6,129",
"12,847","17,648","23,188","39,979","18,646","2,644",
"8,228","15,418","24,870","42,069","3,112","1,026","2,164",
"2,732","3,582","6,056","6,083","4,969","6,449",
"6,611","5,904","6,480","19,388","15,050","15,460",
"18,285","21,124","27,023"),
Quarter.2.2020 = c("dollars","22,755","8,187",
"15,919","20,655","26,178","42,837","17,303","2,338",
"7,366","14,059","23,101","39,652","2,918","768","2,022",
"2,494","3,437","5,869","9,587","7,234","10,102",
"10,646","10,015","9,937","17,903","13,932","14,580",
"17,095","19,531","24,377"),
Quarter.3.2020 = c("dollars","23,576","8,386",
"15,798","21,109","27,392","45,198","18,555","2,596",
"8,167","15,148","24,869","41,994","3,924","1,646","2,805",
"3,636","4,419","7,112","7,759","6,252","8,182",
"8,355","7,686","8,323","20,488","15,660","16,415",
"19,484","22,436","28,446"),
`Q1.2020/2019` = c("% difference","-1.9","-3.6",
"-1.2","-0.8","-2.6","-1.9","-0.3","-3.7","-2.3","-0.9",
"-0.6","0.6","-4.8","-9.0","-6.0","-5.9","-4.6",
"-3.3","8.9","4.0","7.2","9.5","6.9","16.1","-6.6",
"14.2","-2.5","-4.7","-8.5","-16.8"),
`Q2.2020/Q1.2020` = c("% difference","14.0",
"33.6","23.9","17.0","12.9","7.1","-7.2","-11.6",
"-10.5","-8.8","-7.1","-5.7","-6.2","-25.1","-6.6",
"-8.7","-4.0","-3.1","57.6","45.6","56.6","61.0",
"69.6","53.3","-7.7","-7.4","-5.7","-6.5","-7.5",
"-9.8"),
`Q3.2020/Q2.2020` = c("% difference","3.6","2.4",
"-0.8","2.2","4.6","5.5","7.2","11.0","10.9",
"7.7","7.7","5.9","34.5","114.3","38.7","45.8",
"28.6","21.2","-19.1","-13.6","-19.0","-21.5","-23.3",
"-16.2","14.4","12.4","12.6","14.0","14.9","16.7")
) %>%
slice(-1) %>%
select(-c(6:8))
income_data <- income_data %>%
mutate(categories = case_when(
Row_Names %in% c("Lowest income quintile", "Second income quintile",
"Third income quintile", "Fourth income quintile",
"Highest income quintile") ~ NA_character_,
TRUE ~ Row_Names
)) %>%
fill(categories, .direction = "down")%>%
filter(Row_Names %in% c("Lowest income quintile", "Second income quintile",
"Third income quintile", "Fourth income quintile",
"Highest income quintile")) %>%
relocate(categories) %>%
clean_names()
income_data <- income_data %>%
mutate(across(.cols = c(x2019, starts_with("quarter")), .fns = parse_number))
income_data <- income_data %>%
rename(`2019` = x2019) %>%
pivot_longer(
cols = c(`2019`, quarter_1_2020, quarter_2_2020, quarter_3_2020),
names_to = "quarter",
values_to = "dollars"
)
income_data <- income_data %>%
pivot_wider(names_from = categories,
values_from = dollars) %>%
clean_names() %>%
relocate(quarter)
income_data <- income_data %>%
separate(col = quarter,
into = c("year", "quarter"), sep = "_")
## Warning: Expected 2 pieces. Additional pieces discarded in 15 rows [2, 3, 4, 6,
## 7, 8, 10, 11, 12, 14, 15, 16, 18, 19, 20].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 5 rows [1, 5, 9,
## 13, 17].
income_data <- income_data %>%
mutate(year = case_when(
year == "quarter" ~ 2020,
TRUE ~ as.numeric(year)
)) %>%
rename(quintile = row_names) %>%
mutate(quintile = factor(quintile,
levels = c("Lowest income quintile", "Second income quintile",
"Third income quintile", "Fourth income quintile",
"Highest income quintile")))
## Warning in eval_tidy(pair$rhs, env = default_env): NAs introduced by coercion
income_data <- income_data %>%
mutate(q_year = case_when(
!is.na(quarter) ~ paste0("Q ", quarter, " ", year),
is.na(quarter) ~ as.character(year)
)) %>%
relocate(q_year)
income_data %>% kbl() %>% kable_paper("hover", full_width = F)
q_year | year | quarter | quintile | household_disposable_income | compensation_of_employees | net_mixed_income | current_transfers_received | household_final_consumption_expenditure |
---|---|---|---|---|---|---|---|---|
2019 | 2019 | NA | Lowest income quintile | 25430 | 10980 | 4509 | 19106 | 52733 |
Q 1 2020 | 2020 | 1 | Lowest income quintile | 6129 | 2644 | 1026 | 4969 | 15050 |
Q 2 2020 | 2020 | 2 | Lowest income quintile | 8187 | 2338 | 768 | 7234 | 13932 |
Q 3 2020 | 2020 | 3 | Lowest income quintile | 8386 | 2596 | 1646 | 6252 | 15660 |
2019 | 2019 | NA | Second income quintile | 52011 | 33699 | 9204 | 24061 | 63448 |
Q 1 2020 | 2020 | 1 | Second income quintile | 12847 | 8228 | 2164 | 6449 | 15460 |
Q 2 2020 | 2020 | 2 | Second income quintile | 15919 | 7366 | 2022 | 10102 | 14580 |
Q 3 2020 | 2020 | 3 | Second income quintile | 15798 | 8167 | 2805 | 8182 | 16415 |
2019 | 2019 | NA | Third income quintile | 71153 | 62253 | 11613 | 24142 | 76738 |
Q 1 2020 | 2020 | 1 | Third income quintile | 17648 | 15418 | 2732 | 6611 | 18285 |
Q 2 2020 | 2020 | 2 | Third income quintile | 20655 | 14059 | 2494 | 10646 | 17095 |
Q 3 2020 | 2020 | 3 | Third income quintile | 21109 | 15148 | 3636 | 8355 | 19484 |
2019 | 2019 | NA | Fourth income quintile | 95200 | 100049 | 15021 | 22101 | 92381 |
Q 1 2020 | 2020 | 1 | Fourth income quintile | 23188 | 24870 | 3582 | 5904 | 21124 |
Q 2 2020 | 2020 | 2 | Fourth income quintile | 26178 | 23101 | 3437 | 10015 | 19531 |
Q 3 2020 | 2020 | 3 | Fourth income quintile | 27392 | 24869 | 4419 | 7686 | 22436 |
2019 | 2019 | NA | Highest income quintile | 162963 | 167190 | 25057 | 22331 | 129915 |
Q 1 2020 | 2020 | 1 | Highest income quintile | 39979 | 42069 | 6056 | 6480 | 27023 |
Q 2 2020 | 2020 | 2 | Highest income quintile | 42837 | 39652 | 5869 | 9937 | 24377 |
Q 3 2020 | 2020 | 3 | Highest income quintile | 45198 | 41994 | 7112 | 8323 | 28446 |
income_data_subset<-income_data[income_data$q_year!="2019",]
ggplot(income_data_subset,aes(x=quintile,y=household_disposable_income))+
geom_bar(aes(fill=q_year),stat="identity")+
geom_text(aes(label=household_disposable_income))+
facet_grid(. ~ q_year )+
coord_flip()+
labs(title="Household Disposable Income Distribution 2019-2020")+
theme(
legend.position = "none",
axis.title.x=element_blank(),
axis.title.y=element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank())
Interpreting the plot.
Household Disposable Income is the amount of money that a household has to spend or save after income taxes have been deducted.
There was an increase in all the Income quintiles in Q2 2020 from Q1 2020.Lowest Income Quintile increased the least in comparison. The increase in the rest of the Quintiles was from two to three thousand.
For Q3 2020, Highest Income Quintile again increased by three thousand, while the rest of the quintiles also increased, but only a little. Second Income Quintile decreased by about 200.
ggplot(income_data_subset,aes(q_year,
household_final_consumption_expenditure,color=quintile,group=quintile))+
geom_point()+
geom_line()+
geom_text_repel(aes(label=household_final_consumption_expenditure))+
labs(title="Household Final Consumption Expenditure 2019-2020")+
theme(
axis.title.y=element_blank(),
axis.title.x=element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank())
Interpreting the plot.
Final Consumption expenditure is the total expenses by a household. There was a decrease in it, in all the quintiles, in Q2 of 2020. The most decrease was in Highest Income Quintile.
In Q3 2020, there was an increase, making the household expenditure greater than that in Q1 of 2020, in all the Quintiles.
According to the Statistics Canada article:
Over the first three quarters of 2020, disposable income for the lowest-income households increased 36.8%, more than for any other households.
Using the income_data
and the formula for percentage change (below) determine the percent increase in the income of lowest-income quintile. This number must be 36.8% as above. $$
% =
$$
income_data_subset %>%
filter(quintile=="Lowest income quintile" & q_year %in% c("Q 1 2020","Q 3 2020")) %>%
select(q_year,quintile,household_disposable_income) %>%
mutate(
percent=((household_disposable_income -lag(household_disposable_income))/lag(household_disposable_income))*100
) %>% drop_na() %>%pull()
## [1] 36.82493
Using the same steps as above, we can get the table from this page of Statistics Canada in R. This contains the age group instead of population quintiles. You’ll need to use the datapasta
addin in RStudio to do this step.
age_data<-data.frame(
stringsAsFactors = FALSE,
check.names = FALSE,
Row_Names = c(NA,"Household disposable income","Younger than 35 years",
"35 to 44 years","45 to 54 years","55 to 64 years",
"65 years and older","Compensation of employees","Younger than 35 years",
"35 to 44 years","45 to 54 years","55 to 64 years","65 years and older",
"Net mixed income","Younger than 35 years","35 to 44 years",
"45 to 54 years","55 to 64 years","65 years and older",
"Current transfers received","Younger than 35 years","35 to 44 years","45 to 54 years",
"55 to 64 years","65 years and older",
"Household final consumption expenditure","Younger than 35 years","35 to 44 years","45 to 54 years",
"55 to 64 years","65 years and older"),
`2019` = c("dollars","81,351","75,337","93,641","104,868",
"87,365","60,193","74,834","85,308","109,778","122,610",
"85,218","11,887","13,081","9,744","14,820","16,664","14,903",
"10,957","22,348","11,584","14,307","13,097","19,075","41,935",
"83,043","75,494","92,286","107,072","90,092","63,789"),
Quarter.1.2020 = c("dollars","19,958","18,259",
"22,859","25,456","21,283","15,243","18,646","20,990",
"27,546","30,663","21,227","2,957","3,112","2,245",
"3,514","3,920","3,528","2,700","6,083","3,475","4,006",
"3,807","5,279","10,934","19,388","17,546","21,532",
"25,094","21,210","14,785"),
Quarter.2.2020 = c("dollars","22,755","21,934",
"25,728","29,400","24,251","16,630","17,303","19,340",
"25,369","28,392","20,088","2,739","2,918","2,141",
"3,324","3,710","3,085","2,615","9,587","8,169","8,246",
"9,119","8,948","12,033","17,903","16,089","19,898",
"23,187","19,714","13,625"),
Quarter.3.2020 = c("dollars","23,576","21,953",
"26,781","30,453","25,721","17,286","18,555","20,993",
"26,937","30,486","21,424","2,983","3,924","2,773",
"4,384","4,855","4,778","3,308","7,759","5,362","5,839",
"6,162","6,928","11,997","20,488","18,674","22,933",
"26,516","22,421","15,422"),
`Q1.2020/2019` = c("% difference","-1.9","-3.1",
"-2.4","-2.9","-2.6","1.3","-0.3","-1.6","0.4","0.0",
"-0.4","-0.5","-4.8","-7.8","-5.2","-5.9","-5.3",
"-1.4","8.9","20.0","12.0","16.3","10.7","4.3","-6.6",
"-7.0","-6.7","-6.3","-5.8","-7.3"),
`Q2.2020/Q1.2020` = c("% difference","14.0",
"20.1","12.6","15.5","13.9","9.1","-7.2","-7.9","-7.9",
"-7.4","-5.4","-7.4","-6.2","-4.6","-5.4","-5.4",
"-12.6","-3.1","57.6","135.1","105.8","139.5",
"69.5","10.1","-7.7","-8.3","-7.6","-7.6","-7.1",
"-7.8"),
`Q3.2020/Q2.2020` = c("% difference","3.6","0.1",
"4.1","3.6","6.1","3.9","7.2","8.5","6.2","7.4",
"6.7","8.9","34.5","29.5","31.9","30.9","54.9",
"26.5","-19.1","-34.4","-29.2","-32.4","-22.6",
"-0.3","14.4","16.1","15.3","14.4","13.7","13.2")
)%>%
slice(-1) %>%
select(-c(6:8))
age_data <- age_data %>%
mutate(categories = case_when(
Row_Names %in% c("Younger than 35 years", "35 to 44 years",
"45 to 54 years", "55 to 64 years",
"65 years and older") ~ NA_character_,
TRUE ~ Row_Names
)) %>%
fill(categories, .direction = "down")%>%
filter(Row_Names %in% c("Younger than 35 years", " 35 to 44 years",
"45 to 54 years", "55 to 64 years",
"65 years and older")) %>%
relocate(categories) %>%
clean_names()
age_data <- age_data %>%
mutate(across(.cols = c(x2019, starts_with("quarter")), .fns = parse_number))
age_data <- age_data %>%
rename(`2019` = x2019) %>%
pivot_longer(
cols = c(`2019`, quarter_1_2020, quarter_2_2020, quarter_3_2020),
names_to = "quarter",
values_to = "dollars"
)
age_data <- age_data %>%
pivot_wider(names_from = categories,
values_from = dollars) %>%
clean_names() %>%
relocate(quarter)
age_data <- age_data %>%
separate(col = quarter,
into = c("year", "quarter"), sep = "_")
## Warning: Expected 2 pieces. Additional pieces discarded in 12 rows [2, 3, 4, 6,
## 7, 8, 10, 11, 12, 14, 15, 16].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 4 rows [1, 5, 9,
## 13].
age_data <- age_data %>%
mutate(year = case_when(
year == "quarter" ~ 2020,
TRUE ~ as.numeric(year)
)) %>%
rename(age_group = row_names) %>%
mutate(age_group = factor(age_group,
levels = c("Younger than 35 years", " 35 to 44 years",
"45 to 54 years", "55 to 64 years",
"65 years and older")))
## Warning in eval_tidy(pair$rhs, env = default_env): NAs introduced by coercion
age_data <- age_data %>%
mutate(q_year = case_when(
!is.na(quarter) ~ paste0("Q ", quarter, " ", year),
is.na(quarter) ~ as.character(year)
)) %>%
relocate(q_year)
age_data %>% kbl() %>% kable_paper("hover", full_width = F)
q_year | year | quarter | age_group | household_disposable_income | compensation_of_employees | net_mixed_income | current_transfers_received | household_final_consumption_expenditure |
---|---|---|---|---|---|---|---|---|
2019 | 2019 | NA | Younger than 35 years | 75337 | 85308 | 9744 | 11584 | 75494 |
Q 1 2020 | 2020 | 1 | Younger than 35 years | 18259 | 20990 | 2245 | 3475 | 17546 |
Q 2 2020 | 2020 | 2 | Younger than 35 years | 21934 | 19340 | 2141 | 8169 | 16089 |
Q 3 2020 | 2020 | 3 | Younger than 35 years | 21953 | 20993 | 2773 | 5362 | 18674 |
2019 | 2019 | NA | 45 to 54 years | 104868 | 122610 | 16664 | 13097 | 107072 |
Q 1 2020 | 2020 | 1 | 45 to 54 years | 25456 | 30663 | 3920 | 3807 | 25094 |
Q 2 2020 | 2020 | 2 | 45 to 54 years | 29400 | 28392 | 3710 | 9119 | 23187 |
Q 3 2020 | 2020 | 3 | 45 to 54 years | 30453 | 30486 | 4855 | 6162 | 26516 |
2019 | 2019 | NA | 55 to 64 years | 87365 | 85218 | 14903 | 19075 | 90092 |
Q 1 2020 | 2020 | 1 | 55 to 64 years | 21283 | 21227 | 3528 | 5279 | 21210 |
Q 2 2020 | 2020 | 2 | 55 to 64 years | 24251 | 20088 | 3085 | 8948 | 19714 |
Q 3 2020 | 2020 | 3 | 55 to 64 years | 25721 | 21424 | 4778 | 6928 | 22421 |
2019 | 2019 | NA | 65 years and older | 60193 | 11887 | 10957 | 41935 | 63789 |
Q 1 2020 | 2020 | 1 | 65 years and older | 15243 | 2957 | 2700 | 10934 | 14785 |
Q 2 2020 | 2020 | 2 | 65 years and older | 16630 | 2739 | 2615 | 12033 | 13625 |
Q 3 2020 | 2020 | 3 | 65 years and older | 17286 | 2983 | 3308 | 11997 | 15422 |
income_data
income_data %>%
filter(year == 2020) %>%
select(q_year:household_disposable_income) %>%
group_by(quintile) %>%
mutate(change_in_income = c(NA, diff(household_disposable_income)),
percent_change_in_income = ((change_in_income/lag(household_disposable_income))*100) %>% round(1)) %>%
ungroup() %>%
filter(q_year != "Q 1 2020") %>%
ggplot(aes(x = quintile, y = percent_change_in_income, fill = q_year)) +
geom_col(position = position_dodge()) +
geom_label(aes(label = paste0(percent_change_in_income, "%"))) +
coord_flip() +
labs(x=NULL,
y=NULL)
Lowest Income quintile had the largest increase in Q 2 in 2020 of about 33.6 %.
Interpreting the plot in context of Universal Basic Income.
The graph shows the increases in household disposable incomes in the second and third quarters of 2020.The percent change was the highest for the lowest income households and the least for high income households in the Quarter 2 of 2020, which was the middle of the pandemic. It shows the government of Canada helped out the households with low income levels. It means that the income that low income households get after tax has been deducted substantially increased
The reddit thread you want to get data from is here.
library(RedditExtractoR)
rcoms <- reddit_content(URL = "https://old.reddit.com/r/canada/comments/l6lubc/3_in_5_canadians_support_universal_basic_income/")
##
|
| | 0%
|
|======================================================================| 100%
We are selecting a few interesting columns and then keeping the top 10 comments by the comment_score
.
#rcoms <- rcoms %>%
# select(id, structure, comm_date, comment_score,
# controversiality, comment) %>%
# as_tibble() %>%
# mutate(num_of_underscores = str_count(structure, "_")) %>%
# filter(num_of_underscores == 0) %>%
# slice_max(order_by = comment_score, n = 30)
#rcoms %>% head() %>% kbl() %>% kable_paper("hover", full_width = F)
Commenting this chunk to get more rows for better text analysis.
Breaking the comment paragraphs into individual words and removing common words aka stopwords
library(tidytext)
rcoms_tokens <- rcoms %>%
unnest_tokens(word, comment) %>%
anti_join(stop_words)
## Joining, by = "word"
rcoms_tokens %>%
head() %>%
kbl() %>%
kable_paper("hover", full_width = F)
We are removing the profane words by filtering badwords
(obtained online). Also, adding the positive and negative sentiments in a new column as expressed in the bing
dictionary.
# Source: https://rpubs.com/Nikotino/58395
badwords <- readLines("http://www.cs.cmu.edu/~biglou/resources/bad-words.txt")
rcoms_tokens <- rcoms_tokens %>%
filter(!word %in% badwords) %>%
inner_join(get_sentiments("bing"))
## Joining, by = "word"
rcoms_tokens %>%
head() %>%
kbl() %>%
kable_paper("hover", full_width = F)
library(ggwordcloud)
rcoms_tokens %>%
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(label=word,size=n, color = sentiment)) +
geom_text_wordcloud()+
scale_size_area(max_size = 20)+
facet_grid(.~sentiment)+
theme_minimal()
## Selecting by n
As per the analysis, there is a substantial difference between household income and the expenditure of low income families, as opposed to high income families and other quinitiles. While the difference exist for almost all the quintiles for the pandemic year 2020. Due to this, Yes, I think there should be a Universal Basic Income in Canada because even with the government’s help the gap is there,and as also seen by the text analysis,people made more comments that have positive sentiments towards a Basic Income than negative sentiments.