Visualization - VHLSS
Topic: Household Income Inequality by Province
Year: 2022
Reference: Nguyen Chi Dung
Set working directory, and load data in Ho3 from VHLSS 2022:
# Clear R environment:
rm(list = ls())
# Set working directory
setwd("D:/0 - My documents/TOOLS/R/Household Income Inequality/HH Income Inequality_2022")
# Pacman: Load necessary packages
library("pacman")
pacman::p_load(
rio,
dplyr,
summarytools,
janitor,
stringi,
stringr)
# Load data
ho3 <- import("D:/0 - My documents/TOOLS/R/Household Income Inequality/HH Income Inequality_2022/SL_ThongTinHo.dta")
View(ho3)Write function extract_description to describe data:
# Function extracts variable description
extract_description <- function(df_selected) {
sapply(df_selected, function(x) {attributes(x) %>% .$label}) %>%
data.frame() %>%
mutate(description = stri_trans_general(`.`, "Latin-ASCII")) -> df_des
df_des %>%
mutate(var_name = row.names(df_des)) %>%
select(var_name, description) -> df_des
row.names(df_des) <- NULL
return(df_des)
}
# Description for data
extract_description(ho3)Write function add_zeros to standardize province codes:
# Function creates full code by adding zeros
add_zero <- function(x) {
tibble(x_text = as.character(x)) %>%
mutate(n_digits = str_count(x_text),
n_max = max(n_digits),
delta = n_max - n_digits,
pre = strrep("0", times = delta),
full_code = str_c(pre, x_text)) %>%
pull(full_code) %>%
return()
}
# Use the function:
ho3 <- ho3 %>% mutate(tinh_n = add_zero(matinh)) # Extract province info
df_province <- ho3 %>%
select(matinh, tentinh) %>%
distinct()
View(df_province)
# Rename for DF:
df_province <- df_province %>% janitor::clean_names() %>%
# NEW name # OLD name
rename(province_code = matinh)
names(df_province)
View(df_province)
# Create some columns and relabel for provinces
df_province <-
df_province %>%
mutate(province_vie = tentinh) %>%
mutate(province_eng = stri_trans_general(province_vie, "Latin-ASCII")) %>%
mutate(province_eng = str_replace_all(province_eng, "Tinh |Thanh pho ","")) %>%
mutate(province_eng = str_replace_all(province_eng, " - ", "-")) %>%
mutate(province_code = add_zero(province_code))
View(df_province)
# Full join data
ho3 <-
ho3 %>% full_join(df_province, by = c("tinh_n" = "province_code"))
View(ho3)
# Caculate thunhap
df_thu_hh <- ho3 %>%
filter(thunhap > 0) %>% # Person have income > 0
filter(province_eng !="test chuong trinh") %>%
group_by(idho, province_eng) %>% # Group by idho of HH
summarise(total_thunhap = sum(thunhap, na.rm = TRUE)) %>%
ungroup()
df_thunhap_hh <- df_thu_hh %>%
filter(total_thunhap > 0) %>%
group_by(province_eng) %>%
summarise(avg_income = mean(total_thunhap),
th25 = quantile(total_thunhap, 0.25),
th50 = quantile(total_thunhap, 0.50),
th75 = quantile(total_thunhap, 0.75)) %>%
mutate_if(is.numeric, function(x) {round(x / 1000, 1)}) %>%
ungroup() %>%
arrange(th50) %>%
mutate(province_eng = factor(province_eng, province_eng))
View(df_thunhap_dumb) #----------------------------------------------------------------------------------------------------------------------
# Data Visualization
# Ref: https://www.economist.com/united-states/2019/06/29/will-transparent-pricing-make-americas-health-care-cheaper
# https://www.stata.com/meeting/switzerland20/slides/Switzerland20_Gamma.pdf
#----------------------------------------------------------------------------------------------------------------------
# Load some R packages for Data Visualization:
library(ggeconodist) # install.packages("ggeconodist", repos = "https://cinc.rud.is")
library(ggplot2)
library(showtext)
# Select Ubuntu Condensed font:
showtext.auto()
my_font <- "Roboto Condensed"
font_add_google(name = my_font, family = my_font)
graph_thunhap_hh <-
df_thunhap_hh %>%
ggplot(aes(x = province_eng)) +
geom_econodist(aes(ymin = th25, median = th50, ymax = th75),
median_col = "firebrick",
stat = "identity",
median_point_size = 1,
show.legend = TRUE) +
coord_flip()+
theme_econodist()+
scale_y_continuous(expand = c(0, 0), limits = c(0, 400), breaks = seq(0, 400, 50), position = "right")+
labs(title = "Gaps in Household Income (millions VND) by Province, 2022",
caption = "N = 48.192| Data Source: VHLSS 2022")+
theme(plot.margin = unit(c(0.7, 1, 0.5, 0.5), "cm")) +
theme(axis.title.y = element_blank()) +
theme(axis.text.y = element_text(family = my_font, size = 9)) +
theme(axis.text.x = element_text(family = my_font, size = 10)) +
theme(plot.caption = element_text(family = my_font, size = 8, hjust = 1)) +
theme(plot.title = element_text(family = my_font, size = 18, face = "bold", color = "grey10"))
graph_thunhap_hh
grid.newpage()
graph_thunhap_hh %>% left_align(c("title", "caption")) %>%
add_econodist_legend(
econodist_legend_grob(
tenth_lab = "25th Percentile",
ninetieth_lab = "75th Percentile",
med_lab = "Median",
med_col = "firebrick",
family = my_font,
label_size = 10.5,
),
below = "title"
) %>%
grid.draw()