Plot dưới đây được tạo từ bộ dữ liệu VHLSS 2020:
Biểu đồ này cho thấy thu nhập trung vị (điểm màu đỏ) của các hộ gia đình ở TP. HCM là gần gấp đôi thu nhập trung vị của hộ gia đình ở Hà Nội.
Dưới đây là R codes của plot ở trên:
# Clear R environment:
rm(list = ls())
# Load some R packages:
library(haven)
library(stringi)
library(stringr)
library(dplyr)
# Load data (download from https://www.mediafire.com/file/b6hm4zngz2q5tc6/VHLSS_2020.zip/file):
read_dta("E:/VHLSS 2020/VHLSS2020_Household_Data/HO3.dta") -> ho3
# Function extracts variable description:
<- function(df_selected) {
extract_description
sapply(df_selected, function(x) {attributes(x) %>% .$label}) %>%
data.frame() %>%
mutate(description = stri_trans_general(`.`, "Latin-ASCII")) -> df_des
%>%
df_des mutate(var_name = row.names(df_des)) %>%
select(var_name, description) -> df_des
row.names(df_des) <- NULL
return(df_des)
}
# Description for data:
extract_description(ho3)
# Function creates full code by adding zeros:
<- function(x) {
add_zero
tibble(x_text = as.character(x)) %>%
mutate(n_digits = str_count(x_text),
n_max = max(n_digits),
delta = n_max - n_digits,
pre = strrep("0", times = delta),
full_code = str_c(pre, x_text)) %>%
pull(full_code) %>%
return()
}
# Use the function:
%>% mutate(tinh_n = add_zero(tinh)) -> ho3
ho3
#-----------------------------
# Prepare data for ploting
#-----------------------------
# Extract province info:
%>%
ho3 pull(tinh) %>%
attributes() %>%
$labels %>%
.data.frame() -> df_province
# Rename for DF:
names(df_province) <- "province_code"
# Create some columns and relabel for provinces:
%>%
df_province mutate(province_vie = row.names(df_province)) %>%
mutate(province_eng = stri_trans_general(province_vie, "Latin-ASCII")) %>%
mutate(province_eng = str_replace_all(province_eng, "Tinh |Thanh pho ", "")) %>%
mutate(province_eng = str_replace_all(province_eng, " - ", "-")) %>%
mutate(province_code = add_zero(province_code)) -> df_province
%>% full_join(df_province, by = c("tinh_n" = "province_code")) -> ho3
ho3
%>%
ho3 group_by(province_eng) %>%
summarise(th25 = quantile(thunhap, 0.25),
th50 = quantile(thunhap, 0.50),
th75 = quantile(thunhap, 0.75)) %>%
mutate_if(is.numeric, function(x) {round(x / 1000, 1)}) %>%
ungroup() %>%
arrange(th50) %>%
mutate(province_eng = factor(province_eng, province_eng)) -> df_thunhap
#----------------------------------------------------------------------------------------------------------------------
# Data Visualization
# Ref: https://www.economist.com/united-states/2019/06/29/will-transparent-pricing-make-americas-health-care-cheaper
# https://www.stata.com/meeting/switzerland20/slides/Switzerland20_Gamma.pdf
#----------------------------------------------------------------------------------------------------------------------
# Load some R packages for Data Visualization:
library(ggeconodist) # install.packages("ggeconodist", repos = "https://cinc.rud.is")
library(ggplot2)
library(showtext)
# Select Ubuntu Condensed font:
showtext.auto()
<- "Roboto Condensed"
my_font
font_add_google(name = my_font, family = my_font)
%>%
df_thunhap ggplot(aes(x = province_eng)) +
geom_econodist(aes(ymin = th25, median = th50, ymax = th75),
median_col = "firebrick",
stat = "identity",
median_point_size = 1.3,
show.legend = TRUE) +
coord_flip() +
theme_econodist() +
scale_y_continuous(expand = c(0, 0), limits = c(0, 450), breaks = seq(0, 450, 50), position = "right") +
labs(title = "Household Income Inequality by Province, 2020",
caption = "Data Source: VHLSS 2020 by GSO|Graphic Designer: Nguyen Chi Dung") +
theme(plot.margin = unit(c(0.7, 1, 0.5, 0.5), "cm")) +
theme(axis.title.y = element_blank()) +
theme(axis.text.y = element_text(family = my_font, size = 9)) +
theme(axis.text.x = element_text(family = my_font, size = 10)) +
theme(plot.caption = element_text(family = my_font, size = 8, hjust = 1)) +
theme(plot.title = element_text(family = my_font, size = 18, face = "bold", color = "grey10")) -> f1
grid.newpage()
%>%
f1 left_align(c("title", "caption")) %>%
add_econodist_legend(
econodist_legend_grob(
tenth_lab = "25th Percentile",
ninetieth_lab = "75th Percentile",
med_lab = "Median",
med_col = "firebrick",
family = my_font,
label_size = 10.5,
), below = "title"
%>%
) grid.draw()