Motivations

Plot dưới đây được tạo từ bộ dữ liệu VHLSS 2020:

Biểu đồ này cho thấy thu nhập trung vị (điểm màu đỏ) của các hộ gia đình ở TP. HCM là gần gấp đôi thu nhập trung vị của hộ gia đình ở Hà Nội.

R codes

Dưới đây là R codes của plot ở trên:

# Clear R environment: 
rm(list = ls())

# Load some R packages: 

library(haven)
library(stringi)
library(stringr)
library(dplyr)

# Load data (download from https://www.mediafire.com/file/b6hm4zngz2q5tc6/VHLSS_2020.zip/file): 
read_dta("E:/VHLSS 2020/VHLSS2020_Household_Data/HO3.dta") -> ho3

# Function extracts variable description: 

extract_description <- function(df_selected) {
  
  sapply(df_selected, function(x) {attributes(x) %>% .$label}) %>% 
    data.frame() %>% 
    mutate(description = stri_trans_general(`.`, "Latin-ASCII")) -> df_des
  
  df_des %>% 
    mutate(var_name = row.names(df_des)) %>% 
    select(var_name, description) -> df_des
  
  row.names(df_des) <- NULL
  
  return(df_des)
  
}

# Description for data: 
extract_description(ho3)

# Function creates full code by adding zeros: 

add_zero <- function(x) {
  
  tibble(x_text = as.character(x)) %>% 
    mutate(n_digits = str_count(x_text),
           n_max = max(n_digits), 
           delta = n_max - n_digits, 
           pre = strrep("0", times = delta), 
           full_code = str_c(pre, x_text)) %>% 
    pull(full_code) %>% 
    return()
}

# Use the function: 

ho3 %>% mutate(tinh_n = add_zero(tinh)) -> ho3


#-----------------------------
#  Prepare data for ploting
#-----------------------------

# Extract province info:

ho3 %>% 
  pull(tinh) %>% 
  attributes() %>% 
  .$labels %>% 
  data.frame() -> df_province

# Rename for DF: 
names(df_province) <- "province_code"

# Create some columns and relabel for provinces: 

df_province %>% 
  mutate(province_vie = row.names(df_province)) %>% 
  mutate(province_eng = stri_trans_general(province_vie, "Latin-ASCII")) %>% 
  mutate(province_eng = str_replace_all(province_eng, "Tinh |Thanh pho ", "")) %>% 
  mutate(province_eng = str_replace_all(province_eng, " - ", "-")) %>% 
  mutate(province_code = add_zero(province_code)) -> df_province


ho3 %>% full_join(df_province, by = c("tinh_n" = "province_code")) -> ho3


ho3 %>% 
  group_by(province_eng) %>% 
  summarise(th25 = quantile(thunhap, 0.25), 
            th50 = quantile(thunhap, 0.50), 
            th75 = quantile(thunhap, 0.75)) %>% 
  mutate_if(is.numeric, function(x) {round(x / 1000, 1)}) %>% 
  ungroup() %>% 
  arrange(th50) %>% 
  mutate(province_eng = factor(province_eng, province_eng)) -> df_thunhap 


#----------------------------------------------------------------------------------------------------------------------
#                            Data Visualization
# Ref: https://www.economist.com/united-states/2019/06/29/will-transparent-pricing-make-americas-health-care-cheaper
#      https://www.stata.com/meeting/switzerland20/slides/Switzerland20_Gamma.pdf
#----------------------------------------------------------------------------------------------------------------------

# Load some R packages for Data Visualization: 

library(ggeconodist) # install.packages("ggeconodist", repos = "https://cinc.rud.is")
library(ggplot2)
library(showtext)

# Select Ubuntu Condensed font: 
showtext.auto()

my_font <- "Roboto Condensed"

font_add_google(name = my_font, family = my_font)


df_thunhap %>% 
  ggplot(aes(x = province_eng)) + 
  geom_econodist(aes(ymin = th25, median = th50, ymax = th75), 
                 median_col = "firebrick", 
                 stat = "identity", 
                 median_point_size = 1.3, 
                 show.legend = TRUE) + 
  coord_flip() +
  theme_econodist() + 
  scale_y_continuous(expand = c(0, 0), limits = c(0, 450), breaks = seq(0, 450, 50), position = "right") + 
  labs(title = "Household Income Inequality by Province, 2020", 
       caption = "Data Source: VHLSS 2020 by GSO|Graphic Designer: Nguyen Chi Dung") +  
  theme(plot.margin = unit(c(0.7, 1, 0.5, 0.5), "cm")) + 
  theme(axis.title.y = element_blank()) + 
  theme(axis.text.y = element_text(family = my_font, size = 9)) + 
  theme(axis.text.x = element_text(family = my_font, size = 10)) + 
  theme(plot.caption = element_text(family = my_font, size = 8, hjust = 1)) + 
  theme(plot.title = element_text(family = my_font, size = 18, face = "bold", color = "grey10")) -> f1

grid.newpage()

f1 %>% 
  left_align(c("title", "caption")) %>% 
  add_econodist_legend(
    econodist_legend_grob(
      tenth_lab = "25th Percentile", 
      ninetieth_lab = "75th Percentile", 
      med_lab = "Median", 
      med_col = "firebrick", 
      family = my_font, 
      label_size = 10.5,
    ), 
    below = "title"
  ) %>% 
  grid.draw()
LS0tDQp0aXRsZTogJ0RhdGEgVmlzdWFsaXphdGlvbiBmcm9tIFZITFNTIDIwMjA6IEhvdXNlaG9sZCBJbmNvbWUgSW5lcXVhbGl0eSBieSBQcm92aW5jZScNCmF1dGhvcjogJ0F1dGhvcjogTmd1eWVuIENoaSBEdW5nJw0Kc3VidGl0bGU6ICJEYWlseSBHcmFwaCBTZXJpZXMiDQpvdXRwdXQ6DQogIGh0bWxfZG9jdW1lbnQ6IA0KICAgIGNvZGVfZG93bmxvYWQ6IHRydWUNCiAgICAjIGNvZGVfZm9sZGluZzogaGlkZQ0KICAgIGhpZ2hsaWdodDogemVuYnVybg0KICAgICMgbnVtYmVyX3NlY3Rpb25zOiB5ZXMNCiAgICB0aGVtZTogImZsYXRseSINCiAgICB0b2M6IFRSVUUNCiAgICB0b2NfZmxvYXQ6IFRSVUUNCi0tLQ0KDQpgYGB7ciBzZXR1cCxpbmNsdWRlPUZBTFNFfQ0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFLCB3YXJuaW5nID0gRkFMU0UsIG1lc3NhZ2UgPSBGQUxTRSwgY2FjaGUgPSBUUlVFLCBmaWcuc2hvd3RleHQgPSBUUlVFKQ0KDQpgYGANCg0KIyBNb3RpdmF0aW9ucw0KDQpQbG90IGTGsOG7m2kgxJHDonkgxJHGsOG7o2MgdOG6oW8gdOG7qyBi4buZIGThu68gbGnhu4d1IFtWSExTUyAyMDIwXShodHRwczovL3d3dy5uaWVuZ2lhbXRob25na2UubmV0L2toJUUxJUJBJUEzby1zJUMzJUExdC92aGxzcy0yMDIwKTogDQoNCiFbXShDOlxcVXNlcnNcXEFkbWluXFxEb2N1bWVudHNcXHBpY18xMjA3LmpwZykNCg0KDQpCaeG7g3UgxJHhu5MgbsOgeSBjaG8gdGjhuqV5IHRodSBuaOG6rXAgdHJ1bmcgduG7iyAoxJFp4buDbSBtw6B1IMSR4buPKSBj4bunYSBjw6FjIGjhu5kgZ2lhIMSRw6xuaCDhu58gVFAuIEhDTSBsw6AgZ+G6p24gZ+G6pXAgxJHDtGkgdGh1IG5o4bqtcCB0cnVuZyB24buLIGPhu6dhIGjhu5kgZ2lhIMSRw6xuaCDhu58gSMOgIE7hu5lpLiANCg0KIyBSIGNvZGVzDQoNCkTGsOG7m2kgxJHDonkgbMOgIFIgY29kZXMgY+G7p2EgcGxvdCDhu58gdHLDqm46IA0KDQpgYGB7ciwgZXZhbD1GQUxTRX0NCg0KIyBDbGVhciBSIGVudmlyb25tZW50OiANCnJtKGxpc3QgPSBscygpKQ0KDQojIExvYWQgc29tZSBSIHBhY2thZ2VzOiANCg0KbGlicmFyeShoYXZlbikNCmxpYnJhcnkoc3RyaW5naSkNCmxpYnJhcnkoc3RyaW5ncikNCmxpYnJhcnkoZHBseXIpDQoNCiMgTG9hZCBkYXRhIChkb3dubG9hZCBmcm9tIGh0dHBzOi8vd3d3Lm1lZGlhZmlyZS5jb20vZmlsZS9iNmhtNHpuZ3oycTV0YzYvVkhMU1NfMjAyMC56aXAvZmlsZSk6IA0KcmVhZF9kdGEoIkU6L1ZITFNTIDIwMjAvVkhMU1MyMDIwX0hvdXNlaG9sZF9EYXRhL0hPMy5kdGEiKSAtPiBobzMNCg0KIyBGdW5jdGlvbiBleHRyYWN0cyB2YXJpYWJsZSBkZXNjcmlwdGlvbjogDQoNCmV4dHJhY3RfZGVzY3JpcHRpb24gPC0gZnVuY3Rpb24oZGZfc2VsZWN0ZWQpIHsNCiAgDQogIHNhcHBseShkZl9zZWxlY3RlZCwgZnVuY3Rpb24oeCkge2F0dHJpYnV0ZXMoeCkgJT4lIC4kbGFiZWx9KSAlPiUgDQogICAgZGF0YS5mcmFtZSgpICU+JSANCiAgICBtdXRhdGUoZGVzY3JpcHRpb24gPSBzdHJpX3RyYW5zX2dlbmVyYWwoYC5gLCAiTGF0aW4tQVNDSUkiKSkgLT4gZGZfZGVzDQogIA0KICBkZl9kZXMgJT4lIA0KICAgIG11dGF0ZSh2YXJfbmFtZSA9IHJvdy5uYW1lcyhkZl9kZXMpKSAlPiUgDQogICAgc2VsZWN0KHZhcl9uYW1lLCBkZXNjcmlwdGlvbikgLT4gZGZfZGVzDQogIA0KICByb3cubmFtZXMoZGZfZGVzKSA8LSBOVUxMDQogIA0KICByZXR1cm4oZGZfZGVzKQ0KICANCn0NCg0KIyBEZXNjcmlwdGlvbiBmb3IgZGF0YTogDQpleHRyYWN0X2Rlc2NyaXB0aW9uKGhvMykNCg0KIyBGdW5jdGlvbiBjcmVhdGVzIGZ1bGwgY29kZSBieSBhZGRpbmcgemVyb3M6IA0KDQphZGRfemVybyA8LSBmdW5jdGlvbih4KSB7DQogIA0KICB0aWJibGUoeF90ZXh0ID0gYXMuY2hhcmFjdGVyKHgpKSAlPiUgDQogICAgbXV0YXRlKG5fZGlnaXRzID0gc3RyX2NvdW50KHhfdGV4dCksDQogICAgICAgICAgIG5fbWF4ID0gbWF4KG5fZGlnaXRzKSwgDQogICAgICAgICAgIGRlbHRhID0gbl9tYXggLSBuX2RpZ2l0cywgDQogICAgICAgICAgIHByZSA9IHN0cnJlcCgiMCIsIHRpbWVzID0gZGVsdGEpLCANCiAgICAgICAgICAgZnVsbF9jb2RlID0gc3RyX2MocHJlLCB4X3RleHQpKSAlPiUgDQogICAgcHVsbChmdWxsX2NvZGUpICU+JSANCiAgICByZXR1cm4oKQ0KfQ0KDQojIFVzZSB0aGUgZnVuY3Rpb246IA0KDQpobzMgJT4lIG11dGF0ZSh0aW5oX24gPSBhZGRfemVybyh0aW5oKSkgLT4gaG8zDQoNCg0KIy0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tDQojICBQcmVwYXJlIGRhdGEgZm9yIHBsb3RpbmcNCiMtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLQ0KDQojIEV4dHJhY3QgcHJvdmluY2UgaW5mbzoNCg0KaG8zICU+JSANCiAgcHVsbCh0aW5oKSAlPiUgDQogIGF0dHJpYnV0ZXMoKSAlPiUgDQogIC4kbGFiZWxzICU+JSANCiAgZGF0YS5mcmFtZSgpIC0+IGRmX3Byb3ZpbmNlDQoNCiMgUmVuYW1lIGZvciBERjogDQpuYW1lcyhkZl9wcm92aW5jZSkgPC0gInByb3ZpbmNlX2NvZGUiDQoNCiMgQ3JlYXRlIHNvbWUgY29sdW1ucyBhbmQgcmVsYWJlbCBmb3IgcHJvdmluY2VzOiANCg0KZGZfcHJvdmluY2UgJT4lIA0KICBtdXRhdGUocHJvdmluY2VfdmllID0gcm93Lm5hbWVzKGRmX3Byb3ZpbmNlKSkgJT4lIA0KICBtdXRhdGUocHJvdmluY2VfZW5nID0gc3RyaV90cmFuc19nZW5lcmFsKHByb3ZpbmNlX3ZpZSwgIkxhdGluLUFTQ0lJIikpICU+JSANCiAgbXV0YXRlKHByb3ZpbmNlX2VuZyA9IHN0cl9yZXBsYWNlX2FsbChwcm92aW5jZV9lbmcsICJUaW5oIHxUaGFuaCBwaG8gIiwgIiIpKSAlPiUgDQogIG11dGF0ZShwcm92aW5jZV9lbmcgPSBzdHJfcmVwbGFjZV9hbGwocHJvdmluY2VfZW5nLCAiIC0gIiwgIi0iKSkgJT4lIA0KICBtdXRhdGUocHJvdmluY2VfY29kZSA9IGFkZF96ZXJvKHByb3ZpbmNlX2NvZGUpKSAtPiBkZl9wcm92aW5jZQ0KDQoNCmhvMyAlPiUgZnVsbF9qb2luKGRmX3Byb3ZpbmNlLCBieSA9IGMoInRpbmhfbiIgPSAicHJvdmluY2VfY29kZSIpKSAtPiBobzMNCg0KDQpobzMgJT4lIA0KICBncm91cF9ieShwcm92aW5jZV9lbmcpICU+JSANCiAgc3VtbWFyaXNlKHRoMjUgPSBxdWFudGlsZSh0aHVuaGFwLCAwLjI1KSwgDQogICAgICAgICAgICB0aDUwID0gcXVhbnRpbGUodGh1bmhhcCwgMC41MCksIA0KICAgICAgICAgICAgdGg3NSA9IHF1YW50aWxlKHRodW5oYXAsIDAuNzUpKSAlPiUgDQogIG11dGF0ZV9pZihpcy5udW1lcmljLCBmdW5jdGlvbih4KSB7cm91bmQoeCAvIDEwMDAsIDEpfSkgJT4lIA0KICB1bmdyb3VwKCkgJT4lIA0KICBhcnJhbmdlKHRoNTApICU+JSANCiAgbXV0YXRlKHByb3ZpbmNlX2VuZyA9IGZhY3Rvcihwcm92aW5jZV9lbmcsIHByb3ZpbmNlX2VuZykpIC0+IGRmX3RodW5oYXAgDQoNCg0KIy0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0NCiMgICAgICAgICAgICAgICAgICAgICAgICAgICAgRGF0YSBWaXN1YWxpemF0aW9uDQojIFJlZjogaHR0cHM6Ly93d3cuZWNvbm9taXN0LmNvbS91bml0ZWQtc3RhdGVzLzIwMTkvMDYvMjkvd2lsbC10cmFuc3BhcmVudC1wcmljaW5nLW1ha2UtYW1lcmljYXMtaGVhbHRoLWNhcmUtY2hlYXBlcg0KIyAgICAgIGh0dHBzOi8vd3d3LnN0YXRhLmNvbS9tZWV0aW5nL3N3aXR6ZXJsYW5kMjAvc2xpZGVzL1N3aXR6ZXJsYW5kMjBfR2FtbWEucGRmDQojLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLQ0KDQojIExvYWQgc29tZSBSIHBhY2thZ2VzIGZvciBEYXRhIFZpc3VhbGl6YXRpb246IA0KDQpsaWJyYXJ5KGdnZWNvbm9kaXN0KSAjIGluc3RhbGwucGFja2FnZXMoImdnZWNvbm9kaXN0IiwgcmVwb3MgPSAiaHR0cHM6Ly9jaW5jLnJ1ZC5pcyIpDQpsaWJyYXJ5KGdncGxvdDIpDQpsaWJyYXJ5KHNob3d0ZXh0KQ0KDQojIFNlbGVjdCBVYnVudHUgQ29uZGVuc2VkIGZvbnQ6IA0Kc2hvd3RleHQuYXV0bygpDQoNCm15X2ZvbnQgPC0gIlJvYm90byBDb25kZW5zZWQiDQoNCmZvbnRfYWRkX2dvb2dsZShuYW1lID0gbXlfZm9udCwgZmFtaWx5ID0gbXlfZm9udCkNCg0KDQpkZl90aHVuaGFwICU+JSANCiAgZ2dwbG90KGFlcyh4ID0gcHJvdmluY2VfZW5nKSkgKyANCiAgZ2VvbV9lY29ub2Rpc3QoYWVzKHltaW4gPSB0aDI1LCBtZWRpYW4gPSB0aDUwLCB5bWF4ID0gdGg3NSksIA0KICAgICAgICAgICAgICAgICBtZWRpYW5fY29sID0gImZpcmVicmljayIsIA0KICAgICAgICAgICAgICAgICBzdGF0ID0gImlkZW50aXR5IiwgDQogICAgICAgICAgICAgICAgIG1lZGlhbl9wb2ludF9zaXplID0gMS4zLCANCiAgICAgICAgICAgICAgICAgc2hvdy5sZWdlbmQgPSBUUlVFKSArIA0KICBjb29yZF9mbGlwKCkgKw0KICB0aGVtZV9lY29ub2Rpc3QoKSArIA0KICBzY2FsZV95X2NvbnRpbnVvdXMoZXhwYW5kID0gYygwLCAwKSwgbGltaXRzID0gYygwLCA0NTApLCBicmVha3MgPSBzZXEoMCwgNDUwLCA1MCksIHBvc2l0aW9uID0gInJpZ2h0IikgKyANCiAgbGFicyh0aXRsZSA9ICJIb3VzZWhvbGQgSW5jb21lIEluZXF1YWxpdHkgYnkgUHJvdmluY2UsIDIwMjAiLCANCiAgICAgICBjYXB0aW9uID0gIkRhdGEgU291cmNlOiBWSExTUyAyMDIwIGJ5IEdTT3xHcmFwaGljIERlc2lnbmVyOiBOZ3V5ZW4gQ2hpIER1bmciKSArICANCiAgdGhlbWUocGxvdC5tYXJnaW4gPSB1bml0KGMoMC43LCAxLCAwLjUsIDAuNSksICJjbSIpKSArIA0KICB0aGVtZShheGlzLnRpdGxlLnkgPSBlbGVtZW50X2JsYW5rKCkpICsgDQogIHRoZW1lKGF4aXMudGV4dC55ID0gZWxlbWVudF90ZXh0KGZhbWlseSA9IG15X2ZvbnQsIHNpemUgPSA5KSkgKyANCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoZmFtaWx5ID0gbXlfZm9udCwgc2l6ZSA9IDEwKSkgKyANCiAgdGhlbWUocGxvdC5jYXB0aW9uID0gZWxlbWVudF90ZXh0KGZhbWlseSA9IG15X2ZvbnQsIHNpemUgPSA4LCBoanVzdCA9IDEpKSArIA0KICB0aGVtZShwbG90LnRpdGxlID0gZWxlbWVudF90ZXh0KGZhbWlseSA9IG15X2ZvbnQsIHNpemUgPSAxOCwgZmFjZSA9ICJib2xkIiwgY29sb3IgPSAiZ3JleTEwIikpIC0+IGYxDQoNCmdyaWQubmV3cGFnZSgpDQoNCmYxICU+JSANCiAgbGVmdF9hbGlnbihjKCJ0aXRsZSIsICJjYXB0aW9uIikpICU+JSANCiAgYWRkX2Vjb25vZGlzdF9sZWdlbmQoDQogICAgZWNvbm9kaXN0X2xlZ2VuZF9ncm9iKA0KICAgICAgdGVudGhfbGFiID0gIjI1dGggUGVyY2VudGlsZSIsIA0KICAgICAgbmluZXRpZXRoX2xhYiA9ICI3NXRoIFBlcmNlbnRpbGUiLCANCiAgICAgIG1lZF9sYWIgPSAiTWVkaWFuIiwgDQogICAgICBtZWRfY29sID0gImZpcmVicmljayIsIA0KICAgICAgZmFtaWx5ID0gbXlfZm9udCwgDQogICAgICBsYWJlbF9zaXplID0gMTAuNSwNCiAgICApLCANCiAgICBiZWxvdyA9ICJ0aXRsZSINCiAgKSAlPiUgDQogIGdyaWQuZHJhdygpDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQpgYGANCg0K