Motivation

Visualization - VHLSS

Topic: Household Income Inequality by Province

Year: 2022

Reference: Nguyen Chi Dung

Data Processing

Clean Data

Set working directory, and load data in Ho3 from VHLSS 2022:

  # Clear R environment: 
  rm(list = ls())
  
  # Set working directory
  setwd("D:/0 - My documents/TOOLS/R/Household Income Inequality/HH Income Inequality_2022")

  # Pacman: Load necessary packages
  library("pacman")
  pacman::p_load(
    rio,
    dplyr,
    summarytools,
    janitor,
    stringi,
    stringr)
  
  # Load data
ho3 <- import("D:/0 - My documents/TOOLS/R/Household Income Inequality/HH Income Inequality_2022/SL_ThongTinHo.dta")
  View(ho3)

Write function extract_description to describe data:

  # Function extracts variable description
  extract_description <- function(df_selected) {
    
    sapply(df_selected, function(x) {attributes(x) %>% .$label}) %>% 
      data.frame() %>% 
      mutate(description = stri_trans_general(`.`, "Latin-ASCII")) -> df_des
    
    df_des %>% 
      mutate(var_name = row.names(df_des)) %>% 
      select(var_name, description) -> df_des
    
    row.names(df_des) <- NULL
    
    return(df_des)
  }
  
  # Description for data
  extract_description(ho3)

Write function add_zeros to standardize province codes:

  # Function creates full code by adding zeros
  add_zero <- function(x) {
    
    tibble(x_text = as.character(x)) %>% 
      mutate(n_digits = str_count(x_text),
             n_max = max(n_digits), 
             delta = n_max - n_digits, 
             pre = strrep("0", times = delta), 
             full_code = str_c(pre, x_text)) %>% 
      pull(full_code) %>% 
      return()
  }
  
  # Use the function: 
  ho3 <- ho3 %>% mutate(tinh_n = add_zero(matinh))

Prepare data for ploting

  # Extract province info
  df_province <- ho3 %>% 
  select(matinh, tentinh) %>% 
  distinct()
  
  View(df_province)
  
  # Rename for DF:
  df_province <- df_province %>%  janitor::clean_names() %>% 
  # NEW name             # OLD name
  rename(province_code  = matinh)
  names(df_province)
  
  View(df_province)
  
  # Create some columns and relabel for provinces
  df_province <- 
  df_province %>% 
  mutate(province_vie = tentinh) %>% 
  mutate(province_eng = stri_trans_general(province_vie, "Latin-ASCII")) %>% 
  mutate(province_eng = str_replace_all(province_eng, "Tinh |Thanh pho ","")) %>% 
  mutate(province_eng = str_replace_all(province_eng, " - ", "-")) %>% 
  mutate(province_code = add_zero(province_code))
  
  View(df_province)
  
  # Full join data
  ho3 <- 
    ho3 %>% full_join(df_province, by = c("tinh_n" = "province_code"))

  View(ho3) 
  
  # Caculate thunhap
  df_thu_hh <- ho3 %>% 
  filter(thunhap > 0) %>% # Person have income > 0
  filter(province_eng !="test chuong trinh") %>% 
  group_by(idho, province_eng) %>%  # Group by idho of HH
  summarise(total_thunhap = sum(thunhap, na.rm = TRUE)) %>% 
  ungroup()
  
  df_thunhap_hh <- df_thu_hh %>% 
  filter(total_thunhap > 0) %>% 
  group_by(province_eng) %>% 
  summarise(avg_income = mean(total_thunhap), 
            th25 = quantile(total_thunhap, 0.25), 
            th50 = quantile(total_thunhap, 0.50), 
            th75 = quantile(total_thunhap, 0.75)) %>% 
  mutate_if(is.numeric, function(x) {round(x / 1000, 1)}) %>% 
  ungroup() %>% 
  arrange(th50) %>% 
  mutate(province_eng = factor(province_eng, province_eng))
  
  View(df_thunhap_dumb)

Data visualization

  #----------------------------------------------------------------------------------------------------------------------
  #                            Data Visualization
  # Ref: https://www.economist.com/united-states/2019/06/29/will-transparent-pricing-make-americas-health-care-cheaper
  #      https://www.stata.com/meeting/switzerland20/slides/Switzerland20_Gamma.pdf
  #----------------------------------------------------------------------------------------------------------------------
  # Load some R packages for Data Visualization: 
  
  library(ggeconodist) # install.packages("ggeconodist", repos = "https://cinc.rud.is")
  library(ggplot2)
  library(showtext)

  # Select Ubuntu Condensed font: 
  showtext.auto()  
  
  my_font <- "Roboto Condensed"
  
  font_add_google(name = my_font, family = my_font)
  
  graph_thunhap_hh <- 
    df_thunhap_hh %>% 
    ggplot(aes(x = province_eng)) + 
    geom_econodist(aes(ymin = th25, median = th50, ymax = th75),
                   median_col = "firebrick",
                   stat = "identity",
                   median_point_size = 1,
                   show.legend = TRUE) +
    coord_flip()+
    theme_econodist()+
    scale_y_continuous(expand = c(0, 0), limits = c(0, 400), breaks = seq(0, 400, 50), position = "right")+
    labs(title = "Gaps in Household Income (millions VND) by Province, 2022",
         caption = "N = 48.192| Data Source: VHLSS 2022")+
    theme(plot.margin = unit(c(0.7, 1, 0.5, 0.5), "cm")) + 
    theme(axis.title.y = element_blank()) + 
    theme(axis.text.y = element_text(family = my_font, size = 9)) + 
    theme(axis.text.x = element_text(family = my_font, size = 10)) + 
    theme(plot.caption = element_text(family = my_font, size = 8, hjust = 1)) + 
    theme(plot.title = element_text(family = my_font, size = 18, face = "bold", color = "grey10"))
  
  graph_thunhap_hh  
  
  grid.newpage()
  
  graph_thunhap_hh %>% left_align(c("title", "caption")) %>% 
    add_econodist_legend(
      econodist_legend_grob(
        tenth_lab = "25th Percentile", 
        ninetieth_lab = "75th Percentile", 
        med_lab = "Median", 
        med_col = "firebrick", 
        family = my_font, 
        label_size = 10.5,
      ), 
      below = "title"
    ) %>% 
    grid.draw()