2020

Data Processing

  # Clear R environment: 
  rm(list = ls())
  
  # Set working directory
  setwd("D:/0 - My documents/TOOLS/R/Household Income Inequality/HH Income Inequality_2020")

  # Pacman: Load necessary packages
  library("pacman")
  pacman::p_load(
    rio,
    dplyr,
    summarytools,
    janitor,
    stringi,
    stringr)
  
  # Load data
  ho3 <- import("D:/0 - My documents/TOOLS/R/Household Income Inequality/HH Income Inequality_2020/HO3_VHLSS2020.dta")
  View(ho3)
  
  # Function extracts variable description
  extract_description <- function(df_selected) {
    
    sapply(df_selected, function(x) {attributes(x) %>% .$label}) %>% 
      data.frame() %>% 
      mutate(description = stri_trans_general(`.`, "Latin-ASCII")) -> df_des
    
    df_des %>% 
      mutate(var_name = row.names(df_des)) %>% 
      select(var_name, description) -> df_des
    
    row.names(df_des) <- NULL
    
    return(df_des)
    
  }
  
  # Description for data: 
  extract_description(ho3)
  
  # Function creates full code by adding zeros
  add_zero <- function(x) {
    
    tibble(x_text = as.character(x)) %>% 
      mutate(n_digits = str_count(x_text),
             n_max = max(n_digits), 
             delta = n_max - n_digits, 
             pre = strrep("0", times = delta), 
             full_code = str_c(pre, x_text)) %>% 
      pull(full_code) %>% 
      return()
  }
  
  # Use the function: 
  ho3 <- ho3 %>% mutate(tinh_n = add_zero(tinh))
  freq(ho3$tinh_n)
  
  #------------------------
  #  Create Household ID
  #------------------------
  
  # Create h_code column: 
  ho3 <- ho3 %>% 
    mutate(tinh_n = add_zero(tinh), 
           huyen_n = add_zero(huyen), 
           xa_n = add_zero(xa), 
           diaban_n = add_zero(diaban), 
           hoso_n = add_zero(hoso)) %>% 
    mutate(h_code = str_c(tinh_n, huyen_n, xa_n, diaban_n, hoso_n)) 
  View(ho3)
  
  
  #-----------------------------
  #  Prepare data for ploting
  #-----------------------------
  
  
  # Extract province info
  df_province <- ho3 %>% 
    pull(tinh) %>% 
    attributes() %>% 
    .$labels %>% 
    data.frame()
  
  View(df_province)
  
  # Rename for DF:
  df_province <- df_province %>% janitor::clean_names() %>% 
    # NEW name             # OLD name
    rename(province_code  = x)
  
  names(df_province)
  
  View(df_province)
  
  # Create some columns and relabel for provinces
  df_province <- 
    df_province %>% 
    mutate(province_vie = row.names(df_province)) %>% 
    mutate(province_eng = stri_trans_general(province_vie, "Latin-ASCII")) %>% 
    mutate(province_eng = str_replace_all(province_eng, "Tinh |Thanh pho ","")) %>% 
    mutate(province_eng = str_replace_all(province_eng, " - ", "-")) %>% 
    mutate(province_code = add_zero(province_code))
  
  View(df_province)
  
  freq(df_province$province_code)
  
  # Full join data
  ho3 <- 
    ho3 %>% full_join(df_province, by = c("tinh_n" = "province_code"))
  
  # Level: HH Income
  df_thuchi_hh <- ho3 %>% 
    filter(thunhap > 0) %>% # Person have income > 0
    group_by(h_code, province_eng) %>%  # Group by h_code of HH
    summarise(total_chi_SXKD = sum(chisxkd, na.rm = TRUE), 
              total_chi_khac = sum(chikhac, na.rm = TRUE), 
              total_thunhap = sum(thunhap, na.rm = TRUE)) %>% 
    ungroup()
  
  View(df_thuchi_hh)
  
  skim(df_thuchi_hh)
  
  df_thunhap_hh <- df_thuchi_hh %>% 
    filter(total_thunhap > 0) %>% 
    group_by(province_eng) %>% 
    summarise(avg_income = mean(total_thunhap), 
              th25 = quantile(total_thunhap, 0.25), 
              th50 = quantile(total_thunhap, 0.50), 
              th75 = quantile(total_thunhap, 0.75)) %>% 
    mutate_if(is.numeric, function(x) {round(x / 1000, 1)}) %>% 
    ungroup() %>% 
    arrange(th50) %>% 
    mutate(province_eng = factor(province_eng, province_eng))

Data Visualization

    #-----------------------------
    #  Data visualization: ggeconodist - HH Income
    #----------------------------- 
    
    # Load some R packages for Data Visualization: 
    
    library(ggeconodist) # install.packages("ggeconodist", repos = "https://cinc.rud.is")
    library(ggplot2)
    library(showtext)
    
    # Select Ubuntu Condensed font: 
    showtext.auto()  
    
    my_font <- "Roboto Condensed"
    
    font_add_google(name = my_font, family = my_font)
    
    graph_thunhap_hh <- 
      df_thunhap_hh %>% 
      ggplot(aes(x = province_eng)) + 
      geom_econodist(aes(ymin = th25, median = th50, ymax = th75),
                     median_col = "firebrick",
                     stat = "identity",
                     median_point_size = 1,
                     show.legend = TRUE) +
      coord_flip()+
      theme_econodist()+
      scale_y_continuous(expand = c(0, 0), limits = c(0, 350), breaks = seq(0, 350, 50), position = "right")+
      labs(title = "Gaps in Household Income (millions VND) by Province, 2020",
           caption = "N = 46.979| Data Source: VHLSS 2020")+
      theme(plot.margin = unit(c(0.7, 1, 0.5, 0.5), "cm")) + 
      theme(axis.title.y = element_blank()) + 
      theme(axis.text.y = element_text(family = my_font, size = 9)) + 
      theme(axis.text.x = element_text(family = my_font, size = 10)) + 
      theme(plot.caption = element_text(family = my_font, size = 8, hjust = 1)) + 
      theme(plot.title = element_text(family = my_font, size = 18, face = "bold", color = "grey10"))
    
    graph_thunhap_hh  
    
    grid.newpage()
    
    graph_thunhap_hh %>% left_align(c("title", "caption")) %>% 
      add_econodist_legend(
        econodist_legend_grob(
          tenth_lab = "25th Percentile", 
          ninetieth_lab = "75th Percentile", 
          med_lab = "Median", 
          med_col = "firebrick", 
          family = my_font, 
          label_size = 10.5,
        ), 
        below = "title"
      ) %>% 
      grid.draw()

2018

Data Processing

  # Clear R environment: 
  rm(list = ls())
  
  # Set working directory
  setwd("D:/0 - My documents/TOOLS/R/Household Income Inequality/HH Income Inequality_2018")
  
  # Pacman: Load necessary packages
  library("pacman")
  pacman::p_load(
    rio,
    dplyr,
    summarytools,
    skimr,
    janitor,
    tidyverse,
    stringi,
    stringr)
  
  # Load data
  ho3 <- import("D:/0 - My documents/TOOLS/R/Household Income Inequality/HH Income Inequality_2018/HO3_VHLSS2018.dta")
  
  View(ho3)
  
  skim(ho3)
  
  summary(ho3, warning())
  
  # Function extracts variable description
  extract_description <- function(df_selected) {
    
    sapply(df_selected, function(x) {attributes(x) %>% .$label}) %>% 
      data.frame() %>% 
      mutate(description = stri_trans_general(`.`, "Latin-ASCII")) -> df_des
    
    df_des %>% 
      mutate(var_name = row.names(df_des)) %>% 
      select(var_name, description) -> df_des
    
    row.names(df_des) <- NULL
    
    return(df_des)
    
  }
  
  # Description for data
  extract_description(ho3)
  
  # Function creates full code by adding zeros
  add_zero <- function(x) {
    
    tibble(x_text = as.character(x)) %>% 
      mutate(n_digits = str_count(x_text),
             n_max = max(n_digits), 
             delta = n_max - n_digits, 
             pre = strrep("0", times = delta), 
             full_code = str_c(pre, x_text)) %>% 
      pull(full_code) %>% 
      return()
  }
  
  # Use the function: 
  ho3 <- ho3 %>% mutate(tinh_n = add_zero(tinh))
  
  freq(ho3$tinh_n)
  
    #------------------------
    #  Create Household ID
    #------------------------
  
  # Create h_code column: 
  ho3 <- ho3 %>% 
    mutate(tinh_n = add_zero(tinh), 
           huyen_n = add_zero(huyen), 
           xa_n = add_zero(xa), 
           diaban_n = add_zero(diaban), 
           hoso_n = add_zero(hoso)) %>% 
    mutate(h_code = str_c(tinh_n, huyen_n, xa_n, diaban_n, hoso_n)) 
  
  View(ho3)
  
    #-----------------------------
    #  Prepare data for ploting
    #-----------------------------
  
  # Extract province info
  df_province <- ho3 %>% 
    pull(tinh) %>% 
    attributes() %>% 
    .$labels %>% 
    data.frame()
  
  View(df_province)
  
  # Rename for DF:
  df_province <- df_province %>% janitor::clean_names() %>% 
    # NEW name             # OLD name
    rename(province_code  = x)
  names(df_province)
  
  View(df_province)
  
  # Create some columns and relabel for provinces
  df_province <- 
    df_province %>% 
    mutate(province_vie = row.names(df_province)) %>% 
    mutate(province_eng = stri_trans_general(province_vie, "Latin-ASCII")) %>% 
    mutate(province_eng = str_replace_all(province_eng, "Tinh |Thanh pho ","")) %>% 
    mutate(province_eng = str_replace_all(province_eng, " - ", "-")) %>% 
    mutate(province_code = add_zero(province_code))
  
  View(df_province)
  
  freq(df_province$province_code)
  
  # Full join data
  ho3 <- 
    ho3 %>% full_join(df_province, by = c("tinh_n" = "province_code"))
  
  # Level: HH Income
  df_thuchi_hh <- ho3 %>% 
    filter(thunhap > 0) %>% # Person have income > 0
    group_by(h_code, province_eng) %>%  # Group by h_code of HH
    summarise(total_chi_SXKD = sum(chisxkd, na.rm = TRUE), 
              total_chi_khac = sum(chikhac, na.rm = TRUE), 
              total_thunhap = sum(thunhap, na.rm = TRUE)) %>% 
    ungroup()
  
  View(df_thuchi_hh)
  
  skim(df_thuchi_hh)
  
  df_thunhap_hh <- df_thuchi_hh %>% 
    filter(total_thunhap > 0) %>% 
    group_by(province_eng) %>% 
    summarise(avg_income = mean(total_thunhap), 
              th25 = quantile(total_thunhap, 0.25), 
              th50 = quantile(total_thunhap, 0.50), 
              th75 = quantile(total_thunhap, 0.75)) %>% 
    mutate_if(is.numeric, function(x) {round(x / 1000, 1)}) %>% 
    ungroup() %>% 
    arrange(th50) %>% 
    mutate(province_eng = factor(province_eng, province_eng))
  
  View(df_thunhap_hh)
  
  skim(df_thunhap_hh)
  
  summary(df_thunhap_hh)

Data Visualization

  #-----------------------------
  #  Data visualization: ggeconodist - HH Income
  #----------------------------- 

  # Load some R packages for Data Visualization: 
  
  library(ggeconodist) # install.packages("ggeconodist", repos = "https://cinc.rud.is")
  library(ggplot2)
  library(showtext)
  
  # Select Ubuntu Condensed font: 
  showtext.auto()  
  
  my_font <- "Roboto Condensed"
  
  font_add_google(name = my_font, family = my_font)
  
  graph_thunhap_hh <- 
    df_thunhap_hh %>% 
    ggplot(aes(x = province_eng)) + 
    geom_econodist(aes(ymin = th25, median = th50, ymax = th75),
                   median_col = "firebrick",
                   stat = "identity",
                   median_point_size = 1,
                   show.legend = TRUE) +
    coord_flip()+
    theme_econodist()+
    scale_y_continuous(expand = c(0, 0), limits = c(0, 350), breaks = seq(0, 350, 50), position = "right")+
    labs(title = "Gaps in Household Income (millions VND) by Province, 2018",
         caption = "N = 46.963 | Data Source: VHLSS 2018")+
    theme(plot.margin = unit(c(0.7, 1, 0.5, 0.5), "cm")) + 
    theme(axis.title.y = element_blank()) + 
    theme(axis.text.y = element_text(family = my_font, size = 9)) + 
    theme(axis.text.x = element_text(family = my_font, size = 10)) + 
    theme(plot.caption = element_text(family = my_font, size = 8, hjust = 1)) + 
    theme(plot.title = element_text(family = my_font, size = 18, face = "bold", color = "grey10"))
  
  graph_thunhap_hh  
  
  grid.newpage()
  
  graph_thunhap_hh %>% left_align(c("title", "caption")) %>% 
    add_econodist_legend(
      econodist_legend_grob(
        tenth_lab = "25th Percentile", 
        ninetieth_lab = "75th Percentile", 
        med_lab = "Median", 
        med_col = "firebrick", 
        family = my_font, 
        label_size = 10.5,
      ), 
      below = "title"
    ) %>% 
    grid.draw()

Household Income Inequality by Province (2020 & 2018)

Visualization - VHLSS

Author: Thao Bui

2024-02-06

2020

Data Processing

Data Visualization

2018

Data Processing

Data Visualization