Motivation

Visualization - VHLSS
Topic: Gaps in Household Income by Province
Year: 2014 - 2022

Data Processing

Set-up

Set working directory, and load necessary packages

# Clear R environment: 
rm(list = ls())

# Set working directory
setwd("D:/0 - My documents/TOOLS/R/Household Income Inequality")

# Pacman: Load necessary packages
library("pacman")
pacman::p_load(
  rio,
  dplyr,
  summarytools,
  skimr,
  janitor,
  tidyverse,
  stringi,
  stringr)

Data 2022

## 2022 - HH
{
  df_2022_ho3 <- import("D:/0 - My documents/TOOLS/R/Household Income Inequality/Data/SL_ThongTinHo.dta")
  
  View(df_2022_ho3)
  skim(df_2022_ho3)
  
  df_2022_ho3 <- df_2022_ho3 %>% 
    select(idho, matinh, tentinh, thunhap)
  
  # Function creates full code by adding zeros
  add_zero <- function(x) {
    
    tibble(x_text = as.character(x)) %>% 
      mutate(n_digits = str_count(x_text),
             n_max = max(n_digits), 
             delta = n_max - n_digits, 
             pre = strrep("0", times = delta), 
             full_code = str_c(pre, x_text)) %>% 
      pull(full_code) %>% 
      return()
  }
  
  # Use the function: 
  df_2022_ho3 <- df_2022_ho3 %>% mutate(tinh_n = add_zero(matinh))
  View(df_2022_ho3)
  
  # Extract province info
  
  df_province <- df_2022_ho3 %>% 
    select(matinh, tentinh) %>% 
    distinct()
  
  View(df_province)
  
  # Create some columns and relabel for provinces
  df_province <- 
    df_province %>% 
    rename(province_vie = tentinh,
           province_code  = matinh) %>% 
    mutate(province_eng = stri_trans_general(province_vie, "Latin-ASCII")) %>% 
    mutate(province_eng = str_replace_all(province_eng, "Tinh |Thanh pho ","")) %>% 
    mutate(province_eng = str_replace_all(province_eng, " - ", "-")) %>% 
    mutate(province_code = add_zero(province_code))
  
  View(df_province)
  
  # Left join data
  df_2022_ho3 <- 
    df_2022_ho3 %>% left_join(df_province, by = c("tinh_n" = "province_code"))
  
  View(df_2022_ho3)
  
  # Caculate income
  df_2022_ho3_income <- df_2022_ho3 %>% 
    filter(thunhap > 0) %>% # Person have income > 0
    filter(province_eng !="test chuong trinh") %>% 
    group_by(idho, province_eng) %>%  # Group by idho of HH
    summarise(total_thunhap = sum(thunhap, na.rm = TRUE)) %>% 
    ungroup()
  
  View(df_2022_ho3_income)
  
  df_2022_province_income <- df_2022_ho3_income %>% 
    filter(total_thunhap > 0) %>% 
    group_by(province_eng) %>% 
    summarise(avg_income = mean(total_thunhap), 
              th25 = quantile(total_thunhap, 0.25), 
              th50 = quantile(total_thunhap, 0.50), 
              th75 = quantile(total_thunhap, 0.75)) %>% 
    mutate_if(is.numeric, function(x) {round(x / 1000, 1)}) %>% 
    ungroup() %>% 
    arrange(th50) %>% 
    mutate(province_eng = factor(province_eng, province_eng))
  
  View(df_2022_province_income)
  
  df_2022_province_income <- df_2022_province_income %>% 
    mutate(year = 2022)
}

Data 2020

## 2020 - HH
{
  df_2020_ho3 <- import("D:/0 - My documents/TOOLS/R/Household Income Inequality/Data/HO3_VHLSS2020.dta")
  View(df_2020_ho3)
  
  # Function creates full code by adding zeros
  add_zero <- function(x) {
    
    tibble(x_text = as.character(x)) %>% 
      mutate(n_digits = str_count(x_text),
             n_max = max(n_digits), 
             delta = n_max - n_digits, 
             pre = strrep("0", times = delta), 
             full_code = str_c(pre, x_text)) %>% 
      pull(full_code) %>% 
      return()
  }
  
  # Use the function: 
  df_2020_ho3 <- df_2020_ho3 %>% mutate(tinh_n = add_zero(tinh))
  freq(df_2020_ho3$tinh_n)
  
  # Create h_code column: 
  df_2020_ho3 <- df_2020_ho3 %>% 
    mutate(tinh_n = add_zero(tinh), 
           huyen_n = add_zero(huyen), 
           xa_n = add_zero(xa), 
           diaban_n = add_zero(diaban), 
           hoso_n = add_zero(hoso)) %>% 
    mutate(h_code = str_c(tinh_n, huyen_n, xa_n, diaban_n, hoso_n)) 
  
  View(df_2020_ho3)
  
  # Left join data
  df_2020_ho3 <- 
    df_2020_ho3 %>% left_join(df_province, by = c("tinh_n" = "province_code"))
  
  View(df_2020_ho3)
  
  # Caculate income
  df_2020_ho3_income <- df_2020_ho3 %>% 
    filter(thunhap > 0) %>% # Person have income > 0
    group_by(h_code, province_eng) %>%  # Group by h_code of HH
    summarise(total_thunhap = sum(thunhap, na.rm = TRUE)) %>% 
    ungroup()
  
  df_2020_province_income <- df_2020_ho3_income %>% 
    filter(total_thunhap > 0) %>% 
    group_by(province_eng) %>% 
    summarise(avg_income = mean(total_thunhap), 
              th25 = quantile(total_thunhap, 0.25), 
              th50 = quantile(total_thunhap, 0.50), 
              th75 = quantile(total_thunhap, 0.75)) %>% 
    mutate_if(is.numeric, function(x) {round(x / 1000, 1)}) %>% 
    ungroup() %>% 
    arrange(th50) %>% 
    mutate(province_eng = factor(province_eng, province_eng))
  
  df_2020_province_income <- df_2020_province_income %>% 
    mutate(year = 2020)
}

Data 2018

## 2018 - HH
{
  df_2018_ho3 <- import("D:/0 - My documents/TOOLS/R/Household Income Inequality/Data/HO3_VHLSS2018.dta")
  
  # Function creates full code by adding zeros
  add_zero <- function(x) {
    
    tibble(x_text = as.character(x)) %>% 
      mutate(n_digits = str_count(x_text),
             n_max = max(n_digits), 
             delta = n_max - n_digits, 
             pre = strrep("0", times = delta), 
             full_code = str_c(pre, x_text)) %>% 
      pull(full_code) %>% 
      return()
  }
  
  # Use the function: 
  df_2018_ho3 <- df_2018_ho3 %>% mutate(tinh_n = add_zero(tinh))
  freq(df_2018_ho3$tinh_n)
  
  # Create h_code column: 
  df_2018_ho3 <- df_2018_ho3 %>% 
    mutate(tinh_n = add_zero(tinh), 
           huyen_n = add_zero(huyen), 
           xa_n = add_zero(xa), 
           diaban_n = add_zero(diaban), 
           hoso_n = add_zero(hoso)) %>% 
    mutate(h_code = str_c(tinh_n, huyen_n, xa_n, diaban_n, hoso_n)) 
  
  View(df_2018_ho3)
  
  # Left join data
  df_2018_ho3 <- 
    df_2018_ho3 %>% left_join(df_province, by = c("tinh_n" = "province_code"))
  
  View(df_2018_ho3)
  
  # Caculate income
  df_2018_ho3_income <- df_2018_ho3 %>% 
    filter(thunhap > 0) %>% # Person have income > 0
    group_by(h_code, province_eng) %>%  # Group by h_code of HH
    summarise(total_thunhap = sum(thunhap, na.rm = TRUE)) %>% 
    ungroup()
  
  df_2018_province_income <- df_2018_ho3_income %>% 
    filter(total_thunhap > 0) %>% 
    group_by(province_eng) %>% 
    summarise(avg_income = mean(total_thunhap), 
              th25 = quantile(total_thunhap, 0.25), 
              th50 = quantile(total_thunhap, 0.50), 
              th75 = quantile(total_thunhap, 0.75)) %>% 
    mutate_if(is.numeric, function(x) {round(x / 1000, 1)}) %>% 
    ungroup() %>% 
    arrange(th50) %>% 
    mutate(province_eng = factor(province_eng, province_eng))
  
  df_2018_province_income <- df_2018_province_income %>% 
    mutate(year = 2018)
  
}

Data 2016

## 2016 - HH
{
  df_2016_ho3 <- import("D:/0 - My documents/TOOLS/R/Household Income Inequality/Data/Ho3_VHLSS2016.dta")
  
  # Function creates full code by adding zeros
  add_zero <- function(x) {
    
    tibble(x_text = as.character(x)) %>% 
      mutate(n_digits = str_count(x_text),
             n_max = max(n_digits), 
             delta = n_max - n_digits, 
             pre = strrep("0", times = delta), 
             full_code = str_c(pre, x_text)) %>% 
      pull(full_code) %>% 
      return()
  }
  
  # Use the function: 
  df_2016_ho3 <- df_2016_ho3 %>% mutate(tinh_n = add_zero(tinh))
  freq(df_2016_ho3$tinh_n)
  
  # Create h_code column: 
  df_2016_ho3 <- df_2016_ho3 %>% 
    mutate(tinh_n = add_zero(tinh), 
           huyen_n = add_zero(huyen), 
           xa_n = add_zero(xa), 
           diaban_n = add_zero(diaban), 
           hoso_n = add_zero(hoso)) %>% 
    mutate(h_code = str_c(tinh_n, huyen_n, xa_n, diaban_n, hoso_n)) 
  
  View(df_2016_ho3)
  
  # Left join data
  df_2016_ho3 <- 
    df_2016_ho3 %>% left_join(df_province, by = c("tinh_n" = "province_code"))
  
  View(df_2016_ho3)
  
  # Caculate income
  df_2016_ho3_income <- df_2016_ho3 %>% 
    filter(thunhap > 0) %>% # Person have income > 0
    group_by(h_code, province_eng) %>%  # Group by h_code of HH
    summarise(total_thunhap = sum(thunhap, na.rm = TRUE)) %>% 
    ungroup()
  
  df_2016_province_income <- df_2016_ho3_income %>% 
    filter(total_thunhap > 0) %>% 
    group_by(province_eng) %>% 
    summarise(avg_income = mean(total_thunhap), 
              th25 = quantile(total_thunhap, 0.25), 
              th50 = quantile(total_thunhap, 0.50), 
              th75 = quantile(total_thunhap, 0.75)) %>% 
    mutate_if(is.numeric, function(x) {round(x / 1000, 1)}) %>% 
    ungroup() %>% 
    arrange(th50) %>% 
    mutate(province_eng = factor(province_eng, province_eng))
  
  df_2016_province_income <- df_2016_province_income %>% 
    mutate(year = 2016)
}

Data 2014

## 2014 - HH
{
  df_2014_ho3 <- import("D:/0 - My documents/TOOLS/R/Household Income Inequality/Data/Ho3_VHLSS2014.dta")
  
  # Function creates full code by adding zeros
  add_zero <- function(x) {
    
    tibble(x_text = as.character(x)) %>% 
      mutate(n_digits = str_count(x_text),
             n_max = max(n_digits), 
             delta = n_max - n_digits, 
             pre = strrep("0", times = delta), 
             full_code = str_c(pre, x_text)) %>% 
      pull(full_code) %>% 
      return()
  }
  
  # Use the function: 
  df_2014_ho3 <- df_2014_ho3 %>% mutate(tinh_n = add_zero(tinh))
  freq(df_2014_ho3$tinh_n)
  
  # Create h_code column: 
  df_2014_ho3 <- df_2014_ho3 %>% 
    mutate(tinh_n = add_zero(tinh), 
           huyen_n = add_zero(huyen), 
           xa_n = add_zero(xa), 
           diaban_n = add_zero(diaban), 
           hoso_n = add_zero(hoso)) %>% 
    mutate(h_code = str_c(tinh_n, huyen_n, xa_n, diaban_n, hoso_n)) 
  
  View(df_2014_ho3)
  
  # Left join data
  df_2014_ho3 <- 
    df_2014_ho3 %>% left_join(df_province, by = c("tinh_n" = "province_code"))
  
  View(df_2014_ho3)
  
  # Caculate income
  df_2014_ho3_income <- df_2014_ho3 %>% 
    filter(thunhap > 0) %>% # Person have income > 0
    group_by(h_code, province_eng) %>%  # Group by h_code of HH
    summarise(total_thunhap = sum(thunhap, na.rm = TRUE)) %>% 
    ungroup()
  
  df_2014_province_income <- df_2014_ho3_income %>% 
    filter(total_thunhap > 0) %>% 
    group_by(province_eng) %>% 
    summarise(avg_income = mean(total_thunhap), 
              th25 = quantile(total_thunhap, 0.25), 
              th50 = quantile(total_thunhap, 0.50), 
              th75 = quantile(total_thunhap, 0.75)) %>% 
    mutate_if(is.numeric, function(x) {round(x / 1000, 1)}) %>% 
    ungroup() %>% 
    arrange(th50) %>% 
    mutate(province_eng = factor(province_eng, province_eng))
  
  df_2014_province_income <- df_2014_province_income %>% 
    mutate(year = 2014)
  
  View(df_2014_province_income)
}

Append data

## Append data
{
  append_df_5years <- bind_rows(df_2022_province_income,
                                df_2020_province_income,
                                df_2018_province_income,
                                df_2016_province_income,
                                df_2014_province_income )
}

Visualization

Change structure

  # Change wide to long
  append_df_5years_long <- append_df_5years %>% 
    pivot_longer(
      cols = th25:th75,
      names_to = "Quantile",
      values_to = "Values"
    )
  
  skim(append_df_5years_long)
  
  # Convert the numeric column to integers
  append_df_5years_long$year <- as.integer(append_df_5years_long$year)
  
  # Mutate new variable
  append_df_5years_long <- append_df_5years_long %>%
    mutate(
      Quartiles = case_when(
        Quantile == "th25" ~ "Q1",
        Quantile == "th50" ~ "Q2",
        Quantile == "th75" ~ "Q3",
        TRUE ~ Quantile  # Keep the original value if none of the conditions match
      )
    )

Graph

  # Load packages
  library("pacman")
  pacman::p_load(
    gganimate,
    ggplot2,
    dplyr,
    gapminder,
    ggthemes,
    gifski,
    showtext,
    transformr)
  
  # Select Ubuntu Condensed font: 
  showtext.auto()  
  
  my_font <- "Roboto Condensed"
  
  font_add_google(name = my_font, family = my_font)
  
  graph <- ggplot(append_df_5years_long,
                             aes(x= Values,
                                 y = province_eng))+
    geom_line(color = "grey", linewidth = 1.5,alpha = 0.5)+
    geom_point(aes(color = Quartiles), size = 2)+
    scale_x_continuous(expand = c(0, 0), limits = c(0, 400), breaks = seq(0, 400, 50), position = "top")+
    scale_color_manual(values = c("#6E9FC6", "#AE123A", "#2A5783"))+
    theme_minimal()+
    labs(title = "Gaps in houshold Income (millions VND) by Province",
         caption = "Data Source: VHLSS")+
    theme(plot.margin = unit(c(0.7, 1, 0.5, 0.5), "cm"))+
    theme(legend.position="bottom")+
    theme(axis.title.y = element_blank()) +
    theme(axis.text.y = element_text(size = 9)) + 
    theme(axis.text.x = element_text(size = 9)) +
    theme(plot.title = element_text(size = 13, face = "bold", color = "grey10"))+
    theme(axis.title.y = element_blank())+
    theme(axis.title.x = element_blank())
  
  graph

Animation

graph.animation <-  graph +
    transition_time(year) +
    labs(subtitle = "Year: {frame_time}")
  
  animate(graph.animation, height = 1100, width = 700, fps = 25, duration = 7, end_pause = 90, res = 100)
  
  anim_save("Result/Gaps HH Income.gif")