Motivation

From 2014 to 2022, the Southeast and Red River Delta regions had the highest average incomes in the country, while the Northwest had the lowest. The North Central Coast region notably climbed from the second lowest to the fourth position in average household income, surpassing the Central Highlands and Northeast regions.

Data Processing

# Clear R environment: 
rm(list = ls())

# Set working directory
setwd("D:/0 - My documents/TOOLS/R/Household Income Inequality")

 # Pacman: Load necessary packages
  library("pacman")
  pacman::p_load(
    rio,
    dplyr,
    summarytools,
    skimr,
    janitor,
    tidyverse,
    stringi,
    stringr)
  
  # Load data
  vung <- import("D:/0 - My documents/TOOLS/R/Household Income Inequality/Data/Vung.dta")
  
  View(vung)
  
  # Mutate province_eng
  vung <- 
    vung %>% 
    rename(province_vie = tentinh,
           province_code  = codetinh) %>% 
    mutate(province_eng = stri_trans_general(province_vie, "Latin-ASCII")) %>% 
    mutate(province_eng = str_replace_all(province_eng, "Tinh |Thanh pho ","")) %>% 
    mutate(province_eng = str_replace_all(province_eng, " - ", "-")) %>% 
    mutate(province_code = add_zero(province_code)) %>% 
    mutate(area_eng = stri_trans_general(area, "Latin-ASCII")) %>% 
    mutate(
      area_eng1 = case_when(
        area_eng == "Bac Trung Bo" ~ "North Central Coast",
        area_eng == "Dong Bac bo" ~ "Northeast",
        area_eng == "Dong bang song Hong" ~ "Red River Delta",
        area_eng == "Nam Trung Bo" ~ "South Central Coast",
        area_eng == "Tay Bac Bo" ~ "Northwest",
        area_eng == "Tay Nam Bo" ~ "Mekong River Delta",
        area_eng == "Tay Nguyen" ~ "Central Highlands",
        area_eng == "Vung Dong Nam Bo" ~ "Southeast",
        TRUE ~ area_eng  # Keep the original value if none of the conditions match
      )
    )
  
  skim(area)
  
  freq(vung$area_eng)
  
  names(vung)
  
  vung <- vung %>% 
    select(province_eng,area_eng1)
  
  # Load new dataframe
  append_df_5years_area <- append_df_5years_long
  
  # Left join data
  append_df_5years_area <- 
    append_df_5years_area %>% full_join(vung, by = c("province_eng" = "province_eng"))
  
  View(append_df_5years_area)
  
  names(append_df_5years_area)
  
  append_df_5years_area <- append_df_5years_area %>% 
    select(province_eng, area_eng1, avg_income, year) %>% 
    group_by(area_eng1, year) %>% 
    distinct()
  
  summary(append_df_5years_area$avg_income)
  
  skim(append_df_5years_area$avg_income)
  
  append_df_5years_area <- append_df_5years_area %>%
    mutate(avg_income = ifelse(avg_income == 148316880.2,148.3, avg_income))
  
  # Gen area dataframe
  area <- append_df_5years_area %>% 
    group_by(area_eng1, year) %>% 
    summarise(avg_income = mean(avg_income))
  
  # Gen whole country
  wholecountry <- area %>%
    select(area_eng1,year, avg_income) %>% 
    group_by(year) %>%
      summarise(avg_income = mean(avg_income)) %>% 
    mutate(area_eng1 = "Whole areas")
  wholecountry <- wholecountry %>% 
    select(area_eng1,year, avg_income)

  View(wholecountry)
  
  # Append data
  area <- bind_rows(area,
                    wholecountry)
  summary(area$avg_income)

Data Visualization

library("pacman")
  pacman::p_load(
    ggthemes,
    grid)
  
  label_y <- c(2014, rep("", 1), 2016, rep("", 1), 2018, rep("", 1), 2020, rep("", 1),
               2022, rep("", 2))
  
  df_text <- area %>% 
    filter(year == 2022) %>% 
    filter(area_eng1 %in% c("Southeast", "Red River Delta", "Northwest"))
  
  df_text_mekong <- area %>% 
    filter(year == 2022) %>% 
    filter(area_eng1 %in% c("Mekong River Delta"))
  
  df_text_whole <- area %>% 
    filter(year == 2022) %>% 
    filter(area_eng1 %in% c("Whole areas"))
  
  df_text_north <- area %>% 
    filter(year == 2022) %>% 
    filter(area_eng1 %in% c("North Central Coast"))
  
  df_text_south <- area %>% 
    filter(year == 2022) %>% 
    filter(area_eng1 %in% c("South Central Coast"))
  
  df_text_central <- area %>% 
    filter(year == 2022) %>% 
    filter(area_eng1 %in% c("Central Highlands"))
  
  df_text_northeast <- area %>% 
    filter(year == 2022) %>% 
    filter(area_eng1 %in% c("Northeast"))
  
  skim(area)
  freq(area$area_eng1)
  
  graph_area <- area %>% 
    ggplot(aes(x = year, y = avg_income, group = area_eng1, color = area_eng1)) +
    geom_line(aes(linetype = area_eng1, linewidth = area_eng1), show.legend = FALSE) +
    scale_linetype_manual(values=c("solid", "solid", "solid", "solid",
                                   "solid", "solid", "solid", "solid",
                                   "dashed")) +
    scale_linewidth_manual(values = c(0.5, 0.5, 1, 0.5,
                                      0.5, 0.5, 0.5, 0.5,
                                      2.5)) +
    scale_color_manual(values = c ("#8DBBDC", "#8DBBDC", "#2A5783", "#8DBBDC",
                                   "#8DBBDC", "#8DBBDC", "#8DBBDC", "#8DBBDC",
                                   "grey80")) +
    theme_fivethirtyeight()+
    scale_y_continuous(limits = c(70, 250)) +
    scale_x_continuous(limits = c(2014, 2024), breaks = seq(2014, 2024, 1),
                       labels = label_y, expand = c(0,0)) +
      # Add line labels
    geom_text(data = df_text, aes(year+0.05, avg_income, label = area_eng1), size = 2.75, 
              hjust = 0, show.legend = FALSE) +
    geom_text(data = df_text_mekong, aes(year + 0.05, avg_income + 15, label = area_eng1), size = 2.75, 
              hjust = 0, show.legend = FALSE) +
    geom_text(data = df_text_whole, aes(year + 0.05, avg_income + 8, label = area_eng1), size = 4, 
              hjust = 0, show.legend = FALSE) +
    geom_text(data = df_text_north, aes(year + 0.05, avg_income - 4, label = area_eng1), size = 2.75, 
              hjust = 0, show.legend = FALSE) +
    geom_text(data = df_text_south, aes(year + 0.05, avg_income, label = area_eng1), size = 2.75, 
              hjust = 0, show.legend = FALSE) +
    geom_text(data = df_text_central, aes(year + 0.05, avg_income - 3, label = area_eng1), size = 2.75, 
              hjust = 0, show.legend = FALSE) +
    geom_text(data = df_text_northeast, aes(year + 0.05, avg_income - 8, label = area_eng1), size = 2.75, 
              hjust = 0, show.legend = FALSE) +
    theme(plot.margin = unit(c(0.7, 0.5, 0.5, 0.5), "cm")) +
    theme(panel.grid.major.x = element_line(color = "white", linewidth = 0.5)) +
    theme(panel.grid.major.y = element_line(color = "white", linewidth = 0.5)) +
    theme(panel.grid.minor.y = element_line(color = "white", linewidth = 0.1)) +
    labs(title = "Gaps in Household Income (millions VND) by Area\n2014-2022",
         caption = "Data source: VHLSS | Author: Thao Bui",
         subtitle = "From 2014 to 2022, the Southeast and Red River Delta regions had the highest average incomes in the country,\nwhile the Northwest had the lowest.\nThe North Central Coast region notably climbed from the second lowest to the fourth position in average household\nincome, surpassing the Central Highlands and Northeast regions.") +
    theme(plot.title = element_text(size = 12, color = "#2A5783")) +
    theme(plot.subtitle = element_text(size = 7, color = "grey40")) +
    theme(plot.caption = element_text(size = 7, color = "grey40")) +
    theme(axis.text.x = element_text(size = 7, color = "grey40")) + 
    theme(axis.text.y = element_text(size = 7, color = "grey40")) 
    
  graph_area
  
  grid.rect(x = 0.012, y = 0.9, hjust = 1, vjust = 0, gp = gpar(fill = "#2A5783", lwd = 0, col = "transparent"))
  grid.rect(x = 1, y = 1 - 0.008, hjust = 1, vjust = 0,  gp = gpar(fill = "#2A5783", lwd = 0, col = "transparent"))

Data Animation

library(gganimate)
  library(gifski)
  library(transformr)
  
  freq(area$area_eng1)
  
  graph_area_ani <- area %>% 
    ggplot(aes(x = year, y = avg_income, group = area_eng1, color = area_eng1)) +
    geom_line(aes(linetype = area_eng1, linewidth = area_eng1), show.legend = TRUE) +
    scale_linetype_manual(values=c("solid", "solid", "solid", "solid",
                                   "solid", "solid", "solid", "solid",
                                   "dashed")) +
    scale_linewidth_manual(values = c(1, 1, 1, 1,
                                      1, 1, 1, 1,
                                      2)) +
    scale_color_manual(values = c ("#BFB2FF", "#FF8E32", "#2A5783", "#8F7EE5",
                                   "#6551CC", "#CC5800", "#FFCA99", "#993F00",
                                   "grey70")) +
    theme_fivethirtyeight()+
    scale_y_continuous(limits = c(70, 250)) +
    scale_x_continuous(limits = c(2014, 2022)) +
    theme(plot.margin = unit(c(0.7, 0.5, 0.5, 0.5), "cm")) +
    theme(panel.grid.major.x = element_line(color = "white", linewidth = 0.5)) +
    theme(panel.grid.major.y = element_line(color = "white", linewidth = 0.5)) +
    theme(panel.grid.minor.y = element_line(color = "white", linewidth = 0.1)) +
    labs(title = "Gaps in Household Income (millions VND) by Area, 2014-2022",
         caption = "Data source: VHLSS | Author: Thao Bui",
         subtitle = "From 2014 to 2022, the Southeast and Red River Delta regions had the highest average incomes in the country, while the Northwest had the lowest.\nThe North Central Coast region notably climbed from the second lowest to the fourth position in average household income, surpassing the Central\nHighlands and Northeast regions.") +
    theme(plot.title = element_text(size = 20, color = "grey10")) +
    theme(plot.subtitle = element_text(size = 12, color = "grey40")) +
    theme(plot.caption = element_text(size = 11, color = "grey40")) +
    theme(axis.text.x = element_text(size = 12, color = "grey40")) + 
    theme(axis.text.y = element_text(size = 12, color = "grey40")) +
    theme(legend.title = element_blank())+
    theme(legend.position = "bottom")+
    theme(legend.text =  element_text(size = 12, color = "grey40"))
  
  graph_area_ani  
  
  graph.animation.1 <-  graph_area_ani +
    geom_point() +
    transition_reveal(year)
  
  animate(graph.animation.1,height = 550, width = 910)
  anim_save("Result/area.gif")