Visualization - VHLSS
Topic: Gaps in Household Income by Province
Year: 2014 - 2022
Set working directory, and load necessary packages
## 2022 - HH
{
df_2022_ho3 <- import("D:/0 - My documents/TOOLS/R/Household Income Inequality/Data/SL_ThongTinHo.dta")
View(df_2022_ho3)
skim(df_2022_ho3)
df_2022_ho3 <- df_2022_ho3 %>%
select(idho, matinh, tentinh, thunhap)
# Function creates full code by adding zeros
add_zero <- function(x) {
tibble(x_text = as.character(x)) %>%
mutate(n_digits = str_count(x_text),
n_max = max(n_digits),
delta = n_max - n_digits,
pre = strrep("0", times = delta),
full_code = str_c(pre, x_text)) %>%
pull(full_code) %>%
return()
}
# Use the function:
df_2022_ho3 <- df_2022_ho3 %>% mutate(tinh_n = add_zero(matinh))
View(df_2022_ho3)
# Extract province info
df_province <- df_2022_ho3 %>%
select(matinh, tentinh) %>%
distinct()
View(df_province)
# Create some columns and relabel for provinces
df_province <-
df_province %>%
rename(province_vie = tentinh,
province_code = matinh) %>%
mutate(province_eng = stri_trans_general(province_vie, "Latin-ASCII")) %>%
mutate(province_eng = str_replace_all(province_eng, "Tinh |Thanh pho ","")) %>%
mutate(province_eng = str_replace_all(province_eng, " - ", "-")) %>%
mutate(province_code = add_zero(province_code))
View(df_province)
# Left join data
df_2022_ho3 <-
df_2022_ho3 %>% left_join(df_province, by = c("tinh_n" = "province_code"))
View(df_2022_ho3)
# Caculate income
df_2022_ho3_income <- df_2022_ho3 %>%
filter(thunhap > 0) %>% # Person have income > 0
filter(province_eng !="test chuong trinh") %>%
group_by(idho, province_eng) %>% # Group by idho of HH
summarise(total_thunhap = sum(thunhap, na.rm = TRUE)) %>%
ungroup()
View(df_2022_ho3_income)
df_2022_province_income <- df_2022_ho3_income %>%
filter(total_thunhap > 0) %>%
group_by(province_eng) %>%
summarise(avg_income = mean(total_thunhap),
th25 = quantile(total_thunhap, 0.25),
th50 = quantile(total_thunhap, 0.50),
th75 = quantile(total_thunhap, 0.75)) %>%
mutate_if(is.numeric, function(x) {round(x / 1000, 1)}) %>%
ungroup() %>%
arrange(th50) %>%
mutate(province_eng = factor(province_eng, province_eng))
View(df_2022_province_income)
df_2022_province_income <- df_2022_province_income %>%
mutate(year = 2022)
}## 2020 - HH
{
df_2020_ho3 <- import("D:/0 - My documents/TOOLS/R/Household Income Inequality/Data/HO3_VHLSS2020.dta")
View(df_2020_ho3)
# Function creates full code by adding zeros
add_zero <- function(x) {
tibble(x_text = as.character(x)) %>%
mutate(n_digits = str_count(x_text),
n_max = max(n_digits),
delta = n_max - n_digits,
pre = strrep("0", times = delta),
full_code = str_c(pre, x_text)) %>%
pull(full_code) %>%
return()
}
# Use the function:
df_2020_ho3 <- df_2020_ho3 %>% mutate(tinh_n = add_zero(tinh))
freq(df_2020_ho3$tinh_n)
# Create h_code column:
df_2020_ho3 <- df_2020_ho3 %>%
mutate(tinh_n = add_zero(tinh),
huyen_n = add_zero(huyen),
xa_n = add_zero(xa),
diaban_n = add_zero(diaban),
hoso_n = add_zero(hoso)) %>%
mutate(h_code = str_c(tinh_n, huyen_n, xa_n, diaban_n, hoso_n))
View(df_2020_ho3)
# Left join data
df_2020_ho3 <-
df_2020_ho3 %>% left_join(df_province, by = c("tinh_n" = "province_code"))
View(df_2020_ho3)
# Caculate income
df_2020_ho3_income <- df_2020_ho3 %>%
filter(thunhap > 0) %>% # Person have income > 0
group_by(h_code, province_eng) %>% # Group by h_code of HH
summarise(total_thunhap = sum(thunhap, na.rm = TRUE)) %>%
ungroup()
df_2020_province_income <- df_2020_ho3_income %>%
filter(total_thunhap > 0) %>%
group_by(province_eng) %>%
summarise(avg_income = mean(total_thunhap),
th25 = quantile(total_thunhap, 0.25),
th50 = quantile(total_thunhap, 0.50),
th75 = quantile(total_thunhap, 0.75)) %>%
mutate_if(is.numeric, function(x) {round(x / 1000, 1)}) %>%
ungroup() %>%
arrange(th50) %>%
mutate(province_eng = factor(province_eng, province_eng))
df_2020_province_income <- df_2020_province_income %>%
mutate(year = 2020)
}## 2018 - HH
{
df_2018_ho3 <- import("D:/0 - My documents/TOOLS/R/Household Income Inequality/Data/HO3_VHLSS2018.dta")
# Function creates full code by adding zeros
add_zero <- function(x) {
tibble(x_text = as.character(x)) %>%
mutate(n_digits = str_count(x_text),
n_max = max(n_digits),
delta = n_max - n_digits,
pre = strrep("0", times = delta),
full_code = str_c(pre, x_text)) %>%
pull(full_code) %>%
return()
}
# Use the function:
df_2018_ho3 <- df_2018_ho3 %>% mutate(tinh_n = add_zero(tinh))
freq(df_2018_ho3$tinh_n)
# Create h_code column:
df_2018_ho3 <- df_2018_ho3 %>%
mutate(tinh_n = add_zero(tinh),
huyen_n = add_zero(huyen),
xa_n = add_zero(xa),
diaban_n = add_zero(diaban),
hoso_n = add_zero(hoso)) %>%
mutate(h_code = str_c(tinh_n, huyen_n, xa_n, diaban_n, hoso_n))
View(df_2018_ho3)
# Left join data
df_2018_ho3 <-
df_2018_ho3 %>% left_join(df_province, by = c("tinh_n" = "province_code"))
View(df_2018_ho3)
# Caculate income
df_2018_ho3_income <- df_2018_ho3 %>%
filter(thunhap > 0) %>% # Person have income > 0
group_by(h_code, province_eng) %>% # Group by h_code of HH
summarise(total_thunhap = sum(thunhap, na.rm = TRUE)) %>%
ungroup()
df_2018_province_income <- df_2018_ho3_income %>%
filter(total_thunhap > 0) %>%
group_by(province_eng) %>%
summarise(avg_income = mean(total_thunhap),
th25 = quantile(total_thunhap, 0.25),
th50 = quantile(total_thunhap, 0.50),
th75 = quantile(total_thunhap, 0.75)) %>%
mutate_if(is.numeric, function(x) {round(x / 1000, 1)}) %>%
ungroup() %>%
arrange(th50) %>%
mutate(province_eng = factor(province_eng, province_eng))
df_2018_province_income <- df_2018_province_income %>%
mutate(year = 2018)
}## 2016 - HH
{
df_2016_ho3 <- import("D:/0 - My documents/TOOLS/R/Household Income Inequality/Data/Ho3_VHLSS2016.dta")
# Function creates full code by adding zeros
add_zero <- function(x) {
tibble(x_text = as.character(x)) %>%
mutate(n_digits = str_count(x_text),
n_max = max(n_digits),
delta = n_max - n_digits,
pre = strrep("0", times = delta),
full_code = str_c(pre, x_text)) %>%
pull(full_code) %>%
return()
}
# Use the function:
df_2016_ho3 <- df_2016_ho3 %>% mutate(tinh_n = add_zero(tinh))
freq(df_2016_ho3$tinh_n)
# Create h_code column:
df_2016_ho3 <- df_2016_ho3 %>%
mutate(tinh_n = add_zero(tinh),
huyen_n = add_zero(huyen),
xa_n = add_zero(xa),
diaban_n = add_zero(diaban),
hoso_n = add_zero(hoso)) %>%
mutate(h_code = str_c(tinh_n, huyen_n, xa_n, diaban_n, hoso_n))
View(df_2016_ho3)
# Left join data
df_2016_ho3 <-
df_2016_ho3 %>% left_join(df_province, by = c("tinh_n" = "province_code"))
View(df_2016_ho3)
# Caculate income
df_2016_ho3_income <- df_2016_ho3 %>%
filter(thunhap > 0) %>% # Person have income > 0
group_by(h_code, province_eng) %>% # Group by h_code of HH
summarise(total_thunhap = sum(thunhap, na.rm = TRUE)) %>%
ungroup()
df_2016_province_income <- df_2016_ho3_income %>%
filter(total_thunhap > 0) %>%
group_by(province_eng) %>%
summarise(avg_income = mean(total_thunhap),
th25 = quantile(total_thunhap, 0.25),
th50 = quantile(total_thunhap, 0.50),
th75 = quantile(total_thunhap, 0.75)) %>%
mutate_if(is.numeric, function(x) {round(x / 1000, 1)}) %>%
ungroup() %>%
arrange(th50) %>%
mutate(province_eng = factor(province_eng, province_eng))
df_2016_province_income <- df_2016_province_income %>%
mutate(year = 2016)
}## 2014 - HH
{
df_2014_ho3 <- import("D:/0 - My documents/TOOLS/R/Household Income Inequality/Data/Ho3_VHLSS2014.dta")
# Function creates full code by adding zeros
add_zero <- function(x) {
tibble(x_text = as.character(x)) %>%
mutate(n_digits = str_count(x_text),
n_max = max(n_digits),
delta = n_max - n_digits,
pre = strrep("0", times = delta),
full_code = str_c(pre, x_text)) %>%
pull(full_code) %>%
return()
}
# Use the function:
df_2014_ho3 <- df_2014_ho3 %>% mutate(tinh_n = add_zero(tinh))
freq(df_2014_ho3$tinh_n)
# Create h_code column:
df_2014_ho3 <- df_2014_ho3 %>%
mutate(tinh_n = add_zero(tinh),
huyen_n = add_zero(huyen),
xa_n = add_zero(xa),
diaban_n = add_zero(diaban),
hoso_n = add_zero(hoso)) %>%
mutate(h_code = str_c(tinh_n, huyen_n, xa_n, diaban_n, hoso_n))
View(df_2014_ho3)
# Left join data
df_2014_ho3 <-
df_2014_ho3 %>% left_join(df_province, by = c("tinh_n" = "province_code"))
View(df_2014_ho3)
# Caculate income
df_2014_ho3_income <- df_2014_ho3 %>%
filter(thunhap > 0) %>% # Person have income > 0
group_by(h_code, province_eng) %>% # Group by h_code of HH
summarise(total_thunhap = sum(thunhap, na.rm = TRUE)) %>%
ungroup()
df_2014_province_income <- df_2014_ho3_income %>%
filter(total_thunhap > 0) %>%
group_by(province_eng) %>%
summarise(avg_income = mean(total_thunhap),
th25 = quantile(total_thunhap, 0.25),
th50 = quantile(total_thunhap, 0.50),
th75 = quantile(total_thunhap, 0.75)) %>%
mutate_if(is.numeric, function(x) {round(x / 1000, 1)}) %>%
ungroup() %>%
arrange(th50) %>%
mutate(province_eng = factor(province_eng, province_eng))
df_2014_province_income <- df_2014_province_income %>%
mutate(year = 2014)
View(df_2014_province_income)
} # Change wide to long
append_df_5years_long <- append_df_5years %>%
pivot_longer(
cols = th25:th75,
names_to = "Quantile",
values_to = "Values"
)
skim(append_df_5years_long)
# Convert the numeric column to integers
append_df_5years_long$year <- as.integer(append_df_5years_long$year)
# Mutate new variable
append_df_5years_long <- append_df_5years_long %>%
mutate(
Quartiles = case_when(
Quantile == "th25" ~ "Q1",
Quantile == "th50" ~ "Q2",
Quantile == "th75" ~ "Q3",
TRUE ~ Quantile # Keep the original value if none of the conditions match
)
) # Load packages
library("pacman")
pacman::p_load(
gganimate,
ggplot2,
dplyr,
gapminder,
ggthemes,
gifski,
showtext,
transformr)
# Select Ubuntu Condensed font:
showtext.auto()
my_font <- "Roboto Condensed"
font_add_google(name = my_font, family = my_font)
graph <- ggplot(append_df_5years_long,
aes(x= Values,
y = province_eng))+
geom_line(color = "grey", linewidth = 1.5,alpha = 0.5)+
geom_point(aes(color = Quartiles), size = 2)+
scale_x_continuous(expand = c(0, 0), limits = c(0, 400), breaks = seq(0, 400, 50), position = "top")+
scale_color_manual(values = c("#6E9FC6", "#AE123A", "#2A5783"))+
theme_minimal()+
labs(title = "Gaps in houshold Income (millions VND) by Province",
caption = "Data Source: VHLSS")+
theme(plot.margin = unit(c(0.7, 1, 0.5, 0.5), "cm"))+
theme(legend.position="bottom")+
theme(axis.title.y = element_blank()) +
theme(axis.text.y = element_text(size = 9)) +
theme(axis.text.x = element_text(size = 9)) +
theme(plot.title = element_text(size = 13, face = "bold", color = "grey10"))+
theme(axis.title.y = element_blank())+
theme(axis.title.x = element_blank())
graph