From 2014 to 2022, the Southeast and Red River Delta regions had the highest average incomes in the country, while the Northwest had the lowest. The North Central Coast region notably climbed from the second lowest to the fourth position in average household income, surpassing the Central Highlands and Northeast regions.
# Clear R environment:
rm(list = ls())
# Set working directory
setwd("D:/0 - My documents/TOOLS/R/Household Income Inequality")
# Pacman: Load necessary packages
library("pacman")
pacman::p_load(
rio,
dplyr,
summarytools,
skimr,
janitor,
tidyverse,
stringi,
stringr)
# Load data
vung <- import("D:/0 - My documents/TOOLS/R/Household Income Inequality/Data/Vung.dta")
View(vung)
# Mutate province_eng
vung <-
vung %>%
rename(province_vie = tentinh,
province_code = codetinh) %>%
mutate(province_eng = stri_trans_general(province_vie, "Latin-ASCII")) %>%
mutate(province_eng = str_replace_all(province_eng, "Tinh |Thanh pho ","")) %>%
mutate(province_eng = str_replace_all(province_eng, " - ", "-")) %>%
mutate(province_code = add_zero(province_code)) %>%
mutate(area_eng = stri_trans_general(area, "Latin-ASCII")) %>%
mutate(
area_eng1 = case_when(
area_eng == "Bac Trung Bo" ~ "North Central Coast",
area_eng == "Dong Bac bo" ~ "Northeast",
area_eng == "Dong bang song Hong" ~ "Red River Delta",
area_eng == "Nam Trung Bo" ~ "South Central Coast",
area_eng == "Tay Bac Bo" ~ "Northwest",
area_eng == "Tay Nam Bo" ~ "Mekong River Delta",
area_eng == "Tay Nguyen" ~ "Central Highlands",
area_eng == "Vung Dong Nam Bo" ~ "Southeast",
TRUE ~ area_eng # Keep the original value if none of the conditions match
)
)
skim(area)
freq(vung$area_eng)
names(vung)
vung <- vung %>%
select(province_eng,area_eng1)
# Load new dataframe
append_df_5years_area <- append_df_5years_long
# Left join data
append_df_5years_area <-
append_df_5years_area %>% full_join(vung, by = c("province_eng" = "province_eng"))
View(append_df_5years_area)
names(append_df_5years_area)
append_df_5years_area <- append_df_5years_area %>%
select(province_eng, area_eng1, avg_income, year) %>%
group_by(area_eng1, year) %>%
distinct()
summary(append_df_5years_area$avg_income)
skim(append_df_5years_area$avg_income)
append_df_5years_area <- append_df_5years_area %>%
mutate(avg_income = ifelse(avg_income == 148316880.2,148.3, avg_income))
# Gen area dataframe
area <- append_df_5years_area %>%
group_by(area_eng1, year) %>%
summarise(avg_income = mean(avg_income))
# Gen whole country
wholecountry <- area %>%
select(area_eng1,year, avg_income) %>%
group_by(year) %>%
summarise(avg_income = mean(avg_income)) %>%
mutate(area_eng1 = "Whole areas")
wholecountry <- wholecountry %>%
select(area_eng1,year, avg_income)
View(wholecountry)
# Append data
area <- bind_rows(area,
wholecountry)
summary(area$avg_income)library("pacman")
pacman::p_load(
ggthemes,
grid)
label_y <- c(2014, rep("", 1), 2016, rep("", 1), 2018, rep("", 1), 2020, rep("", 1),
2022, rep("", 2))
df_text <- area %>%
filter(year == 2022) %>%
filter(area_eng1 %in% c("Southeast", "Red River Delta", "Northwest"))
df_text_mekong <- area %>%
filter(year == 2022) %>%
filter(area_eng1 %in% c("Mekong River Delta"))
df_text_whole <- area %>%
filter(year == 2022) %>%
filter(area_eng1 %in% c("Whole areas"))
df_text_north <- area %>%
filter(year == 2022) %>%
filter(area_eng1 %in% c("North Central Coast"))
df_text_south <- area %>%
filter(year == 2022) %>%
filter(area_eng1 %in% c("South Central Coast"))
df_text_central <- area %>%
filter(year == 2022) %>%
filter(area_eng1 %in% c("Central Highlands"))
df_text_northeast <- area %>%
filter(year == 2022) %>%
filter(area_eng1 %in% c("Northeast"))
skim(area)
freq(area$area_eng1)
graph_area <- area %>%
ggplot(aes(x = year, y = avg_income, group = area_eng1, color = area_eng1)) +
geom_line(aes(linetype = area_eng1, linewidth = area_eng1), show.legend = FALSE) +
scale_linetype_manual(values=c("solid", "solid", "solid", "solid",
"solid", "solid", "solid", "solid",
"dashed")) +
scale_linewidth_manual(values = c(0.5, 0.5, 1, 0.5,
0.5, 0.5, 0.5, 0.5,
2.5)) +
scale_color_manual(values = c ("#8DBBDC", "#8DBBDC", "#2A5783", "#8DBBDC",
"#8DBBDC", "#8DBBDC", "#8DBBDC", "#8DBBDC",
"grey80")) +
theme_fivethirtyeight()+
scale_y_continuous(limits = c(70, 250)) +
scale_x_continuous(limits = c(2014, 2024), breaks = seq(2014, 2024, 1),
labels = label_y, expand = c(0,0)) +
# Add line labels
geom_text(data = df_text, aes(year+0.05, avg_income, label = area_eng1), size = 2.75,
hjust = 0, show.legend = FALSE) +
geom_text(data = df_text_mekong, aes(year + 0.05, avg_income + 15, label = area_eng1), size = 2.75,
hjust = 0, show.legend = FALSE) +
geom_text(data = df_text_whole, aes(year + 0.05, avg_income + 8, label = area_eng1), size = 4,
hjust = 0, show.legend = FALSE) +
geom_text(data = df_text_north, aes(year + 0.05, avg_income - 4, label = area_eng1), size = 2.75,
hjust = 0, show.legend = FALSE) +
geom_text(data = df_text_south, aes(year + 0.05, avg_income, label = area_eng1), size = 2.75,
hjust = 0, show.legend = FALSE) +
geom_text(data = df_text_central, aes(year + 0.05, avg_income - 3, label = area_eng1), size = 2.75,
hjust = 0, show.legend = FALSE) +
geom_text(data = df_text_northeast, aes(year + 0.05, avg_income - 8, label = area_eng1), size = 2.75,
hjust = 0, show.legend = FALSE) +
theme(plot.margin = unit(c(0.7, 0.5, 0.5, 0.5), "cm")) +
theme(panel.grid.major.x = element_line(color = "white", linewidth = 0.5)) +
theme(panel.grid.major.y = element_line(color = "white", linewidth = 0.5)) +
theme(panel.grid.minor.y = element_line(color = "white", linewidth = 0.1)) +
labs(title = "Gaps in Household Income (millions VND) by Area\n2014-2022",
caption = "Data source: VHLSS | Author: Thao Bui",
subtitle = "From 2014 to 2022, the Southeast and Red River Delta regions had the highest average incomes in the country,\nwhile the Northwest had the lowest.\nThe North Central Coast region notably climbed from the second lowest to the fourth position in average household\nincome, surpassing the Central Highlands and Northeast regions.") +
theme(plot.title = element_text(size = 12, color = "#2A5783")) +
theme(plot.subtitle = element_text(size = 7, color = "grey40")) +
theme(plot.caption = element_text(size = 7, color = "grey40")) +
theme(axis.text.x = element_text(size = 7, color = "grey40")) +
theme(axis.text.y = element_text(size = 7, color = "grey40"))
graph_area
grid.rect(x = 0.012, y = 0.9, hjust = 1, vjust = 0, gp = gpar(fill = "#2A5783", lwd = 0, col = "transparent"))
grid.rect(x = 1, y = 1 - 0.008, hjust = 1, vjust = 0, gp = gpar(fill = "#2A5783", lwd = 0, col = "transparent"))library(gganimate)
library(gifski)
library(transformr)
freq(area$area_eng1)
graph_area_ani <- area %>%
ggplot(aes(x = year, y = avg_income, group = area_eng1, color = area_eng1)) +
geom_line(aes(linetype = area_eng1, linewidth = area_eng1), show.legend = TRUE) +
scale_linetype_manual(values=c("solid", "solid", "solid", "solid",
"solid", "solid", "solid", "solid",
"dashed")) +
scale_linewidth_manual(values = c(1, 1, 1, 1,
1, 1, 1, 1,
2)) +
scale_color_manual(values = c ("#BFB2FF", "#FF8E32", "#2A5783", "#8F7EE5",
"#6551CC", "#CC5800", "#FFCA99", "#993F00",
"grey70")) +
theme_fivethirtyeight()+
scale_y_continuous(limits = c(70, 250)) +
scale_x_continuous(limits = c(2014, 2022)) +
theme(plot.margin = unit(c(0.7, 0.5, 0.5, 0.5), "cm")) +
theme(panel.grid.major.x = element_line(color = "white", linewidth = 0.5)) +
theme(panel.grid.major.y = element_line(color = "white", linewidth = 0.5)) +
theme(panel.grid.minor.y = element_line(color = "white", linewidth = 0.1)) +
labs(title = "Gaps in Household Income (millions VND) by Area, 2014-2022",
caption = "Data source: VHLSS | Author: Thao Bui",
subtitle = "From 2014 to 2022, the Southeast and Red River Delta regions had the highest average incomes in the country, while the Northwest had the lowest.\nThe North Central Coast region notably climbed from the second lowest to the fourth position in average household income, surpassing the Central\nHighlands and Northeast regions.") +
theme(plot.title = element_text(size = 20, color = "grey10")) +
theme(plot.subtitle = element_text(size = 12, color = "grey40")) +
theme(plot.caption = element_text(size = 11, color = "grey40")) +
theme(axis.text.x = element_text(size = 12, color = "grey40")) +
theme(axis.text.y = element_text(size = 12, color = "grey40")) +
theme(legend.title = element_blank())+
theme(legend.position = "bottom")+
theme(legend.text = element_text(size = 12, color = "grey40"))
graph_area_ani
graph.animation.1 <- graph_area_ani +
geom_point() +
transition_reveal(year)
animate(graph.animation.1,height = 550, width = 910)
anim_save("Result/area.gif")