#===========================
# Pepare data for ploting
#===========================
# Import Data:
rm(list = ls())
library(readstata13)
read.dta13("ipumsi_00002.dta") -> ipumsi_raw # The dataset of 2368167 observations and 92 columns.
# Prepare data for ploting:
library(tidyverse)
ipumsi_raw %>%
mutate_if(is.factor, function(x) {as.character(x)}) %>%
mutate(vn1999a_age = str_replace_all(vn1999a_age, "\\+", "")) %>%
mutate(age_num = as.numeric(vn1999a_age)) -> ipumsi
# Age group (reference: https://rpubs.com/chidungkt/505486):
age_grouped <- c("00-04", "05-09", str_c(seq(10, 85, 5), seq(14, 89, 5), sep = "-"), "90+")
ipumsi %>%
mutate(age_group = case_when(age_num <= 4 ~ age_grouped[1],
age_num >= 5 & age_num <= 9 ~ age_grouped[2],
age_num >= 10 & age_num <= 14 ~ age_grouped[3],
age_num >= 15 & age_num <= 19 ~ age_grouped[4],
age_num >= 20 & age_num <= 24 ~ age_grouped[5],
age_num >= 25 & age_num <= 29 ~ age_grouped[6],
age_num >= 30 & age_num <= 34 ~ age_grouped[7],
age_num >= 35 & age_num <= 39 ~ age_grouped[8],
age_num >= 40 & age_num <= 44 ~ age_grouped[9],
age_num >= 45 & age_num <= 49 ~ age_grouped[10],
age_num >= 50 & age_num <= 54 ~ age_grouped[11],
age_num >= 55 & age_num <= 59 ~ age_grouped[12],
age_num >= 60 & age_num <= 64 ~ age_grouped[13],
age_num >= 65 & age_num <= 69 ~ age_grouped[14],
age_num >= 70 & age_num <= 74 ~ age_grouped[15],
age_num >= 75 & age_num <= 79 ~ age_grouped[16],
age_num >= 80 & age_num <= 84 ~ age_grouped[17],
age_num >= 85 & age_num <= 89 ~ age_grouped[18],
age_num >= 90 ~ age_grouped[19])) -> ipumsi
ipumsi %>%
group_by(vn1999a_sex, age_group) %>%
count() %>%
ungroup() %>%
mutate(age_group = factor(age_group, levels = age_grouped)) %>%
mutate(n = case_when(vn1999a_sex == "female" ~ -n, TRUE ~ n)) -> df_age_group
#======================
# Data Visualization
#======================
# Colors selected:
my_colors <- c("#3E606F", "#8C3F4D")
library(showtext) # Reference: https://rpubs.com/chidungkt/744221
font_add_google(name = "Roboto Condensed", family = "roboto") # Font selected for graph.
my_font <- "roboto"
showtext_auto()
# Label on x axis:
label_x <- c(paste0(seq(150, 0, -50), "K"), paste0(seq(50, 150, 50), "K"))
# Make a draft:
df_age_group %>%
ggplot(aes(age_group, n, fill = vn1999a_sex)) +
geom_col() +
coord_flip() +
scale_y_continuous(breaks = seq(-150000, 150000, 50000), limits = c(-150000, 150000), labels = label_x) +
theme_minimal() +
scale_fill_manual(values = my_colors, name = "", labels = c("Female", "Male")) +
theme(panel.grid.major.x = element_line(linetype = "dotted", size = 0.2, color = "grey40")) +
theme(panel.grid.major.y = element_blank()) +
theme(panel.grid.minor.y = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(legend.position = "top") +
theme(plot.title = element_text(family = my_font, size = 20)) +
theme(plot.subtitle = element_text(family = my_font, size = 12, color = "gray30")) +
theme(plot.caption = element_text(family = my_font, size = 9, colour = "grey30", face = "italic")) +
theme(plot.margin = unit(c(1.2, 1.2, 1.2, 1.2), "cm")) +
theme(axis.text = element_text(size = 11, family = my_font)) +
theme(legend.text = element_text(size = 10, face = "bold", color = "grey30", family = my_font)) +
labs(x = NULL, y = NULL,
title = "An Approximation of Population Pyramids of Vietnam in 1999",
subtitle = "A population pyramid illustrates the age-sex structure of a country's population and may\nprovide insights about political and social stability, as well as economic development.",
caption = "Data Source: Minnesota Population Center")