Population Pyramids

R Codes for Collecting Data and Visualization

#==================================================================================
#   Scrapping population data with inputs are: 
#   1. Given year, 
#   2. Country code selected from https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
#==================================================================================

# Clear workspace: 
rm(list = ls())

# Load some packages for scrapping data and data manipulation: 
library(rvest)
library(magrittr)
library(tidyverse)

# A function for collecting population data: 

get_population <- function(countryCode_selected, year_selected) {
  
  base_url <- "https://www.census.gov/data-tools/demo/idb/region.php?N=%20Results%20&T=10&A=separate&RT=0&Y="
  link <- paste0(paste0(paste0(base_url, year_selected), "&R=-1&C="), countryCode_selected)
  
  link %>% 
    read_html() -> html_content  
  
  html_content %>% 
    html_table(fill = TRUE) %>% 
    .[[1]] -> pop_df
  
  names(pop_df) <- str_replace_all(names(pop_df), " ", "_")
  
  country_name <- html_content %>% 
    html_nodes('.query_source a') %>% 
    html_text() 
  
  if (length(country_name) == 2) {
    country_name <- country_name[2]
  }
  
  
  final_df <- pop_df %>% 
    select(-Year, -Age, -contains("Percent"), -Sex_Ratio) %>% 
    mutate_all(function(x) {str_replace_all(x, "[^0-9]", "") %>% as.numeric()}) %>% 
    mutate(Year = pop_df$Year, Country = country_name[1], Age = pop_df$Age) %>% 
    select(Year, Country, Age, everything()) %>% 
    mutate(Percent_Both_Sexes = Both_Sexes_Population / Both_Sexes_Population[1], 
           Percent_Male = Male_Population / Male_Population[1], 
           Percent_Female = Female_Population / Female_Population[1], 
           Sex_Ratio = Male_Population / Female_Population, 
           Age = case_when(Age == "0-4" ~ "00-04", Age == "5-9" ~ "05-09", TRUE ~ Age))
  
  return(final_df)
  
}


# Function for getting population data for Vietnam by given year: 

pop_data_ByYear <- function(year_selected) {
  get_population("VM", year_selected) %>% 
    slice(-c(1, 20:22)) %>% 
    select(Year, Age, Male_Population, Female_Population) %>% 
    mutate(Female_Population = -1*Female_Population) %>% 
    gather(Gender, Value, -Age, -Year) %>% 
    return()
}


# Use the function for getting data: 
lapply(c(2018, 2005, 2000, 1995), pop_data_ByYear) -> total_list
total_df <- do.call("rbind", total_list)

#=========================
#  Data Visualization
#=========================

library(extrafont)

# Colors and font selected: 
my_colors <- c("#2E74C0", "#CB454A")
my_font <- "Roboto Condensed"


#---------------------------------
#   Version 1: Using geom_col()
#---------------------------------

total_df %>% 
  ggplot(aes(Age, Value, fill = Gender)) + 
  geom_col(position = "stack", width = 0.85) + 
  scale_y_continuous(expand = c(0, 0.5)) + 
  facet_wrap(~ Year) + 
  coord_flip() + 
  scale_y_continuous(breaks = seq(-5000000, 5000000, 1000000), 
                     limits = c(-5000000, 5000000), 
                     labels = c(paste0(seq(5, 0, -1), "M"), paste0(1:5, "M"))) + 
  theme_minimal() + 
  scale_fill_manual(values = my_colors, name = "", labels = c("Female", "Male")) + 
  # guides(fill = guide_legend(reverse = TRUE)) + 
  theme(panel.grid.major.x = element_line(linetype = "dotted", size = 0.2, color = "grey40")) + 
  theme(panel.grid.major.y = element_blank()) + 
  theme(panel.grid.minor.y = element_blank()) + 
  theme(panel.grid.minor.x = element_blank()) + 
  theme(legend.position = "top") + 
  theme(axis.text.y = element_text(size = 10, family = my_font)) + 
  theme(axis.text.x = element_text(size = 10, family = my_font)) + 
  theme(plot.title = element_text(family = my_font, size = 28)) + 
  theme(plot.subtitle = element_text(family = my_font, size = 13, color = "gray40")) + 
  theme(plot.caption = element_text(family = my_font, size = 12, colour = "grey40", face = "italic")) + 
  theme(plot.margin = unit(c(1.2, 1.2, 1.2, 1.2), "cm")) + 
  theme(legend.text = element_text(size = 12, face = "bold", color = "grey30", family = my_font)) + 
  theme(strip.text = element_text(color = "grey20", size = 13, face = "bold", family = my_font)) +  
  labs(x = NULL, y = NULL, 
       title = "Population Pyramids of Vietnam: 1995 - 2018",
       subtitle = "The age-sex structure of a country's population and may provide insights about political and social stability,\nas well as economic development. Countries with young populations need to invest more in schools,\nwhile countries with older populations need to invest more in the health sector.",
       caption = "Data Source: https://www.census.gov")


#-----------------------------------------
#   Version 2: Using geom_linerange()
#-----------------------------------------

total_df %>% 
  ggplot(aes(x = Age, color = Gender))+
  geom_linerange(data = total_df %>% filter(Gender == "Male_Population"),
                 aes(ymin = -0.3, ymax = -0.3 + Value), size = 3.5, alpha = 1) +
  geom_linerange(data = total_df %>% filter(Gender != "Male_Population"), 
                 aes(ymin = 0.3, ymax = 0.3 + Value), size = 3.5, alpha = 1) + 
  geom_label(aes(x = Age, y = 0, label = Age), inherit.aes = TRUE, family = my_font, 
             size = 3.5, label.padding = unit(0, "lines"), label.size = 0, 
             label.r = unit(0.0, "lines"), fill = "#f5f5f2", alpha = 1, color = "gray20") + 
  theme_minimal() + 
  coord_flip() + 
  facet_wrap( ~ Year) + 
  scale_y_continuous(breaks = seq(-5000000, 5000000, 1000000), 
                     limits = c(-5000000, 5000000), 
                     labels = c(paste0(seq(5, 0, -1), "M"), paste0(1:5, "M"))) + 
  scale_color_manual(name = "", values = c(Female_Population = "#3E606F", Male_Population = "#8C3F4D"), 
                     labels = c("Male", "Female")) + 
  theme(panel.grid.major.y = element_blank()) + 
  theme(plot.title = element_text(face = "bold", size = 28, family = my_font, margin = margin(b = 10), hjust = 0, color = "grey20")) + 
  theme(plot.subtitle = element_text(size = 14, margin = margin(b = 20), hjust = 0, family = my_font, color = "grey30")) + 
  theme(plot.caption = element_text(size = 13, color = "grey50", family = my_font)) + 
  theme(axis.text.y = element_blank()) + 
  theme(axis.text.x = element_text(size = 12, color = "grey20", family = my_font, face = "bold")) + 
  theme(plot.background = element_rect(fill = "#EFF2F4", color = NA)) + 
  theme(panel.grid.major.x = element_line(linetype = "dotted", size = 0.2, color = "grey40")) + 
  theme(plot.margin = unit(c(1.2, 1.2, 1.2, 1.2), "cm")) + 
  theme(legend.position = "top") + 
  theme(legend.text = element_text(size = 13, face = "bold", color = "grey30", family = my_font)) + 
  theme(legend.text.align = 1) + 
  theme(strip.text = element_text(color = "grey20", size = 14, face = "bold", family = my_font)) + 
  # theme(strip.text = element_text(color = "grey20", size = 14, face = "bold", hjust = 0.026) + 
  labs(x = NULL, y = NULL, 
       title = "Population Pyramids of Vietnam: 1995 - 2018",
       subtitle = "The age-sex structure of a country's population and may provide insights about political and social stability,\nas well as economic development. Countries with young populations need to invest more in schools,\nwhile countries with older populations need to invest more in the health sector.",
       caption = "Data Source: https://www.census.gov")
---
title: "Population Structure of Vietnam: 1995 - 2018"
author: "Nguyen Chi Dung"
subtitle: "Daily Graph Series"
output:
  html_document:
    code_download: yes
    # code_folding: hide
    highlight: zenburn
    theme: flatly
    toc: yes
    toc_float: yes
  word_document:
    toc: yes
---

```{r setup,include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.retina=2)
```

# Population Pyramids

![](C:\\Users\\Zbook\\Desktop\\pic\\pid1.jpg)

![](C:\\Users\\Zbook\\Desktop\\pic\\pid2.jpg)

# R Codes for Collecting Data and Visualization

```{r, eval=FALSE}

#==================================================================================
#   Scrapping population data with inputs are: 
#   1. Given year, 
#   2. Country code selected from https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
#==================================================================================

# Clear workspace: 
rm(list = ls())

# Load some packages for scrapping data and data manipulation: 
library(rvest)
library(magrittr)
library(tidyverse)

# A function for collecting population data: 

get_population <- function(countryCode_selected, year_selected) {
  
  base_url <- "https://www.census.gov/data-tools/demo/idb/region.php?N=%20Results%20&T=10&A=separate&RT=0&Y="
  link <- paste0(paste0(paste0(base_url, year_selected), "&R=-1&C="), countryCode_selected)
  
  link %>% 
    read_html() -> html_content  
  
  html_content %>% 
    html_table(fill = TRUE) %>% 
    .[[1]] -> pop_df
  
  names(pop_df) <- str_replace_all(names(pop_df), " ", "_")
  
  country_name <- html_content %>% 
    html_nodes('.query_source a') %>% 
    html_text() 
  
  if (length(country_name) == 2) {
    country_name <- country_name[2]
  }
  
  
  final_df <- pop_df %>% 
    select(-Year, -Age, -contains("Percent"), -Sex_Ratio) %>% 
    mutate_all(function(x) {str_replace_all(x, "[^0-9]", "") %>% as.numeric()}) %>% 
    mutate(Year = pop_df$Year, Country = country_name[1], Age = pop_df$Age) %>% 
    select(Year, Country, Age, everything()) %>% 
    mutate(Percent_Both_Sexes = Both_Sexes_Population / Both_Sexes_Population[1], 
           Percent_Male = Male_Population / Male_Population[1], 
           Percent_Female = Female_Population / Female_Population[1], 
           Sex_Ratio = Male_Population / Female_Population, 
           Age = case_when(Age == "0-4" ~ "00-04", Age == "5-9" ~ "05-09", TRUE ~ Age))
  
  return(final_df)
  
}


# Function for getting population data for Vietnam by given year: 

pop_data_ByYear <- function(year_selected) {
  get_population("VM", year_selected) %>% 
    slice(-c(1, 20:22)) %>% 
    select(Year, Age, Male_Population, Female_Population) %>% 
    mutate(Female_Population = -1*Female_Population) %>% 
    gather(Gender, Value, -Age, -Year) %>% 
    return()
}


# Use the function for getting data: 
lapply(c(2018, 2005, 2000, 1995), pop_data_ByYear) -> total_list
total_df <- do.call("rbind", total_list)

#=========================
#  Data Visualization
#=========================

library(extrafont)

# Colors and font selected: 
my_colors <- c("#2E74C0", "#CB454A")
my_font <- "Roboto Condensed"


#---------------------------------
#   Version 1: Using geom_col()
#---------------------------------

total_df %>% 
  ggplot(aes(Age, Value, fill = Gender)) + 
  geom_col(position = "stack", width = 0.85) + 
  scale_y_continuous(expand = c(0, 0.5)) + 
  facet_wrap(~ Year) + 
  coord_flip() + 
  scale_y_continuous(breaks = seq(-5000000, 5000000, 1000000), 
                     limits = c(-5000000, 5000000), 
                     labels = c(paste0(seq(5, 0, -1), "M"), paste0(1:5, "M"))) + 
  theme_minimal() + 
  scale_fill_manual(values = my_colors, name = "", labels = c("Female", "Male")) + 
  # guides(fill = guide_legend(reverse = TRUE)) + 
  theme(panel.grid.major.x = element_line(linetype = "dotted", size = 0.2, color = "grey40")) + 
  theme(panel.grid.major.y = element_blank()) + 
  theme(panel.grid.minor.y = element_blank()) + 
  theme(panel.grid.minor.x = element_blank()) + 
  theme(legend.position = "top") + 
  theme(axis.text.y = element_text(size = 10, family = my_font)) + 
  theme(axis.text.x = element_text(size = 10, family = my_font)) + 
  theme(plot.title = element_text(family = my_font, size = 28)) + 
  theme(plot.subtitle = element_text(family = my_font, size = 13, color = "gray40")) + 
  theme(plot.caption = element_text(family = my_font, size = 12, colour = "grey40", face = "italic")) + 
  theme(plot.margin = unit(c(1.2, 1.2, 1.2, 1.2), "cm")) + 
  theme(legend.text = element_text(size = 12, face = "bold", color = "grey30", family = my_font)) + 
  theme(strip.text = element_text(color = "grey20", size = 13, face = "bold", family = my_font)) +  
  labs(x = NULL, y = NULL, 
       title = "Population Pyramids of Vietnam: 1995 - 2018",
       subtitle = "The age-sex structure of a country's population and may provide insights about political and social stability,\nas well as economic development. Countries with young populations need to invest more in schools,\nwhile countries with older populations need to invest more in the health sector.",
       caption = "Data Source: https://www.census.gov")


#-----------------------------------------
#   Version 2: Using geom_linerange()
#-----------------------------------------

total_df %>% 
  ggplot(aes(x = Age, color = Gender))+
  geom_linerange(data = total_df %>% filter(Gender == "Male_Population"),
                 aes(ymin = -0.3, ymax = -0.3 + Value), size = 3.5, alpha = 1) +
  geom_linerange(data = total_df %>% filter(Gender != "Male_Population"), 
                 aes(ymin = 0.3, ymax = 0.3 + Value), size = 3.5, alpha = 1) + 
  geom_label(aes(x = Age, y = 0, label = Age), inherit.aes = TRUE, family = my_font, 
             size = 3.5, label.padding = unit(0, "lines"), label.size = 0, 
             label.r = unit(0.0, "lines"), fill = "#f5f5f2", alpha = 1, color = "gray20") + 
  theme_minimal() + 
  coord_flip() + 
  facet_wrap( ~ Year) + 
  scale_y_continuous(breaks = seq(-5000000, 5000000, 1000000), 
                     limits = c(-5000000, 5000000), 
                     labels = c(paste0(seq(5, 0, -1), "M"), paste0(1:5, "M"))) + 
  scale_color_manual(name = "", values = c(Female_Population = "#3E606F", Male_Population = "#8C3F4D"), 
                     labels = c("Male", "Female")) + 
  theme(panel.grid.major.y = element_blank()) + 
  theme(plot.title = element_text(face = "bold", size = 28, family = my_font, margin = margin(b = 10), hjust = 0, color = "grey20")) + 
  theme(plot.subtitle = element_text(size = 14, margin = margin(b = 20), hjust = 0, family = my_font, color = "grey30")) + 
  theme(plot.caption = element_text(size = 13, color = "grey50", family = my_font)) + 
  theme(axis.text.y = element_blank()) + 
  theme(axis.text.x = element_text(size = 12, color = "grey20", family = my_font, face = "bold")) + 
  theme(plot.background = element_rect(fill = "#EFF2F4", color = NA)) + 
  theme(panel.grid.major.x = element_line(linetype = "dotted", size = 0.2, color = "grey40")) + 
  theme(plot.margin = unit(c(1.2, 1.2, 1.2, 1.2), "cm")) + 
  theme(legend.position = "top") + 
  theme(legend.text = element_text(size = 13, face = "bold", color = "grey30", family = my_font)) + 
  theme(legend.text.align = 1) + 
  theme(strip.text = element_text(color = "grey20", size = 14, face = "bold", family = my_font)) + 
  # theme(strip.text = element_text(color = "grey20", size = 14, face = "bold", hjust = 0.026) + 
  labs(x = NULL, y = NULL, 
       title = "Population Pyramids of Vietnam: 1995 - 2018",
       subtitle = "The age-sex structure of a country's population and may provide insights about political and social stability,\nas well as economic development. Countries with young populations need to invest more in schools,\nwhile countries with older populations need to invest more in the health sector.",
       caption = "Data Source: https://www.census.gov")
  


```

