# Load packages 
if(!require(pacman)) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(tidyverse, zoo, highcharter)


# read in data /Users/kendavidn/Dropbox/Mac (2)/Downloads/population.csv

pop <- 
  read_csv("/Users/kendavidn/Dropbox/Mac (2)/Downloads/population.csv") %>% 
  janitor::clean_names()
## Rows: 58252 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, Population (historical estimates)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
annual_growth_rate <- 
  pop %>% 
  # filter out rows where code is empty
  filter(!is.na(code)) %>% 
  filter(year > 0) %>% 
  rename(population = population_historical_estimates)  %>% 
  select(-code) %>% 
  group_by(entity) %>% 
  # expand to include all years from 100 to 2020
  complete(year = seq(100, 2020, 1)) %>% 
  arrange(entity, year) %>% 
  # for missing year data, interpolate. That is,
  # if for specific country, say afganistan, we have data on year 100 (200,000 people)
  # and data on year 300 (300,000 people), but NA's in between, we can interpolate
  # to estimate the population in years 101, 102, 103, etc.
  mutate(population = na.approx(population, na.rm = FALSE)) %>% 
  # now calculate year on year growth
  mutate(growth = population - lag(population)) %>%
  # calculate the growth rate
  mutate(growth_rate = 100 * growth / lag(population)) %>% 
  # smooth growth rate by doing rolling average of past 15 years
  mutate(growth_rate_smooth = rollmean(growth_rate, 20, na.pad = T, align = "right"))

  
# plot the annual growth rate for two key countries in each continent
# India, China; USA, Mexico; Brazil, Argentina; Russia, Turkey; Nigeria, Egypt; United Kingdom, France, Italy

select_countries <- 
annual_growth_rate %>% 
  filter(entity %in% c("India", "China", "United States", "Mexico", 
                       "Argentina", "Russia", "Turkey", "Nigeria", "Egypt", 
                       "United Kingdom", "France", "Italy")) 

select_countries %>% 
  ggplot(aes(x = year, y = growth_rate_smooth, color = entity)) +
  geom_line() +
  labs(title = "Annual population growth rate",
       subtitle = "For key countries in each continent",
       x = "Year",
       y = "Annual growth rate")
## Warning: Removed 240 rows containing missing values (`geom_line()`).

# plot as a highcharter plot instead
select_countries %>% 
  hchart("line", hcaes(x = year, y = growth_rate_smooth, group = entity))

Top 30 largest countries in 2020

top_30 <- pop %>% 
  filter(year == 2020) %>% 
  arrange(desc(population_historical_estimates)) %>% 
  head(30)


annual_growth_rate %>% 
  filter(entity %in% top_30$entity) %>% 
  hchart("line", hcaes(x = year, y = growth_rate_smooth, group = entity))